All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v8 00/12] x86emul: further work
@ 2020-05-05  8:10 Jan Beulich
  2020-05-05  8:12 ` [PATCH v8 01/12] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM Jan Beulich
                   ` (12 more replies)
  0 siblings, 13 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:10 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

At least the RDPRU patch is still at least partly RFC. I'd
appreciate though if at least some of the series could go in rather
sooner than later. Note in particular that there's no strong
ordering throughout the entire series, i.e. certain later parts
could be arranged for to go in earlier. This is also specifically
the case for what is now the last patch.

 1: x86emul: disable FPU/MMX/SIMD insn emulation when !HVM
 2: x86emul: support MOVDIR{I,64B} insns
 3: x86emul: support ENQCMD insn
 4: x86emul: support SERIALIZE
 5: x86emul: support X{SUS,RES}LDTRK
 6: x86/HVM: make hvmemul_blk() capable of handling r/o operations
 7: x86emul: support FNSTENV and FNSAVE
 8: x86emul: support FLDENV and FRSTOR
 9: x86emul: support FXSAVE/FXRSTOR
10: x86/HVM: scale MPERF values reported to guests (on AMD)
11: x86emul: support RDPRU
12: x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads

Main changes from v7 are the new patch 6 and quiite a bit of re-work
of what is now patch 9. See individual patches for revision details.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 01/12] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
@ 2020-05-05  8:12 ` Jan Beulich
  2020-05-07 18:11   ` Andrew Cooper
  2020-05-05  8:13 ` [PATCH v8 02/12] x86emul: support MOVDIR{I,64B} insns Jan Beulich
                   ` (11 subsequent siblings)
  12 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:12 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

In a pure PV environment (the PV shim in particular) we don't really
need emulation of all these. To limit #ifdef-ary utilize some of the
CASE_*() macros we have, by providing variants expanding to
(effectively) nothing (really a label, which in turn requires passing
-Wno-unused-label to the compiler when build such configurations).

Due to the mixture of macro and #ifdef use, the placement of some of
the #ifdef-s is a little arbitrary.

The resulting object file's .text is less than half the size of the
original, and looks to also be compiling a little more quickly.

This is meant as a first step; more parts can likely be disabled down
the road.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v7: Integrate into this series. Re-base.
---
I'll be happy to take suggestions allowing to avoid -Wno-unused-label.

--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -73,6 +73,9 @@ obj-y += vm_event.o
 obj-y += xstate.o
 extra-y += asm-macros.i
 
+ifneq ($(CONFIG_HVM),y)
+x86_emulate.o: CFLAGS-y += -Wno-unused-label
+endif
 x86_emulate.o: x86_emulate/x86_emulate.c x86_emulate/x86_emulate.h
 
 efi-y := $(shell if [ ! -r $(BASEDIR)/include/xen/compile.h -o \
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -42,6 +42,12 @@
     }                                                      \
 })
 
+#ifndef CONFIG_HVM
+# define X86EMUL_NO_FPU
+# define X86EMUL_NO_MMX
+# define X86EMUL_NO_SIMD
+#endif
+
 #include "x86_emulate/x86_emulate.c"
 
 int x86emul_read_xcr(unsigned int reg, uint64_t *val,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3492,6 +3492,7 @@ x86_decode(
             op_bytes = 4;
         break;
 
+#ifndef X86EMUL_NO_SIMD
     case simd_packed_int:
         switch ( vex.pfx )
         {
@@ -3557,6 +3558,7 @@ x86_decode(
     case simd_256:
         op_bytes = 32;
         break;
+#endif /* !X86EMUL_NO_SIMD */
 
     default:
         op_bytes = 0;
@@ -3711,6 +3713,7 @@ x86_emulate(
         break;
     }
 
+#ifndef X86EMUL_NO_SIMD
     /* With a memory operand, fetch the mask register in use (if any). */
     if ( ea.type == OP_MEM && evex.opmsk &&
          _get_fpu(fpu_type = X86EMUL_FPU_opmask, ctxt, ops) == X86EMUL_OKAY )
@@ -3741,6 +3744,7 @@ x86_emulate(
         put_fpu(X86EMUL_FPU_opmask, false, state, ctxt, ops);
         fpu_type = X86EMUL_FPU_none;
     }
+#endif /* !X86EMUL_NO_SIMD */
 
     /* Decode (but don't fetch) the destination operand: register or memory. */
     switch ( d & DstMask )
@@ -4386,11 +4390,13 @@ x86_emulate(
         singlestep = _regs.eflags & X86_EFLAGS_TF;
         break;
 
+#ifndef X86EMUL_NO_FPU
     case 0x9b:  /* wait/fwait */
         host_and_vcpu_must_have(fpu);
         get_fpu(X86EMUL_FPU_wait);
         emulate_fpu_insn_stub(b);
         break;
+#endif
 
     case 0x9c: /* pushf */
         if ( (_regs.eflags & X86_EFLAGS_VM) &&
@@ -4800,6 +4806,7 @@ x86_emulate(
         break;
     }
 
+#ifndef X86EMUL_NO_FPU
     case 0xd8: /* FPU 0xd8 */
         host_and_vcpu_must_have(fpu);
         get_fpu(X86EMUL_FPU_fpu);
@@ -5134,6 +5141,7 @@ x86_emulate(
             }
         }
         break;
+#endif /* !X86EMUL_NO_FPU */
 
     case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
         unsigned long count = get_loop_count(&_regs, ad_bytes);
@@ -6079,6 +6087,8 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
+#ifndef X86EMUL_NO_MMX
+
     case X86EMUL_OPC(0x0f, 0x0e): /* femms */
         host_and_vcpu_must_have(3dnow);
         asm volatile ( "femms" );
@@ -6099,39 +6109,71 @@ x86_emulate(
         state->simd_size = simd_other;
         goto simd_0f_imm8;
 
-#define CASE_SIMD_PACKED_INT(pfx, opc)       \
+#endif /* !X86EMUL_NO_MMX */
+
+#if !defined(X86EMUL_NO_SIMD) && !defined(X86EMUL_NO_MMX)
+# define CASE_SIMD_PACKED_INT(pfx, opc)      \
     case X86EMUL_OPC(pfx, opc):              \
     case X86EMUL_OPC_66(pfx, opc)
-#define CASE_SIMD_PACKED_INT_VEX(pfx, opc)   \
+#elif !defined(X86EMUL_NO_SIMD)
+# define CASE_SIMD_PACKED_INT(pfx, opc)      \
+    case X86EMUL_OPC_66(pfx, opc)
+#elif !defined(X86EMUL_NO_MMX)
+# define CASE_SIMD_PACKED_INT(pfx, opc)      \
+    case X86EMUL_OPC(pfx, opc)
+#else
+# define CASE_SIMD_PACKED_INT(pfx, opc) C##pfx##_##opc
+#endif
+
+#ifndef X86EMUL_NO_SIMD
+
+# define CASE_SIMD_PACKED_INT_VEX(pfx, opc)  \
     CASE_SIMD_PACKED_INT(pfx, opc):          \
     case X86EMUL_OPC_VEX_66(pfx, opc)
 
-#define CASE_SIMD_ALL_FP(kind, pfx, opc)     \
+# define CASE_SIMD_ALL_FP(kind, pfx, opc)    \
     CASE_SIMD_PACKED_FP(kind, pfx, opc):     \
     CASE_SIMD_SCALAR_FP(kind, pfx, opc)
-#define CASE_SIMD_PACKED_FP(kind, pfx, opc)  \
+# define CASE_SIMD_PACKED_FP(kind, pfx, opc) \
     case X86EMUL_OPC##kind(pfx, opc):        \
     case X86EMUL_OPC##kind##_66(pfx, opc)
-#define CASE_SIMD_SCALAR_FP(kind, pfx, opc)  \
+# define CASE_SIMD_SCALAR_FP(kind, pfx, opc) \
     case X86EMUL_OPC##kind##_F3(pfx, opc):   \
     case X86EMUL_OPC##kind##_F2(pfx, opc)
-#define CASE_SIMD_SINGLE_FP(kind, pfx, opc)  \
+# define CASE_SIMD_SINGLE_FP(kind, pfx, opc) \
     case X86EMUL_OPC##kind(pfx, opc):        \
     case X86EMUL_OPC##kind##_F3(pfx, opc)
 
-#define CASE_SIMD_ALL_FP_VEX(pfx, opc)       \
+# define CASE_SIMD_ALL_FP_VEX(pfx, opc)      \
     CASE_SIMD_ALL_FP(, pfx, opc):            \
     CASE_SIMD_ALL_FP(_VEX, pfx, opc)
-#define CASE_SIMD_PACKED_FP_VEX(pfx, opc)    \
+# define CASE_SIMD_PACKED_FP_VEX(pfx, opc)   \
     CASE_SIMD_PACKED_FP(, pfx, opc):         \
     CASE_SIMD_PACKED_FP(_VEX, pfx, opc)
-#define CASE_SIMD_SCALAR_FP_VEX(pfx, opc)    \
+# define CASE_SIMD_SCALAR_FP_VEX(pfx, opc)   \
     CASE_SIMD_SCALAR_FP(, pfx, opc):         \
     CASE_SIMD_SCALAR_FP(_VEX, pfx, opc)
-#define CASE_SIMD_SINGLE_FP_VEX(pfx, opc)    \
+# define CASE_SIMD_SINGLE_FP_VEX(pfx, opc)   \
     CASE_SIMD_SINGLE_FP(, pfx, opc):         \
     CASE_SIMD_SINGLE_FP(_VEX, pfx, opc)
 
+#else
+
+# define CASE_SIMD_PACKED_INT_VEX(pfx, opc)  \
+    CASE_SIMD_PACKED_INT(pfx, opc)
+
+# define CASE_SIMD_ALL_FP(kind, pfx, opc)    C##kind##pfx##_##opc
+# define CASE_SIMD_PACKED_FP(kind, pfx, opc) Cp##kind##pfx##_##opc
+# define CASE_SIMD_SCALAR_FP(kind, pfx, opc) Cs##kind##pfx##_##opc
+# define CASE_SIMD_SINGLE_FP(kind, pfx, opc) C##kind##pfx##_##opc
+
+# define CASE_SIMD_ALL_FP_VEX(pfx, opc)    CASE_SIMD_ALL_FP(, pfx, opc)
+# define CASE_SIMD_PACKED_FP_VEX(pfx, opc) CASE_SIMD_PACKED_FP(, pfx, opc)
+# define CASE_SIMD_SCALAR_FP_VEX(pfx, opc) CASE_SIMD_SCALAR_FP(, pfx, opc)
+# define CASE_SIMD_SINGLE_FP_VEX(pfx, opc) CASE_SIMD_SINGLE_FP(, pfx, opc)
+
+#endif
+
     CASE_SIMD_SCALAR_FP(, 0x0f, 0x2b):     /* movnts{s,d} xmm,mem */
         host_and_vcpu_must_have(sse4a);
         /* fall through */
@@ -6269,6 +6311,8 @@ x86_emulate(
         insn_bytes = EVEX_PFX_BYTES + 2;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_66(0x0f, 0x12):       /* movlpd m64,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x12):   /* vmovlpd m64,xmm,xmm */
     CASE_SIMD_PACKED_FP_VEX(0x0f, 0x13):   /* movlp{s,d} xmm,m64 */
@@ -6375,6 +6419,8 @@ x86_emulate(
         avx512_vlen_check(false);
         goto simd_zmm;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
     case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
     case X86EMUL_OPC(0x0f, 0x22): /* mov reg,cr */
@@ -6401,6 +6447,8 @@ x86_emulate(
             goto done;
         break;
 
+#if !defined(X86EMUL_NO_MMX) && !defined(X86EMUL_NO_SIMD)
+
     case X86EMUL_OPC_66(0x0f, 0x2a):       /* cvtpi2pd mm/m64,xmm */
         if ( ea.type == OP_REG )
         {
@@ -6412,6 +6460,8 @@ x86_emulate(
         op_bytes = (b & 4) && (vex.pfx & VEX_PREFIX_DOUBLE_MASK) ? 16 : 8;
         goto simd_0f_fp;
 
+#endif /* !X86EMUL_NO_MMX && !X86EMUL_NO_SIMD */
+
     CASE_SIMD_SCALAR_FP_VEX(0x0f, 0x2a):   /* {,v}cvtsi2s{s,d} r/m,xmm */
         if ( vex.opcx == vex_none )
         {
@@ -6758,6 +6808,8 @@ x86_emulate(
             dst.val = src.val;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX(0x0f, 0x4a):    /* kadd{w,q} k,k,k */
         if ( !vex.w )
             host_and_vcpu_must_have(avx512dq);
@@ -6812,6 +6864,8 @@ x86_emulate(
         generate_exception_if(!vex.l || vex.w, EXC_UD);
         goto opmask_common;
 
+#endif /* X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_FP_VEX(0x0f, 0x50):   /* movmskp{s,d} xmm,reg */
                                            /* vmovmskp{s,d} {x,y}mm,reg */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xd7):  /* pmovmskb {,x}mm,reg */
@@ -6895,6 +6949,8 @@ x86_emulate(
                          evex.w);
         goto avx512f_all_fp;
 
+#ifndef X86EMUL_NO_SIMD
+
     CASE_SIMD_PACKED_FP_VEX(0x0f, 0x5b):   /* cvt{ps,dq}2{dq,ps} xmm/mem,xmm */
                                            /* vcvt{ps,dq}2{dq,ps} {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_F3(0x0f, 0x5b):       /* cvttps2dq xmm/mem,xmm */
@@ -6925,6 +6981,8 @@ x86_emulate(
         op_bytes = 16 << evex.lr;
         goto simd_zmm;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
                                           /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
@@ -6951,6 +7009,7 @@ x86_emulate(
                                           /* vpackusbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x6b): /* packsswd {,x}mm/mem,{,x}mm */
                                           /* vpacksswd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+#ifndef X86EMUL_NO_SIMD
     case X86EMUL_OPC_66(0x0f, 0x6c):     /* punpcklqdq xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x6c): /* vpunpcklqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0x6d):     /* punpckhqdq xmm/m128,xmm */
@@ -7035,6 +7094,7 @@ x86_emulate(
                                           /* vpsubd {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xfb):     /* psubq xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xfb): /* vpsubq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+#endif /* !X86EMUL_NO_SIMD */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xfc): /* paddb {,x}mm/mem,{,x}mm */
                                           /* vpaddb {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xfd): /* paddw {,x}mm/mem,{,x}mm */
@@ -7042,6 +7102,7 @@ x86_emulate(
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xfe): /* paddd {,x}mm/mem,{,x}mm */
                                           /* vpaddd {x,y}mm/mem,{x,y}mm,{x,y}mm */
     simd_0f_int:
+#ifndef X86EMUL_NO_SIMD
         if ( vex.opcx != vex_none )
         {
     case X86EMUL_OPC_VEX_66(0x0f38, 0x00): /* vpshufb {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -7083,11 +7144,14 @@ x86_emulate(
         }
         if ( vex.pfx )
             goto simd_0f_sse2;
+#endif /* !X86EMUL_NO_SIMD */
     simd_0f_mmx:
         host_and_vcpu_must_have(mmx);
         get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm */
         generate_exception_if(evex.opmsk, EXC_UD);
         /* fall through */
@@ -7181,6 +7245,8 @@ x86_emulate(
         generate_exception_if(!evex.w, EXC_UD);
         goto avx512f_no_sae;
 
+#endif /* X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x6e): /* mov{d,q} r/m,{,x}mm */
                                           /* vmov{d,q} r/m,xmm */
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x7e): /* mov{d,q} {,x}mm,r/m */
@@ -7222,6 +7288,8 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     case X86EMUL_OPC_EVEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */
         generate_exception_if((evex.lr || evex.opmsk || evex.brs ||
@@ -7294,11 +7362,15 @@ x86_emulate(
         d |= TwoOp;
         /* fall through */
     case X86EMUL_OPC_66(0x0f, 0xd6):     /* movq xmm,xmm/m64 */
+#endif /* !X86EMUL_NO_SIMD */
+#ifndef X86EMUL_NO_MMX
     case X86EMUL_OPC(0x0f, 0x6f):        /* movq mm/m64,mm */
     case X86EMUL_OPC(0x0f, 0x7f):        /* movq mm,mm/m64 */
+#endif
         op_bytes = 8;
         goto simd_0f_int;
 
+#ifndef X86EMUL_NO_SIMD
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0x70):/* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
                                          /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_F3(0x0f, 0x70):     /* pshufhw $imm8,xmm/m128,xmm */
@@ -7307,12 +7379,15 @@ x86_emulate(
     case X86EMUL_OPC_VEX_F2(0x0f, 0x70): /* vpshuflw $imm8,{x,y}mm/mem,{x,y}mm */
         d = (d & ~SrcMask) | SrcMem | TwoOp;
         op_bytes = vex.pfx ? 16 << vex.l : 8;
+#endif
     simd_0f_int_imm8:
         if ( vex.opcx != vex_none )
         {
+#ifndef X86EMUL_NO_SIMD
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0e): /* vpblendw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x42): /* vmpsadbw $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+#endif
             if ( vex.l )
             {
     simd_0f_imm8_avx2:
@@ -7320,6 +7395,7 @@ x86_emulate(
             }
             else
             {
+#ifndef X86EMUL_NO_SIMD
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x08): /* vroundps $imm8,{x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x09): /* vroundpd $imm8,{x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0a): /* vroundss $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -7327,6 +7403,7 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0c): /* vblendps $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x0d): /* vblendpd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x40): /* vdpps $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+#endif
     simd_0f_imm8_avx:
                 host_and_vcpu_must_have(avx);
             }
@@ -7360,6 +7437,8 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0x70): /* vpshufd $imm8,[xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_F3(0x0f, 0x70): /* vpshufhw $imm8,[xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_F2(0x0f, 0x70): /* vpshuflw $imm8,[xyz]mm/mem,[xyz]mm{k} */
@@ -7418,6 +7497,9 @@ x86_emulate(
         opc[1] = modrm;
         opc[2] = imm1;
         insn_bytes = PFX_BYTES + 3;
+
+#endif /* X86EMUL_NO_SIMD */
+
     simd_0f_reg_only:
         opc[insn_bytes - PFX_BYTES] = 0xc3;
 
@@ -7428,6 +7510,8 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0x71): /* Grp12 */
         switch ( modrm_reg & 7 )
         {
@@ -7459,6 +7543,9 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+#endif /* !X86EMUL_NO_SIMD */
+#ifndef X86EMUL_NO_MMX
+
     case X86EMUL_OPC(0x0f, 0x73):        /* Grp14 */
         switch ( modrm_reg & 7 )
         {
@@ -7468,6 +7555,9 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+#endif /* !X86EMUL_NO_MMX */
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_66(0x0f, 0x73):
     case X86EMUL_OPC_VEX_66(0x0f, 0x73):
         switch ( modrm_reg & 7 )
@@ -7498,7 +7588,12 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+#endif /* !X86EMUL_NO_SIMD */
+
+#ifndef X86EMUL_NO_MMX
     case X86EMUL_OPC(0x0f, 0x77):        /* emms */
+#endif
+#ifndef X86EMUL_NO_SIMD
     case X86EMUL_OPC_VEX(0x0f, 0x77):    /* vzero{all,upper} */
         if ( vex.opcx != vex_none )
         {
@@ -7544,6 +7639,7 @@ x86_emulate(
 #endif
         }
         else
+#endif /* !X86EMUL_NO_SIMD */
         {
             host_and_vcpu_must_have(mmx);
             get_fpu(X86EMUL_FPU_mmx);
@@ -7557,6 +7653,8 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 1;
         goto simd_0f_reg_only;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_66(0x0f, 0x78):     /* Grp17 */
         switch ( modrm_reg & 7 )
         {
@@ -7654,6 +7752,8 @@ x86_emulate(
         op_bytes = 8;
         goto simd_zmm;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
         if ( test_cc(b, _regs.eflags) )
             jmp_rel((int32_t)src.val);
@@ -7664,6 +7764,8 @@ x86_emulate(
         dst.val = test_cc(b, _regs.eflags);
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX(0x0f, 0x91):    /* kmov{w,q} k,mem */
     case X86EMUL_OPC_VEX_66(0x0f, 0x91): /* kmov{b,d} k,mem */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -7812,6 +7914,8 @@ x86_emulate(
         dst.type = OP_NONE;
         break;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
         msr_val = 0;
         fail_if(ops->cpuid == NULL);
@@ -7908,6 +8012,7 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
         switch ( modrm_reg & 7 )
         {
+#ifndef X86EMUL_NO_SIMD
         case 2: /* ldmxcsr */
             generate_exception_if(vex.pfx, EXC_UD);
             vcpu_must_have(sse);
@@ -7926,6 +8031,7 @@ x86_emulate(
             get_fpu(vex.opcx ? X86EMUL_FPU_ymm : X86EMUL_FPU_xmm);
             asm volatile ( "stmxcsr %0" : "=m" (dst.val) );
             break;
+#endif /* X86EMUL_NO_SIMD */
 
         case 5: /* lfence */
             fail_if(modrm_mod != 3);
@@ -7974,6 +8080,8 @@ x86_emulate(
         }
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX(0x0f, 0xae): /* Grp15 */
         switch ( modrm_reg & 7 )
         {
@@ -7988,6 +8096,8 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC_F3(0x0f, 0xae): /* Grp15 */
         fail_if(modrm_mod != 3);
         generate_exception_if((modrm_reg & 4) || !mode_64bit(), EXC_UD);
@@ -8227,6 +8337,8 @@ x86_emulate(
         }
         goto simd_0f_imm8_avx;
 
+#ifndef X86EMUL_NO_SIMD
+
     CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0xc2): /* vcmp{p,s}{s,d} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
         generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
                                (ea.type != OP_REG && evex.brs &&
@@ -8253,6 +8365,8 @@ x86_emulate(
         insn_bytes = EVEX_PFX_BYTES + 3;
         break;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
         /* Ignore the non-temporal hint for now. */
         vcpu_must_have(sse2);
@@ -8267,6 +8381,8 @@ x86_emulate(
         ea.type = OP_MEM;
         goto simd_0f_int_imm8;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0xc4):   /* vpinsrw $imm8,r32/m16,xmm,xmm */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x20): /* vpinsrb $imm8,r32/m8,xmm,xmm */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x22): /* vpinsr{d,q} $imm8,r/m,xmm,xmm */
@@ -8284,6 +8400,8 @@ x86_emulate(
         state->simd_size = simd_other;
         goto avx512f_imm8_no_sae;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xc5):  /* pextrw $imm8,{,x}mm,reg */
                                            /* vpextrw $imm8,xmm,reg */
         generate_exception_if(vex.l, EXC_UD);
@@ -8299,6 +8417,8 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         goto simd_0f_to_gpr;
 
+#ifndef X86EMUL_NO_SIMD
+
     CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
                               EXC_UD);
@@ -8313,6 +8433,8 @@ x86_emulate(
         avx512_vlen_check(false);
         goto simd_imm8_zmm;
 
+#endif /* X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
     {
         union {
@@ -8503,6 +8625,8 @@ x86_emulate(
         }
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xe2): /* vpsra{d,q} xmm/m128,[xyz]mm,[xyz]mm{k} */
@@ -8524,12 +8648,18 @@ x86_emulate(
         generate_exception_if(evex.w != (b & 1), EXC_UD);
         goto avx512f_no_sae;
 
+#endif /* !X86EMUL_NO_SIMD */
+#ifndef X86EMUL_NO_MMX
+
     case X86EMUL_OPC(0x0f, 0xd4):        /* paddq mm/m64,mm */
     case X86EMUL_OPC(0x0f, 0xf4):        /* pmuludq mm/m64,mm */
     case X86EMUL_OPC(0x0f, 0xfb):        /* psubq mm/m64,mm */
         vcpu_must_have(sse2);
         goto simd_0f_mmx;
 
+#endif /* !X86EMUL_NO_MMX */
+#if !defined(X86EMUL_NO_MMX) && !defined(X86EMUL_NO_SIMD)
+
     case X86EMUL_OPC_F3(0x0f, 0xd6):     /* movq2dq mm,xmm */
     case X86EMUL_OPC_F2(0x0f, 0xd6):     /* movdq2q xmm,mm */
         generate_exception_if(ea.type != OP_REG, EXC_UD);
@@ -8537,6 +8667,9 @@ x86_emulate(
         host_and_vcpu_must_have(mmx);
         goto simd_0f_int;
 
+#endif /* !X86EMUL_NO_MMX && !X86EMUL_NO_SIMD */
+#ifndef X86EMUL_NO_MMX
+
     case X86EMUL_OPC(0x0f, 0xe7):        /* movntq mm,m64 */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
         sfence = true;
@@ -8552,6 +8685,9 @@ x86_emulate(
         vcpu_must_have(mmxext);
         goto simd_0f_mmx;
 
+#endif /* !X86EMUL_NO_MMX */
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0xda): /* vpminub [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xde): /* vpmaxub [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xe4): /* vpmulhuw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -8572,6 +8708,8 @@ x86_emulate(
         op_bytes = 8 << (!!(vex.pfx & VEX_PREFIX_DOUBLE_MASK) + vex.l);
         goto simd_0f_cvt;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_INT_VEX(0x0f, 0xf7): /* {,v}maskmov{q,dqu} {,x}mm,{,x}mm */
         generate_exception_if(ea.type != OP_REG, EXC_UD);
         if ( vex.opcx != vex_none )
@@ -8675,6 +8813,8 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,ymm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x1a): /* vbroadcastf128 m128,ymm */
         generate_exception_if(!vex.l, EXC_UD);
@@ -9257,6 +9397,8 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC_66(0x0f38, 0x82): /* invpcid reg,m128 */
         vcpu_must_have(invpcid);
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -9299,6 +9441,8 @@ x86_emulate(
         state->simd_size = simd_none;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x83): /* vpmultishiftqb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(!evex.w, EXC_UD);
         host_and_vcpu_must_have(avx512_vbmi);
@@ -9862,6 +10006,8 @@ x86_emulate(
         generate_exception_if(evex.brs || evex.opmsk, EXC_UD);
         goto avx512f_no_sae;
 
+#endif /* !X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
     case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
         vcpu_must_have(movbe);
@@ -10027,6 +10173,8 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x00): /* vpermq $imm8,ymm/m256,ymm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x01): /* vpermpd $imm8,ymm/m256,ymm */
         generate_exception_if(!vex.l || !vex.w, EXC_UD);
@@ -10087,6 +10235,8 @@ x86_emulate(
         avx512_vlen_check(b & 2);
         goto simd_imm8_zmm;
 
+#endif /* X86EMUL_NO_SIMD */
+
     CASE_SIMD_PACKED_INT(0x0f3a, 0x0f): /* palignr $imm8,{,x}mm/mem,{,x}mm */
         host_and_vcpu_must_have(ssse3);
         if ( vex.pfx )
@@ -10114,6 +10264,8 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 4;
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x42): /* vdbpsadbw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(evex.w, EXC_UD);
         /* fall through */
@@ -10612,6 +10764,8 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_imm8_avx;
 
+#endif /* X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC_VEX_F2(0x0f3a, 0xf0): /* rorx imm,r/m,r */
         vcpu_must_have(bmi2);
         generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
@@ -10626,6 +10780,8 @@ x86_emulate(
             asm ( "rorl %b1,%k0" : "=g" (dst.val) : "c" (imm1), "0" (src.val) );
         break;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
     case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */
@@ -10661,6 +10817,8 @@ x86_emulate(
         host_and_vcpu_must_have(xop);
         goto simd_0f_imm8_ymm;
 
+#endif /* X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC_XOP(09, 0x01): /* XOP Grp1 */
         switch ( modrm_reg & 7 )
         {
@@ -10720,6 +10878,8 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+#ifndef X86EMUL_NO_SIMD
+
     case X86EMUL_OPC_XOP(09, 0x82): /* vfrczss xmm/m128,xmm */
     case X86EMUL_OPC_XOP(09, 0x83): /* vfrczsd xmm/m128,xmm */
         generate_exception_if(vex.l, EXC_UD);
@@ -10775,6 +10935,8 @@ x86_emulate(
         host_and_vcpu_must_have(xop);
         goto simd_0f_ymm;
 
+#endif /* X86EMUL_NO_SIMD */
+
     case X86EMUL_OPC_XOP(0a, 0x10): /* bextr imm,r/m,r */
     {
         uint8_t *buf = get_stub(stub);



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 02/12] x86emul: support MOVDIR{I,64B} insns
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
  2020-05-05  8:12 ` [PATCH v8 01/12] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM Jan Beulich
@ 2020-05-05  8:13 ` Jan Beulich
  2020-05-07 18:30   ` Andrew Cooper
  2020-05-05  8:13 ` [PATCH v8 03/12] x86emul: support ENQCMD insns Jan Beulich
                   ` (10 subsequent siblings)
  12 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:13 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

Introduce a new blk() hook, paralleling the rmw() one in a certain way,
but being intended for larger data sizes, and hence its HVM intermediate
handling function doesn't fall back to splitting the operation if the
requested virtual address can't be mapped.

Note that SDM revision 071 doesn't specify exception behavior for
ModRM.mod == 0b11; assuming #UD here.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
---
v7: Add blk_NONE. Move  harness'es setting of .blk. Correct indentation.
    Re-base.
v6: Fold MOVDIRI and MOVDIR64B changes again. Use blk() for both. All
    tags dropped.
v5: Introduce/use ->blk() hook. Correct asm() operands.
v4: Split MOVDIRI and MOVDIR64B and move this one ahead. Re-base.
v3: Update description.
---
(SDE: -tnt)

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -652,6 +652,18 @@ static int cmpxchg(
     return X86EMUL_OKAY;
 }
 
+static int blk(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt)
+{
+    return x86_emul_blk((void *)offset, p_data, bytes, eflags, state, ctxt);
+}
+
 static int read_segment(
     enum x86_segment seg,
     struct segment_register *reg,
@@ -721,6 +733,7 @@ static struct x86_emulate_ops emulops =
     .insn_fetch = fetch,
     .write      = write,
     .cmpxchg    = cmpxchg,
+    .blk        = blk,
     .read_segment = read_segment,
     .cpuid      = emul_test_cpuid,
     .read_cr    = emul_test_read_cr,
@@ -2339,6 +2352,50 @@ int main(int argc, char **argv)
         goto fail;
     printf("okay\n");
 
+    printf("%-40s", "Testing movdiri %edx,(%ecx)...");
+    if ( stack_exec && cpu_has_movdiri )
+    {
+        instr[0] = 0x0f; instr[1] = 0x38; instr[2] = 0xf9; instr[3] = 0x11;
+
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)memset(res, -1, 16);
+        regs.edx = 0x44332211;
+
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) ||
+             (regs.eip != (unsigned long)&instr[4]) ||
+             res[0] != 0x44332211 || ~res[1] )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing movdir64b 144(%edx),%ecx...");
+    if ( stack_exec && cpu_has_movdir64b )
+    {
+        instr[0] = 0x66; instr[1] = 0x0f; instr[2] = 0x38; instr[3] = 0xf8;
+        instr[4] = 0x8a; instr[5] = 0x90; instr[8] = instr[7] = instr[6] = 0;
+
+        regs.eip = (unsigned long)&instr[0];
+        for ( i = 0; i < 64; ++i )
+            res[i] = i - 20;
+        regs.edx = (unsigned long)res;
+        regs.ecx = (unsigned long)(res + 16);
+
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) ||
+             (regs.eip != (unsigned long)&instr[9]) ||
+             res[15] != -5 || res[32] != 12 )
+            goto fail;
+        for ( i = 16; i < 32; ++i )
+            if ( res[i] != i )
+                goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movq %mm3,(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
     {
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -154,6 +154,8 @@ static inline bool xcr0_mask(uint64_t ma
 #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
 #define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
 #define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
+#define cpu_has_movdiri    cp.feat.movdiri
+#define cpu_has_movdir64b  cp.feat.movdir64b
 #define cpu_has_avx512_4vnniw (cp.feat.avx512_4vnniw && xcr0_mask(0xe6))
 #define cpu_has_avx512_4fmaps (cp.feat.avx512_4fmaps && xcr0_mask(0xe6))
 #define cpu_has_avx512_bf16 (cp.feat.avx512_bf16 && xcr0_mask(0xe6))
--- a/xen/arch/x86/arch.mk
+++ b/xen/arch/x86/arch.mk
@@ -47,6 +47,7 @@ $(call as-option-add,CFLAGS,CC,"rdseed %
 $(call as-option-add,CFLAGS,CC,"clwb (%rax)",-DHAVE_AS_CLWB)
 $(call as-option-add,CFLAGS,CC,".equ \"x\"$$(comma)1",-DHAVE_AS_QUOTED_SYM)
 $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
+$(call as-option-add,CFLAGS,CC,"movdiri %rax$$(comma)(%rax)",-DHAVE_AS_MOVDIR)
 
 # GAS's idea of true is -1.  Clang's idea is 1
 $(call as-option-add,CFLAGS,CC,\
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1441,6 +1441,44 @@ static int hvmemul_rmw(
     return rc;
 }
 
+static int hvmemul_blk(
+    enum x86_segment seg,
+    unsigned long offset,
+    void *p_data,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt)
+{
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+    unsigned long addr;
+    uint32_t pfec = PFEC_page_present | PFEC_write_access;
+    int rc;
+    void *mapping = NULL;
+
+    rc = hvmemul_virtual_to_linear(
+        seg, offset, bytes, NULL, hvm_access_write, hvmemul_ctxt, &addr);
+    if ( rc != X86EMUL_OKAY || !bytes )
+        return rc;
+
+    if ( is_x86_system_segment(seg) )
+        pfec |= PFEC_implicit;
+    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
+        pfec |= PFEC_user_mode;
+
+    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
+    if ( IS_ERR(mapping) )
+        return ~PTR_ERR(mapping);
+    if ( !mapping )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = x86_emul_blk(mapping, p_data, bytes, eflags, state, ctxt);
+    hvmemul_unmap_linear_addr(mapping, addr, bytes, hvmemul_ctxt);
+
+    return rc;
+}
+
 static int hvmemul_write_discard(
     enum x86_segment seg,
     unsigned long offset,
@@ -2512,6 +2550,7 @@ static const struct x86_emulate_ops hvm_
     .write         = hvmemul_write,
     .rmw           = hvmemul_rmw,
     .cmpxchg       = hvmemul_cmpxchg,
+    .blk           = hvmemul_blk,
     .validate      = hvmemul_validate,
     .rep_ins       = hvmemul_rep_ins,
     .rep_outs      = hvmemul_rep_outs,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -548,6 +548,8 @@ static const struct ext0f38_table {
     [0xf1] = { .to_mem = 1, .two_op = 1 },
     [0xf2 ... 0xf3] = {},
     [0xf5 ... 0xf7] = {},
+    [0xf8] = { .simd_size = simd_other },
+    [0xf9] = { .to_mem = 1, .two_op = 1 /* Mov */ },
 };
 
 /* Shift values between src and dst sizes of pmov{s,z}x{b,w,d}{w,d,q}. */
@@ -851,6 +853,10 @@ struct x86_emulate_state {
         rmw_xchg,
         rmw_xor,
     } rmw;
+    enum {
+        blk_NONE,
+        blk_movdir,
+    } blk;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
     uint8_t sib_index, sib_scale;
     uint8_t rex_prefix;
@@ -1914,6 +1920,8 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
 #define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
 #define vcpu_has_rdpid()       (ctxt->cpuid->feat.rdpid)
+#define vcpu_has_movdiri()     (ctxt->cpuid->feat.movdiri)
+#define vcpu_has_movdir64b()   (ctxt->cpuid->feat.movdir64b)
 #define vcpu_has_avx512_4vnniw() (ctxt->cpuid->feat.avx512_4vnniw)
 #define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
 #define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16)
@@ -2722,10 +2730,12 @@ x86_decode_0f38(
     {
     case 0x00 ... 0xef:
     case 0xf2 ... 0xf5:
-    case 0xf7 ... 0xff:
+    case 0xf7 ... 0xf8:
+    case 0xfa ... 0xff:
         op_bytes = 0;
         /* fall through */
     case 0xf6: /* adcx / adox */
+    case 0xf9: /* movdiri */
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         break;
 
@@ -10173,6 +10183,34 @@ x86_emulate(
                             : "0" ((uint32_t)src.val), "rm" (_regs.edx) );
         break;
 
+    case X86EMUL_OPC_66(0x0f38, 0xf8): /* movdir64b r,m512 */
+        host_and_vcpu_must_have(movdir64b);
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        src.val = truncate_ea(*dst.reg);
+        generate_exception_if(!is_aligned(x86_seg_es, src.val, 64, ctxt, ops),
+                              EXC_GP, 0);
+        fail_if(!ops->blk);
+        state->blk = blk_movdir;
+        BUILD_BUG_ON(sizeof(*mmvalp) < 64);
+        if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, 64,
+                             ctxt)) != X86EMUL_OKAY ||
+             (rc = ops->blk(x86_seg_es, src.val, mmvalp, 64, &_regs.eflags,
+                            state, ctxt)) != X86EMUL_OKAY )
+            goto done;
+        state->simd_size = simd_none;
+        break;
+
+    case X86EMUL_OPC(0x0f38, 0xf9): /* movdiri mem,r */
+        host_and_vcpu_must_have(movdiri);
+        generate_exception_if(dst.type != OP_MEM, EXC_UD);
+        fail_if(!ops->blk);
+        state->blk = blk_movdir;
+        if ( (rc = ops->blk(dst.mem.seg, dst.mem.off, &src.val, op_bytes,
+                            &_regs.eflags, state, ctxt)) != X86EMUL_OKAY )
+            goto done;
+        dst.type = OP_NONE;
+        break;
+
 #ifndef X86EMUL_NO_SIMD
 
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x00): /* vpermq $imm8,ymm/m256,ymm */
@@ -11431,6 +11469,77 @@ int x86_emul_rmw(
 
     return X86EMUL_OKAY;
 }
+
+int x86_emul_blk(
+    void *ptr,
+    void *data,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt)
+{
+    switch ( state->blk )
+    {
+        /*
+         * Throughout this switch(), memory clobbers are used to compensate
+         * that other operands may not properly express the (full) memory
+         * ranges covered.
+         */
+    case blk_movdir:
+        switch ( bytes )
+        {
+#ifdef __x86_64__
+        case sizeof(uint32_t):
+# ifdef HAVE_AS_MOVDIR
+            asm ( "movdiri %0, (%1)"
+                  :: "r" (*(uint32_t *)data), "r" (ptr) : "memory" );
+# else
+            /* movdiri %esi, (%rdi) */
+            asm ( ".byte 0x0f, 0x38, 0xf9, 0x37"
+                  :: "S" (*(uint32_t *)data), "D" (ptr) : "memory" );
+# endif
+            break;
+#endif
+
+        case sizeof(unsigned long):
+#ifdef HAVE_AS_MOVDIR
+            asm ( "movdiri %0, (%1)"
+                  :: "r" (*(unsigned long *)data), "r" (ptr) : "memory" );
+#else
+            /* movdiri %rsi, (%rdi) */
+            asm ( ".byte 0x48, 0x0f, 0x38, 0xf9, 0x37"
+                  :: "S" (*(unsigned long *)data), "D" (ptr) : "memory" );
+#endif
+            break;
+
+        case 64:
+            if ( ((unsigned long)ptr & 0x3f) )
+            {
+                ASSERT_UNREACHABLE();
+                return X86EMUL_UNHANDLEABLE;
+            }
+#ifdef HAVE_AS_MOVDIR
+            asm ( "movdir64b (%0), %1" :: "r" (data), "r" (ptr) : "memory" );
+#else
+            /* movdir64b (%rsi), %rdi */
+            asm ( ".byte 0x66, 0x0f, 0x38, 0xf8, 0x3e"
+                  :: "S" (data), "D" (ptr) : "memory" );
+#endif
+            break;
+
+        default:
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNHANDLEABLE;
+        }
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    return X86EMUL_OKAY;
+}
 
 static void __init __maybe_unused build_assertions(void)
 {
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -310,6 +310,22 @@ struct x86_emulate_ops
         struct x86_emulate_ctxt *ctxt);
 
     /*
+     * blk: Emulate a large (block) memory access.
+     * @p_data: [IN/OUT] (optional) Pointer to source/destination buffer.
+     * @eflags: [IN/OUT] Pointer to EFLAGS to be updated according to
+     *                   instruction effects.
+     * @state:  [IN/OUT] Pointer to (opaque) emulator state.
+     */
+    int (*blk)(
+        enum x86_segment seg,
+        unsigned long offset,
+        void *p_data,
+        unsigned int bytes,
+        uint32_t *eflags,
+        struct x86_emulate_state *state,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
      * validate: Post-decode, pre-emulate hook to allow caller controlled
      * filtering.
      */
@@ -793,6 +809,14 @@ x86_emul_rmw(
     unsigned int bytes,
     uint32_t *eflags,
     struct x86_emulate_state *state,
+    struct x86_emulate_ctxt *ctxt);
+int
+x86_emul_blk(
+    void *ptr,
+    void *data,
+    unsigned int bytes,
+    uint32_t *eflags,
+    struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt);
 
 static inline void x86_emul_hw_exception(
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -118,6 +118,8 @@
 #define cpu_has_avx512_bitalg   boot_cpu_has(X86_FEATURE_AVX512_BITALG)
 #define cpu_has_avx512_vpopcntdq boot_cpu_has(X86_FEATURE_AVX512_VPOPCNTDQ)
 #define cpu_has_rdpid           boot_cpu_has(X86_FEATURE_RDPID)
+#define cpu_has_movdiri         boot_cpu_has(X86_FEATURE_MOVDIRI)
+#define cpu_has_movdir64b       boot_cpu_has(X86_FEATURE_MOVDIR64B)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -238,6 +238,8 @@ XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /
 XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of DW/QW */
 XEN_CPUFEATURE(RDPID,         6*32+22) /*A  RDPID instruction */
 XEN_CPUFEATURE(CLDEMOTE,      6*32+25) /*A  CLDEMOTE instruction */
+XEN_CPUFEATURE(MOVDIRI,       6*32+27) /*A  MOVDIRI instruction */
+XEN_CPUFEATURE(MOVDIR64B,     6*32+28) /*A  MOVDIR64B instruction */
 
 /* AMD-defined CPU features, CPUID level 0x80000007.edx, word 7 */
 XEN_CPUFEATURE(ITSC,          7*32+ 8) /*   Invariant TSC */



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 03/12] x86emul: support ENQCMD insns
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
  2020-05-05  8:12 ` [PATCH v8 01/12] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM Jan Beulich
  2020-05-05  8:13 ` [PATCH v8 02/12] x86emul: support MOVDIR{I,64B} insns Jan Beulich
@ 2020-05-05  8:13 ` Jan Beulich
  2020-05-07 18:59   ` Andrew Cooper
  2020-05-05  8:14 ` [PATCH v8 04/12] x86emul: support SERIALIZE Jan Beulich
                   ` (9 subsequent siblings)
  12 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:13 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

Note that the ISA extensions document revision 038 doesn't specify
exception behavior for ModRM.mod == 0b11; assuming #UD here.

No tests are being added to the harness - this would be quite hard,
we can't just issue the insns against RAM. Their similarity with
MOVDIR64B should have the test case there be god enough to cover any
fundamental flaws.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: This doesn't (can't) consult PASID translation tables yet, as we
     have no VMX code for this so far. I guess for this we will want to
     replace the direct ->read_msr(MSR_IA32_PASID, ...) with a new
     ->read_pasid() hook.
---
v7: Re-base.
v6: Re-base.
v5: New.

--- a/xen/arch/x86/arch.mk
+++ b/xen/arch/x86/arch.mk
@@ -48,6 +48,7 @@ $(call as-option-add,CFLAGS,CC,"clwb (%r
 $(call as-option-add,CFLAGS,CC,".equ \"x\"$$(comma)1",-DHAVE_AS_QUOTED_SYM)
 $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
 $(call as-option-add,CFLAGS,CC,"movdiri %rax$$(comma)(%rax)",-DHAVE_AS_MOVDIR)
+$(call as-option-add,CFLAGS,CC,"enqcmd (%rax)$$(comma)%rax",-DHAVE_AS_ENQCMD)
 
 # GAS's idea of true is -1.  Clang's idea is 1
 $(call as-option-add,CFLAGS,CC,\
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -855,6 +855,7 @@ struct x86_emulate_state {
     } rmw;
     enum {
         blk_NONE,
+        blk_enqcmd,
         blk_movdir,
     } blk;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -901,6 +902,7 @@ typedef union {
     uint64_t __attribute__ ((aligned(16))) xmm[2];
     uint64_t __attribute__ ((aligned(32))) ymm[4];
     uint64_t __attribute__ ((aligned(64))) zmm[8];
+    uint32_t data32[16];
 } mmval_t;
 
 /*
@@ -1922,6 +1924,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_rdpid()       (ctxt->cpuid->feat.rdpid)
 #define vcpu_has_movdiri()     (ctxt->cpuid->feat.movdiri)
 #define vcpu_has_movdir64b()   (ctxt->cpuid->feat.movdir64b)
+#define vcpu_has_enqcmd()      (ctxt->cpuid->feat.enqcmd)
 #define vcpu_has_avx512_4vnniw() (ctxt->cpuid->feat.avx512_4vnniw)
 #define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
 #define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16)
@@ -10200,6 +10203,36 @@ x86_emulate(
         state->simd_size = simd_none;
         break;
 
+    case X86EMUL_OPC_F2(0x0f38, 0xf8): /* enqcmd r,m512 */
+    case X86EMUL_OPC_F3(0x0f38, 0xf8): /* enqcmds r,m512 */
+        host_and_vcpu_must_have(enqcmd);
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        generate_exception_if(vex.pfx != vex_f2 && !mode_ring0(), EXC_GP, 0);
+        src.val = truncate_ea(*dst.reg);
+        generate_exception_if(!is_aligned(x86_seg_es, src.val, 64, ctxt, ops),
+                              EXC_GP, 0);
+        fail_if(!ops->blk);
+        BUILD_BUG_ON(sizeof(*mmvalp) < 64);
+        if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, 64,
+                             ctxt)) != X86EMUL_OKAY )
+            goto done;
+        if ( vex.pfx == vex_f2 ) /* enqcmd */
+        {
+            fail_if(!ops->read_msr);
+            if ( (rc = ops->read_msr(MSR_IA32_PASID,
+                                     &msr_val, ctxt)) != X86EMUL_OKAY )
+                goto done;
+            generate_exception_if(!(msr_val & PASID_VALID), EXC_GP, 0);
+            mmvalp->data32[0] = MASK_EXTR(msr_val, PASID_PASID_MASK);
+        }
+        mmvalp->data32[0] &= ~0x7ff00000;
+        state->blk = blk_enqcmd;
+        if ( (rc = ops->blk(x86_seg_es, src.val, mmvalp, 64, &_regs.eflags,
+                            state, ctxt)) != X86EMUL_OKAY )
+            goto done;
+        state->simd_size = simd_none;
+        break;
+
     case X86EMUL_OPC(0x0f38, 0xf9): /* movdiri mem,r */
         host_and_vcpu_must_have(movdiri);
         generate_exception_if(dst.type != OP_MEM, EXC_UD);
@@ -11480,11 +11513,36 @@ int x86_emul_blk(
 {
     switch ( state->blk )
     {
+        bool zf;
+
         /*
          * Throughout this switch(), memory clobbers are used to compensate
          * that other operands may not properly express the (full) memory
          * ranges covered.
          */
+    case blk_enqcmd:
+        ASSERT(bytes == 64);
+        if ( ((unsigned long)ptr & 0x3f) )
+        {
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNHANDLEABLE;
+        }
+        *eflags &= ~EFLAGS_MASK;
+#ifdef HAVE_AS_ENQCMD
+        asm ( "enqcmds (%[src]), %[dst]" ASM_FLAG_OUT(, "; setz %0")
+              : [zf] ASM_FLAG_OUT("=@ccz", "=qm") (zf)
+              : [src] "r" (data), [dst] "r" (ptr) : "memory" );
+#else
+        /* enqcmds (%rsi), %rdi */
+        asm ( ".byte 0xf3, 0x0f, 0x38, 0xf8, 0x3e"
+              ASM_FLAG_OUT(, "; setz %[zf]")
+              : [zf] ASM_FLAG_OUT("=@ccz", "=qm") (zf)
+              : "S" (data), "D" (ptr) : "memory" );
+#endif
+        if ( zf )
+            *eflags |= X86_EFLAGS_ZF;
+        break;
+
     case blk_movdir:
         switch ( bytes )
         {
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -120,6 +120,7 @@
 #define cpu_has_rdpid           boot_cpu_has(X86_FEATURE_RDPID)
 #define cpu_has_movdiri         boot_cpu_has(X86_FEATURE_MOVDIRI)
 #define cpu_has_movdir64b       boot_cpu_has(X86_FEATURE_MOVDIR64B)
+#define cpu_has_enqcmd          boot_cpu_has(X86_FEATURE_ENQCMD)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -420,6 +420,10 @@
 #define MSR_IA32_TSC_DEADLINE		0x000006E0
 #define MSR_IA32_ENERGY_PERF_BIAS	0x000001b0
 
+#define MSR_IA32_PASID			0x00000d93
+#define  PASID_PASID_MASK		0x000fffff
+#define  PASID_VALID			0x80000000
+
 /* Platform Shared Resource MSRs */
 #define MSR_IA32_CMT_EVTSEL		0x00000c8d
 #define MSR_IA32_CMT_EVTSEL_UE_MASK	0x0000ffff
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -240,6 +240,7 @@ XEN_CPUFEATURE(RDPID,         6*32+22) /
 XEN_CPUFEATURE(CLDEMOTE,      6*32+25) /*A  CLDEMOTE instruction */
 XEN_CPUFEATURE(MOVDIRI,       6*32+27) /*A  MOVDIRI instruction */
 XEN_CPUFEATURE(MOVDIR64B,     6*32+28) /*A  MOVDIR64B instruction */
+XEN_CPUFEATURE(ENQCMD,        6*32+29) /*   ENQCMD{,S} instructions */
 
 /* AMD-defined CPU features, CPUID level 0x80000007.edx, word 7 */
 XEN_CPUFEATURE(ITSC,          7*32+ 8) /*   Invariant TSC */



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 04/12] x86emul: support SERIALIZE
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
                   ` (2 preceding siblings ...)
  2020-05-05  8:13 ` [PATCH v8 03/12] x86emul: support ENQCMD insns Jan Beulich
@ 2020-05-05  8:14 ` Jan Beulich
  2020-05-07 19:32   ` Andrew Cooper
  2020-05-05  8:14 ` [PATCH v8 05/12] x86emul: support X{SUS,RES}LDTRK Jan Beulich
                   ` (8 subsequent siblings)
  12 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:14 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

... enabling its use by all guest kinds at the same time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v7: Re-base.
v6: New.

--- a/tools/libxl/libxl_cpuid.c
+++ b/tools/libxl/libxl_cpuid.c
@@ -214,6 +214,7 @@ int libxl_cpuid_parse_config(libxl_cpuid
         {"avx512-4vnniw",0x00000007,  0, CPUID_REG_EDX,  2,  1},
         {"avx512-4fmaps",0x00000007,  0, CPUID_REG_EDX,  3,  1},
         {"md-clear",     0x00000007,  0, CPUID_REG_EDX, 10,  1},
+        {"serialize",    0x00000007,  0, CPUID_REG_EDX, 14,  1},
         {"cet-ibt",      0x00000007,  0, CPUID_REG_EDX, 20,  1},
         {"ibrsb",        0x00000007,  0, CPUID_REG_EDX, 26,  1},
         {"stibp",        0x00000007,  0, CPUID_REG_EDX, 27,  1},
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -161,6 +161,7 @@ static const char *const str_7d0[32] =
 
     [10] = "md-clear",
     /* 12 */                [13] = "tsx-force-abort",
+    [14] = "serialize",
 
     [18] = "pconfig",
     [20] = "cet-ibt",
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -158,6 +158,7 @@ static inline bool xcr0_mask(uint64_t ma
 #define cpu_has_movdir64b  cp.feat.movdir64b
 #define cpu_has_avx512_4vnniw (cp.feat.avx512_4vnniw && xcr0_mask(0xe6))
 #define cpu_has_avx512_4fmaps (cp.feat.avx512_4fmaps && xcr0_mask(0xe6))
+#define cpu_has_serialize  cp.feat.serialize
 #define cpu_has_avx512_bf16 (cp.feat.avx512_bf16 && xcr0_mask(0xe6))
 
 #define cpu_has_xgetbv1   (cpu_has_xsave && cp.xstate.xgetbv1)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1927,6 +1927,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_enqcmd()      (ctxt->cpuid->feat.enqcmd)
 #define vcpu_has_avx512_4vnniw() (ctxt->cpuid->feat.avx512_4vnniw)
 #define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
+#define vcpu_has_serialize()   (ctxt->cpuid->feat.serialize)
 #define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16)
 
 #define vcpu_must_have(feat) \
@@ -5660,6 +5661,18 @@ x86_emulate(
                 goto done;
             break;
 
+        case 0xe8:
+            switch ( vex.pfx )
+            {
+            case vex_none: /* serialize */
+                host_and_vcpu_must_have(serialize);
+                asm volatile ( ".byte 0x0f, 0x01, 0xe8" );
+                break;
+            default:
+                goto unimplemented_insn;
+            }
+            break;
+
         case 0xf8: /* swapgs */
             generate_exception_if(!mode_64bit(), EXC_UD);
             generate_exception_if(!mode_ring0(), EXC_GP, 0);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -129,6 +129,7 @@
 #define cpu_has_avx512_4vnniw   boot_cpu_has(X86_FEATURE_AVX512_4VNNIW)
 #define cpu_has_avx512_4fmaps   boot_cpu_has(X86_FEATURE_AVX512_4FMAPS)
 #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)
+#define cpu_has_serialize       boot_cpu_has(X86_FEATURE_SERIALIZE)
 
 /* CPUID level 0x00000007:1.eax */
 #define cpu_has_avx512_bf16     boot_cpu_has(X86_FEATURE_AVX512_BF16)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -258,6 +258,7 @@ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /
 XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A  AVX512 Multiply Accumulation Single Precision */
 XEN_CPUFEATURE(MD_CLEAR,      9*32+10) /*A  VERW clears microarchitectural buffers */
 XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */
+XEN_CPUFEATURE(SERIALIZE,     9*32+14) /*A  SERIALIZE insn */
 XEN_CPUFEATURE(CET_IBT,       9*32+20) /*   CET - Indirect Branch Tracking */
 XEN_CPUFEATURE(IBRSB,         9*32+26) /*A  IBRS and IBPB support (used by Intel) */
 XEN_CPUFEATURE(STIBP,         9*32+27) /*A  STIBP */



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 05/12] x86emul: support X{SUS,RES}LDTRK
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
                   ` (3 preceding siblings ...)
  2020-05-05  8:14 ` [PATCH v8 04/12] x86emul: support SERIALIZE Jan Beulich
@ 2020-05-05  8:14 ` Jan Beulich
  2020-05-07 20:13   ` Andrew Cooper
  2020-05-05  8:15 ` [PATCH v8 06/12] x86/HVM: make hvmemul_blk() capable of handling r/o operations Jan Beulich
                   ` (7 subsequent siblings)
  12 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:14 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

There's nothing to be done by the emulator, as we unconditionally abort
any XBEGIN.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v6: New.

--- a/tools/libxl/libxl_cpuid.c
+++ b/tools/libxl/libxl_cpuid.c
@@ -208,6 +208,7 @@ int libxl_cpuid_parse_config(libxl_cpuid
         {"avx512-vnni",  0x00000007,  0, CPUID_REG_ECX, 11,  1},
         {"avx512-bitalg",0x00000007,  0, CPUID_REG_ECX, 12,  1},
         {"avx512-vpopcntdq",0x00000007,0,CPUID_REG_ECX, 14,  1},
+        {"tsxldtrk",     0x00000007,  0, CPUID_REG_ECX, 16,  1},
         {"rdpid",        0x00000007,  0, CPUID_REG_ECX, 22,  1},
         {"cldemote",     0x00000007,  0, CPUID_REG_ECX, 25,  1},
 
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -128,6 +128,7 @@ static const char *const str_7c0[32] =
     [10] = "vpclmulqdq",       [11] = "avx512_vnni",
     [12] = "avx512_bitalg",
     [14] = "avx512_vpopcntdq",
+    [16] = "tsxldtrk",
 
     [22] = "rdpid",
     /* 24 */                   [25] = "cldemote",
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1921,6 +1921,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
 #define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
 #define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
+#define vcpu_has_tsxldtrk()    (ctxt->cpuid->feat.tsxldtrk)
 #define vcpu_has_rdpid()       (ctxt->cpuid->feat.rdpid)
 #define vcpu_has_movdiri()     (ctxt->cpuid->feat.movdiri)
 #define vcpu_has_movdir64b()   (ctxt->cpuid->feat.movdir64b)
@@ -5668,6 +5669,20 @@ x86_emulate(
                 host_and_vcpu_must_have(serialize);
                 asm volatile ( ".byte 0x0f, 0x01, 0xe8" );
                 break;
+            case vex_f2: /* xsusldtrk */
+                vcpu_must_have(tsxldtrk);
+                break;
+            default:
+                goto unimplemented_insn;
+            }
+            break;
+
+        case 0xe9:
+            switch ( vex.pfx )
+            {
+            case vex_f2: /* xresldtrk */
+                vcpu_must_have(tsxldtrk);
+                break;
             default:
                 goto unimplemented_insn;
             }
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -236,6 +236,7 @@ XEN_CPUFEATURE(VPCLMULQDQ,    6*32+10) /
 XEN_CPUFEATURE(AVX512_VNNI,   6*32+11) /*A  Vector Neural Network Instrs */
 XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A  Support for VPOPCNT[B,W] and VPSHUFBITQMB */
 XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of DW/QW */
+XEN_CPUFEATURE(TSXLDTRK,      6*32+16) /*A  TSX load tracking suspend/resume insns */
 XEN_CPUFEATURE(RDPID,         6*32+22) /*A  RDPID instruction */
 XEN_CPUFEATURE(CLDEMOTE,      6*32+25) /*A  CLDEMOTE instruction */
 XEN_CPUFEATURE(MOVDIRI,       6*32+27) /*A  MOVDIRI instruction */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -284,6 +284,9 @@ def crunch_numbers(state):
         # as dependent features simplifies Xen's logic, and prevents the guest
         # from seeing implausible configurations.
         IBRSB: [STIBP, SSBD],
+
+        # In principle the TSXLDTRK insns could also be considered independent.
+        RTM: [TSXLDTRK],
     }
 
     deep_features = tuple(sorted(deps.keys()))



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 06/12] x86/HVM: make hvmemul_blk() capable of handling r/o operations
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
                   ` (4 preceding siblings ...)
  2020-05-05  8:14 ` [PATCH v8 05/12] x86emul: support X{SUS,RES}LDTRK Jan Beulich
@ 2020-05-05  8:15 ` Jan Beulich
  2020-05-05 14:20   ` Paul Durrant
  2020-05-07 20:34   ` Andrew Cooper
  2020-05-05  8:15 ` [PATCH v8 07/12] x86emul: support FNSTENV and FNSAVE Jan Beulich
                   ` (6 subsequent siblings)
  12 siblings, 2 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:15 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Paul Durrant, Wei Liu, Roger Pau Monne

In preparation for handling e.g. FLDENV or {F,FX,X}RSTOR here as well.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v8: New (could be folded into "x86emul: support MOVDIR{I,64B} insns",
    but would invalidate Paul's R-b there).

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1453,7 +1453,7 @@ static int hvmemul_blk(
     struct hvm_emulate_ctxt *hvmemul_ctxt =
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
     unsigned long addr;
-    uint32_t pfec = PFEC_page_present | PFEC_write_access;
+    uint32_t pfec = PFEC_page_present;
     int rc;
     void *mapping = NULL;
 
@@ -1462,6 +1462,9 @@ static int hvmemul_blk(
     if ( rc != X86EMUL_OKAY || !bytes )
         return rc;
 
+    if ( x86_insn_is_mem_write(state, ctxt) )
+        pfec |= PFEC_write_access;
+
     if ( is_x86_system_segment(seg) )
         pfec |= PFEC_implicit;
     else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 07/12] x86emul: support FNSTENV and FNSAVE
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
                   ` (5 preceding siblings ...)
  2020-05-05  8:15 ` [PATCH v8 06/12] x86/HVM: make hvmemul_blk() capable of handling r/o operations Jan Beulich
@ 2020-05-05  8:15 ` Jan Beulich
  2020-05-05 12:36   ` Jan Beulich
  2020-05-08 17:58   ` Andrew Cooper
  2020-05-05  8:16 ` [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR Jan Beulich
                   ` (5 subsequent siblings)
  12 siblings, 2 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:15 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

To avoid introducing another boolean into emulator state, the
rex_prefix field gets (ab)used to convey the real/VM86 vs protected mode
info (affecting structure layout, albeit not size) to x86_emul_blk().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: The full 16-bit padding fields in the 32-bit structures get filled
     with all ones by modern CPUs (i.e. other than the comment says for
     FIP and FDP). We may want to mirror this as well (for the real mode
     variant), even if those fields' contents are unspecified.
---
v7: New.

--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -120,6 +120,7 @@ static inline bool xcr0_mask(uint64_t ma
 }
 
 #define cache_line_size() (cp.basic.clflush_size * 8)
+#define cpu_has_fpu        cp.basic.fpu
 #define cpu_has_mmx        cp.basic.mmx
 #define cpu_has_fxsr       cp.basic.fxsr
 #define cpu_has_sse        cp.basic.sse
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -748,6 +748,25 @@ static struct x86_emulate_ops emulops =
 
 #define MMAP_ADDR 0x100000
 
+/*
+ * 64-bit OSes may not (be able to) properly restore the two selectors in
+ * the FPU environment. Zap them so that memcmp() on two saved images will
+ * work regardless of whether a context switch occurred in the middle.
+ */
+static void zap_fpsel(unsigned int *env, bool is_32bit)
+{
+    if ( is_32bit )
+    {
+        env[4] &= ~0xffff;
+        env[6] &= ~0xffff;
+    }
+    else
+    {
+        env[2] &= ~0xffff;
+        env[3] &= ~0xffff;
+    }
+}
+
 #ifdef __x86_64__
 # define STKVAL_DISP 64
 static const struct {
@@ -2394,6 +2413,62 @@ int main(int argc, char **argv)
         printf("okay\n");
     }
     else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing fnstenv 4(%ecx)...");
+    if ( stack_exec && cpu_has_fpu )
+    {
+        const uint16_t three = 3;
+
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %1\n\t"
+                       "fstenv %0"
+                       : "=m" (res[9]) : "m" (three) : "memory" );
+        zap_fpsel(&res[9], true);
+        instr[0] = 0xd9; instr[1] = 0x71; instr[2] = 0x04;
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)res;
+        res[8] = 0xaa55aa55;
+        rc = x86_emulate(&ctxt, &emulops);
+        zap_fpsel(&res[1], true);
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 1, res + 9, 28) ||
+             res[8] != 0xaa55aa55 ||
+             (regs.eip != (unsigned long)&instr[3]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing 16-bit fnsave (%ecx)...");
+    if ( stack_exec && cpu_has_fpu )
+    {
+        const uint16_t five = 5;
+
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %1\n\t"
+                       "fsaves %0"
+                       : "=m" (res[25]) : "m" (five) : "memory" );
+        zap_fpsel(&res[25], false);
+        asm volatile ( "frstors %0" :: "m" (res[25]) : "memory" );
+        instr[0] = 0x66; instr[1] = 0xdd; instr[2] = 0x31;
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)res;
+        res[23] = 0xaa55aa55;
+        res[24] = 0xaa55aa55;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res, res + 25, 94) ||
+             (res[23] >> 16) != 0xaa55 ||
+             res[24] != 0xaa55aa55 ||
+             (regs.eip != (unsigned long)&instr[3]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
         printf("skipped\n");
 
     printf("%-40s", "Testing movq %mm3,(%ecx)...");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -856,6 +856,9 @@ struct x86_emulate_state {
     enum {
         blk_NONE,
         blk_enqcmd,
+#ifndef X86EMUL_NO_FPU
+        blk_fst, /* FNSTENV, FNSAVE */
+#endif
         blk_movdir,
     } blk;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -897,6 +900,50 @@ struct x86_emulate_state {
 #define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
 #endif
 
+#ifndef X86EMUL_NO_FPU
+struct x87_env16 {
+    uint16_t fcw;
+    uint16_t fsw;
+    uint16_t ftw;
+    union {
+        struct {
+            uint16_t fip_lo;
+            uint16_t fop:11, :1, fip_hi:4;
+            uint16_t fdp_lo;
+            uint16_t :12, fdp_hi:4;
+        } real;
+        struct {
+            uint16_t fip;
+            uint16_t fcs;
+            uint16_t fdp;
+            uint16_t fds;
+        } prot;
+    } mode;
+};
+
+struct x87_env32 {
+    uint32_t fcw:16, :16;
+    uint32_t fsw:16, :16;
+    uint32_t ftw:16, :16;
+    union {
+        struct {
+            /* some CPUs/FPUs also store the full FIP here */
+            uint32_t fip_lo:16, :16;
+            uint32_t fop:11, :1, fip_hi:16, :4;
+            /* some CPUs/FPUs also store the full FDP here */
+            uint32_t fdp_lo:16, :16;
+            uint32_t :12, fdp_hi:16, :4;
+        } real;
+        struct {
+            uint32_t fip;
+            uint32_t fcs:16, fop:11, :5;
+            uint32_t fdp;
+            uint32_t fds:16, :16;
+        } prot;
+    } mode;
+};
+#endif
+
 typedef union {
     uint64_t mmx;
     uint64_t __attribute__ ((aligned(16))) xmm[2];
@@ -4912,9 +4959,19 @@ x86_emulate(
                     goto done;
                 emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
                 break;
-            case 6: /* fnstenv - TODO */
+            case 6: /* fnstenv */
+                fail_if(!ops->blk);
+                state->blk = blk_fst;
+                /* REX is meaningless for this insn by this point. */
+                rex_prefix = in_protmode(ctxt, ops);
+                if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
+                                    op_bytes > 2 ? sizeof(struct x87_env32)
+                                                 : sizeof(struct x87_env16),
+                                    &_regs.eflags,
+                                    state, ctxt)) != X86EMUL_OKAY )
+                    goto done;
                 state->fpu_ctrl = true;
-                goto unimplemented_insn;
+                break;
             case 7: /* fnstcw m2byte */
                 state->fpu_ctrl = true;
             fpu_memdst16:
@@ -5068,9 +5125,21 @@ x86_emulate(
                 emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
                 break;
             case 4: /* frstor - TODO */
-            case 6: /* fnsave - TODO */
                 state->fpu_ctrl = true;
                 goto unimplemented_insn;
+            case 6: /* fnsave */
+                fail_if(!ops->blk);
+                state->blk = blk_fst;
+                /* REX is meaningless for this insn by this point. */
+                rex_prefix = in_protmode(ctxt, ops);
+                if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
+                                    op_bytes > 2 ? sizeof(struct x87_env32) + 80
+                                                 : sizeof(struct x87_env16) + 80,
+                                    &_regs.eflags,
+                                    state, ctxt)) != X86EMUL_OKAY )
+                    goto done;
+                state->fpu_ctrl = true;
+                break;
             case 7: /* fnstsw m2byte */
                 state->fpu_ctrl = true;
                 goto fpu_memdst16;
@@ -11542,6 +11611,12 @@ int x86_emul_blk(
     switch ( state->blk )
     {
         bool zf;
+        struct {
+            struct x87_env32 env;
+            struct {
+               uint8_t bytes[10];
+            } freg[8];
+        } fpstate;
 
         /*
          * Throughout this switch(), memory clobbers are used to compensate
@@ -11571,6 +11646,93 @@ int x86_emul_blk(
             *eflags |= X86_EFLAGS_ZF;
         break;
 
+#ifndef X86EMUL_NO_FPU
+
+    case blk_fst:
+        ASSERT(!data);
+
+        if ( bytes > sizeof(fpstate.env) )
+            asm ( "fnsave %0" : "=m" (fpstate) );
+        else
+            asm ( "fnstenv %0" : "=m" (fpstate.env) );
+
+        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
+        switch ( bytes )
+        {
+        case sizeof(fpstate.env):
+        case sizeof(fpstate):
+            if ( !state->rex_prefix )
+            {
+                unsigned int fip = fpstate.env.mode.prot.fip +
+                                   (fpstate.env.mode.prot.fcs << 4);
+                unsigned int fdp = fpstate.env.mode.prot.fdp +
+                                   (fpstate.env.mode.prot.fds << 4);
+                unsigned int fop = fpstate.env.mode.prot.fop;
+
+                memset(&fpstate.env.mode, 0, sizeof(fpstate.env.mode));
+                fpstate.env.mode.real.fip_lo = fip;
+                fpstate.env.mode.real.fip_hi = fip >> 16;
+                fpstate.env.mode.real.fop = fop;
+                fpstate.env.mode.real.fdp_lo = fdp;
+                fpstate.env.mode.real.fdp_hi = fdp >> 16;
+            }
+            memcpy(ptr, &fpstate.env, sizeof(fpstate.env));
+            if ( bytes == sizeof(fpstate.env) )
+                ptr = NULL;
+            else
+                ptr += sizeof(fpstate.env);
+            break;
+
+        case sizeof(struct x87_env16):
+        case sizeof(struct x87_env16) + sizeof(fpstate.freg):
+            if ( state->rex_prefix )
+            {
+                struct x87_env16 *env = ptr;
+
+                env->fcw = fpstate.env.fcw;
+                env->fsw = fpstate.env.fsw;
+                env->ftw = fpstate.env.ftw;
+                env->mode.prot.fip = fpstate.env.mode.prot.fip;
+                env->mode.prot.fcs = fpstate.env.mode.prot.fcs;
+                env->mode.prot.fdp = fpstate.env.mode.prot.fdp;
+                env->mode.prot.fds = fpstate.env.mode.prot.fds;
+            }
+            else
+            {
+                unsigned int fip = fpstate.env.mode.prot.fip +
+                                   (fpstate.env.mode.prot.fcs << 4);
+                unsigned int fdp = fpstate.env.mode.prot.fdp +
+                                   (fpstate.env.mode.prot.fds << 4);
+                struct x87_env16 env = {
+                    .fcw = fpstate.env.fcw,
+                    .fsw = fpstate.env.fsw,
+                    .ftw = fpstate.env.ftw,
+                    .mode.real.fip_lo = fip,
+                    .mode.real.fip_hi = fip >> 16,
+                    .mode.real.fop = fpstate.env.mode.prot.fop,
+                    .mode.real.fdp_lo = fdp,
+                    .mode.real.fdp_hi = fdp >> 16
+                };
+
+                memcpy(ptr, &env, sizeof(env));
+            }
+            if ( bytes == sizeof(struct x87_env16) )
+                ptr = NULL;
+            else
+                ptr += sizeof(struct x87_env16);
+            break;
+
+        default:
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( ptr )
+            memcpy(ptr, fpstate.freg, sizeof(fpstate.freg));
+        break;
+
+#endif /* X86EMUL_NO_FPU */
+
     case blk_movdir:
         switch ( bytes )
         {



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
                   ` (6 preceding siblings ...)
  2020-05-05  8:15 ` [PATCH v8 07/12] x86emul: support FNSTENV and FNSAVE Jan Beulich
@ 2020-05-05  8:16 ` Jan Beulich
  2020-05-08 13:37   ` Roger Pau Monné
  2020-05-08 18:19   ` Andrew Cooper
  2020-05-05  8:16 ` [PATCH v8 09/12] x86emul: support FXSAVE/FXRSTOR Jan Beulich
                   ` (4 subsequent siblings)
  12 siblings, 2 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:16 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

While the Intel SDM claims that FRSTOR itself may raise #MF upon
completion, this was confirmed by Intel to be a doc error which will be
corrected in due course; behavior is like FLDENV, and like old hard copy
manuals describe it. Otherwise we'd have to emulate the insn by filling
st(N) in suitable order, followed by FLDENV.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v7: New.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -2442,6 +2442,27 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing fldenv 8(%edx)...");
+    if ( stack_exec && cpu_has_fpu )
+    {
+        asm volatile ( "fnstenv %0\n\t"
+                       "fninit"
+                       : "=m" (res[2]) :: "memory" );
+        zap_fpsel(&res[2], true);
+        instr[0] = 0xd9; instr[1] = 0x62; instr[2] = 0x08;
+        regs.eip = (unsigned long)&instr[0];
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "fnstenv %0" : "=m" (res[9]) :: "memory" );
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 2, res + 9, 28) ||
+             (regs.eip != (unsigned long)&instr[3]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing 16-bit fnsave (%ecx)...");
     if ( stack_exec && cpu_has_fpu )
     {
@@ -2468,6 +2489,31 @@ int main(int argc, char **argv)
             goto fail;
         printf("okay\n");
     }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing frstor (%edx)...");
+    if ( stack_exec && cpu_has_fpu )
+    {
+        const uint16_t seven = 7;
+
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %1\n\t"
+                       "fnsave %0\n\t"
+                       : "=&m" (res[0]) : "m" (seven) : "memory" );
+        zap_fpsel(&res[0], true);
+        instr[0] = 0xdd; instr[1] = 0x22;
+        regs.eip = (unsigned long)&instr[0];
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "fnsave %0" : "=m" (res[27]) :: "memory" );
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res, res + 27, 108) ||
+             (regs.eip != (unsigned long)&instr[2]) )
+            goto fail;
+        printf("okay\n");
+    }
     else
         printf("skipped\n");
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -857,6 +857,7 @@ struct x86_emulate_state {
         blk_NONE,
         blk_enqcmd,
 #ifndef X86EMUL_NO_FPU
+        blk_fld, /* FLDENV, FRSTOR */
         blk_fst, /* FNSTENV, FNSAVE */
 #endif
         blk_movdir,
@@ -4948,21 +4949,14 @@ x86_emulate(
                 dst.bytes = 4;
                 emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
                 break;
-            case 4: /* fldenv - TODO */
-                state->fpu_ctrl = true;
-                goto unimplemented_insn;
-            case 5: /* fldcw m2byte */
-                state->fpu_ctrl = true;
-            fpu_memsrc16:
-                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
-                                     2, ctxt)) != X86EMUL_OKAY )
-                    goto done;
-                emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
-                break;
+            case 4: /* fldenv */
+                /* Raise #MF now if there are pending unmasked exceptions. */
+                emulate_fpu_insn_stub(0xd9, 0xd0 /* fnop */);
+                /* fall through */
             case 6: /* fnstenv */
                 fail_if(!ops->blk);
-                state->blk = blk_fst;
-                /* REX is meaningless for this insn by this point. */
+                state->blk = modrm_reg & 2 ? blk_fst : blk_fld;
+                /* REX is meaningless for these insns by this point. */
                 rex_prefix = in_protmode(ctxt, ops);
                 if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
                                     op_bytes > 2 ? sizeof(struct x87_env32)
@@ -4972,6 +4966,14 @@ x86_emulate(
                     goto done;
                 state->fpu_ctrl = true;
                 break;
+            case 5: /* fldcw m2byte */
+                state->fpu_ctrl = true;
+            fpu_memsrc16:
+                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
+                                     2, ctxt)) != X86EMUL_OKAY )
+                    goto done;
+                emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
+                break;
             case 7: /* fnstcw m2byte */
                 state->fpu_ctrl = true;
             fpu_memdst16:
@@ -5124,13 +5126,14 @@ x86_emulate(
                 dst.bytes = 8;
                 emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
                 break;
-            case 4: /* frstor - TODO */
-                state->fpu_ctrl = true;
-                goto unimplemented_insn;
+            case 4: /* frstor */
+                /* Raise #MF now if there are pending unmasked exceptions. */
+                emulate_fpu_insn_stub(0xd9, 0xd0 /* fnop */);
+                /* fall through */
             case 6: /* fnsave */
                 fail_if(!ops->blk);
-                state->blk = blk_fst;
-                /* REX is meaningless for this insn by this point. */
+                state->blk = modrm_reg & 2 ? blk_fst : blk_fld;
+                /* REX is meaningless for these insns by this point. */
                 rex_prefix = in_protmode(ctxt, ops);
                 if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
                                     op_bytes > 2 ? sizeof(struct x87_env32) + 80
@@ -11648,6 +11651,89 @@ int x86_emul_blk(
 
 #ifndef X86EMUL_NO_FPU
 
+    case blk_fld:
+        ASSERT(!data);
+
+        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
+        switch ( bytes )
+        {
+        case sizeof(fpstate.env):
+        case sizeof(fpstate):
+            memcpy(&fpstate.env, ptr, sizeof(fpstate.env));
+            if ( !state->rex_prefix )
+            {
+                unsigned int fip = fpstate.env.mode.real.fip_lo +
+                                   (fpstate.env.mode.real.fip_hi << 16);
+                unsigned int fdp = fpstate.env.mode.real.fdp_lo +
+                                   (fpstate.env.mode.real.fdp_hi << 16);
+                unsigned int fop = fpstate.env.mode.real.fop;
+
+                fpstate.env.mode.prot.fip = fip & 0xf;
+                fpstate.env.mode.prot.fcs = fip >> 4;
+                fpstate.env.mode.prot.fop = fop;
+                fpstate.env.mode.prot.fdp = fdp & 0xf;
+                fpstate.env.mode.prot.fds = fdp >> 4;
+            }
+
+            if ( bytes == sizeof(fpstate.env) )
+                ptr = NULL;
+            else
+                ptr += sizeof(fpstate.env);
+            break;
+
+        case sizeof(struct x87_env16):
+        case sizeof(struct x87_env16) + sizeof(fpstate.freg):
+        {
+            const struct x87_env16 *env = ptr;
+
+            fpstate.env.fcw = env->fcw;
+            fpstate.env.fsw = env->fsw;
+            fpstate.env.ftw = env->ftw;
+
+            if ( state->rex_prefix )
+            {
+                fpstate.env.mode.prot.fip = env->mode.prot.fip;
+                fpstate.env.mode.prot.fcs = env->mode.prot.fcs;
+                fpstate.env.mode.prot.fdp = env->mode.prot.fdp;
+                fpstate.env.mode.prot.fds = env->mode.prot.fds;
+                fpstate.env.mode.prot.fop = 0; /* unknown */
+            }
+            else
+            {
+                unsigned int fip = env->mode.real.fip_lo +
+                                   (env->mode.real.fip_hi << 16);
+                unsigned int fdp = env->mode.real.fdp_lo +
+                                   (env->mode.real.fdp_hi << 16);
+                unsigned int fop = env->mode.real.fop;
+
+                fpstate.env.mode.prot.fip = fip & 0xf;
+                fpstate.env.mode.prot.fcs = fip >> 4;
+                fpstate.env.mode.prot.fop = fop;
+                fpstate.env.mode.prot.fdp = fdp & 0xf;
+                fpstate.env.mode.prot.fds = fdp >> 4;
+            }
+
+            if ( bytes == sizeof(*env) )
+                ptr = NULL;
+            else
+                ptr += sizeof(*env);
+            break;
+        }
+
+        default:
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( ptr )
+        {
+            memcpy(fpstate.freg, ptr, sizeof(fpstate.freg));
+            asm volatile ( "frstor %0" :: "m" (fpstate) );
+        }
+        else
+            asm volatile ( "fldenv %0" :: "m" (fpstate.env) );
+        break;
+
     case blk_fst:
         ASSERT(!data);
 



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 09/12] x86emul: support FXSAVE/FXRSTOR
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
                   ` (7 preceding siblings ...)
  2020-05-05  8:16 ` [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR Jan Beulich
@ 2020-05-05  8:16 ` Jan Beulich
  2020-05-08 19:31   ` Andrew Cooper
  2020-05-05  8:17 ` [PATCH v8 09/12] x86/HVM: scale MPERF values reported to guests (on AMD) Jan Beulich
                   ` (3 subsequent siblings)
  12 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:16 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

Note that FPU selector handling as well as MXCSR mask saving for now
does not honor differences between host and guest visible featuresets.

While for Intel operation of the insns with CR4.OSFXSR=0 is
implementation dependent, use the easiest solution there: Simply don't
look at the bit in the first place. For AMD and alike the behavior is
well defined, so it gets handled together with FFXSR.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v8: Respect EFER.FFXSE and CR4.OSFXSR. Correct wrong X86EMUL_NO_*
    dependencies. Reduce #ifdef-ary.
v7: New.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -767,6 +767,12 @@ static void zap_fpsel(unsigned int *env,
     }
 }
 
+static void zap_xfpsel(unsigned int *env)
+{
+    env[3] &= ~0xffff;
+    env[5] &= ~0xffff;
+}
+
 #ifdef __x86_64__
 # define STKVAL_DISP 64
 static const struct {
@@ -2517,6 +2523,91 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing fxsave 4(%ecx)...");
+    if ( stack_exec && cpu_has_fxsr )
+    {
+        const uint16_t nine = 9;
+
+        memset(res + 0x80, 0xcc, 0x400);
+        if ( cpu_has_sse2 )
+            asm volatile ( "pcmpeqd %xmm7, %xmm7\n\t"
+                           "pxor %xmm6, %xmm6\n\t"
+                           "psubw %xmm7, %xmm6" );
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %1\n\t"
+                       "fxsave %0"
+                       : "=m" (res[0x100]) : "m" (nine) : "memory" );
+        zap_xfpsel(&res[0x100]);
+        instr[0] = 0x0f; instr[1] = 0xae; instr[2] = 0x41; instr[3] = 0x04;
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)(res + 0x7f);
+        memset(res + 0x100 + 0x74, 0x33, 0x30);
+        memset(res + 0x80 + 0x74, 0x33, 0x30);
+        rc = x86_emulate(&ctxt, &emulops);
+        zap_xfpsel(&res[0x80]);
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 0x80, res + 0x100, 0x200) ||
+             (regs.eip != (unsigned long)&instr[4]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing fxrstor -4(%ecx)...");
+    if ( stack_exec && cpu_has_fxsr )
+    {
+        const uint16_t eleven = 11;
+
+        memset(res + 0x80, 0xcc, 0x400);
+        asm volatile ( "fxsave %0" : "=m" (res[0x80]) :: "memory" );
+        zap_xfpsel(&res[0x80]);
+        if ( cpu_has_sse2 )
+            asm volatile ( "pxor %xmm7, %xmm6\n\t"
+                           "pxor %xmm7, %xmm3\n\t"
+                           "pxor %xmm7, %xmm0\n\t"
+                           "pxor %xmm7, %xmm7" );
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %0\n\t"
+                       :: "m" (eleven) );
+        instr[0] = 0x0f; instr[1] = 0xae; instr[2] = 0x49; instr[3] = 0xfc;
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)(res + 0x81);
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "fxsave %0" : "=m" (res[0x100]) :: "memory" );
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 0x100, res + 0x80, 0x200) ||
+             (regs.eip != (unsigned long)&instr[4]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+#ifdef __x86_64__
+    printf("%-40s", "Testing fxsaveq 8(%edx)...");
+    if ( stack_exec && cpu_has_fxsr )
+    {
+        memset(res + 0x80, 0xcc, 0x400);
+        asm volatile ( "fxsaveq %0" : "=m" (res[0x100]) :: "memory" );
+        instr[0] = 0x48; instr[1] = 0x0f; instr[2] = 0xae; instr[3] = 0x42; instr[4] = 0x08;
+        regs.eip = (unsigned long)&instr[0];
+        regs.edx = (unsigned long)(res + 0x7e);
+        memset(res + 0x100 + 0x74, 0x33, 0x30);
+        memset(res + 0x80 + 0x74, 0x33, 0x30);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 0x80, res + 0x100, 0x200) ||
+             (regs.eip != (unsigned long)&instr[5]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+#endif
+
     printf("%-40s", "Testing movq %mm3,(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
     {
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -30,6 +30,13 @@ struct cpuid_policy cp;
 static char fpu_save_area[4096] __attribute__((__aligned__((64))));
 static bool use_xsave;
 
+/*
+ * Re-use the area above also as scratch space for the emulator itself.
+ * (When debugging the emulator, care needs to be taken when inserting
+ * printf() or alike function calls into regions using this.)
+ */
+#define FXSAVE_AREA ((struct x86_fxsr *)fpu_save_area)
+
 void emul_save_fpu_state(void)
 {
     if ( use_xsave )
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -860,6 +860,11 @@ struct x86_emulate_state {
         blk_fld, /* FLDENV, FRSTOR */
         blk_fst, /* FNSTENV, FNSAVE */
 #endif
+#if !defined(X86EMUL_NO_FPU) || !defined(X86EMUL_NO_MMX) || \
+    !defined(X86EMUL_NO_SIMD)
+        blk_fxrstor,
+        blk_fxsave,
+#endif
         blk_movdir,
     } blk;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -953,6 +958,29 @@ typedef union {
     uint32_t data32[16];
 } mmval_t;
 
+struct x86_fxsr {
+    uint16_t fcw;
+    uint16_t fsw;
+    uint16_t ftw:8, :8;
+    uint16_t fop;
+    union {
+        struct {
+            uint32_t offs;
+            uint32_t sel:16, :16;
+        };
+        uint64_t addr;
+    } fip, fdp;
+    uint32_t mxcsr;
+    uint32_t mxcsr_mask;
+    struct {
+        uint8_t data[10];
+        uint8_t _[6];
+    } fpreg[8];
+    uint64_t __attribute__ ((aligned(16))) xmm[16][2];
+    uint64_t _[6];
+    uint64_t avl[6];
+};
+
 /*
  * While proper alignment gets specified above, this doesn't get honored by
  * the compiler for automatic variables. Use this helper to instantiate a
@@ -1910,6 +1938,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_cmov()        (ctxt->cpuid->basic.cmov)
 #define vcpu_has_clflush()     (ctxt->cpuid->basic.clflush)
 #define vcpu_has_mmx()         (ctxt->cpuid->basic.mmx)
+#define vcpu_has_fxsr()        (ctxt->cpuid->basic.fxsr)
 #define vcpu_has_sse()         (ctxt->cpuid->basic.sse)
 #define vcpu_has_sse2()        (ctxt->cpuid->basic.sse2)
 #define vcpu_has_sse3()        (ctxt->cpuid->basic.sse3)
@@ -8125,6 +8154,47 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
         switch ( modrm_reg & 7 )
         {
+#if !defined(X86EMUL_NO_FPU) || !defined(X86EMUL_NO_MMX) || \
+    !defined(X86EMUL_NO_SIMD)
+        case 0: /* fxsave */
+        case 1: /* fxrstor */
+            generate_exception_if(vex.pfx, EXC_UD);
+            vcpu_must_have(fxsr);
+            generate_exception_if(ea.type != OP_MEM, EXC_UD);
+            generate_exception_if(!is_aligned(ea.mem.seg, ea.mem.off, 16,
+                                              ctxt, ops),
+                                  EXC_GP, 0);
+            fail_if(!ops->blk);
+            op_bytes =
+#ifdef __x86_64__
+                !mode_64bit() ? offsetof(struct x86_fxsr, xmm[8]) :
+#endif
+                sizeof(struct x86_fxsr);
+            if ( amd_like(ctxt) )
+            {
+                if ( !ops->read_cr ||
+                     ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
+                    cr4 = X86_CR4_OSFXSR;
+                if ( !ops->read_msr ||
+                     ops->read_msr(MSR_EFER, &msr_val, ctxt) != X86EMUL_OKAY )
+                    msr_val = 0;
+                if ( !(cr4 & X86_CR4_OSFXSR) ||
+                     (mode_64bit() && mode_ring0() && (msr_val & EFER_FFXSE)) )
+                    op_bytes = offsetof(struct x86_fxsr, xmm[0]);
+            }
+            /*
+             * This could also be X86EMUL_FPU_mmx, but it shouldn't be
+             * X86EMUL_FPU_xmm, as we don't want CR4.OSFXSR checked.
+             */
+            get_fpu(X86EMUL_FPU_fpu);
+            state->blk = modrm_reg & 1 ? blk_fxrstor : blk_fxsave;
+            if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
+                                sizeof(struct x86_fxsr), &_regs.eflags,
+                                state, ctxt)) != X86EMUL_OKAY )
+                goto done;
+            break;
+#endif /* X86EMUL_NO_{FPU,MMX,SIMD} */
+
 #ifndef X86EMUL_NO_SIMD
         case 2: /* ldmxcsr */
             generate_exception_if(vex.pfx, EXC_UD);
@@ -11611,6 +11681,8 @@ int x86_emul_blk(
     struct x86_emulate_state *state,
     struct x86_emulate_ctxt *ctxt)
 {
+    int rc = X86EMUL_OKAY;
+
     switch ( state->blk )
     {
         bool zf;
@@ -11819,6 +11891,77 @@ int x86_emul_blk(
 
 #endif /* X86EMUL_NO_FPU */
 
+#if !defined(X86EMUL_NO_FPU) || !defined(X86EMUL_NO_MMX) || \
+    !defined(X86EMUL_NO_SIMD)
+
+    case blk_fxrstor:
+    {
+        struct x86_fxsr *fxsr = FXSAVE_AREA;
+
+        ASSERT(!data);
+        ASSERT(bytes == sizeof(*fxsr));
+        ASSERT(state->op_bytes <= bytes);
+
+        if ( state->op_bytes < sizeof(*fxsr) )
+        {
+            if ( state->rex_prefix & REX_W )
+            {
+                /*
+                 * The only way to force fxsaveq on a wide range of gas
+                 * versions. On older versions the rex64 prefix works only if
+                 * we force an addressing mode that doesn't require extended
+                 * registers.
+                 */
+                asm volatile ( ".byte 0x48; fxsave (%1)"
+                               : "=m" (*fxsr) : "R" (fxsr) );
+            }
+            else
+                asm volatile ( "fxsave %0" : "=m" (*fxsr) );
+        }
+
+        memcpy(fxsr, ptr, min(state->op_bytes,
+                              (unsigned int)offsetof(struct x86_fxsr, _)));
+        memset(fxsr->_, 0, sizeof(*fxsr) - offsetof(struct x86_fxsr, _));
+
+        generate_exception_if(fxsr->mxcsr & ~mxcsr_mask, EXC_GP, 0);
+
+        if ( state->rex_prefix & REX_W )
+        {
+            /* See above for why operand/constraints are this way. */
+            asm volatile ( ".byte 0x48; fxrstor (%0)"
+                           :: "R" (fxsr), "m" (*fxsr) );
+        }
+        else
+            asm volatile ( "fxrstor %0" :: "m" (*fxsr) );
+        break;
+    }
+
+    case blk_fxsave:
+    {
+        struct x86_fxsr *fxsr = FXSAVE_AREA;
+
+        ASSERT(!data);
+        ASSERT(bytes == sizeof(*fxsr));
+        ASSERT(state->op_bytes <= bytes);
+
+        if ( state->rex_prefix & REX_W )
+        {
+            /* See above for why operand/constraint are this way. */
+            asm volatile ( ".byte 0x48; fxsave (%0)"
+                           :: "R" (state->op_bytes < sizeof(*fxsr) ? fxsr : ptr)
+                           : "memory" );
+        }
+        else
+            asm volatile ( "fxsave (%0)"
+                           :: "r" (state->op_bytes < sizeof(*fxsr) ? fxsr : ptr)
+                           : "memory" );
+        if ( state->op_bytes < sizeof(*fxsr) )
+            memcpy(ptr, fxsr, state->op_bytes);
+        break;
+    }
+
+#endif /* X86EMUL_NO_{FPU,MMX,SIMD} */
+
     case blk_movdir:
         switch ( bytes )
         {
@@ -11872,7 +12015,8 @@ int x86_emul_blk(
         return X86EMUL_UNHANDLEABLE;
     }
 
-    return X86EMUL_OKAY;
+ done:
+    return rc;
 }
 
 static void __init __maybe_unused build_assertions(void)
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -42,6 +42,8 @@
     }                                                      \
 })
 
+#define FXSAVE_AREA current->arch.fpu_ctxt
+
 #ifndef CONFIG_HVM
 # define X86EMUL_NO_FPU
 # define X86EMUL_NO_MMX



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 09/12] x86/HVM: scale MPERF values reported to guests (on AMD)
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
                   ` (8 preceding siblings ...)
  2020-05-05  8:16 ` [PATCH v8 09/12] x86emul: support FXSAVE/FXRSTOR Jan Beulich
@ 2020-05-05  8:17 ` Jan Beulich
  2020-05-05  8:19   ` Jan Beulich
  2020-05-05  8:18 ` [PATCH v8 10/12] " Jan Beulich
                   ` (2 subsequent siblings)
  12 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:17 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

AMD's PM specifies that MPERF (and its r/o counterpart) reads are
affected by the TSC ratio. Hence when processing such reads in software
we too should scale the values. While we don't currently (yet) expose
the underlying feature flags, besides us allowing the MSRs to be read
nevertheless, RDPRU is going to expose the values even to user space.

Furthermore, due to the not exposed feature flags, this change has the
effect of making properly inaccessible (for reads) the two MSRs.

Note that writes to MPERF (and APERF) continue to be unsupported.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.
---
I did consider whether to put the code in guest_rdmsr() instead, but
decided that it's better to have it next to TSC handling.

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3478,6 +3478,22 @@ int hvm_msr_read_intercept(unsigned int
         *msr_content = v->arch.hvm.msr_tsc_adjust;
         break;
 
+    case MSR_MPERF_RD_ONLY:
+        if ( !d->arch.cpuid->extd.efro )
+        {
+            goto gp_fault;
+
+    case MSR_IA32_MPERF:
+            if ( !(d->arch.cpuid->basic.raw[6].c &
+                   CPUID6_ECX_APERFMPERF_CAPABILITY) )
+                goto gp_fault;
+        }
+        if ( rdmsr_safe(msr, *msr_content) )
+            goto gp_fault;
+        if ( d->arch.cpuid->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
+            *msr_content = hvm_get_guest_tsc_fixed(v, *msr_content);
+        break;
+
     case MSR_APIC_BASE:
         *msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
         break;
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -405,6 +405,9 @@
 #define MSR_IA32_MPERF			0x000000e7
 #define MSR_IA32_APERF			0x000000e8
 
+#define MSR_MPERF_RD_ONLY		0xc00000e7
+#define MSR_APERF_RD_ONLY		0xc00000e8
+
 #define MSR_IA32_THERM_CONTROL		0x0000019a
 #define MSR_IA32_THERM_INTERRUPT	0x0000019b
 #define MSR_IA32_THERM_STATUS		0x0000019c



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 10/12] x86/HVM: scale MPERF values reported to guests (on AMD)
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
                   ` (9 preceding siblings ...)
  2020-05-05  8:17 ` [PATCH v8 09/12] x86/HVM: scale MPERF values reported to guests (on AMD) Jan Beulich
@ 2020-05-05  8:18 ` Jan Beulich
  2020-05-08 20:32   ` Andrew Cooper
  2020-05-05  8:19 ` [PATCH v8 11/12] x86emul: support RDPRU Jan Beulich
  2020-05-05  8:20 ` [PATCH v8 12/12] x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads Jan Beulich
  12 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:18 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

AMD's PM specifies that MPERF (and its r/o counterpart) reads are
affected by the TSC ratio. Hence when processing such reads in software
we too should scale the values. While we don't currently (yet) expose
the underlying feature flags, besides us allowing the MSRs to be read
nevertheless, RDPRU is going to expose the values even to user space.

Furthermore, due to the not exposed feature flags, this change has the
effect of making properly inaccessible (for reads) the two MSRs.

Note that writes to MPERF (and APERF) continue to be unsupported.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.
---
I did consider whether to put the code in guest_rdmsr() instead, but
decided that it's better to have it next to TSC handling.

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3478,6 +3478,22 @@ int hvm_msr_read_intercept(unsigned int
         *msr_content = v->arch.hvm.msr_tsc_adjust;
         break;
 
+    case MSR_MPERF_RD_ONLY:
+        if ( !d->arch.cpuid->extd.efro )
+        {
+            goto gp_fault;
+
+    case MSR_IA32_MPERF:
+            if ( !(d->arch.cpuid->basic.raw[6].c &
+                   CPUID6_ECX_APERFMPERF_CAPABILITY) )
+                goto gp_fault;
+        }
+        if ( rdmsr_safe(msr, *msr_content) )
+            goto gp_fault;
+        if ( d->arch.cpuid->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
+            *msr_content = hvm_get_guest_tsc_fixed(v, *msr_content);
+        break;
+
     case MSR_APIC_BASE:
         *msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
         break;
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -405,6 +405,9 @@
 #define MSR_IA32_MPERF			0x000000e7
 #define MSR_IA32_APERF			0x000000e8
 
+#define MSR_MPERF_RD_ONLY		0xc00000e7
+#define MSR_APERF_RD_ONLY		0xc00000e8
+
 #define MSR_IA32_THERM_CONTROL		0x0000019a
 #define MSR_IA32_THERM_INTERRUPT	0x0000019b
 #define MSR_IA32_THERM_STATUS		0x0000019c



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 09/12] x86/HVM: scale MPERF values reported to guests (on AMD)
  2020-05-05  8:17 ` [PATCH v8 09/12] x86/HVM: scale MPERF values reported to guests (on AMD) Jan Beulich
@ 2020-05-05  8:19   ` Jan Beulich
  0 siblings, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:19 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

On 05.05.2020 10:17, Jan Beulich wrote:
> AMD's PM specifies that MPERF (and its r/o counterpart) reads are
> affected by the TSC ratio. Hence when processing such reads in software
> we too should scale the values. While we don't currently (yet) expose
> the underlying feature flags, besides us allowing the MSRs to be read
> nevertheless, RDPRU is going to expose the values even to user space.
> 
> Furthermore, due to the not exposed feature flags, this change has the
> effect of making properly inaccessible (for reads) the two MSRs.
> 
> Note that writes to MPERF (and APERF) continue to be unsupported.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Sorry, just re-sent this one with correct numbering.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 11/12] x86emul: support RDPRU
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
                   ` (10 preceding siblings ...)
  2020-05-05  8:18 ` [PATCH v8 10/12] " Jan Beulich
@ 2020-05-05  8:19 ` Jan Beulich
  2020-05-05  8:20 ` [PATCH v8 12/12] x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads Jan Beulich
  12 siblings, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:19 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

While the PM doesn't say so, this assumes that the MPERF value read this
way gets scaled similarly to its reading through RDMSR.

Also introduce the SVM related constants at this occasion.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v6: Re-base.
v5: The CPUID field used is just 8 bits wide.
v4: Add GENERAL2_INTERCEPT_RDPRU and VMEXIT_RDPRU enumerators. Fold
    handling of out of bounds indexes into switch(). Avoid
    recalculate_misc() clobbering what recalculate_cpu_policy() has
    done. Re-base.
v3: New.
---
RFC: Andrew promised to take care of the CPUID side of this; re-base
     over his work once available.

--- a/tools/libxl/libxl_cpuid.c
+++ b/tools/libxl/libxl_cpuid.c
@@ -264,6 +264,7 @@ int libxl_cpuid_parse_config(libxl_cpuid
 
         {"clzero",       0x80000008, NA, CPUID_REG_EBX,  0,  1},
         {"rstr-fp-err-ptrs", 0x80000008, NA, CPUID_REG_EBX, 2, 1},
+        {"rdpru",        0x80000008, NA, CPUID_REG_EBX,  4,  1},
         {"wbnoinvd",     0x80000008, NA, CPUID_REG_EBX,  9,  1},
         {"ibpb",         0x80000008, NA, CPUID_REG_EBX, 12,  1},
         {"ppin",         0x80000008, NA, CPUID_REG_EBX, 23,  1},
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -148,6 +148,8 @@ static const char *const str_e8b[32] =
     [ 0] = "clzero",
     [ 2] = "rstr-fp-err-ptrs",
 
+    [ 4] = "rdpru",
+
     /* [ 8] */            [ 9] = "wbnoinvd",
 
     [12] = "ibpb",
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -683,6 +683,13 @@ static int read_msr(
 {
     switch ( reg )
     {
+    case 0x000000e8: /* APERF */
+    case 0xc00000e8: /* APERF_RD_ONLY */
+#define APERF_LO_VALUE 0xAEAEAEAE
+#define APERF_HI_VALUE 0xEAEAEAEA
+        *val = ((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE;
+        return X86EMUL_OKAY;
+
     case 0xc0000080: /* EFER */
         *val = ctxt->addr_size > 32 ? 0x500 /* LME|LMA */ : 0;
         return X86EMUL_OKAY;
@@ -2421,6 +2428,30 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing rdpru...");
+    instr[0] = 0x0f; instr[1] = 0x01; instr[2] = 0xfd;
+    regs.eip = (unsigned long)&instr[0];
+    regs.ecx = 1;
+    regs.eflags = EFLAGS_ALWAYS_SET;
+    rc = x86_emulate(&ctxt, &emulops);
+    if ( (rc != X86EMUL_OKAY) ||
+         (regs.eax != APERF_LO_VALUE) || (regs.edx != APERF_HI_VALUE) ||
+         !(regs.eflags & X86_EFLAGS_CF) ||
+         (regs.eip != (unsigned long)&instr[3]) )
+        goto fail;
+    if ( ctxt.cpuid->extd.rdpru_max < 0xffff )
+    {
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = ctxt.cpuid->extd.rdpru_max + 1;
+        regs.eflags = EFLAGS_ALWAYS_SET | X86_EFLAGS_CF;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || regs.eax || regs.edx ||
+             (regs.eflags & X86_EFLAGS_CF) ||
+             (regs.eip != (unsigned long)&instr[3]) )
+            goto fail;
+    }
+    printf("okay\n");
+
     printf("%-40s", "Testing fnstenv 4(%ecx)...");
     if ( stack_exec && cpu_has_fpu )
     {
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -84,6 +84,8 @@ bool emul_test_init(void)
     cp.feat.avx512pf = cp.feat.avx512f;
     cp.feat.rdpid = true;
     cp.extd.clzero = true;
+    cp.extd.rdpru = true;
+    cp.extd.rdpru_max = 1;
 
     if ( cpu_has_xsave )
     {
@@ -156,11 +158,11 @@ int emul_test_cpuid(
     }
 
     /*
-     * The emulator doesn't itself use CLZERO, so we can always run the
+     * The emulator doesn't itself use CLZERO/RDPRU, so we can always run the
      * respective test(s).
      */
     if ( leaf == 0x80000008 )
-        res->b |= 1U << 0;
+        res->b |= (1U << 0) | (1U << 4);
 
     return X86EMUL_OKAY;
 }
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -243,8 +243,6 @@ static void recalculate_misc(struct cpui
     /* Most of Power/RAS hidden from guests. */
     p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
 
-    p->extd.raw[0x8].d = 0;
-
     switch ( p->x86_vendor )
     {
     case X86_VENDOR_INTEL:
@@ -263,6 +261,7 @@ static void recalculate_misc(struct cpui
 
         p->extd.raw[0x8].a &= 0x0000ffff;
         p->extd.raw[0x8].c = 0;
+        p->extd.raw[0x8].d = 0;
         break;
 
     case X86_VENDOR_AMD:
@@ -281,6 +280,7 @@ static void recalculate_misc(struct cpui
 
         p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
         p->extd.raw[0x8].c &= 0x0003f0ff;
+        p->extd.raw[0x8].d &= 0xffff0000;
 
         p->extd.raw[0x9] = EMPTY_LEAF;
 
@@ -643,6 +643,11 @@ void recalculate_cpuid_policy(struct dom
 
     p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
 
+    if ( p->extd.rdpru )
+        p->extd.rdpru_max = min(p->extd.rdpru_max, max->extd.rdpru_max);
+    else
+        p->extd.rdpru_max = 0;
+
     recalculate_xstate(p);
     recalculate_misc(p);
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1967,6 +1967,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_fma4()        (ctxt->cpuid->extd.fma4)
 #define vcpu_has_tbm()         (ctxt->cpuid->extd.tbm)
 #define vcpu_has_clzero()      (ctxt->cpuid->extd.clzero)
+#define vcpu_has_rdpru()       (ctxt->cpuid->extd.rdpru)
 #define vcpu_has_wbnoinvd()    (ctxt->cpuid->extd.wbnoinvd)
 
 #define vcpu_has_bmi1()        (ctxt->cpuid->feat.bmi1)
@@ -5855,6 +5856,50 @@ x86_emulate(
                 limit -= sizeof(zero);
             }
             break;
+
+        case 0xfd: /* rdpru */
+            vcpu_must_have(rdpru);
+
+            if ( !mode_ring0() )
+            {
+                fail_if(!ops->read_cr);
+                if ( (rc = ops->read_cr(4, &cr4, ctxt)) != X86EMUL_OKAY )
+                    goto done;
+                generate_exception_if(cr4 & X86_CR4_TSD, EXC_UD);
+            }
+
+            switch ( _regs.ecx | -(_regs.ecx > ctxt->cpuid->extd.rdpru_max) )
+            {
+            case 0:  n = MSR_IA32_MPERF; break;
+            case 1:  n = MSR_IA32_APERF; break;
+            default: n = 0; break;
+            }
+
+            _regs.eflags &= ~EFLAGS_MASK;
+            if ( n )
+            {
+                fail_if(!ops->read_msr);
+                switch ( rc = ops->read_msr(n, &msr_val, ctxt) )
+                {
+                case X86EMUL_OKAY:
+                    _regs.eflags |= X86_EFLAGS_CF;
+                    break;
+
+                case X86EMUL_EXCEPTION:
+                    x86_emul_reset_event(ctxt);
+                    rc = X86EMUL_OKAY;
+                    break;
+
+                default:
+                    goto done;
+                }
+            }
+
+            if ( !(_regs.eflags & X86_EFLAGS_CF) )
+                msr_val = 0;
+            _regs.r(dx) = msr_val >> 32;
+            _regs.r(ax) = (uint32_t)msr_val;
+            break;
         }
 
 #define _GRP7(mod, reg) \
--- a/xen/include/asm-x86/hvm/svm/vmcb.h
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h
@@ -74,7 +74,8 @@ enum GenericIntercept2bits
     GENERAL2_INTERCEPT_MONITOR = 1 << 10,
     GENERAL2_INTERCEPT_MWAIT   = 1 << 11,
     GENERAL2_INTERCEPT_MWAIT_CONDITIONAL = 1 << 12,
-    GENERAL2_INTERCEPT_XSETBV  = 1 << 13
+    GENERAL2_INTERCEPT_XSETBV  = 1 << 13,
+    GENERAL2_INTERCEPT_RDPRU   = 1 << 14,
 };
 
 
@@ -298,6 +299,7 @@ enum VMEXIT_EXITCODE
     VMEXIT_MWAIT            = 139, /* 0x8b */
     VMEXIT_MWAIT_CONDITIONAL= 140, /* 0x8c */
     VMEXIT_XSETBV           = 141, /* 0x8d */
+    VMEXIT_RDPRU            = 142, /* 0x8e */
     VMEXIT_NPF              = 1024, /* 0x400, nested paging fault */
     VMEXIT_INVALID          =  -1
 };
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -250,6 +250,7 @@ XEN_CPUFEATURE(EFRO,          7*32+10) /
 /* AMD-defined CPU features, CPUID level 0x80000008.ebx, word 8 */
 XEN_CPUFEATURE(CLZERO,        8*32+ 0) /*A  CLZERO instruction */
 XEN_CPUFEATURE(RSTR_FP_ERR_PTRS, 8*32+ 2) /*A  (F)X{SAVE,RSTOR} always saves/restores FPU Error pointers */
+XEN_CPUFEATURE(RDPRU,         8*32+ 4) /*A  RDPRU instruction */
 XEN_CPUFEATURE(WBNOINVD,      8*32+ 9) /*   WBNOINVD instruction */
 XEN_CPUFEATURE(IBPB,          8*32+12) /*A  IBPB support only (no IBRS, used by AMD) */
 XEN_CPUFEATURE(AMD_PPIN,      8*32+23) /*   Protected Processor Inventory Number */
--- a/xen/include/xen/lib/x86/cpuid.h
+++ b/xen/include/xen/lib/x86/cpuid.h
@@ -263,7 +263,7 @@ struct cpuid_policy
                 struct { DECL_BITFIELD(e8b); };
             };
             uint32_t nc:8, :4, apic_id_size:4, :16;
-            uint32_t /* d */:32;
+            uint8_t :8, :8, rdpru_max, :8;
         };
     } extd;
 



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH v8 12/12] x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads
  2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
                   ` (11 preceding siblings ...)
  2020-05-05  8:19 ` [PATCH v8 11/12] x86emul: support RDPRU Jan Beulich
@ 2020-05-05  8:20 ` Jan Beulich
  2020-05-08 21:04   ` Andrew Cooper
  12 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-05  8:20 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

If the hardware can handle accesses, we should allow it to do so. This
way we can expose EFRO to HVM guests, and "all" that's left for exposing
APERF/MPERF is to figure out how to handle writes to these MSRs. (Note
that the leaf 6 guest CPUID checks will evaluate to false for now, as
recalculate_misc() zaps the entire leaf.)

For TSC the intercepts are made mirror the RDTSC ones.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
---
v4: Make TSC intercepts mirror RDTSC ones. Re-base.
v3: New.

--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -595,6 +595,7 @@ static void svm_cpuid_policy_changed(str
     struct vmcb_struct *vmcb = svm->vmcb;
     const struct cpuid_policy *cp = v->domain->arch.cpuid;
     u32 bitmap = vmcb_get_exception_intercepts(vmcb);
+    unsigned int mode;
 
     if ( opt_hvm_fep ||
          (v->domain->arch.cpuid->x86_vendor != boot_cpu_data.x86_vendor) )
@@ -607,6 +608,17 @@ static void svm_cpuid_policy_changed(str
     /* Give access to MSR_PRED_CMD if the guest has been told about it. */
     svm_intercept_msr(v, MSR_PRED_CMD,
                       cp->extd.ibpb ? MSR_INTERCEPT_NONE : MSR_INTERCEPT_RW);
+
+    /* Allow direct reads from APERF/MPERF if permitted by the policy. */
+    mode = cp->basic.raw[6].c & CPUID6_ECX_APERFMPERF_CAPABILITY
+           ? MSR_INTERCEPT_WRITE : MSR_INTERCEPT_RW;
+    svm_intercept_msr(v, MSR_IA32_APERF, mode);
+    svm_intercept_msr(v, MSR_IA32_MPERF, mode);
+
+    /* Allow direct access to their r/o counterparts if permitted. */
+    mode = cp->extd.efro ? MSR_INTERCEPT_NONE : MSR_INTERCEPT_RW;
+    svm_intercept_msr(v, MSR_APERF_RD_ONLY, mode);
+    svm_intercept_msr(v, MSR_MPERF_RD_ONLY, mode);
 }
 
 void svm_sync_vmcb(struct vcpu *v, enum vmcb_sync_state new_state)
@@ -860,7 +872,10 @@ static void svm_set_rdtsc_exiting(struct
     {
         general1_intercepts |= GENERAL1_INTERCEPT_RDTSC;
         general2_intercepts |= GENERAL2_INTERCEPT_RDTSCP;
+        svm_enable_intercept_for_msr(v, MSR_IA32_TSC);
     }
+    else
+        svm_intercept_msr(v, MSR_IA32_TSC, MSR_INTERCEPT_WRITE);
 
     vmcb_set_general1_intercepts(vmcb, general1_intercepts);
     vmcb_set_general2_intercepts(vmcb, general2_intercepts);
--- a/xen/arch/x86/hvm/svm/vmcb.c
+++ b/xen/arch/x86/hvm/svm/vmcb.c
@@ -108,6 +108,7 @@ static int construct_vmcb(struct vcpu *v
     {
         vmcb->_general1_intercepts |= GENERAL1_INTERCEPT_RDTSC;
         vmcb->_general2_intercepts |= GENERAL2_INTERCEPT_RDTSCP;
+        svm_intercept_msr(v, MSR_IA32_TSC, MSR_INTERCEPT_WRITE);
     }
 
     /* Guest segment limits. */
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1141,8 +1141,13 @@ static int construct_vmcs(struct vcpu *v
         vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_CS, VMX_MSR_RW);
         vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_ESP, VMX_MSR_RW);
         vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_EIP, VMX_MSR_RW);
+
+        if ( !(v->arch.hvm.vmx.exec_control & CPU_BASED_RDTSC_EXITING) )
+            vmx_clear_msr_intercept(v, MSR_IA32_TSC, VMX_MSR_R);
+
         if ( paging_mode_hap(d) && (!is_iommu_enabled(d) || iommu_snoop) )
             vmx_clear_msr_intercept(v, MSR_IA32_CR_PAT, VMX_MSR_RW);
+
         if ( (vmexit_ctl & VM_EXIT_CLEAR_BNDCFGS) &&
              (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) )
             vmx_clear_msr_intercept(v, MSR_IA32_BNDCFGS, VMX_MSR_RW);
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -589,6 +589,18 @@ static void vmx_cpuid_policy_changed(str
         vmx_clear_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
     else
         vmx_set_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
+
+    /* Allow direct reads from APERF/MPERF if permitted by the policy. */
+    if ( cp->basic.raw[6].c & CPUID6_ECX_APERFMPERF_CAPABILITY )
+    {
+        vmx_clear_msr_intercept(v, MSR_IA32_APERF, VMX_MSR_R);
+        vmx_clear_msr_intercept(v, MSR_IA32_MPERF, VMX_MSR_R);
+    }
+    else
+    {
+        vmx_set_msr_intercept(v, MSR_IA32_APERF, VMX_MSR_R);
+        vmx_set_msr_intercept(v, MSR_IA32_MPERF, VMX_MSR_R);
+    }
 }
 
 int vmx_guest_x86_mode(struct vcpu *v)
@@ -1254,7 +1266,12 @@ static void vmx_set_rdtsc_exiting(struct
     vmx_vmcs_enter(v);
     v->arch.hvm.vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
     if ( enable )
+    {
         v->arch.hvm.vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
+        vmx_set_msr_intercept(v, MSR_IA32_TSC, VMX_MSR_R);
+    }
+    else
+        vmx_clear_msr_intercept(v, MSR_IA32_TSC, VMX_MSR_R);
     vmx_update_cpu_exec_control(v);
     vmx_vmcs_exit(v);
 }
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -245,7 +245,7 @@ XEN_CPUFEATURE(ENQCMD,        6*32+29) /
 
 /* AMD-defined CPU features, CPUID level 0x80000007.edx, word 7 */
 XEN_CPUFEATURE(ITSC,          7*32+ 8) /*   Invariant TSC */
-XEN_CPUFEATURE(EFRO,          7*32+10) /*   APERF/MPERF Read Only interface */
+XEN_CPUFEATURE(EFRO,          7*32+10) /*S  APERF/MPERF Read Only interface */
 
 /* AMD-defined CPU features, CPUID level 0x80000008.ebx, word 8 */
 XEN_CPUFEATURE(CLZERO,        8*32+ 0) /*A  CLZERO instruction */



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 07/12] x86emul: support FNSTENV and FNSAVE
  2020-05-05  8:15 ` [PATCH v8 07/12] x86emul: support FNSTENV and FNSAVE Jan Beulich
@ 2020-05-05 12:36   ` Jan Beulich
  2020-05-08 17:58   ` Andrew Cooper
  1 sibling, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-05 12:36 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne

On 05.05.2020 10:15, Jan Beulich wrote:
> @@ -11542,6 +11611,12 @@ int x86_emul_blk(
>      switch ( state->blk )
>      {
>          bool zf;
> +        struct {
> +            struct x87_env32 env;
> +            struct {
> +               uint8_t bytes[10];
> +            } freg[8];
> +        } fpstate;

This also needs #ifndef X86EMUL_NO_FPU around it for !HVM builds
to work.

Jan



^ permalink raw reply	[flat|nested] 51+ messages in thread

* RE: [PATCH v8 06/12] x86/HVM: make hvmemul_blk() capable of handling r/o operations
  2020-05-05  8:15 ` [PATCH v8 06/12] x86/HVM: make hvmemul_blk() capable of handling r/o operations Jan Beulich
@ 2020-05-05 14:20   ` Paul Durrant
  2020-05-07 20:34   ` Andrew Cooper
  1 sibling, 0 replies; 51+ messages in thread
From: Paul Durrant @ 2020-05-05 14:20 UTC (permalink / raw)
  To: 'Jan Beulich', xen-devel
  Cc: 'Andrew Cooper', 'Wei Liu', 'Roger Pau Monne'

> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: 05 May 2020 09:15
> To: xen-devel@lists.xenproject.org
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>; Wei Liu <wl@xen.org>; Roger Pau Monne
> <roger.pau@citrix.com>; Paul Durrant <paul@xen.org>
> Subject: [PATCH v8 06/12] x86/HVM: make hvmemul_blk() capable of handling r/o operations
> 
> In preparation for handling e.g. FLDENV or {F,FX,X}RSTOR here as well.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Paul Durrant <paul@xen.org>



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 01/12] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM
  2020-05-05  8:12 ` [PATCH v8 01/12] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM Jan Beulich
@ 2020-05-07 18:11   ` Andrew Cooper
  2020-05-08  8:10     ` Jan Beulich
  0 siblings, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-07 18:11 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Wei Liu, Roger Pau Monne

On 05/05/2020 09:12, Jan Beulich wrote:
> In a pure PV environment (the PV shim in particular) we don't really
> need emulation of all these. To limit #ifdef-ary utilize some of the
> CASE_*() macros we have, by providing variants expanding to
> (effectively) nothing (really a label, which in turn requires passing
> -Wno-unused-label to the compiler when build such configurations).
>
> Due to the mixture of macro and #ifdef use, the placement of some of
> the #ifdef-s is a little arbitrary.
>
> The resulting object file's .text is less than half the size of the
> original, and looks to also be compiling a little more quickly.
>
> This is meant as a first step; more parts can likely be disabled down
> the road.
>
> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v7: Integrate into this series. Re-base.
> ---
> I'll be happy to take suggestions allowing to avoid -Wno-unused-label.

I already gave you one, along with a far less invasive change.

It is not interesting to be able to conditionally compile these things
separately.  A build of Xen will either need everything, or just the
integer group.

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 02/12] x86emul: support MOVDIR{I,64B} insns
  2020-05-05  8:13 ` [PATCH v8 02/12] x86emul: support MOVDIR{I,64B} insns Jan Beulich
@ 2020-05-07 18:30   ` Andrew Cooper
  2020-05-08  7:19     ` Jan Beulich
  0 siblings, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-07 18:30 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Wei Liu, Roger Pau Monne

On 05/05/2020 09:13, Jan Beulich wrote:
> Introduce a new blk() hook, paralleling the rmw() one in a certain way,
> but being intended for larger data sizes, and hence its HVM intermediate
> handling function doesn't fall back to splitting the operation if the
> requested virtual address can't be mapped.
>
> Note that SDM revision 071 doesn't specify exception behavior for
> ModRM.mod == 0b11; assuming #UD here.

Still stale?  It does #UD on current hardware, and will cease to #UD in
the future when the encoding space gets used for something else.

>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Paul Durrant <paul@xen.org>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 03/12] x86emul: support ENQCMD insns
  2020-05-05  8:13 ` [PATCH v8 03/12] x86emul: support ENQCMD insns Jan Beulich
@ 2020-05-07 18:59   ` Andrew Cooper
  2020-05-08  7:32     ` Jan Beulich
  0 siblings, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-07 18:59 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Wei Liu, Roger Pau Monne

On 05/05/2020 09:13, Jan Beulich wrote:
> Note that the ISA extensions document revision 038 doesn't specify
> exception behavior for ModRM.mod == 0b11; assuming #UD here.

Stale.

> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -11480,11 +11513,36 @@ int x86_emul_blk(
>  {
>      switch ( state->blk )
>      {
> +        bool zf;
> +
>          /*
>           * Throughout this switch(), memory clobbers are used to compensate
>           * that other operands may not properly express the (full) memory
>           * ranges covered.
>           */
> +    case blk_enqcmd:
> +        ASSERT(bytes == 64);
> +        if ( ((unsigned long)ptr & 0x3f) )
> +        {
> +            ASSERT_UNREACHABLE();
> +            return X86EMUL_UNHANDLEABLE;
> +        }
> +        *eflags &= ~EFLAGS_MASK;
> +#ifdef HAVE_AS_ENQCMD
> +        asm ( "enqcmds (%[src]), %[dst]" ASM_FLAG_OUT(, "; setz %0")

%[zf]

> +              : [zf] ASM_FLAG_OUT("=@ccz", "=qm") (zf)
> +              : [src] "r" (data), [dst] "r" (ptr) : "memory" );

Can't src get away with being "m" (*data)?  There is no need to force it
into a single register, even if it is overwhelmingly likely to end up
with %rsi scheduled here.

> +#else
> +        /* enqcmds (%rsi), %rdi */
> +        asm ( ".byte 0xf3, 0x0f, 0x38, 0xf8, 0x3e"
> +              ASM_FLAG_OUT(, "; setz %[zf]")
> +              : [zf] ASM_FLAG_OUT("=@ccz", "=qm") (zf)
> +              : "S" (data), "D" (ptr) : "memory" );
> +#endif
> +        if ( zf )
> +            *eflags |= X86_EFLAGS_ZF;
> +        break;
> +
>      case blk_movdir:
>          switch ( bytes )
>          {
> --- a/xen/include/asm-x86/msr-index.h
> +++ b/xen/include/asm-x86/msr-index.h
> @@ -420,6 +420,10 @@
>  #define MSR_IA32_TSC_DEADLINE		0x000006E0
>  #define MSR_IA32_ENERGY_PERF_BIAS	0x000001b0
>  
> +#define MSR_IA32_PASID			0x00000d93
> +#define  PASID_PASID_MASK		0x000fffff
> +#define  PASID_VALID			0x80000000
> +

Above the legacy line please as this is using the newer style, and drop
_IA32.  Intel's ideal of architectural-ness isn't interesting or worth
the added code volume.

PASSID_PASSID_MASK isn't great, but I can't suggest anything better, and
MSR_PASSID_MAS doesn't work either.

Otherwise, Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 04/12] x86emul: support SERIALIZE
  2020-05-05  8:14 ` [PATCH v8 04/12] x86emul: support SERIALIZE Jan Beulich
@ 2020-05-07 19:32   ` Andrew Cooper
  2020-05-08  7:34     ` Jan Beulich
  0 siblings, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-07 19:32 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Wei Liu, Roger Pau Monne

On 05/05/2020 09:14, Jan Beulich wrote:
> ... enabling its use by all guest kinds at the same time.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

> @@ -5660,6 +5661,18 @@ x86_emulate(
>                  goto done;
>              break;
>  
> +        case 0xe8:
> +            switch ( vex.pfx )
> +            {
> +            case vex_none: /* serialize */
> +                host_and_vcpu_must_have(serialize);
> +                asm volatile ( ".byte 0x0f, 0x01, 0xe8" );

There is very little need for an actual implementation here.  The VMExit
to get here is good enough.

The only question is whether pre-unrestricted_guest Intel boxes are
liable to find this in real mode code.

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 05/12] x86emul: support X{SUS,RES}LDTRK
  2020-05-05  8:14 ` [PATCH v8 05/12] x86emul: support X{SUS,RES}LDTRK Jan Beulich
@ 2020-05-07 20:13   ` Andrew Cooper
  2020-05-08  7:38     ` Jan Beulich
  0 siblings, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-07 20:13 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Wei Liu, Roger Pau Monne

On 05/05/2020 09:14, Jan Beulich wrote:
> --- a/xen/tools/gen-cpuid.py
> +++ b/xen/tools/gen-cpuid.py
> @@ -284,6 +284,9 @@ def crunch_numbers(state):
>          # as dependent features simplifies Xen's logic, and prevents the guest
>          # from seeing implausible configurations.
>          IBRSB: [STIBP, SSBD],
> +
> +        # In principle the TSXLDTRK insns could also be considered independent.
> +        RTM: [TSXLDTRK],

Why the link?  There is no relevant interaction AFAICT.

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 06/12] x86/HVM: make hvmemul_blk() capable of handling r/o operations
  2020-05-05  8:15 ` [PATCH v8 06/12] x86/HVM: make hvmemul_blk() capable of handling r/o operations Jan Beulich
  2020-05-05 14:20   ` Paul Durrant
@ 2020-05-07 20:34   ` Andrew Cooper
  2020-05-08  7:13     ` Jan Beulich
  1 sibling, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-07 20:34 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Paul Durrant, Wei Liu, Roger Pau Monne

On 05/05/2020 09:15, Jan Beulich wrote:
> In preparation for handling e.g. FLDENV or {F,FX,X}RSTOR here as well.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v8: New (could be folded into "x86emul: support MOVDIR{I,64B} insns",
>     but would invalidate Paul's R-b there).
>
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -1453,7 +1453,7 @@ static int hvmemul_blk(
>      struct hvm_emulate_ctxt *hvmemul_ctxt =
>          container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>      unsigned long addr;
> -    uint32_t pfec = PFEC_page_present | PFEC_write_access;
> +    uint32_t pfec = PFEC_page_present;
>      int rc;
>      void *mapping = NULL;
>  
> @@ -1462,6 +1462,9 @@ static int hvmemul_blk(
>      if ( rc != X86EMUL_OKAY || !bytes )
>          return rc;
>  
> +    if ( x86_insn_is_mem_write(state, ctxt) )
> +        pfec |= PFEC_write_access;
> +

For the instructions with two memory operands, it conflates the
read-only side with the read-write side.

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 06/12] x86/HVM: make hvmemul_blk() capable of handling r/o operations
  2020-05-07 20:34   ` Andrew Cooper
@ 2020-05-08  7:13     ` Jan Beulich
  0 siblings, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-08  7:13 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Paul Durrant, Wei Liu, Roger Pau Monne

On 07.05.2020 22:34, Andrew Cooper wrote:
> On 05/05/2020 09:15, Jan Beulich wrote:
>> In preparation for handling e.g. FLDENV or {F,FX,X}RSTOR here as well.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> v8: New (could be folded into "x86emul: support MOVDIR{I,64B} insns",
>>     but would invalidate Paul's R-b there).
>>
>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -1453,7 +1453,7 @@ static int hvmemul_blk(
>>      struct hvm_emulate_ctxt *hvmemul_ctxt =
>>          container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
>>      unsigned long addr;
>> -    uint32_t pfec = PFEC_page_present | PFEC_write_access;
>> +    uint32_t pfec = PFEC_page_present;
>>      int rc;
>>      void *mapping = NULL;
>>  
>> @@ -1462,6 +1462,9 @@ static int hvmemul_blk(
>>      if ( rc != X86EMUL_OKAY || !bytes )
>>          return rc;
>>  
>> +    if ( x86_insn_is_mem_write(state, ctxt) )
>> +        pfec |= PFEC_write_access;
>> +
> 
> For the instructions with two memory operands, it conflates the
> read-only side with the read-write side.

"Conflates" is probably the wrong word? The bug you point out is
really in x86_insn_is_mem_write(), in that so far it would return
false for MOVDIR64B and ENQCMD{,S}. As a result I'll fix the
issue there while, at the same time, folding this patch into
"x86emul: support MOVDIR{I,64B} insns".

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 02/12] x86emul: support MOVDIR{I,64B} insns
  2020-05-07 18:30   ` Andrew Cooper
@ 2020-05-08  7:19     ` Jan Beulich
  0 siblings, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-08  7:19 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 07.05.2020 20:30, Andrew Cooper wrote:
> On 05/05/2020 09:13, Jan Beulich wrote:
>> Introduce a new blk() hook, paralleling the rmw() one in a certain way,
>> but being intended for larger data sizes, and hence its HVM intermediate
>> handling function doesn't fall back to splitting the operation if the
>> requested virtual address can't be mapped.
>>
>> Note that SDM revision 071 doesn't specify exception behavior for
>> ModRM.mod == 0b11; assuming #UD here.
> 
> Still stale?  It does #UD on current hardware, and will cease to #UD in
> the future when the encoding space gets used for something else.

What do you mean by "still stale"? Other insns allowing for only
memory operands have the #UD spelled out in the doc. Are you
implying by the 2nd sentence that it should rather be
"goto unrecognized_insn"? I'm afraid we're not very consistent
yet with what we do in such cases; I could certainly work
towards improving this, but the question is whether it is really
sensible in all cases to assume such partially unused encodings
may get used in the future.

>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Reviewed-by: Paul Durrant <paul@xen.org>
> 
> Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

Thanks, but as per my reply to your patch 6 comment the patch
here will need to be revised, so I'll not apply this just yet
unless you indicate up front that you're fine with me keeping it.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 03/12] x86emul: support ENQCMD insns
  2020-05-07 18:59   ` Andrew Cooper
@ 2020-05-08  7:32     ` Jan Beulich
  0 siblings, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-08  7:32 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 07.05.2020 20:59, Andrew Cooper wrote:
> On 05/05/2020 09:13, Jan Beulich wrote:
>> Note that the ISA extensions document revision 038 doesn't specify
>> exception behavior for ModRM.mod == 0b11; assuming #UD here.
> 
> Stale.

In which way (beyond the question of whether to use
goto unrecognized_insn in the code instead)? The doc doesn't
mention ModRM.mod specifics in any way.

>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -11480,11 +11513,36 @@ int x86_emul_blk(
>>  {
>>      switch ( state->blk )
>>      {
>> +        bool zf;
>> +
>>          /*
>>           * Throughout this switch(), memory clobbers are used to compensate
>>           * that other operands may not properly express the (full) memory
>>           * ranges covered.
>>           */
>> +    case blk_enqcmd:
>> +        ASSERT(bytes == 64);
>> +        if ( ((unsigned long)ptr & 0x3f) )
>> +        {
>> +            ASSERT_UNREACHABLE();
>> +            return X86EMUL_UNHANDLEABLE;
>> +        }
>> +        *eflags &= ~EFLAGS_MASK;
>> +#ifdef HAVE_AS_ENQCMD
>> +        asm ( "enqcmds (%[src]), %[dst]" ASM_FLAG_OUT(, "; setz %0")
> 
> %[zf]

Oops, indeed.

>> +              : [zf] ASM_FLAG_OUT("=@ccz", "=qm") (zf)
>> +              : [src] "r" (data), [dst] "r" (ptr) : "memory" );
> 
> Can't src get away with being "m" (*data)?  There is no need to force it
> into a single register, even if it is overwhelmingly likely to end up
> with %rsi scheduled here.

Well, *data can't be used, as data is of void* type. It would
need to have a suitable cast on it, but since that's not
going to avoid the memory clobber I didn't think it was worth
it (also together with the comment ahead of the switch()).

>> --- a/xen/include/asm-x86/msr-index.h
>> +++ b/xen/include/asm-x86/msr-index.h
>> @@ -420,6 +420,10 @@
>>  #define MSR_IA32_TSC_DEADLINE		0x000006E0
>>  #define MSR_IA32_ENERGY_PERF_BIAS	0x000001b0
>>  
>> +#define MSR_IA32_PASID			0x00000d93
>> +#define  PASID_PASID_MASK		0x000fffff
>> +#define  PASID_VALID			0x80000000
>> +
> 
> Above the legacy line please as this is using the newer style,

Ah, yes, I should have remembered to re-base this over your
change there.

> and drop
> _IA32.  Intel's ideal of architectural-ness isn't interesting or worth
> the added code volume.

We'd been there before, and you know I disagree. I think it
is wrong for me to make the change, but I will do so just
to retain your ack.

> PASSID_PASSID_MASK isn't great, but I can't suggest anything better, and
> MSR_PASSID_MAS doesn't work either.
> 
> Otherwise, Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

Thanks.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 04/12] x86emul: support SERIALIZE
  2020-05-07 19:32   ` Andrew Cooper
@ 2020-05-08  7:34     ` Jan Beulich
  2020-05-08 13:00       ` Andrew Cooper
  0 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-08  7:34 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 07.05.2020 21:32, Andrew Cooper wrote:
> On 05/05/2020 09:14, Jan Beulich wrote:
>> ... enabling its use by all guest kinds at the same time.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

Thanks.

>> @@ -5660,6 +5661,18 @@ x86_emulate(
>>                  goto done;
>>              break;
>>  
>> +        case 0xe8:
>> +            switch ( vex.pfx )
>> +            {
>> +            case vex_none: /* serialize */
>> +                host_and_vcpu_must_have(serialize);
>> +                asm volatile ( ".byte 0x0f, 0x01, 0xe8" );
> 
> There is very little need for an actual implementation here.  The VMExit
> to get here is good enough.
> 
> The only question is whether pre-unrestricted_guest Intel boxes are
> liable to find this in real mode code.

Apart from this we also shouldn't assume HVM in the core emulator,
I think.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 05/12] x86emul: support X{SUS,RES}LDTRK
  2020-05-07 20:13   ` Andrew Cooper
@ 2020-05-08  7:38     ` Jan Beulich
  2020-05-08 13:15       ` Andrew Cooper
  0 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-08  7:38 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 07.05.2020 22:13, Andrew Cooper wrote:
> On 05/05/2020 09:14, Jan Beulich wrote:
>> --- a/xen/tools/gen-cpuid.py
>> +++ b/xen/tools/gen-cpuid.py
>> @@ -284,6 +284,9 @@ def crunch_numbers(state):
>>          # as dependent features simplifies Xen's logic, and prevents the guest
>>          # from seeing implausible configurations.
>>          IBRSB: [STIBP, SSBD],
>> +
>> +        # In principle the TSXLDTRK insns could also be considered independent.
>> +        RTM: [TSXLDTRK],
> 
> Why the link?  There is no relevant interaction AFAICT.

Do the insns make any sense without TSX? Anyway - hence the
comment, and if you're convinced the connection does not
need making, I'd be okay dropping it. I would assume though
that we'd better hide TSXLDTRK whenever we hide RTM, which
is most easily achieved by having a connection here.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 01/12] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM
  2020-05-07 18:11   ` Andrew Cooper
@ 2020-05-08  8:10     ` Jan Beulich
  0 siblings, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-08  8:10 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 07.05.2020 20:11, Andrew Cooper wrote:
> On 05/05/2020 09:12, Jan Beulich wrote:
>> In a pure PV environment (the PV shim in particular) we don't really
>> need emulation of all these. To limit #ifdef-ary utilize some of the
>> CASE_*() macros we have, by providing variants expanding to
>> (effectively) nothing (really a label, which in turn requires passing
>> -Wno-unused-label to the compiler when build such configurations).
>>
>> Due to the mixture of macro and #ifdef use, the placement of some of
>> the #ifdef-s is a little arbitrary.
>>
>> The resulting object file's .text is less than half the size of the
>> original, and looks to also be compiling a little more quickly.
>>
>> This is meant as a first step; more parts can likely be disabled down
>> the road.
>>
>> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> v7: Integrate into this series. Re-base.
>> ---
>> I'll be happy to take suggestions allowing to avoid -Wno-unused-label.
> 
> I already gave you one, along with a far less invasive change.
> 
> It is not interesting to be able to conditionally compile these things
> separately.  A build of Xen will either need everything, or just the
> integer group.

And I had replied to that, indicating my (at least partial)
disagreement as well as asking for clarification on an apparently
incomplete sentence in your response.

Note that in an initial (3 months earlier) reply you did say

"On that subject, it would be very helpful to at least be able to
 configure reduced builds from these utilities."

which I responded to

"Yes, I too have been thinking this way. I may get there eventually."

I specifically meant FPU-less, MMX-less, and SIMD-less each on their
own.

I'm also not at all convinced of, as you say, "a build of Xen will
either need everything, or just the integer group". Yes, for now I
unilaterally disable all three for !HVM here, but that's just
because we know of no PV guests trying to update their page tables
in "interesting" ways. Long term I think this would better be a
separate Kconfig option (or multiple ones, if we stick to keeping
the three groups here to have separate controls), merely defaulting
to !HVM. I could of course switch to such an approach right away.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 04/12] x86emul: support SERIALIZE
  2020-05-08  7:34     ` Jan Beulich
@ 2020-05-08 13:00       ` Andrew Cooper
  2020-05-08 13:59         ` Jan Beulich
  0 siblings, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-08 13:00 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 08/05/2020 08:34, Jan Beulich wrote:
>>> @@ -5660,6 +5661,18 @@ x86_emulate(
>>>                  goto done;
>>>              break;
>>>  
>>> +        case 0xe8:
>>> +            switch ( vex.pfx )
>>> +            {
>>> +            case vex_none: /* serialize */
>>> +                host_and_vcpu_must_have(serialize);
>>> +                asm volatile ( ".byte 0x0f, 0x01, 0xe8" );
>> There is very little need for an actual implementation here.  The VMExit
>> to get here is good enough.
>>
>> The only question is whether pre-unrestricted_guest Intel boxes are
>> liable to find this in real mode code.
> Apart from this we also shouldn't assume HVM in the core emulator,
> I think.

It's not assuming HVM.  Its just that there is no way we can end up
emulating this instruction from PV context.

If we could end up here in PV context, the exception causing us to
emulate to begin with would be good enough as well.

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 05/12] x86emul: support X{SUS,RES}LDTRK
  2020-05-08  7:38     ` Jan Beulich
@ 2020-05-08 13:15       ` Andrew Cooper
  2020-05-08 14:42         ` Jan Beulich
  0 siblings, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-08 13:15 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 08/05/2020 08:38, Jan Beulich wrote:
> [CAUTION - EXTERNAL EMAIL] DO NOT reply, click links, or open attachments unless you have verified the sender and know the content is safe.
>
> On 07.05.2020 22:13, Andrew Cooper wrote:
>> On 05/05/2020 09:14, Jan Beulich wrote:
>>> --- a/xen/tools/gen-cpuid.py
>>> +++ b/xen/tools/gen-cpuid.py
>>> @@ -284,6 +284,9 @@ def crunch_numbers(state):
>>>          # as dependent features simplifies Xen's logic, and prevents the guest
>>>          # from seeing implausible configurations.
>>>          IBRSB: [STIBP, SSBD],
>>> +
>>> +        # In principle the TSXLDTRK insns could also be considered independent.
>>> +        RTM: [TSXLDTRK],
>> Why the link?  There is no relevant interaction AFAICT.
> Do the insns make any sense without TSX? Anyway - hence the
> comment, and if you're convinced the connection does not
> need making, I'd be okay dropping it. I would assume though
> that we'd better hide TSXLDTRK whenever we hide RTM, which
> is most easily achieved by having a connection here.

Actually - that is a very good point.  I expect there will (or should)
be an interaction with MSR_TSX_CTRL, as it has CPUID-hiding functionality.

For now, could I ask you to not expose this to guests in this patch?

For the emulator side of things alone I think this is ok (although
looking over it a second time, we could really do with a comment in the
code explaining that we're never in an RTM region, hence the nop behaviour).

I'll follow up with Intel, and we can figure out the CPUID derivation
details at a later point.

If you're happy with this plan, then A-by to save a round trip.

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR
  2020-05-05  8:16 ` [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR Jan Beulich
@ 2020-05-08 13:37   ` Roger Pau Monné
  2020-05-08 15:04     ` Jan Beulich
  2020-05-08 18:19   ` Andrew Cooper
  1 sibling, 1 reply; 51+ messages in thread
From: Roger Pau Monné @ 2020-05-08 13:37 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Wei Liu, Andrew Cooper

On Tue, May 05, 2020 at 10:16:20AM +0200, Jan Beulich wrote:
> While the Intel SDM claims that FRSTOR itself may raise #MF upon
> completion, this was confirmed by Intel to be a doc error which will be
> corrected in due course; behavior is like FLDENV, and like old hard copy
> manuals describe it. Otherwise we'd have to emulate the insn by filling
> st(N) in suitable order, followed by FLDENV.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v7: New.
> 
> --- a/tools/tests/x86_emulator/test_x86_emulator.c
> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
> @@ -2442,6 +2442,27 @@ int main(int argc, char **argv)
>      else
>          printf("skipped\n");
>  
> +    printf("%-40s", "Testing fldenv 8(%edx)...");

Likely a stupid question, but why the added 8? edx will contain the
memory address used to save the sate by fnstenv, so I would expect
fldenv to just load from there?

> +    if ( stack_exec && cpu_has_fpu )
> +    {
> +        asm volatile ( "fnstenv %0\n\t"
> +                       "fninit"
> +                       : "=m" (res[2]) :: "memory" );

Why do you need the memory clobber here? I assume it's because res is
of type unsigned int and hence doesn't have the right size that
fnstenv will actually write to?

> +        zap_fpsel(&res[2], true);
> +        instr[0] = 0xd9; instr[1] = 0x62; instr[2] = 0x08;
> +        regs.eip = (unsigned long)&instr[0];
> +        regs.edx = (unsigned long)res;
> +        rc = x86_emulate(&ctxt, &emulops);
> +        asm volatile ( "fnstenv %0" : "=m" (res[9]) :: "memory" );
> +        if ( (rc != X86EMUL_OKAY) ||
> +             memcmp(res + 2, res + 9, 28) ||
> +             (regs.eip != (unsigned long)&instr[3]) )
> +            goto fail;
> +        printf("okay\n");
> +    }
> +    else
> +        printf("skipped\n");
> +
>      printf("%-40s", "Testing 16-bit fnsave (%ecx)...");
>      if ( stack_exec && cpu_has_fpu )
>      {
> @@ -2468,6 +2489,31 @@ int main(int argc, char **argv)
>              goto fail;
>          printf("okay\n");
>      }
> +    else
> +        printf("skipped\n");
> +
> +    printf("%-40s", "Testing frstor (%edx)...");
> +    if ( stack_exec && cpu_has_fpu )
> +    {
> +        const uint16_t seven = 7;
> +
> +        asm volatile ( "fninit\n\t"
> +                       "fld1\n\t"
> +                       "fidivs %1\n\t"
> +                       "fnsave %0\n\t"
> +                       : "=&m" (res[0]) : "m" (seven) : "memory" );
> +        zap_fpsel(&res[0], true);
> +        instr[0] = 0xdd; instr[1] = 0x22;
> +        regs.eip = (unsigned long)&instr[0];
> +        regs.edx = (unsigned long)res;
> +        rc = x86_emulate(&ctxt, &emulops);
> +        asm volatile ( "fnsave %0" : "=m" (res[27]) :: "memory" );
> +        if ( (rc != X86EMUL_OKAY) ||
> +             memcmp(res, res + 27, 108) ||
> +             (regs.eip != (unsigned long)&instr[2]) )
> +            goto fail;
> +        printf("okay\n");
> +    }
>      else
>          printf("skipped\n");
>  
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -857,6 +857,7 @@ struct x86_emulate_state {
>          blk_NONE,
>          blk_enqcmd,
>  #ifndef X86EMUL_NO_FPU
> +        blk_fld, /* FLDENV, FRSTOR */
>          blk_fst, /* FNSTENV, FNSAVE */
>  #endif
>          blk_movdir,
> @@ -4948,21 +4949,14 @@ x86_emulate(
>                  dst.bytes = 4;
>                  emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
>                  break;
> -            case 4: /* fldenv - TODO */
> -                state->fpu_ctrl = true;
> -                goto unimplemented_insn;
> -            case 5: /* fldcw m2byte */
> -                state->fpu_ctrl = true;
> -            fpu_memsrc16:
> -                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
> -                                     2, ctxt)) != X86EMUL_OKAY )
> -                    goto done;
> -                emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
> -                break;
> +            case 4: /* fldenv */
> +                /* Raise #MF now if there are pending unmasked exceptions. */
> +                emulate_fpu_insn_stub(0xd9, 0xd0 /* fnop */);

Maybe it would make sense to have a wrapper for fnop?

> +                /* fall through */
>              case 6: /* fnstenv */
>                  fail_if(!ops->blk);
> -                state->blk = blk_fst;
> -                /* REX is meaningless for this insn by this point. */
> +                state->blk = modrm_reg & 2 ? blk_fst : blk_fld;
> +                /* REX is meaningless for these insns by this point. */
>                  rex_prefix = in_protmode(ctxt, ops);
>                  if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
>                                      op_bytes > 2 ? sizeof(struct x87_env32)
> @@ -4972,6 +4966,14 @@ x86_emulate(
>                      goto done;
>                  state->fpu_ctrl = true;
>                  break;
> +            case 5: /* fldcw m2byte */
> +                state->fpu_ctrl = true;
> +            fpu_memsrc16:
> +                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
> +                                     2, ctxt)) != X86EMUL_OKAY )
> +                    goto done;
> +                emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
> +                break;
>              case 7: /* fnstcw m2byte */
>                  state->fpu_ctrl = true;
>              fpu_memdst16:
> @@ -5124,13 +5126,14 @@ x86_emulate(
>                  dst.bytes = 8;
>                  emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
>                  break;
> -            case 4: /* frstor - TODO */
> -                state->fpu_ctrl = true;
> -                goto unimplemented_insn;
> +            case 4: /* frstor */
> +                /* Raise #MF now if there are pending unmasked exceptions. */
> +                emulate_fpu_insn_stub(0xd9, 0xd0 /* fnop */);
> +                /* fall through */
>              case 6: /* fnsave */
>                  fail_if(!ops->blk);
> -                state->blk = blk_fst;
> -                /* REX is meaningless for this insn by this point. */
> +                state->blk = modrm_reg & 2 ? blk_fst : blk_fld;
> +                /* REX is meaningless for these insns by this point. */
>                  rex_prefix = in_protmode(ctxt, ops);
>                  if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
>                                      op_bytes > 2 ? sizeof(struct x87_env32) + 80
> @@ -11648,6 +11651,89 @@ int x86_emul_blk(
>  
>  #ifndef X86EMUL_NO_FPU
>  
> +    case blk_fld:
> +        ASSERT(!data);
> +
> +        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
> +        switch ( bytes )
> +        {
> +        case sizeof(fpstate.env):
> +        case sizeof(fpstate):
> +            memcpy(&fpstate.env, ptr, sizeof(fpstate.env));
> +            if ( !state->rex_prefix )
> +            {
> +                unsigned int fip = fpstate.env.mode.real.fip_lo +
> +                                   (fpstate.env.mode.real.fip_hi << 16);
> +                unsigned int fdp = fpstate.env.mode.real.fdp_lo +
> +                                   (fpstate.env.mode.real.fdp_hi << 16);
> +                unsigned int fop = fpstate.env.mode.real.fop;
> +
> +                fpstate.env.mode.prot.fip = fip & 0xf;
> +                fpstate.env.mode.prot.fcs = fip >> 4;
> +                fpstate.env.mode.prot.fop = fop;
> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
> +                fpstate.env.mode.prot.fds = fdp >> 4;

I've found the layouts in the SDM vol. 1, but I haven't been able to
found the translation mechanism from real to protected. Could you
maybe add a reference here?

> +            }
> +
> +            if ( bytes == sizeof(fpstate.env) )
> +                ptr = NULL;
> +            else
> +                ptr += sizeof(fpstate.env);
> +            break;
> +
> +        case sizeof(struct x87_env16):
> +        case sizeof(struct x87_env16) + sizeof(fpstate.freg):
> +        {
> +            const struct x87_env16 *env = ptr;
> +
> +            fpstate.env.fcw = env->fcw;
> +            fpstate.env.fsw = env->fsw;
> +            fpstate.env.ftw = env->ftw;
> +
> +            if ( state->rex_prefix )
> +            {
> +                fpstate.env.mode.prot.fip = env->mode.prot.fip;
> +                fpstate.env.mode.prot.fcs = env->mode.prot.fcs;
> +                fpstate.env.mode.prot.fdp = env->mode.prot.fdp;
> +                fpstate.env.mode.prot.fds = env->mode.prot.fds;
> +                fpstate.env.mode.prot.fop = 0; /* unknown */
> +            }
> +            else
> +            {
> +                unsigned int fip = env->mode.real.fip_lo +
> +                                   (env->mode.real.fip_hi << 16);
> +                unsigned int fdp = env->mode.real.fdp_lo +
> +                                   (env->mode.real.fdp_hi << 16);
> +                unsigned int fop = env->mode.real.fop;
> +
> +                fpstate.env.mode.prot.fip = fip & 0xf;
> +                fpstate.env.mode.prot.fcs = fip >> 4;
> +                fpstate.env.mode.prot.fop = fop;
> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
> +                fpstate.env.mode.prot.fds = fdp >> 4;

This looks mostly the same as the translation done above, so maybe
could be abstracted anyway in a macro to avoid the code repetition?
(ie: fpstate_real_to_prot(src, dst) or some such).

Thanks, Roger.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 04/12] x86emul: support SERIALIZE
  2020-05-08 13:00       ` Andrew Cooper
@ 2020-05-08 13:59         ` Jan Beulich
  2020-05-08 15:05           ` Andrew Cooper
  0 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-08 13:59 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 08.05.2020 15:00, Andrew Cooper wrote:
> On 08/05/2020 08:34, Jan Beulich wrote:
>>>> @@ -5660,6 +5661,18 @@ x86_emulate(
>>>>                  goto done;
>>>>              break;
>>>>  
>>>> +        case 0xe8:
>>>> +            switch ( vex.pfx )
>>>> +            {
>>>> +            case vex_none: /* serialize */
>>>> +                host_and_vcpu_must_have(serialize);
>>>> +                asm volatile ( ".byte 0x0f, 0x01, 0xe8" );
>>> There is very little need for an actual implementation here.  The VMExit
>>> to get here is good enough.
>>>
>>> The only question is whether pre-unrestricted_guest Intel boxes are
>>> liable to find this in real mode code.
>> Apart from this we also shouldn't assume HVM in the core emulator,
>> I think.
> 
> It's not assuming HVM.  Its just that there is no way we can end up
> emulating this instruction from PV context.
> 
> If we could end up here in PV context, the exception causing us to
> emulate to begin with would be good enough as well.

With the current way of where/how emulation gets involved -
yes. I'd like to remind you though of the 4-insn window
shadow code tries to emulate over for PAE guests. There
is no intervening #VMEXIT there.

So do you want me to drop the asm() then, and switch from
host_and_vcpu_must_have(serialize) to just
vcpu_must_have(serialize)?

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 05/12] x86emul: support X{SUS,RES}LDTRK
  2020-05-08 13:15       ` Andrew Cooper
@ 2020-05-08 14:42         ` Jan Beulich
  0 siblings, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-08 14:42 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 08.05.2020 15:15, Andrew Cooper wrote:
> On 08/05/2020 08:38, Jan Beulich wrote:
>> [CAUTION - EXTERNAL EMAIL] DO NOT reply, click links, or open attachments unless you have verified the sender and know the content is safe.
>>
>> On 07.05.2020 22:13, Andrew Cooper wrote:
>>> On 05/05/2020 09:14, Jan Beulich wrote:
>>>> --- a/xen/tools/gen-cpuid.py
>>>> +++ b/xen/tools/gen-cpuid.py
>>>> @@ -284,6 +284,9 @@ def crunch_numbers(state):
>>>>          # as dependent features simplifies Xen's logic, and prevents the guest
>>>>          # from seeing implausible configurations.
>>>>          IBRSB: [STIBP, SSBD],
>>>> +
>>>> +        # In principle the TSXLDTRK insns could also be considered independent.
>>>> +        RTM: [TSXLDTRK],
>>> Why the link?  There is no relevant interaction AFAICT.
>> Do the insns make any sense without TSX? Anyway - hence the
>> comment, and if you're convinced the connection does not
>> need making, I'd be okay dropping it. I would assume though
>> that we'd better hide TSXLDTRK whenever we hide RTM, which
>> is most easily achieved by having a connection here.
> 
> Actually - that is a very good point.  I expect there will (or should)
> be an interaction with MSR_TSX_CTRL, as it has CPUID-hiding functionality.
> 
> For now, could I ask you to not expose this to guests in this patch?

As per our irc discussion, I'd make it 'a' then instead of 'A'.
Will need to wait for gen-cpuid.py to accept 'a' then.

> For the emulator side of things alone I think this is ok (although
> looking over it a second time, we could really do with a comment in the
> code explaining that we're never in an RTM region, hence the nop behaviour).

I've added

                /*
                 * We're never in a transactional region when coming here
                 * - nothing else to do.
                 */

to both paths.

> I'll follow up with Intel, and we can figure out the CPUID derivation
> details at a later point.
> 
> If you're happy with this plan, then A-by to save a round trip.

Thanks.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR
  2020-05-08 13:37   ` Roger Pau Monné
@ 2020-05-08 15:04     ` Jan Beulich
  2020-05-08 16:21       ` Roger Pau Monné
  2020-05-08 18:29       ` Andrew Cooper
  0 siblings, 2 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-08 15:04 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Wei Liu, Andrew Cooper

On 08.05.2020 15:37, Roger Pau Monné wrote:
> On Tue, May 05, 2020 at 10:16:20AM +0200, Jan Beulich wrote:
>> --- a/tools/tests/x86_emulator/test_x86_emulator.c
>> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
>> @@ -2442,6 +2442,27 @@ int main(int argc, char **argv)
>>      else
>>          printf("skipped\n");
>>  
>> +    printf("%-40s", "Testing fldenv 8(%edx)...");
> 
> Likely a stupid question, but why the added 8? edx will contain the
> memory address used to save the sate by fnstenv, so I would expect
> fldenv to just load from there?

The 8 is just to vary ModR/M encodings acrosss the various
tests - it's an arbitrary choice.

>> +    if ( stack_exec && cpu_has_fpu )
>> +    {
>> +        asm volatile ( "fnstenv %0\n\t"
>> +                       "fninit"
>> +                       : "=m" (res[2]) :: "memory" );
> 
> Why do you need the memory clobber here? I assume it's because res is
> of type unsigned int and hence doesn't have the right size that
> fnstenv will actually write to?

Correct.

>> @@ -4948,21 +4949,14 @@ x86_emulate(
>>                  dst.bytes = 4;
>>                  emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
>>                  break;
>> -            case 4: /* fldenv - TODO */
>> -                state->fpu_ctrl = true;
>> -                goto unimplemented_insn;
>> -            case 5: /* fldcw m2byte */
>> -                state->fpu_ctrl = true;
>> -            fpu_memsrc16:
>> -                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
>> -                                     2, ctxt)) != X86EMUL_OKAY )
>> -                    goto done;
>> -                emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
>> -                break;
>> +            case 4: /* fldenv */
>> +                /* Raise #MF now if there are pending unmasked exceptions. */
>> +                emulate_fpu_insn_stub(0xd9, 0xd0 /* fnop */);
> 
> Maybe it would make sense to have a wrapper for fnop?

I'm not convinced that would be worth it.

>> @@ -11648,6 +11651,89 @@ int x86_emul_blk(
>>  
>>  #ifndef X86EMUL_NO_FPU
>>  
>> +    case blk_fld:
>> +        ASSERT(!data);
>> +
>> +        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
>> +        switch ( bytes )
>> +        {
>> +        case sizeof(fpstate.env):
>> +        case sizeof(fpstate):
>> +            memcpy(&fpstate.env, ptr, sizeof(fpstate.env));
>> +            if ( !state->rex_prefix )
>> +            {
>> +                unsigned int fip = fpstate.env.mode.real.fip_lo +
>> +                                   (fpstate.env.mode.real.fip_hi << 16);
>> +                unsigned int fdp = fpstate.env.mode.real.fdp_lo +
>> +                                   (fpstate.env.mode.real.fdp_hi << 16);
>> +                unsigned int fop = fpstate.env.mode.real.fop;
>> +
>> +                fpstate.env.mode.prot.fip = fip & 0xf;
>> +                fpstate.env.mode.prot.fcs = fip >> 4;
>> +                fpstate.env.mode.prot.fop = fop;
>> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
>> +                fpstate.env.mode.prot.fds = fdp >> 4;
> 
> I've found the layouts in the SDM vol. 1, but I haven't been able to
> found the translation mechanism from real to protected. Could you
> maybe add a reference here?

A reference to some piece of documentation? I don't think this
is spelled out anywhere. It's also only one of various possible
ways of doing the translation, but among them the most flexible
one for possible consumers of the data (because of using the
smallest possible offsets into the segments).

>> +            }
>> +
>> +            if ( bytes == sizeof(fpstate.env) )
>> +                ptr = NULL;
>> +            else
>> +                ptr += sizeof(fpstate.env);
>> +            break;
>> +
>> +        case sizeof(struct x87_env16):
>> +        case sizeof(struct x87_env16) + sizeof(fpstate.freg):
>> +        {
>> +            const struct x87_env16 *env = ptr;
>> +
>> +            fpstate.env.fcw = env->fcw;
>> +            fpstate.env.fsw = env->fsw;
>> +            fpstate.env.ftw = env->ftw;
>> +
>> +            if ( state->rex_prefix )
>> +            {
>> +                fpstate.env.mode.prot.fip = env->mode.prot.fip;
>> +                fpstate.env.mode.prot.fcs = env->mode.prot.fcs;
>> +                fpstate.env.mode.prot.fdp = env->mode.prot.fdp;
>> +                fpstate.env.mode.prot.fds = env->mode.prot.fds;
>> +                fpstate.env.mode.prot.fop = 0; /* unknown */
>> +            }
>> +            else
>> +            {
>> +                unsigned int fip = env->mode.real.fip_lo +
>> +                                   (env->mode.real.fip_hi << 16);
>> +                unsigned int fdp = env->mode.real.fdp_lo +
>> +                                   (env->mode.real.fdp_hi << 16);
>> +                unsigned int fop = env->mode.real.fop;
>> +
>> +                fpstate.env.mode.prot.fip = fip & 0xf;
>> +                fpstate.env.mode.prot.fcs = fip >> 4;
>> +                fpstate.env.mode.prot.fop = fop;
>> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
>> +                fpstate.env.mode.prot.fds = fdp >> 4;
> 
> This looks mostly the same as the translation done above, so maybe
> could be abstracted anyway in a macro to avoid the code repetition?
> (ie: fpstate_real_to_prot(src, dst) or some such).

Just the 5 assignments could be put in an inline function, but
if we also wanted to abstract away the declarations with their
initializers, it would need to be a macro because of the
different types of fpstate.env and *env. While I'd generally
prefer inline functions, the macro would have the benefit that
it could be #define-d / #undef-d right inside this case block.
Thoughts? 

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 04/12] x86emul: support SERIALIZE
  2020-05-08 13:59         ` Jan Beulich
@ 2020-05-08 15:05           ` Andrew Cooper
  0 siblings, 0 replies; 51+ messages in thread
From: Andrew Cooper @ 2020-05-08 15:05 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 08/05/2020 14:59, Jan Beulich wrote:
> On 08.05.2020 15:00, Andrew Cooper wrote:
>> On 08/05/2020 08:34, Jan Beulich wrote:
>>>>> @@ -5660,6 +5661,18 @@ x86_emulate(
>>>>>                  goto done;
>>>>>              break;
>>>>>  
>>>>> +        case 0xe8:
>>>>> +            switch ( vex.pfx )
>>>>> +            {
>>>>> +            case vex_none: /* serialize */
>>>>> +                host_and_vcpu_must_have(serialize);
>>>>> +                asm volatile ( ".byte 0x0f, 0x01, 0xe8" );
>>>> There is very little need for an actual implementation here.  The VMExit
>>>> to get here is good enough.
>>>>
>>>> The only question is whether pre-unrestricted_guest Intel boxes are
>>>> liable to find this in real mode code.
>>> Apart from this we also shouldn't assume HVM in the core emulator,
>>> I think.
>> It's not assuming HVM.  Its just that there is no way we can end up
>> emulating this instruction from PV context.
>>
>> If we could end up here in PV context, the exception causing us to
>> emulate to begin with would be good enough as well.
> With the current way of where/how emulation gets involved -
> yes. I'd like to remind you though of the 4-insn window
> shadow code tries to emulate over for PAE guests. There
> is no intervening #VMEXIT there.
>
> So do you want me to drop the asm() then, and switch from
> host_and_vcpu_must_have(serialize) to just
> vcpu_must_have(serialize)?

No - keep it as is.  This isn't a fastpath, and it is much safer to
assume there might be something we've forgotten.  (Or perhaps some new
future behaviour included in the definition of architecturally serialising).

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR
  2020-05-08 15:04     ` Jan Beulich
@ 2020-05-08 16:21       ` Roger Pau Monné
  2020-05-11  7:29         ` Jan Beulich
  2020-05-08 18:29       ` Andrew Cooper
  1 sibling, 1 reply; 51+ messages in thread
From: Roger Pau Monné @ 2020-05-08 16:21 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Wei Liu, Andrew Cooper

On Fri, May 08, 2020 at 05:04:02PM +0200, Jan Beulich wrote:
> On 08.05.2020 15:37, Roger Pau Monné wrote:
> > On Tue, May 05, 2020 at 10:16:20AM +0200, Jan Beulich wrote:
> >> --- a/tools/tests/x86_emulator/test_x86_emulator.c
> >> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
> >> @@ -11648,6 +11651,89 @@ int x86_emul_blk(
> >>  
> >>  #ifndef X86EMUL_NO_FPU
> >>  
> >> +    case blk_fld:
> >> +        ASSERT(!data);
> >> +
> >> +        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
> >> +        switch ( bytes )
> >> +        {
> >> +        case sizeof(fpstate.env):
> >> +        case sizeof(fpstate):
> >> +            memcpy(&fpstate.env, ptr, sizeof(fpstate.env));
> >> +            if ( !state->rex_prefix )
> >> +            {
> >> +                unsigned int fip = fpstate.env.mode.real.fip_lo +
> >> +                                   (fpstate.env.mode.real.fip_hi << 16);
> >> +                unsigned int fdp = fpstate.env.mode.real.fdp_lo +
> >> +                                   (fpstate.env.mode.real.fdp_hi << 16);
> >> +                unsigned int fop = fpstate.env.mode.real.fop;
> >> +
> >> +                fpstate.env.mode.prot.fip = fip & 0xf;
> >> +                fpstate.env.mode.prot.fcs = fip >> 4;
> >> +                fpstate.env.mode.prot.fop = fop;
> >> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
> >> +                fpstate.env.mode.prot.fds = fdp >> 4;
> > 
> > I've found the layouts in the SDM vol. 1, but I haven't been able to
> > found the translation mechanism from real to protected. Could you
> > maybe add a reference here?
> 
> A reference to some piece of documentation? I don't think this
> is spelled out anywhere. It's also only one of various possible
> ways of doing the translation, but among them the most flexible
> one for possible consumers of the data (because of using the
> smallest possible offsets into the segments).

Having this written down as a comment would help, but maybe that's
just because I'm not familiar at all with all this stuff.

Again, likely a very stupid question, but I would expect:

fpstate.env.mode.prot.fip = fip;

Without the mask.

> >> +            }
> >> +
> >> +            if ( bytes == sizeof(fpstate.env) )
> >> +                ptr = NULL;
> >> +            else
> >> +                ptr += sizeof(fpstate.env);
> >> +            break;
> >> +
> >> +        case sizeof(struct x87_env16):
> >> +        case sizeof(struct x87_env16) + sizeof(fpstate.freg):
> >> +        {
> >> +            const struct x87_env16 *env = ptr;
> >> +
> >> +            fpstate.env.fcw = env->fcw;
> >> +            fpstate.env.fsw = env->fsw;
> >> +            fpstate.env.ftw = env->ftw;
> >> +
> >> +            if ( state->rex_prefix )
> >> +            {
> >> +                fpstate.env.mode.prot.fip = env->mode.prot.fip;
> >> +                fpstate.env.mode.prot.fcs = env->mode.prot.fcs;
> >> +                fpstate.env.mode.prot.fdp = env->mode.prot.fdp;
> >> +                fpstate.env.mode.prot.fds = env->mode.prot.fds;
> >> +                fpstate.env.mode.prot.fop = 0; /* unknown */
> >> +            }
> >> +            else
> >> +            {
> >> +                unsigned int fip = env->mode.real.fip_lo +
> >> +                                   (env->mode.real.fip_hi << 16);
> >> +                unsigned int fdp = env->mode.real.fdp_lo +
> >> +                                   (env->mode.real.fdp_hi << 16);
> >> +                unsigned int fop = env->mode.real.fop;
> >> +
> >> +                fpstate.env.mode.prot.fip = fip & 0xf;
> >> +                fpstate.env.mode.prot.fcs = fip >> 4;
> >> +                fpstate.env.mode.prot.fop = fop;
> >> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
> >> +                fpstate.env.mode.prot.fds = fdp >> 4;
> > 
> > This looks mostly the same as the translation done above, so maybe
> > could be abstracted anyway in a macro to avoid the code repetition?
> > (ie: fpstate_real_to_prot(src, dst) or some such).
> 
> Just the 5 assignments could be put in an inline function, but
> if we also wanted to abstract away the declarations with their
> initializers, it would need to be a macro because of the
> different types of fpstate.env and *env. While I'd generally
> prefer inline functions, the macro would have the benefit that
> it could be #define-d / #undef-d right inside this case block.
> Thoughts? 

I think a macro would be fine IMO.

Thanks, Roger.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 07/12] x86emul: support FNSTENV and FNSAVE
  2020-05-05  8:15 ` [PATCH v8 07/12] x86emul: support FNSTENV and FNSAVE Jan Beulich
  2020-05-05 12:36   ` Jan Beulich
@ 2020-05-08 17:58   ` Andrew Cooper
  2020-05-13 12:07     ` Jan Beulich
  1 sibling, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-08 17:58 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Wei Liu, Roger Pau Monne

On 05/05/2020 09:15, Jan Beulich wrote:
> To avoid introducing another boolean into emulator state, the
> rex_prefix field gets (ab)used to convey the real/VM86 vs protected mode
> info (affecting structure layout, albeit not size) to x86_emul_blk().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TBD: The full 16-bit padding fields in the 32-bit structures get filled
>      with all ones by modern CPUs (i.e. other than the comment says for

You really mean "unlike" here, rather than "other".  They do not have
the same meaning in this context.

(I think you're also missing a "what", which I'm guessing is just an
oversight.)

>      FIP and FDP). We may want to mirror this as well (for the real mode
>      variant), even if those fields' contents are unspecified.

This is surprising behaviour, but I expect it dates back to external x87
processors and default MMIO behaviour.

If it is entirely consistent, it match be nice to match.  OTOH, the
manuals are very clear that it is reserved, which I think gives us the
liberty to use the easier implementation.

> ---
> v7: New.
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -897,6 +900,50 @@ struct x86_emulate_state {
>  #define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
>  #endif
>  
> +#ifndef X86EMUL_NO_FPU
> +struct x87_env16 {
> +    uint16_t fcw;
> +    uint16_t fsw;
> +    uint16_t ftw;
> +    union {
> +        struct {
> +            uint16_t fip_lo;
> +            uint16_t fop:11, :1, fip_hi:4;
> +            uint16_t fdp_lo;
> +            uint16_t :12, fdp_hi:4;
> +        } real;
> +        struct {
> +            uint16_t fip;
> +            uint16_t fcs;
> +            uint16_t fdp;
> +            uint16_t fds;
> +        } prot;
> +    } mode;
> +};
> +
> +struct x87_env32 {
> +    uint32_t fcw:16, :16;
> +    uint32_t fsw:16, :16;
> +    uint32_t ftw:16, :16;

uint16_t fcw, :16;
uint16_t fsw, :16;
uint16_t ftw, :16;

which reduces the number of 16 bit bitfields.

> +    union {
> +        struct {
> +            /* some CPUs/FPUs also store the full FIP here */
> +            uint32_t fip_lo:16, :16;
> +            uint32_t fop:11, :1, fip_hi:16, :4;
> +            /* some CPUs/FPUs also store the full FDP here */
> +            uint32_t fdp_lo:16, :16;
> +            uint32_t :12, fdp_hi:16, :4;

Annoyingly, two of these lines can't use uint16_t.  I'm torn as to
whether to suggest converting the other two which can.

> +        } real;
> +        struct {
> +            uint32_t fip;
> +            uint32_t fcs:16, fop:11, :5;
> +            uint32_t fdp;
> +            uint32_t fds:16, :16;

These two can be converted safely.

> @@ -4912,9 +4959,19 @@ x86_emulate(
>                      goto done;
>                  emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
>                  break;
> -            case 6: /* fnstenv - TODO */
> +            case 6: /* fnstenv */
> +                fail_if(!ops->blk);
> +                state->blk = blk_fst;
> +                /* REX is meaningless for this insn by this point. */
> +                rex_prefix = in_protmode(ctxt, ops);

Code like this is why I have such a strong objection to obfuscating macros.

It reads as if you're updating a local variable, alongside a comment
explaining that it is meaningless.

It is critically important for clarity that the comment state that
you're (ab)using the field to pass information into ->blk(), and I'd go
so far as suggesting

/*state->*/rex_prefix = in_protmode(ctxt, ops);

to reinforce the point that rex_prefix isn't a local variable, seeing
the obfuscation prevents a real state->rex_prefix from working.

> +                if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
> +                                    op_bytes > 2 ? sizeof(struct x87_env32)
> +                                                 : sizeof(struct x87_env16),
> +                                    &_regs.eflags,
> +                                    state, ctxt)) != X86EMUL_OKAY )
> +                    goto done;
>                  state->fpu_ctrl = true;
> -                goto unimplemented_insn;
> +                break;
>              case 7: /* fnstcw m2byte */
>                  state->fpu_ctrl = true;
>              fpu_memdst16:
> @@ -5068,9 +5125,21 @@ x86_emulate(
>                  emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
>                  break;
>              case 4: /* frstor - TODO */
> -            case 6: /* fnsave - TODO */
>                  state->fpu_ctrl = true;
>                  goto unimplemented_insn;
> +            case 6: /* fnsave */
> +                fail_if(!ops->blk);
> +                state->blk = blk_fst;
> +                /* REX is meaningless for this insn by this point. */
> +                rex_prefix = in_protmode(ctxt, ops);
> +                if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
> +                                    op_bytes > 2 ? sizeof(struct x87_env32) + 80
> +                                                 : sizeof(struct x87_env16) + 80,
> +                                    &_regs.eflags,
> +                                    state, ctxt)) != X86EMUL_OKAY )
> +                    goto done;
> +                state->fpu_ctrl = true;
> +                break;
>              case 7: /* fnstsw m2byte */
>                  state->fpu_ctrl = true;
>                  goto fpu_memdst16;
> @@ -11542,6 +11611,12 @@ int x86_emul_blk(
>      switch ( state->blk )
>      {
>          bool zf;
> +        struct {
> +            struct x87_env32 env;
> +            struct {
> +               uint8_t bytes[10];
> +            } freg[8];
> +        } fpstate;
>  
>          /*
>           * Throughout this switch(), memory clobbers are used to compensate
> @@ -11571,6 +11646,93 @@ int x86_emul_blk(
>              *eflags |= X86_EFLAGS_ZF;
>          break;
>  
> +#ifndef X86EMUL_NO_FPU
> +
> +    case blk_fst:
> +        ASSERT(!data);
> +
> +        if ( bytes > sizeof(fpstate.env) )
> +            asm ( "fnsave %0" : "=m" (fpstate) );
> +        else
> +            asm ( "fnstenv %0" : "=m" (fpstate.env) );

We have 4 possible sizes to deal with here - the 16 and 32bit formats of
prot vs real/vm86 modes, and it is not clear (code wise) why
sizeof(fpstate.env) is a suitable boundary.

Given that these are legacy instructions and not a hotpath in the
slightest, it is possibly faster (by removing the branch) and definitely
more obvious to use fnsave unconditionally, and derive all of the
smaller layouts that way.

Critically however, it prevents us from needing a CVE/XSA if ... [bottom
comment]

> +
> +        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
> +        switch ( bytes )
> +        {
> +        case sizeof(fpstate.env):
> +        case sizeof(fpstate):

These case labels don't match up in any kind of obvious way to the
caller.  I think you need /* 32bit FNSAVE */ and /* 32bit FNSTENV */
here, and

> +            if ( !state->rex_prefix )

if ( !state->rex_prefix ) /* Convert 32bit prot to 32bit real/vm86 format */

here.

> +            {
> +                unsigned int fip = fpstate.env.mode.prot.fip +
> +                                   (fpstate.env.mode.prot.fcs << 4);
> +                unsigned int fdp = fpstate.env.mode.prot.fdp +
> +                                   (fpstate.env.mode.prot.fds << 4);
> +                unsigned int fop = fpstate.env.mode.prot.fop;
> +
> +                memset(&fpstate.env.mode, 0, sizeof(fpstate.env.mode));
> +                fpstate.env.mode.real.fip_lo = fip;
> +                fpstate.env.mode.real.fip_hi = fip >> 16;
> +                fpstate.env.mode.real.fop = fop;
> +                fpstate.env.mode.real.fdp_lo = fdp;
> +                fpstate.env.mode.real.fdp_hi = fdp >> 16;
> +            }
> +            memcpy(ptr, &fpstate.env, sizeof(fpstate.env));
> +            if ( bytes == sizeof(fpstate.env) )
> +                ptr = NULL;
> +            else
> +                ptr += sizeof(fpstate.env);
> +            break;
> +
> +        case sizeof(struct x87_env16):
> +        case sizeof(struct x87_env16) + sizeof(fpstate.freg):

Similarly, this wants /* 16bit FNSAVE */ and /* 16bit FNSTENV */.  I'm
tempted to suggest a literal 80 rather than sizeof(fpstate.freg) to
match the caller.

> +            if ( state->rex_prefix )

/* Convert 32bit prot to 16bit prot format */

> +            {
> +                struct x87_env16 *env = ptr;
> +
> +                env->fcw = fpstate.env.fcw;
> +                env->fsw = fpstate.env.fsw;
> +                env->ftw = fpstate.env.ftw;
> +                env->mode.prot.fip = fpstate.env.mode.prot.fip;
> +                env->mode.prot.fcs = fpstate.env.mode.prot.fcs;
> +                env->mode.prot.fdp = fpstate.env.mode.prot.fdp;
> +                env->mode.prot.fds = fpstate.env.mode.prot.fds;
> +            }
> +            else
> +            {

/* Convert 32bit prot to 16bit real/vm86 format */

> +                unsigned int fip = fpstate.env.mode.prot.fip +
> +                                   (fpstate.env.mode.prot.fcs << 4);
> +                unsigned int fdp = fpstate.env.mode.prot.fdp +
> +                                   (fpstate.env.mode.prot.fds << 4);
> +                struct x87_env16 env = {
> +                    .fcw = fpstate.env.fcw,
> +                    .fsw = fpstate.env.fsw,
> +                    .ftw = fpstate.env.ftw,
> +                    .mode.real.fip_lo = fip,
> +                    .mode.real.fip_hi = fip >> 16,
> +                    .mode.real.fop = fpstate.env.mode.prot.fop,
> +                    .mode.real.fdp_lo = fdp,
> +                    .mode.real.fdp_hi = fdp >> 16
> +                };
> +
> +                memcpy(ptr, &env, sizeof(env));
> +            }
> +            if ( bytes == sizeof(struct x87_env16) )
> +                ptr = NULL;
> +            else
> +                ptr += sizeof(struct x87_env16);
> +            break;
> +
> +        default:
> +            ASSERT_UNREACHABLE();
> +            return X86EMUL_UNHANDLEABLE;
> +        }
> +
> +        if ( ptr )
> +            memcpy(ptr, fpstate.freg, sizeof(fpstate.freg));

... we get here accidentally, and then copy stack rubble into the guest.

~Andrew

> +        break;
> +
> +#endif /* X86EMUL_NO_FPU */
> +
>      case blk_movdir:
>          switch ( bytes )
>          {
>



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR
  2020-05-05  8:16 ` [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR Jan Beulich
  2020-05-08 13:37   ` Roger Pau Monné
@ 2020-05-08 18:19   ` Andrew Cooper
  1 sibling, 0 replies; 51+ messages in thread
From: Andrew Cooper @ 2020-05-08 18:19 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Wei Liu, Roger Pau Monne

On 05/05/2020 09:16, Jan Beulich wrote:
> While the Intel SDM claims that FRSTOR itself may raise #MF upon
> completion, this was confirmed by Intel to be a doc error which will be
> corrected in due course; behavior is like FLDENV, and like old hard copy
> manuals describe it. Otherwise we'd have to emulate the insn by filling
> st(N) in suitable order, followed by FLDENV.

I wouldn't bother calling this out at all.  We know the doc is going to
be corrected in the next revision, and "what we would have done if the
docs error was in fact accurate" only adds confusion to an already
complicated change.

> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -857,6 +857,7 @@ struct x86_emulate_state {
>          blk_NONE,
>          blk_enqcmd,
>  #ifndef X86EMUL_NO_FPU
> +        blk_fld, /* FLDENV, FRSTOR */
>          blk_fst, /* FNSTENV, FNSAVE */
>  #endif
>          blk_movdir,
> @@ -4948,21 +4949,14 @@ x86_emulate(
>                  dst.bytes = 4;
>                  emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
>                  break;
> -            case 4: /* fldenv - TODO */
> -                state->fpu_ctrl = true;
> -                goto unimplemented_insn;
> -            case 5: /* fldcw m2byte */
> -                state->fpu_ctrl = true;
> -            fpu_memsrc16:
> -                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
> -                                     2, ctxt)) != X86EMUL_OKAY )
> -                    goto done;
> -                emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
> -                break;

The commit message should however note that the larger-than-expected
diff is purely code motion for case 5, to arrange fldenv to fall though
into fstenv.

Alternatively, could be pulled out into an earlier patch.

> +            case 4: /* fldenv */
> +                /* Raise #MF now if there are pending unmasked exceptions. */
> +                emulate_fpu_insn_stub(0xd9, 0xd0 /* fnop */);
> +                /* fall through */
>              case 6: /* fnstenv */
>                  fail_if(!ops->blk);
> -                state->blk = blk_fst;
> -                /* REX is meaningless for this insn by this point. */
> +                state->blk = modrm_reg & 2 ? blk_fst : blk_fld;
> +                /* REX is meaningless for these insns by this point. */
>                  rex_prefix = in_protmode(ctxt, ops);
>                  if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
>                                      op_bytes > 2 ? sizeof(struct x87_env32)
> @@ -4972,6 +4966,14 @@ x86_emulate(
>                      goto done;
>                  state->fpu_ctrl = true;
>                  break;
> +            case 5: /* fldcw m2byte */
> +                state->fpu_ctrl = true;
> +            fpu_memsrc16:
> +                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
> +                                     2, ctxt)) != X86EMUL_OKAY )
> +                    goto done;
> +                emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
> +                break;
>              case 7: /* fnstcw m2byte */
>                  state->fpu_ctrl = true;
>              fpu_memdst16:
> @@ -5124,13 +5126,14 @@ x86_emulate(
>                  dst.bytes = 8;
>                  emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
>                  break;
> -            case 4: /* frstor - TODO */
> -                state->fpu_ctrl = true;
> -                goto unimplemented_insn;
> +            case 4: /* frstor */
> +                /* Raise #MF now if there are pending unmasked exceptions. */
> +                emulate_fpu_insn_stub(0xd9, 0xd0 /* fnop */);
> +                /* fall through */
>              case 6: /* fnsave */
>                  fail_if(!ops->blk);
> -                state->blk = blk_fst;
> -                /* REX is meaningless for this insn by this point. */
> +                state->blk = modrm_reg & 2 ? blk_fst : blk_fld;
> +                /* REX is meaningless for these insns by this point. */
>                  rex_prefix = in_protmode(ctxt, ops);
>                  if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
>                                      op_bytes > 2 ? sizeof(struct x87_env32) + 80
> @@ -11648,6 +11651,89 @@ int x86_emul_blk(
>  
>  #ifndef X86EMUL_NO_FPU
>  
> +    case blk_fld:
> +        ASSERT(!data);
> +
> +        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
> +        switch ( bytes )
> +        {
> +        case sizeof(fpstate.env):
> +        case sizeof(fpstate):
> +            memcpy(&fpstate.env, ptr, sizeof(fpstate.env));
> +            if ( !state->rex_prefix )
> +            {
> +                unsigned int fip = fpstate.env.mode.real.fip_lo +
> +                                   (fpstate.env.mode.real.fip_hi << 16);
> +                unsigned int fdp = fpstate.env.mode.real.fdp_lo +
> +                                   (fpstate.env.mode.real.fdp_hi << 16);
> +                unsigned int fop = fpstate.env.mode.real.fop;
> +
> +                fpstate.env.mode.prot.fip = fip & 0xf;
> +                fpstate.env.mode.prot.fcs = fip >> 4;
> +                fpstate.env.mode.prot.fop = fop;
> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
> +                fpstate.env.mode.prot.fds = fdp >> 4;

The same comments about comments apply here from the previous patch.

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR
  2020-05-08 15:04     ` Jan Beulich
  2020-05-08 16:21       ` Roger Pau Monné
@ 2020-05-08 18:29       ` Andrew Cooper
  2020-05-11  7:25         ` Jan Beulich
  1 sibling, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-08 18:29 UTC (permalink / raw)
  To: Jan Beulich, Roger Pau Monné; +Cc: xen-devel, Wei Liu

On 08/05/2020 16:04, Jan Beulich wrote:
>>> +            }
>>> +
>>> +            if ( bytes == sizeof(fpstate.env) )
>>> +                ptr = NULL;
>>> +            else
>>> +                ptr += sizeof(fpstate.env);
>>> +            break;
>>> +
>>> +        case sizeof(struct x87_env16):
>>> +        case sizeof(struct x87_env16) + sizeof(fpstate.freg):
>>> +        {
>>> +            const struct x87_env16 *env = ptr;
>>> +
>>> +            fpstate.env.fcw = env->fcw;
>>> +            fpstate.env.fsw = env->fsw;
>>> +            fpstate.env.ftw = env->ftw;
>>> +
>>> +            if ( state->rex_prefix )
>>> +            {
>>> +                fpstate.env.mode.prot.fip = env->mode.prot.fip;
>>> +                fpstate.env.mode.prot.fcs = env->mode.prot.fcs;
>>> +                fpstate.env.mode.prot.fdp = env->mode.prot.fdp;
>>> +                fpstate.env.mode.prot.fds = env->mode.prot.fds;
>>> +                fpstate.env.mode.prot.fop = 0; /* unknown */
>>> +            }
>>> +            else
>>> +            {
>>> +                unsigned int fip = env->mode.real.fip_lo +
>>> +                                   (env->mode.real.fip_hi << 16);
>>> +                unsigned int fdp = env->mode.real.fdp_lo +
>>> +                                   (env->mode.real.fdp_hi << 16);
>>> +                unsigned int fop = env->mode.real.fop;
>>> +
>>> +                fpstate.env.mode.prot.fip = fip & 0xf;
>>> +                fpstate.env.mode.prot.fcs = fip >> 4;
>>> +                fpstate.env.mode.prot.fop = fop;
>>> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
>>> +                fpstate.env.mode.prot.fds = fdp >> 4;
>> This looks mostly the same as the translation done above, so maybe
>> could be abstracted anyway in a macro to avoid the code repetition?
>> (ie: fpstate_real_to_prot(src, dst) or some such).
> Just the 5 assignments could be put in an inline function, but
> if we also wanted to abstract away the declarations with their
> initializers, it would need to be a macro because of the
> different types of fpstate.env and *env. While I'd generally
> prefer inline functions, the macro would have the benefit that
> it could be #define-d / #undef-d right inside this case block.
> Thoughts?

Code like this is large in terms of volume, but it is completely crystal
clear (with the requested comments in place) and easy to follow.

I don't see how attempting to abstract out two small portions is going
to be an improvement.

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 09/12] x86emul: support FXSAVE/FXRSTOR
  2020-05-05  8:16 ` [PATCH v8 09/12] x86emul: support FXSAVE/FXRSTOR Jan Beulich
@ 2020-05-08 19:31   ` Andrew Cooper
  2020-05-13 13:24     ` Jan Beulich
  0 siblings, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-08 19:31 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Wei Liu, Roger Pau Monne

On 05/05/2020 09:16, Jan Beulich wrote:
> Note that FPU selector handling as well as MXCSR mask saving for now
> does not honor differences between host and guest visible featuresets.
>
> While for Intel operation of the insns with CR4.OSFXSR=0 is
> implementation dependent, use the easiest solution there: Simply don't
> look at the bit in the first place. For AMD and alike the behavior is
> well defined, so it gets handled together with FFXSR.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v8: Respect EFER.FFXSE and CR4.OSFXSR. Correct wrong X86EMUL_NO_*
>     dependencies. Reduce #ifdef-ary.
> v7: New.
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -953,6 +958,29 @@ typedef union {
>      uint32_t data32[16];
>  } mmval_t;
>  
> +struct x86_fxsr {
> +    uint16_t fcw;
> +    uint16_t fsw;
> +    uint16_t ftw:8, :8;

uint8_t

> +    uint16_t fop;
> +    union {
> +        struct {
> +            uint32_t offs;
> +            uint32_t sel:16, :16;

uint16_t

> +        };
> +        uint64_t addr;
> +    } fip, fdp;
> +    uint32_t mxcsr;
> +    uint32_t mxcsr_mask;
> +    struct {
> +        uint8_t data[10];
> +        uint8_t _[6];

I'd be tempted to suggest uint16_t :16, :16, :16; here, which gets rid
of the named field, or explicitly rsvd to match below.

> +    } fpreg[8];
> +    uint64_t __attribute__ ((aligned(16))) xmm[16][2];
> +    uint64_t _[6];

This field however is used below.  It would be clearer being named 'rsvd'.

> +    uint64_t avl[6];
> +};
> +
>  /*
>   * While proper alignment gets specified above, this doesn't get honored by
>   * the compiler for automatic variables. Use this helper to instantiate a
> @@ -1910,6 +1938,7 @@ amd_like(const struct x86_emulate_ctxt *
>  #define vcpu_has_cmov()        (ctxt->cpuid->basic.cmov)
>  #define vcpu_has_clflush()     (ctxt->cpuid->basic.clflush)
>  #define vcpu_has_mmx()         (ctxt->cpuid->basic.mmx)
> +#define vcpu_has_fxsr()        (ctxt->cpuid->basic.fxsr)
>  #define vcpu_has_sse()         (ctxt->cpuid->basic.sse)
>  #define vcpu_has_sse2()        (ctxt->cpuid->basic.sse2)
>  #define vcpu_has_sse3()        (ctxt->cpuid->basic.sse3)
> @@ -8125,6 +8154,47 @@ x86_emulate(
>      case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
>          switch ( modrm_reg & 7 )
>          {
> +#if !defined(X86EMUL_NO_FPU) || !defined(X86EMUL_NO_MMX) || \
> +    !defined(X86EMUL_NO_SIMD)
> +        case 0: /* fxsave */
> +        case 1: /* fxrstor */
> +            generate_exception_if(vex.pfx, EXC_UD);
> +            vcpu_must_have(fxsr);
> +            generate_exception_if(ea.type != OP_MEM, EXC_UD);
> +            generate_exception_if(!is_aligned(ea.mem.seg, ea.mem.off, 16,
> +                                              ctxt, ops),
> +                                  EXC_GP, 0);
> +            fail_if(!ops->blk);
> +            op_bytes =
> +#ifdef __x86_64__
> +                !mode_64bit() ? offsetof(struct x86_fxsr, xmm[8]) :
> +#endif
> +                sizeof(struct x86_fxsr);
> +            if ( amd_like(ctxt) )
> +            {
> +                if ( !ops->read_cr ||
> +                     ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
> +                    cr4 = X86_CR4_OSFXSR;

Why do we want to assume OSFXSR in the case of a read_cr() failure,
rather than bailing on the entire instruction?

> +                if ( !ops->read_msr ||
> +                     ops->read_msr(MSR_EFER, &msr_val, ctxt) != X86EMUL_OKAY )
> +                    msr_val = 0;
> +                if ( !(cr4 & X86_CR4_OSFXSR) ||
> +                     (mode_64bit() && mode_ring0() && (msr_val & EFER_FFXSE)) )
> +                    op_bytes = offsetof(struct x86_fxsr, xmm[0]);

This is a very peculiar optimisation in AMD processors...  (but your
logic here does agree with the description AFAICT)

> +            }
> +            /*
> +             * This could also be X86EMUL_FPU_mmx, but it shouldn't be
> +             * X86EMUL_FPU_xmm, as we don't want CR4.OSFXSR checked.
> +             */
> +            get_fpu(X86EMUL_FPU_fpu);
> +            state->blk = modrm_reg & 1 ? blk_fxrstor : blk_fxsave;
> +            if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
> +                                sizeof(struct x86_fxsr), &_regs.eflags,
> +                                state, ctxt)) != X86EMUL_OKAY )
> +                goto done;
> +            break;
> +#endif /* X86EMUL_NO_{FPU,MMX,SIMD} */
> +
>  #ifndef X86EMUL_NO_SIMD
>          case 2: /* ldmxcsr */
>              generate_exception_if(vex.pfx, EXC_UD);
> @@ -11611,6 +11681,8 @@ int x86_emul_blk(
>      struct x86_emulate_state *state,
>      struct x86_emulate_ctxt *ctxt)
>  {
> +    int rc = X86EMUL_OKAY;
> +
>      switch ( state->blk )
>      {
>          bool zf;
> @@ -11819,6 +11891,77 @@ int x86_emul_blk(
>  
>  #endif /* X86EMUL_NO_FPU */
>  
> +#if !defined(X86EMUL_NO_FPU) || !defined(X86EMUL_NO_MMX) || \
> +    !defined(X86EMUL_NO_SIMD)
> +
> +    case blk_fxrstor:
> +    {
> +        struct x86_fxsr *fxsr = FXSAVE_AREA;
> +
> +        ASSERT(!data);
> +        ASSERT(bytes == sizeof(*fxsr));
> +        ASSERT(state->op_bytes <= bytes);
> +
> +        if ( state->op_bytes < sizeof(*fxsr) )
> +        {
> +            if ( state->rex_prefix & REX_W )
> +            {
> +                /*
> +                 * The only way to force fxsaveq on a wide range of gas
> +                 * versions. On older versions the rex64 prefix works only if
> +                 * we force an addressing mode that doesn't require extended
> +                 * registers.
> +                 */
> +                asm volatile ( ".byte 0x48; fxsave (%1)"
> +                               : "=m" (*fxsr) : "R" (fxsr) );
> +            }
> +            else
> +                asm volatile ( "fxsave %0" : "=m" (*fxsr) );
> +        }
> +
> +        memcpy(fxsr, ptr, min(state->op_bytes,
> +                              (unsigned int)offsetof(struct x86_fxsr, _)));
> +        memset(fxsr->_, 0, sizeof(*fxsr) - offsetof(struct x86_fxsr, _));

I'm completely lost trying to follow what's going on here.  Why are we
constructing something different to what the guest gave us?

> +
> +        generate_exception_if(fxsr->mxcsr & ~mxcsr_mask, EXC_GP, 0);
> +
> +        if ( state->rex_prefix & REX_W )
> +        {
> +            /* See above for why operand/constraints are this way. */
> +            asm volatile ( ".byte 0x48; fxrstor (%0)"
> +                           :: "R" (fxsr), "m" (*fxsr) );
> +        }
> +        else
> +            asm volatile ( "fxrstor %0" :: "m" (*fxsr) );
> +        break;
> +    }
> +
> +    case blk_fxsave:
> +    {
> +        struct x86_fxsr *fxsr = FXSAVE_AREA;
> +
> +        ASSERT(!data);
> +        ASSERT(bytes == sizeof(*fxsr));
> +        ASSERT(state->op_bytes <= bytes);
> +
> +        if ( state->rex_prefix & REX_W )
> +        {
> +            /* See above for why operand/constraint are this way. */
> +            asm volatile ( ".byte 0x48; fxsave (%0)"
> +                           :: "R" (state->op_bytes < sizeof(*fxsr) ? fxsr : ptr)
> +                           : "memory" );
> +        }
> +        else
> +            asm volatile ( "fxsave (%0)"
> +                           :: "r" (state->op_bytes < sizeof(*fxsr) ? fxsr : ptr)
> +                           : "memory" );
> +        if ( state->op_bytes < sizeof(*fxsr) )
> +            memcpy(ptr, fxsr, state->op_bytes);

I think this logic would be clearer to follow with:

void *buf = state->op_bytes < sizeof(*fxsr) ? fxsr : ptr;

...

if ( buf != ptr )
    memcpy(ptr, fxsr, state->op_bytes);

This more clearly highlights the "we either fxsave'd straight into the
destination pointer, or into a local buffer if we only want a subset"
property.

> --- a/xen/arch/x86/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate.c
> @@ -42,6 +42,8 @@
>      }                                                      \
>  })
>  
> +#define FXSAVE_AREA current->arch.fpu_ctxt

How safe is this?  Don't we already use this buffer to recover the old
state in the case of an exception?

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 10/12] x86/HVM: scale MPERF values reported to guests (on AMD)
  2020-05-05  8:18 ` [PATCH v8 10/12] " Jan Beulich
@ 2020-05-08 20:32   ` Andrew Cooper
  0 siblings, 0 replies; 51+ messages in thread
From: Andrew Cooper @ 2020-05-08 20:32 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Wei Liu, Roger Pau Monne

On 05/05/2020 09:18, Jan Beulich wrote:
> AMD's PM specifies that MPERF (and its r/o counterpart) reads are
> affected by the TSC ratio. Hence when processing such reads in software
> we too should scale the values. While we don't currently (yet) expose
> the underlying feature flags, besides us allowing the MSRs to be read
> nevertheless, RDPRU is going to expose the values even to user space.
>
> Furthermore, due to the not exposed feature flags, this change has the
> effect of making properly inaccessible (for reads) the two MSRs.
>
> Note that writes to MPERF (and APERF) continue to be unsupported.

Linux is now using MPERF/APERF for its frequency-invariant scheduling
logic.  Irritatingly, via its read/write alias rather than its read-only
alias.  Even more irritatingly, Intel's reference algorithm recommends
writing to both, despite this being being far less efficient than (one
of) AMD's (two) algorithm(s) which tells you just to subtract the values
you last sampled.

On the one hand, I'm tempted to suggest that we offer EFRO on Intel and
update Linux to use it.  OTOH, that would VMExit as Intel CPUs don't
understand the EFRO interface.

I can't see any sane way to virtualise the write behaviour for MPERF/APERF.

>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v3: New.
> ---
> I did consider whether to put the code in guest_rdmsr() instead, but
> decided that it's better to have it next to TSC handling.

Please do put it in guest_rdmsr().  This is code hygene just as much as
bool_t or style fixes are.

The relationship to TSC is passing-at-best.

>
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -3478,6 +3478,22 @@ int hvm_msr_read_intercept(unsigned int
>          *msr_content = v->arch.hvm.msr_tsc_adjust;
>          break;
>  
> +    case MSR_MPERF_RD_ONLY:
> +        if ( !d->arch.cpuid->extd.efro )
> +        {
> +            goto gp_fault;
> +
> +    case MSR_IA32_MPERF:
> +            if ( !(d->arch.cpuid->basic.raw[6].c &
> +                   CPUID6_ECX_APERFMPERF_CAPABILITY) )
> +                goto gp_fault;
> +        }
> +        if ( rdmsr_safe(msr, *msr_content) )
> +            goto gp_fault;
> +        if ( d->arch.cpuid->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )

I suspect we want to gain amd_like() outside of the emulator.

> +            *msr_content = hvm_get_guest_tsc_fixed(v, *msr_content);
> +        break;
> +
>      case MSR_APIC_BASE:
>          *msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
>          break;
> --- a/xen/include/asm-x86/msr-index.h
> +++ b/xen/include/asm-x86/msr-index.h
> @@ -405,6 +405,9 @@
>  #define MSR_IA32_MPERF			0x000000e7
>  #define MSR_IA32_APERF			0x000000e8
>  
> +#define MSR_MPERF_RD_ONLY		0xc00000e7
> +#define MSR_APERF_RD_ONLY		0xc00000e8

S/RD_ONLY/RO/ ?  No loss of meaning.  Also, above the legacy line please.

~Andrew

> +
>  #define MSR_IA32_THERM_CONTROL		0x0000019a
>  #define MSR_IA32_THERM_INTERRUPT	0x0000019b
>  #define MSR_IA32_THERM_STATUS		0x0000019c
>



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 12/12] x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads
  2020-05-05  8:20 ` [PATCH v8 12/12] x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads Jan Beulich
@ 2020-05-08 21:04   ` Andrew Cooper
  2020-05-13 13:35     ` Jan Beulich
  0 siblings, 1 reply; 51+ messages in thread
From: Andrew Cooper @ 2020-05-08 21:04 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Wei Liu, Roger Pau Monne

On 05/05/2020 09:20, Jan Beulich wrote:
> If the hardware can handle accesses, we should allow it to do so. This
> way we can expose EFRO to HVM guests,

I'm reminded now of the conversation I'm sure we've had before, although
I have a feeling it was on IRC.

APERF/MPERF (including the EFRO interface on AMD) are free running
counters but only in C0.  The raw values are not synchronised across
threads.

A vCPU which gets rescheduled has a 50% chance of finding the one or
both values going backwards, and a 100% chance of totally bogus calculation.

There is no point exposing APERF/MPERF to guests.  It can only be used
safely in hypervisor context, on AMD hardware with a CLGI/STGI region,
or on Intel hardware in an NMI handler if you trust that a machine check
isn't going to ruin your day.

VMs have no way of achieving the sampling requirements, and has a fair
chance of getting a plausible-but-wrong answer.

The only possibility to do this safely is on a VM which is pinned to
pCPU for its lifetime, but even I'm unconvinced of the correctness.

I don't think we should be exposing this functionality to guests at all,
although I might be persuaded if someone wanting to use it in a VM can
provide a concrete justification of why the above problems won't get in
their way.

~Andrew


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR
  2020-05-08 18:29       ` Andrew Cooper
@ 2020-05-11  7:25         ` Jan Beulich
  2020-05-11  8:02           ` Roger Pau Monné
  0 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-11  7:25 UTC (permalink / raw)
  To: Andrew Cooper, Roger Pau Monné; +Cc: xen-devel, Wei Liu

On 08.05.2020 20:29, Andrew Cooper wrote:
> On 08/05/2020 16:04, Jan Beulich wrote:
>>>> +            }
>>>> +
>>>> +            if ( bytes == sizeof(fpstate.env) )
>>>> +                ptr = NULL;
>>>> +            else
>>>> +                ptr += sizeof(fpstate.env);
>>>> +            break;
>>>> +
>>>> +        case sizeof(struct x87_env16):
>>>> +        case sizeof(struct x87_env16) + sizeof(fpstate.freg):
>>>> +        {
>>>> +            const struct x87_env16 *env = ptr;
>>>> +
>>>> +            fpstate.env.fcw = env->fcw;
>>>> +            fpstate.env.fsw = env->fsw;
>>>> +            fpstate.env.ftw = env->ftw;
>>>> +
>>>> +            if ( state->rex_prefix )
>>>> +            {
>>>> +                fpstate.env.mode.prot.fip = env->mode.prot.fip;
>>>> +                fpstate.env.mode.prot.fcs = env->mode.prot.fcs;
>>>> +                fpstate.env.mode.prot.fdp = env->mode.prot.fdp;
>>>> +                fpstate.env.mode.prot.fds = env->mode.prot.fds;
>>>> +                fpstate.env.mode.prot.fop = 0; /* unknown */
>>>> +            }
>>>> +            else
>>>> +            {
>>>> +                unsigned int fip = env->mode.real.fip_lo +
>>>> +                                   (env->mode.real.fip_hi << 16);
>>>> +                unsigned int fdp = env->mode.real.fdp_lo +
>>>> +                                   (env->mode.real.fdp_hi << 16);
>>>> +                unsigned int fop = env->mode.real.fop;
>>>> +
>>>> +                fpstate.env.mode.prot.fip = fip & 0xf;
>>>> +                fpstate.env.mode.prot.fcs = fip >> 4;
>>>> +                fpstate.env.mode.prot.fop = fop;
>>>> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
>>>> +                fpstate.env.mode.prot.fds = fdp >> 4;
>>> This looks mostly the same as the translation done above, so maybe
>>> could be abstracted anyway in a macro to avoid the code repetition?
>>> (ie: fpstate_real_to_prot(src, dst) or some such).
>> Just the 5 assignments could be put in an inline function, but
>> if we also wanted to abstract away the declarations with their
>> initializers, it would need to be a macro because of the
>> different types of fpstate.env and *env. While I'd generally
>> prefer inline functions, the macro would have the benefit that
>> it could be #define-d / #undef-d right inside this case block.
>> Thoughts?
> 
> Code like this is large in terms of volume, but it is completely crystal
> clear (with the requested comments in place) and easy to follow.
> 
> I don't see how attempting to abstract out two small portions is going
> to be an improvement.

Okay, easier for me if I don't need to touch it. Roger, can you
live with that?

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR
  2020-05-08 16:21       ` Roger Pau Monné
@ 2020-05-11  7:29         ` Jan Beulich
  2020-05-11  9:22           ` Roger Pau Monné
  0 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-11  7:29 UTC (permalink / raw)
  To: Roger Pau Monné; +Cc: xen-devel, Wei Liu, Andrew Cooper

On 08.05.2020 18:21, Roger Pau Monné wrote:
> On Fri, May 08, 2020 at 05:04:02PM +0200, Jan Beulich wrote:
>> On 08.05.2020 15:37, Roger Pau Monné wrote:
>>> On Tue, May 05, 2020 at 10:16:20AM +0200, Jan Beulich wrote:
>>>> --- a/tools/tests/x86_emulator/test_x86_emulator.c
>>>> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
>>>> @@ -11648,6 +11651,89 @@ int x86_emul_blk(
>>>>  
>>>>  #ifndef X86EMUL_NO_FPU
>>>>  
>>>> +    case blk_fld:
>>>> +        ASSERT(!data);
>>>> +
>>>> +        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
>>>> +        switch ( bytes )
>>>> +        {
>>>> +        case sizeof(fpstate.env):
>>>> +        case sizeof(fpstate):
>>>> +            memcpy(&fpstate.env, ptr, sizeof(fpstate.env));
>>>> +            if ( !state->rex_prefix )
>>>> +            {
>>>> +                unsigned int fip = fpstate.env.mode.real.fip_lo +
>>>> +                                   (fpstate.env.mode.real.fip_hi << 16);
>>>> +                unsigned int fdp = fpstate.env.mode.real.fdp_lo +
>>>> +                                   (fpstate.env.mode.real.fdp_hi << 16);
>>>> +                unsigned int fop = fpstate.env.mode.real.fop;
>>>> +
>>>> +                fpstate.env.mode.prot.fip = fip & 0xf;
>>>> +                fpstate.env.mode.prot.fcs = fip >> 4;
>>>> +                fpstate.env.mode.prot.fop = fop;
>>>> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
>>>> +                fpstate.env.mode.prot.fds = fdp >> 4;
>>>
>>> I've found the layouts in the SDM vol. 1, but I haven't been able to
>>> found the translation mechanism from real to protected. Could you
>>> maybe add a reference here?
>>
>> A reference to some piece of documentation? I don't think this
>> is spelled out anywhere. It's also only one of various possible
>> ways of doing the translation, but among them the most flexible
>> one for possible consumers of the data (because of using the
>> smallest possible offsets into the segments).
> 
> Having this written down as a comment would help, but maybe that's
> just because I'm not familiar at all with all this stuff.
> 
> Again, likely a very stupid question, but I would expect:
> 
> fpstate.env.mode.prot.fip = fip;
> 
> Without the mask.

How that? A linear address has many ways of decomposing into a
real/vm86 mode ssss:oooo pair, but what you suggest is not one
of them. The other extreme to the one chosen would be

                fpstate.env.mode.prot.fip = fip & 0xffff;
                fpstate.env.mode.prot.fcs = (fip >> 4) & 0xf000;

Except that when doing it this way, even the full insn (or for
fcs:fdp the full operand) may not be accessible through the
resulting ssss, due to segment wraparound.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR
  2020-05-11  7:25         ` Jan Beulich
@ 2020-05-11  8:02           ` Roger Pau Monné
  0 siblings, 0 replies; 51+ messages in thread
From: Roger Pau Monné @ 2020-05-11  8:02 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Andrew Cooper, Wei Liu, xen-devel

On Mon, May 11, 2020 at 09:25:54AM +0200, Jan Beulich wrote:
> [CAUTION - EXTERNAL EMAIL] DO NOT reply, click links, or open attachments unless you have verified the sender and know the content is safe.
> 
> On 08.05.2020 20:29, Andrew Cooper wrote:
> > On 08/05/2020 16:04, Jan Beulich wrote:
> >>>> +            }
> >>>> +
> >>>> +            if ( bytes == sizeof(fpstate.env) )
> >>>> +                ptr = NULL;
> >>>> +            else
> >>>> +                ptr += sizeof(fpstate.env);
> >>>> +            break;
> >>>> +
> >>>> +        case sizeof(struct x87_env16):
> >>>> +        case sizeof(struct x87_env16) + sizeof(fpstate.freg):
> >>>> +        {
> >>>> +            const struct x87_env16 *env = ptr;
> >>>> +
> >>>> +            fpstate.env.fcw = env->fcw;
> >>>> +            fpstate.env.fsw = env->fsw;
> >>>> +            fpstate.env.ftw = env->ftw;
> >>>> +
> >>>> +            if ( state->rex_prefix )
> >>>> +            {
> >>>> +                fpstate.env.mode.prot.fip = env->mode.prot.fip;
> >>>> +                fpstate.env.mode.prot.fcs = env->mode.prot.fcs;
> >>>> +                fpstate.env.mode.prot.fdp = env->mode.prot.fdp;
> >>>> +                fpstate.env.mode.prot.fds = env->mode.prot.fds;
> >>>> +                fpstate.env.mode.prot.fop = 0; /* unknown */
> >>>> +            }
> >>>> +            else
> >>>> +            {
> >>>> +                unsigned int fip = env->mode.real.fip_lo +
> >>>> +                                   (env->mode.real.fip_hi << 16);
> >>>> +                unsigned int fdp = env->mode.real.fdp_lo +
> >>>> +                                   (env->mode.real.fdp_hi << 16);
> >>>> +                unsigned int fop = env->mode.real.fop;
> >>>> +
> >>>> +                fpstate.env.mode.prot.fip = fip & 0xf;
> >>>> +                fpstate.env.mode.prot.fcs = fip >> 4;
> >>>> +                fpstate.env.mode.prot.fop = fop;
> >>>> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
> >>>> +                fpstate.env.mode.prot.fds = fdp >> 4;
> >>> This looks mostly the same as the translation done above, so maybe
> >>> could be abstracted anyway in a macro to avoid the code repetition?
> >>> (ie: fpstate_real_to_prot(src, dst) or some such).
> >> Just the 5 assignments could be put in an inline function, but
> >> if we also wanted to abstract away the declarations with their
> >> initializers, it would need to be a macro because of the
> >> different types of fpstate.env and *env. While I'd generally
> >> prefer inline functions, the macro would have the benefit that
> >> it could be #define-d / #undef-d right inside this case block.
> >> Thoughts?
> > 
> > Code like this is large in terms of volume, but it is completely crystal
> > clear (with the requested comments in place) and easy to follow.
> > 
> > I don't see how attempting to abstract out two small portions is going
> > to be an improvement.
> 
> Okay, easier for me if I don't need to touch it. Roger, can you
> live with that?

Yes, that's fine.

Thanks.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR
  2020-05-11  7:29         ` Jan Beulich
@ 2020-05-11  9:22           ` Roger Pau Monné
  0 siblings, 0 replies; 51+ messages in thread
From: Roger Pau Monné @ 2020-05-11  9:22 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, Wei Liu, Andrew Cooper

On Mon, May 11, 2020 at 09:29:27AM +0200, Jan Beulich wrote:
> On 08.05.2020 18:21, Roger Pau Monné wrote:
> > On Fri, May 08, 2020 at 05:04:02PM +0200, Jan Beulich wrote:
> >> On 08.05.2020 15:37, Roger Pau Monné wrote:
> >>> On Tue, May 05, 2020 at 10:16:20AM +0200, Jan Beulich wrote:
> >>>> --- a/tools/tests/x86_emulator/test_x86_emulator.c
> >>>> +++ b/tools/tests/x86_emulator/test_x86_emulator.c
> >>>> @@ -11648,6 +11651,89 @@ int x86_emul_blk(
> >>>>  
> >>>>  #ifndef X86EMUL_NO_FPU
> >>>>  
> >>>> +    case blk_fld:
> >>>> +        ASSERT(!data);
> >>>> +
> >>>> +        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
> >>>> +        switch ( bytes )
> >>>> +        {
> >>>> +        case sizeof(fpstate.env):
> >>>> +        case sizeof(fpstate):
> >>>> +            memcpy(&fpstate.env, ptr, sizeof(fpstate.env));
> >>>> +            if ( !state->rex_prefix )
> >>>> +            {
> >>>> +                unsigned int fip = fpstate.env.mode.real.fip_lo +
> >>>> +                                   (fpstate.env.mode.real.fip_hi << 16);
> >>>> +                unsigned int fdp = fpstate.env.mode.real.fdp_lo +
> >>>> +                                   (fpstate.env.mode.real.fdp_hi << 16);
> >>>> +                unsigned int fop = fpstate.env.mode.real.fop;
> >>>> +
> >>>> +                fpstate.env.mode.prot.fip = fip & 0xf;
> >>>> +                fpstate.env.mode.prot.fcs = fip >> 4;
> >>>> +                fpstate.env.mode.prot.fop = fop;
> >>>> +                fpstate.env.mode.prot.fdp = fdp & 0xf;
> >>>> +                fpstate.env.mode.prot.fds = fdp >> 4;
> >>>
> >>> I've found the layouts in the SDM vol. 1, but I haven't been able to
> >>> found the translation mechanism from real to protected. Could you
> >>> maybe add a reference here?
> >>
> >> A reference to some piece of documentation? I don't think this
> >> is spelled out anywhere. It's also only one of various possible
> >> ways of doing the translation, but among them the most flexible
> >> one for possible consumers of the data (because of using the
> >> smallest possible offsets into the segments).
> > 
> > Having this written down as a comment would help, but maybe that's
> > just because I'm not familiar at all with all this stuff.
> > 
> > Again, likely a very stupid question, but I would expect:
> > 
> > fpstate.env.mode.prot.fip = fip;
> > 
> > Without the mask.
> 
> How that? A linear address has many ways of decomposing into a
> real/vm86 mode ssss:oooo pair, but what you suggest is not one
> of them. The other extreme to the one chosen would be
> 
>                 fpstate.env.mode.prot.fip = fip & 0xffff;
>                 fpstate.env.mode.prot.fcs = (fip >> 4) & 0xf000;
> 
> Except that when doing it this way, even the full insn (or for
> fcs:fdp the full operand) may not be accessible through the
> resulting ssss, due to segment wraparound.

Thanks for the explanation. I see it's better to split the offset into
the lower 4 bytes only in order to prevent overflow.

Roger.


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 07/12] x86emul: support FNSTENV and FNSAVE
  2020-05-08 17:58   ` Andrew Cooper
@ 2020-05-13 12:07     ` Jan Beulich
  0 siblings, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-13 12:07 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 08.05.2020 19:58, Andrew Cooper wrote:
> On 05/05/2020 09:15, Jan Beulich wrote:
>> To avoid introducing another boolean into emulator state, the
>> rex_prefix field gets (ab)used to convey the real/VM86 vs protected mode
>> info (affecting structure layout, albeit not size) to x86_emul_blk().
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> TBD: The full 16-bit padding fields in the 32-bit structures get filled
>>      with all ones by modern CPUs (i.e. other than the comment says for
> 
> You really mean "unlike" here, rather than "other".  They do not have
> the same meaning in this context.
> 
> (I think you're also missing a "what", which I'm guessing is just an
> oversight.)

Well, it's really s/other than/unlike what/ then afaics.

>>      FIP and FDP). We may want to mirror this as well (for the real mode
>>      variant), even if those fields' contents are unspecified.
> 
> This is surprising behaviour, but I expect it dates back to external x87
> processors and default MMIO behaviour.
> 
> If it is entirely consistent, it match be nice to match.  OTOH, the
> manuals are very clear that it is reserved, which I think gives us the
> liberty to use the easier implementation.

I've checked in on an AMD system meanwhile, and it's the same
there. The mentioned comment really refers to observations back
on a 386/387 pair. I think really old CPUs didn't write the
full 16-bit padding fields at all, and the 386/387 then started
writing full 32 bits of FIP and FDP alongside their "high 16
bits" secondary fields. I further assume that this "don't
write parts of the struct at all" behavior was considered
unsafe, or unhelpful when trying to write things out in bigger
chunks (ideally in full cachelines).

I'll leave this as is for now; we can still consider to store
all ones there later on.

>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -897,6 +900,50 @@ struct x86_emulate_state {
>>  #define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
>>  #endif
>>  
>> +#ifndef X86EMUL_NO_FPU
>> +struct x87_env16 {
>> +    uint16_t fcw;
>> +    uint16_t fsw;
>> +    uint16_t ftw;
>> +    union {
>> +        struct {
>> +            uint16_t fip_lo;
>> +            uint16_t fop:11, :1, fip_hi:4;
>> +            uint16_t fdp_lo;
>> +            uint16_t :12, fdp_hi:4;
>> +        } real;
>> +        struct {
>> +            uint16_t fip;
>> +            uint16_t fcs;
>> +            uint16_t fdp;
>> +            uint16_t fds;
>> +        } prot;
>> +    } mode;
>> +};
>> +
>> +struct x87_env32 {
>> +    uint32_t fcw:16, :16;
>> +    uint32_t fsw:16, :16;
>> +    uint32_t ftw:16, :16;
> 
> uint16_t fcw, :16;
> uint16_t fsw, :16;
> uint16_t ftw, :16;
> 
> which reduces the number of 16 bit bitfields.

I'm unconvinced of this being helpful in any way. My goal here
was really to consistently use all uint16_t in the 16-bit
struct, and all uint32_t in the 32-bit one, not the least
after ...

>> +    union {
>> +        struct {
>> +            /* some CPUs/FPUs also store the full FIP here */
>> +            uint32_t fip_lo:16, :16;
>> +            uint32_t fop:11, :1, fip_hi:16, :4;
>> +            /* some CPUs/FPUs also store the full FDP here */
>> +            uint32_t fdp_lo:16, :16;
>> +            uint32_t :12, fdp_hi:16, :4;
> 
> Annoyingly, two of these lines can't use uint16_t.  I'm torn as to
> whether to suggest converting the other two which can.

... observing this. (Really I had it the other way around
initially. I'd be okay to switch back if there was a half
way compelling argument - reducing the number of 16-bit
bitfields doesn't sound like one to me, though, unless
there are implications from this that I don't see.)

>> @@ -11571,6 +11646,93 @@ int x86_emul_blk(
>>              *eflags |= X86_EFLAGS_ZF;
>>          break;
>>  
>> +#ifndef X86EMUL_NO_FPU
>> +
>> +    case blk_fst:
>> +        ASSERT(!data);
>> +
>> +        if ( bytes > sizeof(fpstate.env) )
>> +            asm ( "fnsave %0" : "=m" (fpstate) );
>> +        else
>> +            asm ( "fnstenv %0" : "=m" (fpstate.env) );
> 
> We have 4 possible sizes to deal with here - the 16 and 32bit formats of
> prot vs real/vm86 modes, and it is not clear (code wise) why
> sizeof(fpstate.env) is a suitable boundary.

See the definitons of struct x87_env16 and struct x87_env32:
They're intentionally in part using a union, to make more
obvious that in fact there's just two different sizes to deal
with.

> Given that these are legacy instructions and not a hotpath in the
> slightest, it is possibly faster (by removing the branch) and definitely
> more obvious to use fnsave unconditionally, and derive all of the
> smaller layouts that way.

I can accept the "not a hotpath" argument, but I'm against
using insns other than the intended one for no good reason.

> Critically however, it prevents us from needing a CVE/XSA if ... [bottom
> comment]

This is a legitimate concern, but imo not to be addressed by
using FNSAVE uniformly: There being fields which have
undefined contents even with FNSTENV (and which hence in
principle could not get written at all), I'm instead going
to memset() the entire structure upfront. I'll use 0 for
now, but using ~0 might be an option to fill the padding
fields (see above); the downside then would be that we'd
also fill the less-than-16-bit padding fields this way,
where hardware stores 0 (and where we are at risk of breaking
consumers which don't mask as necessary).

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 09/12] x86emul: support FXSAVE/FXRSTOR
  2020-05-08 19:31   ` Andrew Cooper
@ 2020-05-13 13:24     ` Jan Beulich
  0 siblings, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-13 13:24 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 08.05.2020 21:31, Andrew Cooper wrote:
> On 05/05/2020 09:16, Jan Beulich wrote:
>> @@ -8125,6 +8154,47 @@ x86_emulate(
>>      case X86EMUL_OPC(0x0f, 0xae): case X86EMUL_OPC_66(0x0f, 0xae): /* Grp15 */
>>          switch ( modrm_reg & 7 )
>>          {
>> +#if !defined(X86EMUL_NO_FPU) || !defined(X86EMUL_NO_MMX) || \
>> +    !defined(X86EMUL_NO_SIMD)
>> +        case 0: /* fxsave */
>> +        case 1: /* fxrstor */
>> +            generate_exception_if(vex.pfx, EXC_UD);
>> +            vcpu_must_have(fxsr);
>> +            generate_exception_if(ea.type != OP_MEM, EXC_UD);
>> +            generate_exception_if(!is_aligned(ea.mem.seg, ea.mem.off, 16,
>> +                                              ctxt, ops),
>> +                                  EXC_GP, 0);
>> +            fail_if(!ops->blk);
>> +            op_bytes =
>> +#ifdef __x86_64__
>> +                !mode_64bit() ? offsetof(struct x86_fxsr, xmm[8]) :
>> +#endif
>> +                sizeof(struct x86_fxsr);
>> +            if ( amd_like(ctxt) )
>> +            {
>> +                if ( !ops->read_cr ||
>> +                     ops->read_cr(4, &cr4, ctxt) != X86EMUL_OKAY )
>> +                    cr4 = X86_CR4_OSFXSR;
> 
> Why do we want to assume OSFXSR in the case of a read_cr() failure,
> rather than bailing on the entire instruction?

I prefer to assume "normal" operation over failing in such
cases. We have a few similar examples elsewhere. I'll add
a comment t this effect.

>> @@ -11819,6 +11891,77 @@ int x86_emul_blk(
>>  
>>  #endif /* X86EMUL_NO_FPU */
>>  
>> +#if !defined(X86EMUL_NO_FPU) || !defined(X86EMUL_NO_MMX) || \
>> +    !defined(X86EMUL_NO_SIMD)
>> +
>> +    case blk_fxrstor:
>> +    {
>> +        struct x86_fxsr *fxsr = FXSAVE_AREA;
>> +
>> +        ASSERT(!data);
>> +        ASSERT(bytes == sizeof(*fxsr));
>> +        ASSERT(state->op_bytes <= bytes);
>> +
>> +        if ( state->op_bytes < sizeof(*fxsr) )
>> +        {
>> +            if ( state->rex_prefix & REX_W )
>> +            {
>> +                /*
>> +                 * The only way to force fxsaveq on a wide range of gas
>> +                 * versions. On older versions the rex64 prefix works only if
>> +                 * we force an addressing mode that doesn't require extended
>> +                 * registers.
>> +                 */
>> +                asm volatile ( ".byte 0x48; fxsave (%1)"
>> +                               : "=m" (*fxsr) : "R" (fxsr) );
>> +            }
>> +            else
>> +                asm volatile ( "fxsave %0" : "=m" (*fxsr) );
>> +        }
>> +
>> +        memcpy(fxsr, ptr, min(state->op_bytes,
>> +                              (unsigned int)offsetof(struct x86_fxsr, _)));
>> +        memset(fxsr->_, 0, sizeof(*fxsr) - offsetof(struct x86_fxsr, _));
> 
> I'm completely lost trying to follow what's going on here.  Why are we
> constructing something different to what the guest gave us?

This part of the structure may not get written by FXSAVE. Hence
I'd prefer to assume it has unknown contents (which we shouldn't
leak) over assuming this would never get written (and hence
remain zeroed). Furthermore we mean to pass this to FXRSTOR,
which we know can raise #GP in principle. While this is a legacy
insns and hence unlikely to change in behavior, it seems more
safe to have well known values in at least the reserved range.

I'll add an abbreviated variant of this as a comment.

>> +
>> +        generate_exception_if(fxsr->mxcsr & ~mxcsr_mask, EXC_GP, 0);
>> +
>> +        if ( state->rex_prefix & REX_W )
>> +        {
>> +            /* See above for why operand/constraints are this way. */
>> +            asm volatile ( ".byte 0x48; fxrstor (%0)"
>> +                           :: "R" (fxsr), "m" (*fxsr) );
>> +        }
>> +        else
>> +            asm volatile ( "fxrstor %0" :: "m" (*fxsr) );
>> +        break;
>> +    }
>> +
>> +    case blk_fxsave:
>> +    {
>> +        struct x86_fxsr *fxsr = FXSAVE_AREA;
>> +
>> +        ASSERT(!data);
>> +        ASSERT(bytes == sizeof(*fxsr));
>> +        ASSERT(state->op_bytes <= bytes);
>> +
>> +        if ( state->rex_prefix & REX_W )
>> +        {
>> +            /* See above for why operand/constraint are this way. */
>> +            asm volatile ( ".byte 0x48; fxsave (%0)"
>> +                           :: "R" (state->op_bytes < sizeof(*fxsr) ? fxsr : ptr)
>> +                           : "memory" );
>> +        }
>> +        else
>> +            asm volatile ( "fxsave (%0)"
>> +                           :: "r" (state->op_bytes < sizeof(*fxsr) ? fxsr : ptr)
>> +                           : "memory" );
>> +        if ( state->op_bytes < sizeof(*fxsr) )
>> +            memcpy(ptr, fxsr, state->op_bytes);
> 
> I think this logic would be clearer to follow with:
> 
> void *buf = state->op_bytes < sizeof(*fxsr) ? fxsr : ptr;
> 
> ...
> 
> if ( buf != ptr )
>     memcpy(ptr, fxsr, state->op_bytes);
> 
> This more clearly highlights the "we either fxsave'd straight into the
> destination pointer, or into a local buffer if we only want a subset"
> property.

Ah, yes, and by having buf (or really repurposed fxsr) have
proper type I can then also avoid the memory clobbers, making
the asm()s more similar to the ones used for FXRSTOR emulation.

>> --- a/xen/arch/x86/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate.c
>> @@ -42,6 +42,8 @@
>>      }                                                      \
>>  })
>>  
>> +#define FXSAVE_AREA current->arch.fpu_ctxt
> 
> How safe is this?  Don't we already use this buffer to recover the old
> state in the case of an exception?

For a memory write fault after having updated register state
already, yes. But that can't be the case here. Nevertheless
forcing me to look at this again turned up a bug: We need to
set state->fpu_ctrl in order to keep {,hvmemul_}put_fpu()
from trying to replace FIP/FOP/FDP.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 12/12] x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads
  2020-05-08 21:04   ` Andrew Cooper
@ 2020-05-13 13:35     ` Jan Beulich
  2020-05-14  8:52       ` Jan Beulich
  0 siblings, 1 reply; 51+ messages in thread
From: Jan Beulich @ 2020-05-13 13:35 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 08.05.2020 23:04, Andrew Cooper wrote:
> On 05/05/2020 09:20, Jan Beulich wrote:
>> If the hardware can handle accesses, we should allow it to do so. This
>> way we can expose EFRO to HVM guests,
> 
> I'm reminded now of the conversation I'm sure we've had before, although
> I have a feeling it was on IRC.
> 
> APERF/MPERF (including the EFRO interface on AMD) are free running
> counters but only in C0.  The raw values are not synchronised across
> threads.
> 
> A vCPU which gets rescheduled has a 50% chance of finding the one or
> both values going backwards, and a 100% chance of totally bogus calculation.
> 
> There is no point exposing APERF/MPERF to guests.  It can only be used
> safely in hypervisor context, on AMD hardware with a CLGI/STGI region,
> or on Intel hardware in an NMI handler if you trust that a machine check
> isn't going to ruin your day.
> 
> VMs have no way of achieving the sampling requirements, and has a fair
> chance of getting a plausible-but-wrong answer.
> 
> The only possibility to do this safely is on a VM which is pinned to
> pCPU for its lifetime, but even I'm unconvinced of the correctness.
> 
> I don't think we should be exposing this functionality to guests at all,
> although I might be persuaded if someone wanting to use it in a VM can
> provide a concrete justification of why the above problems won't get in
> their way.

Am I getting it right then that here you're reverting what you've said
on patch 10: "I'm tempted to suggest that we offer EFRO on Intel ..."?
And hence your request is to drop both that and this patch?

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v8 12/12] x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads
  2020-05-13 13:35     ` Jan Beulich
@ 2020-05-14  8:52       ` Jan Beulich
  0 siblings, 0 replies; 51+ messages in thread
From: Jan Beulich @ 2020-05-14  8:52 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Wei Liu, Roger Pau Monne

On 13.05.2020 15:35, Jan Beulich wrote:
> On 08.05.2020 23:04, Andrew Cooper wrote:
>> On 05/05/2020 09:20, Jan Beulich wrote:
>>> If the hardware can handle accesses, we should allow it to do so. This
>>> way we can expose EFRO to HVM guests,
>>
>> I'm reminded now of the conversation I'm sure we've had before, although
>> I have a feeling it was on IRC.
>>
>> APERF/MPERF (including the EFRO interface on AMD) are free running
>> counters but only in C0.  The raw values are not synchronised across
>> threads.
>>
>> A vCPU which gets rescheduled has a 50% chance of finding the one or
>> both values going backwards, and a 100% chance of totally bogus calculation.
>>
>> There is no point exposing APERF/MPERF to guests.  It can only be used
>> safely in hypervisor context, on AMD hardware with a CLGI/STGI region,
>> or on Intel hardware in an NMI handler if you trust that a machine check
>> isn't going to ruin your day.
>>
>> VMs have no way of achieving the sampling requirements, and has a fair
>> chance of getting a plausible-but-wrong answer.
>>
>> The only possibility to do this safely is on a VM which is pinned to
>> pCPU for its lifetime, but even I'm unconvinced of the correctness.
>>
>> I don't think we should be exposing this functionality to guests at all,
>> although I might be persuaded if someone wanting to use it in a VM can
>> provide a concrete justification of why the above problems won't get in
>> their way.
> 
> Am I getting it right then that here you're reverting what you've said
> on patch 10: "I'm tempted to suggest that we offer EFRO on Intel ..."?
> And hence your request is to drop both that and this patch?

Which in turn would then mean also dropping patch 11. Not
supporting RDPRU because of the APERF/MPERF peculiarities
means we also won't be able to support it once other leaves
get defined, as its specification seems to only halfway
allow for supporting higher leaves but not lower ones (by
making the lower ones return zero, which for the two
initially defined leaves may not be expected by the caller).

Since I don't see us settle on this very soon, I guess I'll
re-submit the series with the last three patches left out.

Jan


^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2020-05-14  8:53 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-05  8:10 [PATCH v8 00/12] x86emul: further work Jan Beulich
2020-05-05  8:12 ` [PATCH v8 01/12] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM Jan Beulich
2020-05-07 18:11   ` Andrew Cooper
2020-05-08  8:10     ` Jan Beulich
2020-05-05  8:13 ` [PATCH v8 02/12] x86emul: support MOVDIR{I,64B} insns Jan Beulich
2020-05-07 18:30   ` Andrew Cooper
2020-05-08  7:19     ` Jan Beulich
2020-05-05  8:13 ` [PATCH v8 03/12] x86emul: support ENQCMD insns Jan Beulich
2020-05-07 18:59   ` Andrew Cooper
2020-05-08  7:32     ` Jan Beulich
2020-05-05  8:14 ` [PATCH v8 04/12] x86emul: support SERIALIZE Jan Beulich
2020-05-07 19:32   ` Andrew Cooper
2020-05-08  7:34     ` Jan Beulich
2020-05-08 13:00       ` Andrew Cooper
2020-05-08 13:59         ` Jan Beulich
2020-05-08 15:05           ` Andrew Cooper
2020-05-05  8:14 ` [PATCH v8 05/12] x86emul: support X{SUS,RES}LDTRK Jan Beulich
2020-05-07 20:13   ` Andrew Cooper
2020-05-08  7:38     ` Jan Beulich
2020-05-08 13:15       ` Andrew Cooper
2020-05-08 14:42         ` Jan Beulich
2020-05-05  8:15 ` [PATCH v8 06/12] x86/HVM: make hvmemul_blk() capable of handling r/o operations Jan Beulich
2020-05-05 14:20   ` Paul Durrant
2020-05-07 20:34   ` Andrew Cooper
2020-05-08  7:13     ` Jan Beulich
2020-05-05  8:15 ` [PATCH v8 07/12] x86emul: support FNSTENV and FNSAVE Jan Beulich
2020-05-05 12:36   ` Jan Beulich
2020-05-08 17:58   ` Andrew Cooper
2020-05-13 12:07     ` Jan Beulich
2020-05-05  8:16 ` [PATCH v8 08/12] x86emul: support FLDENV and FRSTOR Jan Beulich
2020-05-08 13:37   ` Roger Pau Monné
2020-05-08 15:04     ` Jan Beulich
2020-05-08 16:21       ` Roger Pau Monné
2020-05-11  7:29         ` Jan Beulich
2020-05-11  9:22           ` Roger Pau Monné
2020-05-08 18:29       ` Andrew Cooper
2020-05-11  7:25         ` Jan Beulich
2020-05-11  8:02           ` Roger Pau Monné
2020-05-08 18:19   ` Andrew Cooper
2020-05-05  8:16 ` [PATCH v8 09/12] x86emul: support FXSAVE/FXRSTOR Jan Beulich
2020-05-08 19:31   ` Andrew Cooper
2020-05-13 13:24     ` Jan Beulich
2020-05-05  8:17 ` [PATCH v8 09/12] x86/HVM: scale MPERF values reported to guests (on AMD) Jan Beulich
2020-05-05  8:19   ` Jan Beulich
2020-05-05  8:18 ` [PATCH v8 10/12] " Jan Beulich
2020-05-08 20:32   ` Andrew Cooper
2020-05-05  8:19 ` [PATCH v8 11/12] x86emul: support RDPRU Jan Beulich
2020-05-05  8:20 ` [PATCH v8 12/12] x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads Jan Beulich
2020-05-08 21:04   ` Andrew Cooper
2020-05-13 13:35     ` Jan Beulich
2020-05-14  8:52       ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.