From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Jan Beulich" Subject: [PATCH v2 04/11] x86emul: support MMX/SSE/SSE2 moves Date: Wed, 01 Feb 2017 04:14:56 -0700 Message-ID: <5891D1400200007800135BFB@prv-mh.provo.novell.com> References: <5891CF990200007800135BC5@prv-mh.provo.novell.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="=__Part427B8620.1__=" Return-path: Received: from mail6.bemta3.messagelabs.com ([195.245.230.39]) by lists.xenproject.org with esmtp (Exim 4.84_2) (envelope-from ) id 1cYssR-0002oF-SJ for xen-devel@lists.xenproject.org; Wed, 01 Feb 2017 11:15:04 +0000 In-Reply-To: <5891CF990200007800135BC5@prv-mh.provo.novell.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" To: xen-devel Cc: Andrew Cooper List-Id: xen-devel@lists.xenproject.org This is a MIME message. If you are reading this text, you may want to consider changing to a mail reader or gateway that understands how to properly handle MIME multipart messages. --=__Part427B8620.1__= Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: quoted-printable Content-Disposition: inline Previously supported insns are being converted to the new model, and several new ones are being added. To keep the stub handling reasonably simple, integrate SET_SSE_PREFIX() into copy_REX_VEX(), at once switching the stubs to use an empty REX prefix instead of a double DS: one (no byte registers are being accessed, so an empty REX prefix has no effect), except (of course) for the 32-bit test harness build. Signed-off-by: Jan Beulich --- v2: Don't clear TwoOp for vmov{l,h}p{s,d} to memory. Move re-setting of TwoOp into VEX-specific code paths where possible. Special case {,v}maskmov{q,dqu} in stub invocation. Move {,v}movq code block to proper position. Add zero-mask {,v}maskmov{q,dqu} tests. --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -1557,6 +1557,29 @@ int main(int argc, char **argv) else printf("skipped\n"); =20 + printf("%-40s", "Testing movq 32(%ecx),%xmm1..."); + if ( stack_exec && cpu_has_sse2 ) + { + decl_insn(movq_from_mem2); + + asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n" + put_insn(movq_from_mem2, "movq 32(%0), %%xmm1") + :: "c" (NULL) ); + + set_insn(movq_from_mem2); + rc =3D x86_emulate(&ctxt, &emulops); + if ( rc !=3D X86EMUL_OKAY || !check_eip(movq_from_mem2) ) + goto fail; + asm ( "pcmpgtb %%xmm0, %%xmm0\n\t" + "pcmpeqb %%xmm1, %%xmm0\n\t" + "pmovmskb %%xmm0, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0xffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing vmovq %xmm1,32(%edx)..."); if ( stack_exec && cpu_has_avx ) { @@ -1581,6 +1604,29 @@ int main(int argc, char **argv) else printf("skipped\n"); =20 + printf("%-40s", "Testing vmovq 32(%edx),%xmm0..."); + if ( stack_exec && cpu_has_avx ) + { + decl_insn(vmovq_from_mem); + + asm volatile ( "pcmpeqb %%xmm0, %%xmm0\n" + put_insn(vmovq_from_mem, "vmovq 32(%0), %%xmm0") + :: "d" (NULL) ); + + set_insn(vmovq_from_mem); + rc =3D x86_emulate(&ctxt, &emulops); + if ( rc !=3D X86EMUL_OKAY || !check_eip(vmovq_from_mem) ) + goto fail; + asm ( "pcmpgtb %%xmm1, %%xmm1\n\t" + "pcmpeqb %%xmm0, %%xmm1\n\t" + "pmovmskb %%xmm1, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0xffff ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing movdqu %xmm2,(%ecx)..."); if ( stack_exec && cpu_has_sse2 ) { @@ -1812,6 +1858,33 @@ int main(int argc, char **argv) else printf("skipped\n"); =20 + printf("%-40s", "Testing movd 32(%ecx),%mm4..."); + if ( stack_exec && cpu_has_mmx ) + { + decl_insn(movd_from_mem); + + asm volatile ( "pcmpgtb %%mm4, %%mm4\n" + put_insn(movd_from_mem, "movd 32(%0), %%mm4") + :: "c" (NULL) ); + + set_insn(movd_from_mem); + rc =3D x86_emulate(&ctxt, &emulops); + if ( rc !=3D X86EMUL_OKAY || !check_eip(movd_from_mem) ) + goto fail; + asm ( "pxor %%mm2,%%mm2\n\t" + "pcmpeqb %%mm4, %%mm2\n\t" + "pmovmskb %%mm2, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0xf0 ) + goto fail; + asm ( "pcmpeqb %%mm4, %%mm3\n\t" + "pmovmskb %%mm3, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0x0f ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing movd %xmm2,32(%edx)..."); if ( stack_exec && cpu_has_sse2 ) { @@ -1836,6 +1909,34 @@ int main(int argc, char **argv) else printf("skipped\n"); =20 + printf("%-40s", "Testing movd 32(%edx),%xmm3..."); + if ( stack_exec && cpu_has_sse2 ) + { + decl_insn(movd_from_mem2); + + asm volatile ( "pcmpeqb %%xmm3, %%xmm3\n" + put_insn(movd_from_mem2, "movd 32(%0), %%xmm3") + :: "d" (NULL) ); + + set_insn(movd_from_mem2); + rc =3D x86_emulate(&ctxt, &emulops); + if ( rc !=3D X86EMUL_OKAY || !check_eip(movd_from_mem2) ) + goto fail; + asm ( "pxor %%xmm1,%%xmm1\n\t" + "pcmpeqb %%xmm3, %%xmm1\n\t" + "pmovmskb %%xmm1, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0xfff0 ) + goto fail; + asm ( "pcmpeqb %%xmm2, %%xmm2\n\t" + "pcmpeqb %%xmm3, %%xmm2\n\t" + "pmovmskb %%xmm2, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0x000f ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing vmovd %xmm1,32(%ecx)..."); if ( stack_exec && cpu_has_avx ) { @@ -1860,6 +1961,34 @@ int main(int argc, char **argv) else printf("skipped\n"); =20 + printf("%-40s", "Testing vmovd 32(%ecx),%xmm2..."); + if ( stack_exec && cpu_has_avx ) + { + decl_insn(vmovd_from_mem); + + asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n" + put_insn(vmovd_from_mem, "vmovd 32(%0), %%xmm2") + :: "c" (NULL) ); + + set_insn(vmovd_from_mem); + rc =3D x86_emulate(&ctxt, &emulops); + if ( rc !=3D X86EMUL_OKAY || !check_eip(vmovd_from_mem) ) + goto fail; + asm ( "pxor %%xmm0,%%xmm0\n\t" + "pcmpeqb %%xmm2, %%xmm0\n\t" + "pmovmskb %%xmm0, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0xfff0 ) + goto fail; + asm ( "pcmpeqb %%xmm1, %%xmm1\n\t" + "pcmpeqb %%xmm2, %%xmm1\n\t" + "pmovmskb %%xmm1, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0x000f ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing movd %mm3,%ebx..."); if ( stack_exec && cpu_has_mmx ) { @@ -1890,6 +2019,34 @@ int main(int argc, char **argv) else printf("skipped\n"); =20 + printf("%-40s", "Testing movd %ebx,%mm4..."); + if ( stack_exec && cpu_has_mmx ) + { + decl_insn(movd_from_reg); + + /* See comment next to movd above. */ + asm volatile ( "pcmpgtb %%mm4, %%mm4\n" + put_insn(movd_from_reg, "movd %%ebx, %%mm4") + :: ); + + set_insn(movd_from_reg); + rc =3D x86_emulate(&ctxt, &emulops); + if ( (rc !=3D X86EMUL_OKAY) || !check_eip(movd_from_reg) ) + goto fail; + asm ( "pxor %%mm2,%%mm2\n\t" + "pcmpeqb %%mm4, %%mm2\n\t" + "pmovmskb %%mm2, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0xf0 ) + goto fail; + asm ( "pcmpeqb %%mm4, %%mm3\n\t" + "pmovmskb %%mm3, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0x0f ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing movd %xmm2,%ebx..."); if ( stack_exec && cpu_has_sse2 ) { @@ -1915,6 +2072,35 @@ int main(int argc, char **argv) else printf("skipped\n"); =20 + printf("%-40s", "Testing movd %ebx,%xmm3..."); + if ( stack_exec && cpu_has_sse2 ) + { + decl_insn(movd_from_reg2); + + /* See comment next to movd above. */ + asm volatile ( "pcmpgtb %%xmm3, %%xmm3\n" + put_insn(movd_from_reg2, "movd %%ebx, %%xmm3") + :: ); + + set_insn(movd_from_reg2); + rc =3D x86_emulate(&ctxt, &emulops); + if ( (rc !=3D X86EMUL_OKAY) || !check_eip(movd_from_reg2) ) + goto fail; + asm ( "pxor %%xmm1,%%xmm1\n\t" + "pcmpeqb %%xmm3, %%xmm1\n\t" + "pmovmskb %%xmm1, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0xfff0 ) + goto fail; + asm ( "pcmpeqb %%xmm2, %%xmm2\n\t" + "pcmpeqb %%xmm3, %%xmm2\n\t" + "pmovmskb %%xmm2, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0x000f ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing vmovd %xmm1,%ebx..."); if ( stack_exec && cpu_has_avx ) { @@ -1940,6 +2126,35 @@ int main(int argc, char **argv) else printf("skipped\n"); =20 + printf("%-40s", "Testing vmovd %ebx,%xmm2..."); + if ( stack_exec && cpu_has_avx ) + { + decl_insn(vmovd_from_reg); + + /* See comment next to movd above. */ + asm volatile ( "pcmpgtb %%xmm2, %%xmm2\n" + put_insn(vmovd_from_reg, "vmovd %%ebx, %%xmm2") + :: ); + + set_insn(vmovd_from_reg); + rc =3D x86_emulate(&ctxt, &emulops); + if ( (rc !=3D X86EMUL_OKAY) || !check_eip(vmovd_from_reg) ) + goto fail; + asm ( "pxor %%xmm0,%%xmm0\n\t" + "pcmpeqb %%xmm2, %%xmm0\n\t" + "pmovmskb %%xmm0, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0xfff0 ) + goto fail; + asm ( "pcmpeqb %%xmm1, %%xmm1\n\t" + "pcmpeqb %%xmm2, %%xmm1\n\t" + "pmovmskb %%xmm1, %0" : "=3Dr" (rc) ); + if ( rc !=3D 0x000f ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + #ifdef __x86_64__ printf("%-40s", "Testing movq %mm3,32(%ecx)..."); if ( stack_exec && cpu_has_mmx ) @@ -2078,6 +2293,41 @@ int main(int argc, char **argv) printf("skipped\n"); #endif =20 + printf("%-40s", "Testing maskmovq (zero mask)..."); + if ( stack_exec && cpu_has_sse ) + { + decl_insn(maskmovq); + + asm volatile ( "pcmpgtb %mm4, %mm4\n" + put_insn(maskmovq, "maskmovq %mm4, %mm4") ); + + set_insn(maskmovq); + regs.edi =3D 0; + rc =3D x86_emulate(&ctxt, &emulops); + if ( rc !=3D X86EMUL_OKAY || !check_eip(maskmovq) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + + printf("%-40s", "Testing maskmovdqu (zero mask)..."); + if ( stack_exec && cpu_has_sse2 ) + { + decl_insn(maskmovdqu); + + asm volatile ( "pcmpgtb %xmm3, %xmm3\n" + put_insn(maskmovdqu, "maskmovdqu %xmm3, %xmm3") ); + + set_insn(maskmovdqu); + rc =3D x86_emulate(&ctxt, &emulops); + if ( rc !=3D X86EMUL_OKAY || !check_eip(maskmovdqu) ) + goto fail; + printf("okay\n"); + } + else + printf("skipped\n"); + printf("%-40s", "Testing lddqu 4(%edx),%xmm4..."); if ( stack_exec && cpu_has_sse3 ) { --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -236,9 +236,12 @@ static const struct { [0x0f] =3D { ModRM|SrcImmByte }, [0x10] =3D { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp }, [0x11] =3D { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp }, - [0x12 ... 0x13] =3D { ImplicitOps|ModRM }, + [0x12] =3D { DstImplicit|SrcMem|ModRM|Mov, simd_other }, + [0x13] =3D { DstMem|SrcImplicit|ModRM|Mov, simd_other }, [0x14 ... 0x15] =3D { DstImplicit|SrcMem|ModRM, simd_packed_fp }, - [0x16 ... 0x1f] =3D { ImplicitOps|ModRM }, + [0x16] =3D { DstImplicit|SrcMem|ModRM|Mov, simd_other }, + [0x17] =3D { DstMem|SrcImplicit|ModRM|Mov, simd_other }, + [0x18 ... 0x1f] =3D { ImplicitOps|ModRM }, [0x20 ... 0x21] =3D { DstMem|SrcImplicit|ModRM }, [0x22 ... 0x23] =3D { DstImplicit|SrcMem|ModRM }, [0x28] =3D { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp }, @@ -251,7 +254,7 @@ static const struct { [0x38] =3D { DstReg|SrcMem|ModRM }, [0x3a] =3D { DstReg|SrcImmByte|ModRM }, [0x40 ... 0x4f] =3D { DstReg|SrcMem|ModRM|Mov }, - [0x50] =3D { ModRM }, + [0x50] =3D { DstReg|SrcImplicit|ModRM|Mov }, [0x51] =3D { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp }, [0x52 ... 0x53] =3D { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp = }, [0x54 ... 0x57] =3D { DstImplicit|SrcMem|ModRM, simd_packed_fp }, @@ -262,14 +265,16 @@ static const struct { [0x63 ... 0x67] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0x68 ... 0x6a] =3D { DstImplicit|SrcMem|ModRM, simd_other }, [0x6b ... 0x6d] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int }, - [0x6e ... 0x6f] =3D { ImplicitOps|ModRM }, + [0x6e] =3D { DstImplicit|SrcMem|ModRM|Mov }, + [0x6f] =3D { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int }, [0x70] =3D { SrcImmByte|ModRM|TwoOp, simd_other }, [0x71 ... 0x73] =3D { SrcImmByte|ModRM }, [0x74 ... 0x76] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0x77] =3D { DstImplicit|SrcNone }, [0x78 ... 0x79] =3D { ModRM }, [0x7c ... 0x7d] =3D { DstImplicit|SrcMem|ModRM, simd_other }, - [0x7e ... 0x7f] =3D { ImplicitOps|ModRM }, + [0x7e] =3D { DstMem|SrcImplicit|ModRM|Mov }, + [0x7f] =3D { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int }, [0x80 ... 0x8f] =3D { DstImplicit|SrcImm }, [0x90 ... 0x9f] =3D { ByteOp|DstMem|SrcNone|ModRM|Mov }, [0xa0 ... 0xa1] =3D { ImplicitOps|Mov }, @@ -311,19 +316,19 @@ static const struct { [0xd0] =3D { DstImplicit|SrcMem|ModRM, simd_other }, [0xd1 ... 0xd3] =3D { DstImplicit|SrcMem|ModRM, simd_other }, [0xd4 ... 0xd5] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int }, - [0xd6] =3D { ImplicitOps|ModRM }, - [0xd7] =3D { ModRM }, + [0xd6] =3D { DstMem|SrcImplicit|ModRM|Mov, simd_other }, + [0xd7] =3D { DstReg|SrcImplicit|ModRM|Mov }, [0xd8 ... 0xdf] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0xe0] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0xe1 ... 0xe2] =3D { DstImplicit|SrcMem|ModRM, simd_other }, [0xe3 ... 0xe5] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0xe6] =3D { ModRM }, - [0xe7] =3D { ImplicitOps|ModRM }, + [0xe7] =3D { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int }, [0xe8 ... 0xef] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0xf0] =3D { DstImplicit|SrcMem|ModRM|Mov, simd_other }, [0xf1 ... 0xf3] =3D { DstImplicit|SrcMem|ModRM, simd_other }, [0xf4 ... 0xf6] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int }, - [0xf7] =3D { ModRM }, + [0xf7] =3D { DstMem|SrcMem|ModRM|Mov, simd_packed_int }, [0xf8 ... 0xfe] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int }, [0xff] =3D { ModRM } }; @@ -359,11 +364,6 @@ enum vex_pfx { =20 static const uint8_t sse_prefix[] =3D { 0x66, 0xf3, 0xf2 }; =20 -#define SET_SSE_PREFIX(dst, vex_pfx) do { \ - if ( vex_pfx ) \ - (dst) =3D sse_prefix[(vex_pfx) - 1]; \ -} while (0) - union vex { uint8_t raw[2]; struct { @@ -378,15 +378,35 @@ union vex { }; }; =20 +#ifdef __x86_64__ +# define PFX2 REX_PREFIX +#else +# define PFX2 0x3e +#endif +#define PFX_BYTES 3 +#define init_prefixes(stub) ({ \ + uint8_t *buf_ =3D get_stub(stub); \ + buf_[0] =3D 0x3e; \ + buf_[1] =3D PFX2; \ + buf_[2] =3D 0x0f; \ + buf_ + 3; \ +}) + #define copy_REX_VEX(ptr, rex, vex) do { \ if ( (vex).opcx !=3D vex_none ) \ { \ if ( !mode_64bit() ) \ vex.reg |=3D 8; \ - ptr[0] =3D 0xc4, ptr[1] =3D (vex).raw[0], ptr[2] =3D (vex).raw[1];= \ + (ptr)[0 - PFX_BYTES] =3D 0xc4; \ + (ptr)[1 - PFX_BYTES] =3D (vex).raw[0]; \ + (ptr)[2 - PFX_BYTES] =3D (vex).raw[1]; \ + } \ + else \ + { \ + if ( (vex).pfx ) \ + (ptr)[0 - PFX_BYTES] =3D sse_prefix[(vex).pfx - 1]; \ + (ptr)[1 - PFX_BYTES] |=3D rex; \ } \ - else if ( mode_64bit() ) \ - ptr[1] =3D rex | REX_PREFIX; \ } while (0) =20 union evex { @@ -2159,7 +2179,8 @@ x86_decode_twobyte( case 0x10 ... 0x18: case 0x28 ... 0x2f: case 0x50 ... 0x77: - case 0x79 ... 0x7f: + case 0x79 ... 0x7d: + case 0x7f: case 0xae: case 0xc2 ... 0xc3: case 0xc5 ... 0xc6: @@ -2179,6 +2200,18 @@ x86_decode_twobyte( op_bytes =3D mode_64bit() ? 8 : 4; break; =20 + case 0x7e: + ctxt->opcode |=3D MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK); + if ( vex.pfx =3D=3D vex_f3 ) /* movq xmm/m64,xmm */ + { + case X86EMUL_OPC_VEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */ + state->desc =3D DstImplicit | SrcMem | ModRM | Mov; + state->simd_size =3D simd_other; + /* Avoid the state->desc adjustment below. */ + return X86EMUL_OKAY; + } + break; + case 0xb8: /* jmpe / popcnt */ if ( rep_prefix() ) ctxt->opcode |=3D MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK); @@ -2776,7 +2809,7 @@ x86_emulate( struct cpu_user_regs _regs =3D *ctxt->regs; struct x86_emulate_state state; int rc; - uint8_t b, d; + uint8_t b, d, *opc =3D NULL; bool singlestep =3D (_regs._eflags & EFLG_TF) && !is_branch_step(ctxt,= ops); bool sfence =3D false; struct operand src =3D { .reg =3D PTR_POISON }; @@ -5255,6 +5288,7 @@ x86_emulate( CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5e): /* vdiv{p,s}{s,d} {x,y}mm/mem,{= x,y}mm,{x,y}mm */ CASE_SIMD_ALL_FP(, 0x0f, 0x5f): /* max{p,s}{s,d} xmm/mem,xmm = */ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5f): /* vmax{p,s}{s,d} {x,y}mm/mem,{= x,y}mm,{x,y}mm */ + simd_0f_fp: if ( vex.opcx =3D=3D vex_none ) { if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK ) @@ -5273,24 +5307,63 @@ x86_emulate( get_fpu(X86EMUL_FPU_ymm, &fic); } simd_0f_common: - { - uint8_t *buf =3D get_stub(stub); - - buf[0] =3D 0x3e; - buf[1] =3D 0x3e; - buf[2] =3D 0x0f; - buf[3] =3D b; - buf[4] =3D modrm; + opc =3D init_prefixes(stub); + opc[0] =3D b; + opc[1] =3D modrm; if ( ea.type =3D=3D OP_MEM ) { /* convert memory operand to (%rAX) */ rex_prefix &=3D ~REX_B; vex.b =3D 1; - buf[4] &=3D 0x38; + opc[1] &=3D 0x38; } - fic.insn_bytes =3D 5; + fic.insn_bytes =3D PFX_BYTES + 2; break; - } + + case X86EMUL_OPC_66(0x0f, 0x12): /* movlpd m64,xmm */ + case X86EMUL_OPC_VEX_66(0x0f, 0x12): /* vmovlpd m64,xmm,xmm */ + CASE_SIMD_PACKED_FP(, 0x0f, 0x13): /* movlp{s,d} xmm,m64 */ + CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x13): /* vmovlp{s,d} xmm,m64 */ + case X86EMUL_OPC_66(0x0f, 0x16): /* movhpd m64,xmm */ + case X86EMUL_OPC_VEX_66(0x0f, 0x16): /* vmovhpd m64,xmm,xmm */ + CASE_SIMD_PACKED_FP(, 0x0f, 0x17): /* movhp{s,d} xmm,m64 */ + CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x17): /* vmovhp{s,d} xmm,m64 */ + generate_exception_if(ea.type !=3D OP_MEM, EXC_UD); + /* fall through */ + case X86EMUL_OPC(0x0f, 0x12): /* movlps m64,xmm */ + /* movhlps xmm,xmm */ + case X86EMUL_OPC_VEX(0x0f, 0x12): /* vmovlps m64,xmm,xmm */ + /* vmovhlps xmm,xmm,xmm */ + case X86EMUL_OPC(0x0f, 0x16): /* movhps m64,xmm */ + /* movlhps xmm,xmm */ + case X86EMUL_OPC_VEX(0x0f, 0x16): /* vmovhps m64,xmm,xmm */ + /* vmovlhps xmm,xmm,xmm */ + generate_exception_if(vex.l, EXC_UD); + if ( (d & DstMask) !=3D DstMem ) + d &=3D ~TwoOp; + op_bytes =3D 8; + goto simd_0f_fp; + + case X86EMUL_OPC_F3(0x0f, 0x12): /* movsldup xmm/m128,xmm */ + case X86EMUL_OPC_VEX_F3(0x0f, 0x12): /* vmovsldup {x,y}mm/mem,{x,y}m= m */ + case X86EMUL_OPC_F2(0x0f, 0x12): /* movddup xmm/m64,xmm */ + case X86EMUL_OPC_VEX_F2(0x0f, 0x12): /* vmovddup {x,y}mm/mem,{x,y}mm= */ + case X86EMUL_OPC_F3(0x0f, 0x16): /* movshdup xmm/m128,xmm */ + case X86EMUL_OPC_VEX_F3(0x0f, 0x16): /* vmovshdup {x,y}mm/mem,{x,y}m= m */ + d |=3D TwoOp; + op_bytes =3D !(vex.pfx & VEX_PREFIX_DOUBLE_MASK) || vex.l + ? 16 << vex.l : 8; + if ( vex.opcx =3D=3D vex_none ) + { + host_and_vcpu_must_have(sse3); + get_fpu(X86EMUL_FPU_xmm, &fic); + } + else + { + host_and_vcpu_must_have(avx); + get_fpu(X86EMUL_FPU_ymm, &fic); + } + goto simd_0f_common; =20 case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */ case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */ @@ -5451,6 +5524,57 @@ x86_emulate( break; } =20 + CASE_SIMD_PACKED_FP(, 0x0f, 0x50): /* movmskp{s,d} xmm,reg */ + CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg = */ + CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */ + case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */ + generate_exception_if(ea.type !=3D OP_REG, EXC_UD); + + if ( vex.opcx =3D=3D vex_none ) + { + if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK ) + vcpu_must_have(sse2); + else + { + if ( b !=3D 0x50 ) + host_and_vcpu_must_have(mmx); + vcpu_must_have(sse); + } + if ( b =3D=3D 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) ) + get_fpu(X86EMUL_FPU_xmm, &fic); + else + get_fpu(X86EMUL_FPU_mmx, &fic); + } + else + { + generate_exception_if(vex.reg !=3D 0xf, EXC_UD); + if ( b =3D=3D 0x50 || !vex.l ) + host_and_vcpu_must_have(avx); + else + host_and_vcpu_must_have(avx2); + get_fpu(X86EMUL_FPU_ymm, &fic); + } + + opc =3D init_prefixes(stub); + opc[0] =3D b; + /* Convert GPR destination to %rAX. */ + rex_prefix &=3D ~REX_R; + vex.r =3D 1; + if ( !mode_64bit() ) + vex.w =3D 0; + opc[1] =3D modrm & 0xc7; + fic.insn_bytes =3D PFX_BYTES + 2; + opc[2] =3D 0xc3; + + copy_REX_VEX(opc, rex_prefix, vex); + invoke_stub("", "", "=3Da" (dst.val) : [dummy] "i" (0)); + + put_stub(stub); + put_fpu(&fic); + + dst.bytes =3D 4; + break; + CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm = */ case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm= ,{x,y}mm */ CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm = */ @@ -5570,134 +5694,82 @@ x86_emulate( } goto simd_0f_common; =20 - case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */ + CASE_SIMD_PACKED_INT(0x0f, 0x6e): /* mov{d,q} r/m,{,x}mm */ + case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */ + CASE_SIMD_PACKED_INT(0x0f, 0x7e): /* mov{d,q} {,x}mm,r/m */ + case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */ + if ( vex.opcx !=3D vex_none ) + { + generate_exception_if(vex.l || vex.reg !=3D 0xf, EXC_UD); + host_and_vcpu_must_have(avx); + get_fpu(X86EMUL_FPU_ymm, &fic); + } + else if ( vex.pfx ) + { + vcpu_must_have(sse2); + get_fpu(X86EMUL_FPU_xmm, &fic); + } + else + { + host_and_vcpu_must_have(mmx); + get_fpu(X86EMUL_FPU_mmx, &fic); + } + + opc =3D init_prefixes(stub); + opc[0] =3D b; + /* Convert memory/GPR operand to (%rAX). */ + rex_prefix &=3D ~REX_B; + vex.b =3D 1; + if ( !mode_64bit() ) + vex.w =3D 0; + opc[1] =3D modrm & 0x38; + fic.insn_bytes =3D PFX_BYTES + 2; + opc[2] =3D 0xc3; + + copy_REX_VEX(opc, rex_prefix, vex); + invoke_stub("", "", "+m" (src.val) : "a" (&src.val)); + dst.val =3D src.val; + + put_stub(stub); + put_fpu(&fic); + break; + case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */ - case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */ - /* vmovntdq ymm,m256 */ - fail_if(ea.type !=3D OP_MEM); + case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq {x,y}mm,mem */ + generate_exception_if(ea.type !=3D OP_MEM, EXC_UD); + sfence =3D true; /* fall through */ - case X86EMUL_OPC(0x0f, 0x6f): /* movq mm/m64,mm */ case X86EMUL_OPC_66(0x0f, 0x6f): /* movdqa xmm/m128,xmm */ + case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa {x,y}mm/mem,{x,y}mm = */ case X86EMUL_OPC_F3(0x0f, 0x6f): /* movdqu xmm/m128,xmm */ - case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa xmm/m128,xmm */ - /* vmovdqa ymm/m256,ymm */ - case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */ - /* vmovdqu ymm/m256,ymm */ - case X86EMUL_OPC(0x0f, 0x7e): /* movd mm,r/m32 */ - /* movq mm,r/m64 */ - case X86EMUL_OPC_66(0x0f, 0x7e): /* movd xmm,r/m32 */ - /* movq xmm,r/m64 */ - case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmovd xmm,r/m32 */ - /* vmovq xmm,r/m64 */ - case X86EMUL_OPC(0x0f, 0x7f): /* movq mm,mm/m64 */ + case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu {x,y}mm/mem,{x,y}mm = */ case X86EMUL_OPC_66(0x0f, 0x7f): /* movdqa xmm,xmm/m128 */ - case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 */ - /* vmovdqa ymm,ymm/m256 */ + case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa {x,y}mm,{x,y}mm/m128 = */ case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 */ - case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu xmm,xmm/m128 */ - /* vmovdqu ymm,ymm/m256 */ - case X86EMUL_OPC_66(0x0f, 0xd6): /* movq xmm,xmm/m64 */ - case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */ - { - uint8_t *buf =3D get_stub(stub); - - fic.insn_bytes =3D 5; - buf[0] =3D 0x3e; - buf[1] =3D 0x3e; - buf[2] =3D 0x0f; - buf[3] =3D b; - buf[4] =3D modrm; - buf[5] =3D 0xc3; - if ( vex.opcx =3D=3D vex_none ) - { - switch ( vex.pfx ) - { - case vex_66: - case vex_f3: - vcpu_must_have(sse2); - /* Converting movdqu to movdqa here: Our buffer is = aligned. */ - buf[0] =3D 0x66; - get_fpu(X86EMUL_FPU_xmm, &fic); - ea.bytes =3D 16; - break; - case vex_none: - if ( b !=3D 0xe7 ) - host_and_vcpu_must_have(mmx); - else - vcpu_must_have(sse); - get_fpu(X86EMUL_FPU_mmx, &fic); - ea.bytes =3D 8; - break; - default: - goto cannot_emulate; - } - } - else + case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu {x,y}mm,{x,y}mm/mem = */ + if ( vex.opcx !=3D vex_none ) { - fail_if(vex.reg !=3D 0xf); host_and_vcpu_must_have(avx); get_fpu(X86EMUL_FPU_ymm, &fic); - ea.bytes =3D 16 << vex.l; } - switch ( b ) + else { - case 0x7e: - generate_exception_if(vex.l, EXC_UD); - ea.bytes =3D op_bytes; - break; - case 0xd6: - generate_exception_if(vex.l, EXC_UD); - ea.bytes =3D 8; - break; + vcpu_must_have(sse2); + get_fpu(X86EMUL_FPU_xmm, &fic); } - if ( ea.type =3D=3D OP_MEM ) - { - uint32_t mxcsr =3D 0; + d |=3D TwoOp; + op_bytes =3D 16 << vex.l; + goto simd_0f_common; =20 - if ( ea.bytes < 16 || vex.pfx =3D=3D vex_f3 ) - mxcsr =3D MXCSR_MM; - else if ( vcpu_has_misalignsse() ) - asm ( "stmxcsr %0" : "=3Dm" (mxcsr) ); - generate_exception_if(!(mxcsr & MXCSR_MM) && - !is_aligned(ea.mem.seg, ea.mem.off, = ea.bytes, - ctxt, ops), - EXC_GP, 0); - if ( b =3D=3D 0x6f ) - rc =3D ops->read(ea.mem.seg, ea.mem.off+0, mmvalp, - ea.bytes, ctxt); - else - fail_if(!ops->write); /* Check before running the stub. = */ - } - if ( ea.type =3D=3D OP_MEM || b =3D=3D 0x7e ) - { - /* Convert memory operand or GPR destination to (%rAX) */ - rex_prefix &=3D ~REX_B; - vex.b =3D 1; - buf[4] &=3D 0x38; - if ( ea.type =3D=3D OP_MEM ) - ea.reg =3D (void *)mmvalp; - else /* Ensure zero-extension of a 32-bit result. */ - *ea.reg =3D 0; - } - if ( !rc ) - { - copy_REX_VEX(buf, rex_prefix, vex); - asm volatile ( "call *%0" : : "r" (stub.func), "a" (ea.reg) - : "memory" ); - } - put_fpu(&fic); - put_stub(stub); - if ( !rc && (b !=3D 0x6f) && (ea.type =3D=3D OP_MEM) ) - { - ASSERT(ops->write); /* See the fail_if() above. */ - rc =3D ops->write(ea.mem.seg, ea.mem.off, mmvalp, - ea.bytes, ctxt); - } - if ( rc ) - goto done; - dst.type =3D OP_NONE; - break; - } + case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */ + generate_exception_if(vex.l, EXC_UD); + d |=3D TwoOp; + /* fall through */ + case X86EMUL_OPC_66(0x0f, 0xd6): /* movq xmm,xmm/m64 */ + case X86EMUL_OPC(0x0f, 0x6f): /* movq mm/m64,mm */ + case X86EMUL_OPC(0x0f, 0x7f): /* movq mm,mm/m64 */ + op_bytes =3D 8; + goto simd_0f_int; =20 CASE_SIMD_PACKED_INT(0x0f, 0x70): /* pshuf{w,d} $imm8,{,x}mm/mem,{,= x}mm */ case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y= }mm */ @@ -5728,25 +5800,25 @@ x86_emulate( get_fpu(X86EMUL_FPU_mmx, &fic); } simd_0f_imm8: - { - uint8_t *buf =3D get_stub(stub); - - buf[0] =3D 0x3e; - buf[1] =3D 0x3e; - buf[2] =3D 0x0f; - buf[3] =3D b; - buf[4] =3D modrm; + opc =3D init_prefixes(stub); + opc[0] =3D b; + opc[1] =3D modrm; if ( ea.type =3D=3D OP_MEM ) { /* Convert memory operand to (%rAX). */ rex_prefix &=3D ~REX_B; vex.b =3D 1; - buf[4] &=3D 0x38; + opc[1] &=3D 0x38; } - buf[5] =3D imm1; - fic.insn_bytes =3D 6; + opc[2] =3D imm1; + fic.insn_bytes =3D PFX_BYTES + 3; break; - } + + case X86EMUL_OPC_F3(0x0f, 0x7e): /* movq xmm/m64,xmm */ + case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */ + generate_exception_if(vex.l, EXC_UD); + op_bytes =3D 8; + goto simd_0f_int; =20 case X86EMUL_OPC_F2(0x0f, 0xf0): /* lddqu m128,xmm */ case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */ @@ -6319,6 +6391,17 @@ x86_emulate( get_fpu(X86EMUL_FPU_mmx, &fic); goto simd_0f_common; =20 + case X86EMUL_OPC_F3(0x0f, 0xd6): /* movq2dq mm,xmm */ + case X86EMUL_OPC_F2(0x0f, 0xd6): /* movdq2q xmm,mm */ + generate_exception_if(ea.type !=3D OP_REG, EXC_UD); + op_bytes =3D 8; + host_and_vcpu_must_have(mmx); + goto simd_0f_int; + + case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */ + generate_exception_if(ea.type !=3D OP_MEM, EXC_UD); + sfence =3D true; + /* fall through */ case X86EMUL_OPC(0x0f, 0xda): /* pminub mm/m64,mm */ case X86EMUL_OPC(0x0f, 0xde): /* pmaxub mm/m64,mm */ case X86EMUL_OPC(0x0f, 0xea): /* pminsw mm/m64,mm */ @@ -6332,6 +6415,73 @@ x86_emulate( get_fpu(X86EMUL_FPU_mmx, &fic); goto simd_0f_common; =20 + CASE_SIMD_PACKED_INT(0x0f, 0xf7): /* maskmov{q,dqu} {,x}mm,{,x}mm = */ + case X86EMUL_OPC_VEX_66(0x0f, 0xf7): /* vmaskmovdqu xmm,xmm */ + generate_exception_if(ea.type !=3D OP_REG, EXC_UD); + if ( vex.opcx !=3D vex_none ) + { + generate_exception_if(vex.l || vex.reg !=3D 0xf, EXC_UD); + d |=3D TwoOp; + host_and_vcpu_must_have(avx); + get_fpu(X86EMUL_FPU_ymm, &fic); + } + else if ( vex.pfx ) + { + vcpu_must_have(sse2); + get_fpu(X86EMUL_FPU_xmm, &fic); + } + else + { + host_and_vcpu_must_have(mmx); + vcpu_must_have(sse); + get_fpu(X86EMUL_FPU_mmx, &fic); + } + + /* + * While we can't reasonably provide fully correct behavior here + * (in particular avoiding the memory read in anticipation of all + * bytes in the range eventually being written), we can (and = should) + * still suppress the memory access if all mask bits are clear. = Read + * the mask bits via {,v}pmovmskb for that purpose. + */ + opc =3D init_prefixes(stub); + opc[0] =3D 0xd7; /* {,v}pmovmskb */ + /* (Ab)use "sfence" for latching the original REX.R / VEX.R. */ + sfence =3D rex_prefix & REX_R; + /* Convert GPR destination to %rAX. */ + rex_prefix &=3D ~REX_R; + vex.r =3D 1; + if ( !mode_64bit() ) + vex.w =3D 0; + opc[1] =3D modrm & 0xc7; + fic.insn_bytes =3D PFX_BYTES + 2; + opc[2] =3D 0xc3; + + copy_REX_VEX(opc, rex_prefix, vex); + invoke_stub("", "", "=3Da" (ea.val) : [dummy] "i" (0)); + + put_stub(stub); + if ( !ea.val ) + { + put_fpu(&fic); + goto complete_insn; + } + + opc =3D init_prefixes(stub); + opc[0] =3D b; + opc[1] =3D modrm; + /* Restore high bit of XMM destination. */ + if ( sfence ) + { + rex_prefix |=3D REX_R; + vex.r =3D 0; + } + + ea.type =3D OP_MEM; + ea.mem.off =3D truncate_ea(_regs.r(di)); + sfence =3D true; + break; + case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */ case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */ vcpu_must_have(movbe); @@ -6595,23 +6745,14 @@ x86_emulate( =20 if ( state->simd_size ) { -#ifdef __XEN__ - uint8_t *buf =3D stub.ptr; -#else - uint8_t *buf =3D get_stub(stub); -#endif - generate_exception_if(!op_bytes, EXC_UD); generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg !=3D = 0xf, EXC_UD); =20 - if ( !buf ) + if ( !opc ) BUG(); - if ( vex.opcx =3D=3D vex_none ) - SET_SSE_PREFIX(buf[0], vex.pfx); - - buf[fic.insn_bytes] =3D 0xc3; - copy_REX_VEX(buf, rex_prefix, vex); + opc[fic.insn_bytes - PFX_BYTES] =3D 0xc3; + copy_REX_VEX(opc, rex_prefix, vex); =20 if ( ea.type =3D=3D OP_MEM ) { @@ -6619,10 +6760,16 @@ x86_emulate( =20 if ( op_bytes < 16 || (vex.opcx - ? /* vmov{a,nt}p{s,d} are exceptions. */ - ext !=3D ext_0f || ((b | 1) !=3D 0x29 && b !=3D 0x2b) - : /* movup{s,d} and lddqu are exceptions. */ - ext =3D=3D ext_0f && ((b | 1) =3D=3D 0x11 || b =3D=3D = 0xf0)) ) + ? /* vmov{{a,nt}p{s,d},dqa,ntdq} are exceptions. */ + ext !=3D ext_0f || + ((b | 1) !=3D 0x29 && b !=3D 0x2b && + ((b | 0x10) !=3D 0x7f || vex.pfx !=3D vex_66) && + b !=3D 0xe7) + : /* movup{s,d}, {,mask}movdqu, and lddqu are exceptions= . */ + ext =3D=3D ext_0f && + ((b | 1) =3D=3D 0x11 || + ((b | 0x10) =3D=3D 0x7f && vex.pfx =3D=3D vex_f3) || + b =3D=3D 0xf7 || b =3D=3D 0xf0)) ) mxcsr =3D MXCSR_MM; else if ( vcpu_has_misalignsse() ) asm ( "stmxcsr %0" : "=3Dm" (mxcsr) ); @@ -6630,14 +6777,25 @@ x86_emulate( !is_aligned(ea.mem.seg, ea.mem.off, = op_bytes, ctxt, ops), EXC_GP, 0); - if ( (d & SrcMask) =3D=3D SrcMem ) + switch ( d & SrcMask ) { + case SrcMem: rc =3D ops->read(ea.mem.seg, ea.mem.off, mmvalp, = op_bytes, ctxt); if ( rc !=3D X86EMUL_OKAY ) goto done; + /* fall through */ + case SrcMem16: dst.type =3D OP_NONE; + break; + default: + if ( (d & DstMask) !=3D DstMem ) + { + ASSERT_UNREACHABLE(); + return X86EMUL_UNHANDLEABLE; + } + break; } - else if ( (d & DstMask) =3D=3D DstMem ) + if ( (d & DstMask) =3D=3D DstMem ) { fail_if(!ops->write); /* Check before running the stub. = */ ASSERT(d & Mov); @@ -6645,18 +6803,17 @@ x86_emulate( dst.bytes =3D op_bytes; dst.mem =3D ea.mem; } - else if ( (d & SrcMask) =3D=3D SrcMem16 ) - dst.type =3D OP_NONE; - else - { - ASSERT_UNREACHABLE(); - return X86EMUL_UNHANDLEABLE; - } } else dst.type =3D OP_NONE; =20 - invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp)); + /* {,v}maskmov{q,dqu}, as an exception, uses rDI. */ + if ( likely((ctxt->opcode & ~(X86EMUL_OPC_PFX_MASK | + X86EMUL_OPC_ENCODING_MASK)) !=3D + X86EMUL_OPC(0x0f, 0xf7)) ) + invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp)); + else + invoke_stub("", "", "+m" (*mmvalp) : "D" (mmvalp)); =20 put_stub(stub); put_fpu(&fic); @@ -6912,6 +7069,8 @@ x86_insn_is_mem_access(const struct x86_ case 0xa4 ... 0xa7: /* MOVS / CMPS */ case 0xaa ... 0xaf: /* STOS / LODS / SCAS */ case 0xd7: /* XLAT */ + CASE_SIMD_PACKED_INT(0x0f, 0xf7): /* MASKMOV{Q,DQU} */ + case X86EMUL_OPC_VEX_66(0x0f, 0xf7): /* VMASKMOVDQU */ return true; =20 case X86EMUL_OPC(0x0f, 0x01): @@ -6929,7 +7088,8 @@ x86_insn_is_mem_write(const struct x86_e switch ( state->desc & DstMask ) { case DstMem: - return state->modrm_mod !=3D 3; + /* The SrcMem check is to cover {,V}MASKMOV{Q,DQU}. */ + return state->modrm_mod !=3D 3 || (state->desc & SrcMask) =3D=3D = SrcMem; =20 case DstBitBase: case DstImplicit: @@ -6949,22 +7109,9 @@ x86_insn_is_mem_write(const struct x86_e case 0x6c: case 0x6d: /* INS */ case 0xa4: case 0xa5: /* MOVS */ case 0xaa: case 0xab: /* STOS */ - case X86EMUL_OPC(0x0f, 0x7e): /* MOVD/MOVQ */ - case X86EMUL_OPC_66(0x0f, 0x7e): /* MOVD/MOVQ */ - case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* VMOVD/VMOVQ */ - case X86EMUL_OPC(0x0f, 0x7f): /* VMOVQ */ - case X86EMUL_OPC_66(0x0f, 0x7f): /* MOVDQA */ - case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* VMOVDQA */ - case X86EMUL_OPC_F3(0x0f, 0x7f): /* MOVDQU */ - case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* VMOVDQU */ case X86EMUL_OPC(0x0f, 0xab): /* BTS */ case X86EMUL_OPC(0x0f, 0xb3): /* BTR */ case X86EMUL_OPC(0x0f, 0xbb): /* BTC */ - case X86EMUL_OPC_66(0x0f, 0xd6): /* MOVQ */ - case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* VMOVQ */ - case X86EMUL_OPC(0x0f, 0xe7): /* MOVNTQ */ - case X86EMUL_OPC_66(0x0f, 0xe7): /* MOVNTDQ */ - case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* VMOVNTDQ */ return true; =20 case 0xd9: --=__Part427B8620.1__= Content-Type: text/plain; name="x86emul-SSE-AVX-0f-mov.patch" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="x86emul-SSE-AVX-0f-mov.patch" x86emul: support MMX/SSE/SSE2 moves=0A=0APreviously supported insns are = being converted to the new model, and=0Aseveral new ones are being = added.=0A=0ATo keep the stub handling reasonably simple, integrate = SET_SSE_PREFIX()=0Ainto copy_REX_VEX(), at once switching the stubs to use = an empty REX=0Aprefix instead of a double DS: one (no byte registers are = being=0Aaccessed, so an empty REX prefix has no effect), except (of = course) for=0Athe 32-bit test harness build.=0A=0ASigned-off-by: Jan = Beulich =0A---=0Av2: Don't clear TwoOp for vmov{l,h}p{s,= d} to memory. Move re-setting of=0A TwoOp into VEX-specific code paths = where possible. Special case=0A {,v}maskmov{q,dqu} in stub invocation. = Move {,v}movq code block to=0A proper position. Add zero-mask {,v}maskmo= v{q,dqu} tests.=0A=0A--- a/tools/tests/x86_emulator/test_x86_emulator.c=0A+= ++ b/tools/tests/x86_emulator/test_x86_emulator.c=0A@@ -1557,6 +1557,29 @@ = int main(int argc, char **argv)=0A else=0A printf("skipped\n");= =0A =0A+ printf("%-40s", "Testing movq 32(%ecx),%xmm1...");=0A+ if ( = stack_exec && cpu_has_sse2 )=0A+ {=0A+ decl_insn(movq_from_mem2);= =0A+=0A+ asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"=0A+ = put_insn(movq_from_mem2, "movq 32(%0), %%xmm1")=0A+ = :: "c" (NULL) );=0A+=0A+ set_insn(movq_from_mem2);=0A+ = rc =3D x86_emulate(&ctxt, &emulops);=0A+ if ( rc !=3D X86EMUL_OKAY= || !check_eip(movq_from_mem2) )=0A+ goto fail;=0A+ asm = ( "pcmpgtb %%xmm0, %%xmm0\n\t"=0A+ "pcmpeqb %%xmm1, %%xmm0\n\t= "=0A+ "pmovmskb %%xmm0, %0" : "=3Dr" (rc) );=0A+ if ( = rc !=3D 0xffff )=0A+ goto fail;=0A+ printf("okay\n");=0A+= }=0A+ else=0A+ printf("skipped\n");=0A+=0A printf("%-40s"= , "Testing vmovq %xmm1,32(%edx)...");=0A if ( stack_exec && cpu_has_avx= )=0A {=0A@@ -1581,6 +1604,29 @@ int main(int argc, char **argv)=0A = else=0A printf("skipped\n");=0A =0A+ printf("%-40s", "Testing = vmovq 32(%edx),%xmm0...");=0A+ if ( stack_exec && cpu_has_avx )=0A+ = {=0A+ decl_insn(vmovq_from_mem);=0A+=0A+ asm volatile ( = "pcmpeqb %%xmm0, %%xmm0\n"=0A+ put_insn(vmovq_from_me= m, "vmovq 32(%0), %%xmm0")=0A+ :: "d" (NULL) = );=0A+=0A+ set_insn(vmovq_from_mem);=0A+ rc =3D x86_emulate(&= ctxt, &emulops);=0A+ if ( rc !=3D X86EMUL_OKAY || !check_eip(vmovq_f= rom_mem) )=0A+ goto fail;=0A+ asm ( "pcmpgtb %%xmm1, = %%xmm1\n\t"=0A+ "pcmpeqb %%xmm0, %%xmm1\n\t"=0A+ = "pmovmskb %%xmm1, %0" : "=3Dr" (rc) );=0A+ if ( rc !=3D 0xffff = )=0A+ goto fail;=0A+ printf("okay\n");=0A+ }=0A+ = else=0A+ printf("skipped\n");=0A+=0A printf("%-40s", "Testing = movdqu %xmm2,(%ecx)...");=0A if ( stack_exec && cpu_has_sse2 )=0A = {=0A@@ -1812,6 +1858,33 @@ int main(int argc, char **argv)=0A else=0A = printf("skipped\n");=0A =0A+ printf("%-40s", "Testing movd = 32(%ecx),%mm4...");=0A+ if ( stack_exec && cpu_has_mmx )=0A+ {=0A+ = decl_insn(movd_from_mem);=0A+=0A+ asm volatile ( "pcmpgtb = %%mm4, %%mm4\n"=0A+ put_insn(movd_from_mem, "movd = 32(%0), %%mm4")=0A+ :: "c" (NULL) );=0A+=0A+ = set_insn(movd_from_mem);=0A+ rc =3D x86_emulate(&ctxt, &emulops);=0A= + if ( rc !=3D X86EMUL_OKAY || !check_eip(movd_from_mem) )=0A+ = goto fail;=0A+ asm ( "pxor %%mm2,%%mm2\n\t"=0A+ = "pcmpeqb %%mm4, %%mm2\n\t"=0A+ "pmovmskb %%mm2, %0" : "=3Dr" = (rc) );=0A+ if ( rc !=3D 0xf0 )=0A+ goto fail;=0A+ = asm ( "pcmpeqb %%mm4, %%mm3\n\t"=0A+ "pmovmskb %%mm3, %0" : = "=3Dr" (rc) );=0A+ if ( rc !=3D 0x0f )=0A+ goto = fail;=0A+ printf("okay\n");=0A+ }=0A+ else=0A+ = printf("skipped\n");=0A+=0A printf("%-40s", "Testing movd %xmm2,32(%edx= )...");=0A if ( stack_exec && cpu_has_sse2 )=0A {=0A@@ -1836,6 = +1909,34 @@ int main(int argc, char **argv)=0A else=0A = printf("skipped\n");=0A =0A+ printf("%-40s", "Testing movd 32(%edx),%xmm= 3...");=0A+ if ( stack_exec && cpu_has_sse2 )=0A+ {=0A+ = decl_insn(movd_from_mem2);=0A+=0A+ asm volatile ( "pcmpeqb %%xmm3, = %%xmm3\n"=0A+ put_insn(movd_from_mem2, "movd 32(%0), = %%xmm3")=0A+ :: "d" (NULL) );=0A+=0A+ = set_insn(movd_from_mem2);=0A+ rc =3D x86_emulate(&ctxt, &emulops);= =0A+ if ( rc !=3D X86EMUL_OKAY || !check_eip(movd_from_mem2) )=0A+ = goto fail;=0A+ asm ( "pxor %%xmm1,%%xmm1\n\t"=0A+ = "pcmpeqb %%xmm3, %%xmm1\n\t"=0A+ "pmovmskb %%xmm1, %0" : = "=3Dr" (rc) );=0A+ if ( rc !=3D 0xfff0 )=0A+ goto = fail;=0A+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"=0A+ = "pcmpeqb %%xmm3, %%xmm2\n\t"=0A+ "pmovmskb %%xmm2, %0" : = "=3Dr" (rc) );=0A+ if ( rc !=3D 0x000f )=0A+ goto = fail;=0A+ printf("okay\n");=0A+ }=0A+ else=0A+ = printf("skipped\n");=0A+=0A printf("%-40s", "Testing vmovd %xmm1,32(%ec= x)...");=0A if ( stack_exec && cpu_has_avx )=0A {=0A@@ -1860,6 = +1961,34 @@ int main(int argc, char **argv)=0A else=0A = printf("skipped\n");=0A =0A+ printf("%-40s", "Testing vmovd 32(%ecx),%xm= m2...");=0A+ if ( stack_exec && cpu_has_avx )=0A+ {=0A+ = decl_insn(vmovd_from_mem);=0A+=0A+ asm volatile ( "pcmpeqb %%xmm2, = %%xmm2\n"=0A+ put_insn(vmovd_from_mem, "vmovd = 32(%0), %%xmm2")=0A+ :: "c" (NULL) );=0A+=0A+ = set_insn(vmovd_from_mem);=0A+ rc =3D x86_emulate(&ctxt, &emulops);= =0A+ if ( rc !=3D X86EMUL_OKAY || !check_eip(vmovd_from_mem) )=0A+ = goto fail;=0A+ asm ( "pxor %%xmm0,%%xmm0\n\t"=0A+ = "pcmpeqb %%xmm2, %%xmm0\n\t"=0A+ "pmovmskb %%xmm0, %0" : = "=3Dr" (rc) );=0A+ if ( rc !=3D 0xfff0 )=0A+ goto = fail;=0A+ asm ( "pcmpeqb %%xmm1, %%xmm1\n\t"=0A+ = "pcmpeqb %%xmm2, %%xmm1\n\t"=0A+ "pmovmskb %%xmm1, %0" : = "=3Dr" (rc) );=0A+ if ( rc !=3D 0x000f )=0A+ goto = fail;=0A+ printf("okay\n");=0A+ }=0A+ else=0A+ = printf("skipped\n");=0A+=0A printf("%-40s", "Testing movd %mm3,%ebx..."= );=0A if ( stack_exec && cpu_has_mmx )=0A {=0A@@ -1890,6 +2019,34 = @@ int main(int argc, char **argv)=0A else=0A printf("skipped\n= ");=0A =0A+ printf("%-40s", "Testing movd %ebx,%mm4...");=0A+ if ( = stack_exec && cpu_has_mmx )=0A+ {=0A+ decl_insn(movd_from_reg);= =0A+=0A+ /* See comment next to movd above. */=0A+ asm = volatile ( "pcmpgtb %%mm4, %%mm4\n"=0A+ put_insn(movd= _from_reg, "movd %%ebx, %%mm4")=0A+ :: );=0A+=0A+ = set_insn(movd_from_reg);=0A+ rc =3D x86_emulate(&ctxt, = &emulops);=0A+ if ( (rc !=3D X86EMUL_OKAY) || !check_eip(movd_from_r= eg) )=0A+ goto fail;=0A+ asm ( "pxor %%mm2,%%mm2\n\t"=0A+= "pcmpeqb %%mm4, %%mm2\n\t"=0A+ "pmovmskb %%mm2, = %0" : "=3Dr" (rc) );=0A+ if ( rc !=3D 0xf0 )=0A+ goto = fail;=0A+ asm ( "pcmpeqb %%mm4, %%mm3\n\t"=0A+ = "pmovmskb %%mm3, %0" : "=3Dr" (rc) );=0A+ if ( rc !=3D 0x0f )=0A+ = goto fail;=0A+ printf("okay\n");=0A+ }=0A+ else=0A+ = printf("skipped\n");=0A+=0A printf("%-40s", "Testing movd = %xmm2,%ebx...");=0A if ( stack_exec && cpu_has_sse2 )=0A {=0A@@ = -1915,6 +2072,35 @@ int main(int argc, char **argv)=0A else=0A = printf("skipped\n");=0A =0A+ printf("%-40s", "Testing movd %ebx,%xmm3...= ");=0A+ if ( stack_exec && cpu_has_sse2 )=0A+ {=0A+ decl_insn(= movd_from_reg2);=0A+=0A+ /* See comment next to movd above. */=0A+ = asm volatile ( "pcmpgtb %%xmm3, %%xmm3\n"=0A+ = put_insn(movd_from_reg2, "movd %%ebx, %%xmm3")=0A+ = :: );=0A+=0A+ set_insn(movd_from_reg2);=0A+ rc =3D x86_emulat= e(&ctxt, &emulops);=0A+ if ( (rc !=3D X86EMUL_OKAY) || !check_eip(mo= vd_from_reg2) )=0A+ goto fail;=0A+ asm ( "pxor %%xmm1,%%x= mm1\n\t"=0A+ "pcmpeqb %%xmm3, %%xmm1\n\t"=0A+ = "pmovmskb %%xmm1, %0" : "=3Dr" (rc) );=0A+ if ( rc !=3D 0xfff0 = )=0A+ goto fail;=0A+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"= =0A+ "pcmpeqb %%xmm3, %%xmm2\n\t"=0A+ "pmovmskb = %%xmm2, %0" : "=3Dr" (rc) );=0A+ if ( rc !=3D 0x000f )=0A+ = goto fail;=0A+ printf("okay\n");=0A+ }=0A+ else=0A+ = printf("skipped\n");=0A+=0A printf("%-40s", "Testing vmovd %xmm1,%ebx..= .");=0A if ( stack_exec && cpu_has_avx )=0A {=0A@@ -1940,6 = +2126,35 @@ int main(int argc, char **argv)=0A else=0A = printf("skipped\n");=0A =0A+ printf("%-40s", "Testing vmovd %ebx,%xmm2..= .");=0A+ if ( stack_exec && cpu_has_avx )=0A+ {=0A+ decl_insn(= vmovd_from_reg);=0A+=0A+ /* See comment next to movd above. */=0A+ = asm volatile ( "pcmpgtb %%xmm2, %%xmm2\n"=0A+ = put_insn(vmovd_from_reg, "vmovd %%ebx, %%xmm2")=0A+ = :: );=0A+=0A+ set_insn(vmovd_from_reg);=0A+ rc =3D x86_emulat= e(&ctxt, &emulops);=0A+ if ( (rc !=3D X86EMUL_OKAY) || !check_eip(vm= ovd_from_reg) )=0A+ goto fail;=0A+ asm ( "pxor %%xmm0,%%x= mm0\n\t"=0A+ "pcmpeqb %%xmm2, %%xmm0\n\t"=0A+ = "pmovmskb %%xmm0, %0" : "=3Dr" (rc) );=0A+ if ( rc !=3D 0xfff0 = )=0A+ goto fail;=0A+ asm ( "pcmpeqb %%xmm1, %%xmm1\n\t"= =0A+ "pcmpeqb %%xmm2, %%xmm1\n\t"=0A+ "pmovmskb = %%xmm1, %0" : "=3Dr" (rc) );=0A+ if ( rc !=3D 0x000f )=0A+ = goto fail;=0A+ printf("okay\n");=0A+ }=0A+ else=0A+ = printf("skipped\n");=0A+=0A #ifdef __x86_64__=0A printf("%-40s", = "Testing movq %mm3,32(%ecx)...");=0A if ( stack_exec && cpu_has_mmx = )=0A@@ -2078,6 +2293,41 @@ int main(int argc, char **argv)=0A = printf("skipped\n");=0A #endif=0A =0A+ printf("%-40s", "Testing = maskmovq (zero mask)...");=0A+ if ( stack_exec && cpu_has_sse )=0A+ = {=0A+ decl_insn(maskmovq);=0A+=0A+ asm volatile ( "pcmpgtb = %mm4, %mm4\n"=0A+ put_insn(maskmovq, "maskmovq %mm4, = %mm4") );=0A+=0A+ set_insn(maskmovq);=0A+ regs.edi =3D = 0;=0A+ rc =3D x86_emulate(&ctxt, &emulops);=0A+ if ( rc !=3D = X86EMUL_OKAY || !check_eip(maskmovq) )=0A+ goto fail;=0A+ = printf("okay\n");=0A+ }=0A+ else=0A+ printf("skipped\n");=0A+= =0A+ printf("%-40s", "Testing maskmovdqu (zero mask)...");=0A+ if ( = stack_exec && cpu_has_sse2 )=0A+ {=0A+ decl_insn(maskmovdqu);=0A+= =0A+ asm volatile ( "pcmpgtb %xmm3, %xmm3\n"=0A+ = put_insn(maskmovdqu, "maskmovdqu %xmm3, %xmm3") );=0A+=0A+ = set_insn(maskmovdqu);=0A+ rc =3D x86_emulate(&ctxt, &emulops);=0A+ = if ( rc !=3D X86EMUL_OKAY || !check_eip(maskmovdqu) )=0A+ = goto fail;=0A+ printf("okay\n");=0A+ }=0A+ else=0A+ = printf("skipped\n");=0A+=0A printf("%-40s", "Testing lddqu 4(%edx),%xmm= 4...");=0A if ( stack_exec && cpu_has_sse3 )=0A {=0A--- a/xen/arch/= x86/x86_emulate/x86_emulate.c=0A+++ b/xen/arch/x86/x86_emulate/x86_emulate.= c=0A@@ -236,9 +236,12 @@ static const struct {=0A [0x0f] =3D { = ModRM|SrcImmByte },=0A [0x10] =3D { DstImplicit|SrcMem|ModRM|Mov, = simd_any_fp },=0A [0x11] =3D { DstMem|SrcImplicit|ModRM|Mov, simd_any_f= p },=0A- [0x12 ... 0x13] =3D { ImplicitOps|ModRM },=0A+ [0x12] =3D { = DstImplicit|SrcMem|ModRM|Mov, simd_other },=0A+ [0x13] =3D { DstMem|SrcI= mplicit|ModRM|Mov, simd_other },=0A [0x14 ... 0x15] =3D { DstImplicit|S= rcMem|ModRM, simd_packed_fp },=0A- [0x16 ... 0x1f] =3D { ImplicitOps|Mod= RM },=0A+ [0x16] =3D { DstImplicit|SrcMem|ModRM|Mov, simd_other },=0A+ = [0x17] =3D { DstMem|SrcImplicit|ModRM|Mov, simd_other },=0A+ [0x18 = ... 0x1f] =3D { ImplicitOps|ModRM },=0A [0x20 ... 0x21] =3D { = DstMem|SrcImplicit|ModRM },=0A [0x22 ... 0x23] =3D { DstImplicit|SrcMem= |ModRM },=0A [0x28] =3D { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp = },=0A@@ -251,7 +254,7 @@ static const struct {=0A [0x38] =3D { = DstReg|SrcMem|ModRM },=0A [0x3a] =3D { DstReg|SrcImmByte|ModRM },=0A = [0x40 ... 0x4f] =3D { DstReg|SrcMem|ModRM|Mov },=0A- [0x50] =3D { = ModRM },=0A+ [0x50] =3D { DstReg|SrcImplicit|ModRM|Mov },=0A [0x51] = =3D { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },=0A [0x52 ... = 0x53] =3D { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },=0A [0x54 = ... 0x57] =3D { DstImplicit|SrcMem|ModRM, simd_packed_fp },=0A@@ -262,14 = +265,16 @@ static const struct {=0A [0x63 ... 0x67] =3D { DstImplicit|S= rcMem|ModRM, simd_packed_int },=0A [0x68 ... 0x6a] =3D { DstImplicit|Sr= cMem|ModRM, simd_other },=0A [0x6b ... 0x6d] =3D { DstImplicit|SrcMem|M= odRM, simd_packed_int },=0A- [0x6e ... 0x6f] =3D { ImplicitOps|ModRM = },=0A+ [0x6e] =3D { DstImplicit|SrcMem|ModRM|Mov },=0A+ [0x6f] =3D { = DstImplicit|SrcMem|ModRM|Mov, simd_packed_int },=0A [0x70] =3D { = SrcImmByte|ModRM|TwoOp, simd_other },=0A [0x71 ... 0x73] =3D { = SrcImmByte|ModRM },=0A [0x74 ... 0x76] =3D { DstImplicit|SrcMem|ModRM, = simd_packed_int },=0A [0x77] =3D { DstImplicit|SrcNone },=0A [0x78 = ... 0x79] =3D { ModRM },=0A [0x7c ... 0x7d] =3D { DstImplicit|SrcMem|Mo= dRM, simd_other },=0A- [0x7e ... 0x7f] =3D { ImplicitOps|ModRM },=0A+ = [0x7e] =3D { DstMem|SrcImplicit|ModRM|Mov },=0A+ [0x7f] =3D { = DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },=0A [0x80 ... 0x8f] = =3D { DstImplicit|SrcImm },=0A [0x90 ... 0x9f] =3D { ByteOp|DstMem|SrcN= one|ModRM|Mov },=0A [0xa0 ... 0xa1] =3D { ImplicitOps|Mov },=0A@@ = -311,19 +316,19 @@ static const struct {=0A [0xd0] =3D { DstImplicit|Sr= cMem|ModRM, simd_other },=0A [0xd1 ... 0xd3] =3D { DstImplicit|SrcMem|M= odRM, simd_other },=0A [0xd4 ... 0xd5] =3D { DstImplicit|SrcMem|ModRM, = simd_packed_int },=0A- [0xd6] =3D { ImplicitOps|ModRM },=0A- [0xd7] = =3D { ModRM },=0A+ [0xd6] =3D { DstMem|SrcImplicit|ModRM|Mov, simd_other= },=0A+ [0xd7] =3D { DstReg|SrcImplicit|ModRM|Mov },=0A [0xd8 ... = 0xdf] =3D { DstImplicit|SrcMem|ModRM, simd_packed_int },=0A [0xe0] =3D = { DstImplicit|SrcMem|ModRM, simd_packed_int },=0A [0xe1 ... 0xe2] =3D = { DstImplicit|SrcMem|ModRM, simd_other },=0A [0xe3 ... 0xe5] =3D { = DstImplicit|SrcMem|ModRM, simd_packed_int },=0A [0xe6] =3D { ModRM = },=0A- [0xe7] =3D { ImplicitOps|ModRM },=0A+ [0xe7] =3D { DstMem|SrcI= mplicit|ModRM|Mov, simd_packed_int },=0A [0xe8 ... 0xef] =3D { = DstImplicit|SrcMem|ModRM, simd_packed_int },=0A [0xf0] =3D { DstImplici= t|SrcMem|ModRM|Mov, simd_other },=0A [0xf1 ... 0xf3] =3D { DstImplicit|= SrcMem|ModRM, simd_other },=0A [0xf4 ... 0xf6] =3D { DstImplicit|SrcMem= |ModRM, simd_packed_int },=0A- [0xf7] =3D { ModRM },=0A+ [0xf7] =3D = { DstMem|SrcMem|ModRM|Mov, simd_packed_int },=0A [0xf8 ... 0xfe] =3D { = DstImplicit|SrcMem|ModRM, simd_packed_int },=0A [0xff] =3D { ModRM = }=0A };=0A@@ -359,11 +364,6 @@ enum vex_pfx {=0A =0A static const uint8_t = sse_prefix[] =3D { 0x66, 0xf3, 0xf2 };=0A =0A-#define SET_SSE_PREFIX(dst, = vex_pfx) do { \=0A- if ( vex_pfx ) \=0A- (dst) =3D sse_prefix[(ve= x_pfx) - 1]; \=0A-} while (0)=0A-=0A union vex {=0A uint8_t raw[2];=0A = struct {=0A@@ -378,15 +378,35 @@ union vex {=0A };=0A };=0A = =0A+#ifdef __x86_64__=0A+# define PFX2 REX_PREFIX=0A+#else=0A+# define = PFX2 0x3e=0A+#endif=0A+#define PFX_BYTES 3=0A+#define init_prefixes(stub) = ({ \=0A+ uint8_t *buf_ =3D get_stub(stub); \=0A+ buf_[0] =3D 0x3e; = \=0A+ buf_[1] =3D PFX2; \=0A+ buf_[2] =3D 0x0f; \=0A+ buf_ + 3; = \=0A+})=0A+=0A #define copy_REX_VEX(ptr, rex, vex) do { \=0A if ( = (vex).opcx !=3D vex_none ) \=0A { \=0A if ( !mode_64bit() ) = \=0A vex.reg |=3D 8; \=0A- ptr[0] =3D 0xc4, ptr[1] =3D = (vex).raw[0], ptr[2] =3D (vex).raw[1]; \=0A+ (ptr)[0 - PFX_BYTES] = =3D 0xc4; \=0A+ (ptr)[1 - PFX_BYTES] =3D (vex).raw[0]; \=0A+ = (ptr)[2 - PFX_BYTES] =3D (vex).raw[1]; \=0A+ } \=0A+ else \=0A+ { = \=0A+ if ( (vex).pfx ) \=0A+ (ptr)[0 - PFX_BYTES] =3D = sse_prefix[(vex).pfx - 1]; \=0A+ (ptr)[1 - PFX_BYTES] |=3D rex; = \=0A } \=0A- else if ( mode_64bit() ) \=0A- ptr[1] =3D rex | = REX_PREFIX; \=0A } while (0)=0A =0A union evex {=0A@@ -2159,7 +2179,8 @@ = x86_decode_twobyte(=0A case 0x10 ... 0x18:=0A case 0x28 ... = 0x2f:=0A case 0x50 ... 0x77:=0A- case 0x79 ... 0x7f:=0A+ case = 0x79 ... 0x7d:=0A+ case 0x7f:=0A case 0xae:=0A case 0xc2 ... = 0xc3:=0A case 0xc5 ... 0xc6:=0A@@ -2179,6 +2200,18 @@ x86_decode_twobyt= e(=0A op_bytes =3D mode_64bit() ? 8 : 4;=0A break;=0A =0A+ = case 0x7e:=0A+ ctxt->opcode |=3D MASK_INSR(vex.pfx, X86EMUL_OPC_P= FX_MASK);=0A+ if ( vex.pfx =3D=3D vex_f3 ) /* movq xmm/m64,xmm = */=0A+ {=0A+ case X86EMUL_OPC_VEX_F3(0, 0x7e): /* vmovq = xmm/m64,xmm */=0A+ state->desc =3D DstImplicit | SrcMem | ModRM = | Mov;=0A+ state->simd_size =3D simd_other;=0A+ /* = Avoid the state->desc adjustment below. */=0A+ return X86EMUL_OK= AY;=0A+ }=0A+ break;=0A+=0A case 0xb8: /* jmpe / popcnt = */=0A if ( rep_prefix() )=0A ctxt->opcode |=3D = MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);=0A@@ -2776,7 +2809,7 @@ = x86_emulate(=0A struct cpu_user_regs _regs =3D *ctxt->regs;=0A = struct x86_emulate_state state;=0A int rc;=0A- uint8_t b, d;=0A+ = uint8_t b, d, *opc =3D NULL;=0A bool singlestep =3D (_regs._eflags & = EFLG_TF) && !is_branch_step(ctxt, ops);=0A bool sfence =3D false;=0A = struct operand src =3D { .reg =3D PTR_POISON };=0A@@ -5255,6 +5288,7 @@ = x86_emulate(=0A CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5e): /* vdiv{p,s}{s,d= } {x,y}mm/mem,{x,y}mm,{x,y}mm */=0A CASE_SIMD_ALL_FP(, 0x0f, 0x5f): = /* max{p,s}{s,d} xmm/mem,xmm */=0A CASE_SIMD_ALL_FP(_VEX, 0x0f, = 0x5f): /* vmax{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */=0A+ = simd_0f_fp:=0A if ( vex.opcx =3D=3D vex_none )=0A {=0A = if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )=0A@@ -5273,24 +5307,63 @@ = x86_emulate(=0A get_fpu(X86EMUL_FPU_ymm, &fic);=0A = }=0A simd_0f_common:=0A- {=0A- uint8_t *buf =3D get_stub(stub= );=0A-=0A- buf[0] =3D 0x3e;=0A- buf[1] =3D 0x3e;=0A- = buf[2] =3D 0x0f;=0A- buf[3] =3D b;=0A- buf[4] =3D modrm;=0A+ = opc =3D init_prefixes(stub);=0A+ opc[0] =3D b;=0A+ = opc[1] =3D modrm;=0A if ( ea.type =3D=3D OP_MEM )=0A {=0A = /* convert memory operand to (%rAX) */=0A rex_prefix= &=3D ~REX_B;=0A vex.b =3D 1;=0A- buf[4] &=3D = 0x38;=0A+ opc[1] &=3D 0x38;=0A }=0A- fic.insn_byt= es =3D 5;=0A+ fic.insn_bytes =3D PFX_BYTES + 2;=0A = break;=0A- }=0A+=0A+ case X86EMUL_OPC_66(0x0f, 0x12): /* = movlpd m64,xmm */=0A+ case X86EMUL_OPC_VEX_66(0x0f, 0x12): /* vmovlpd = m64,xmm,xmm */=0A+ CASE_SIMD_PACKED_FP(, 0x0f, 0x13): /* movlp{s,d} = xmm,m64 */=0A+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x13): /* vmovlp{s,d} = xmm,m64 */=0A+ case X86EMUL_OPC_66(0x0f, 0x16): /* movhpd m64,xmm = */=0A+ case X86EMUL_OPC_VEX_66(0x0f, 0x16): /* vmovhpd m64,xmm,xmm = */=0A+ CASE_SIMD_PACKED_FP(, 0x0f, 0x17): /* movhp{s,d} xmm,m64 = */=0A+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x17): /* vmovhp{s,d} xmm,m64 = */=0A+ generate_exception_if(ea.type !=3D OP_MEM, EXC_UD);=0A+ = /* fall through */=0A+ case X86EMUL_OPC(0x0f, 0x12): /* = movlps m64,xmm */=0A+ /* movhlps = xmm,xmm */=0A+ case X86EMUL_OPC_VEX(0x0f, 0x12): /* vmovlps = m64,xmm,xmm */=0A+ /* vmovhlps = xmm,xmm,xmm */=0A+ case X86EMUL_OPC(0x0f, 0x16): /* movhps = m64,xmm */=0A+ /* movlhps = xmm,xmm */=0A+ case X86EMUL_OPC_VEX(0x0f, 0x16): /* vmovhps = m64,xmm,xmm */=0A+ /* vmovlhps = xmm,xmm,xmm */=0A+ generate_exception_if(vex.l, EXC_UD);=0A+ = if ( (d & DstMask) !=3D DstMem )=0A+ d &=3D ~TwoOp;=0A+ = op_bytes =3D 8;=0A+ goto simd_0f_fp;=0A+=0A+ case X86EMUL_OPC_F3(= 0x0f, 0x12): /* movsldup xmm/m128,xmm */=0A+ case X86EMUL_OPC_VEX_= F3(0x0f, 0x12): /* vmovsldup {x,y}mm/mem,{x,y}mm */=0A+ case = X86EMUL_OPC_F2(0x0f, 0x12): /* movddup xmm/m64,xmm */=0A+ case = X86EMUL_OPC_VEX_F2(0x0f, 0x12): /* vmovddup {x,y}mm/mem,{x,y}mm */=0A+ = case X86EMUL_OPC_F3(0x0f, 0x16): /* movshdup xmm/m128,xmm */=0A+ = case X86EMUL_OPC_VEX_F3(0x0f, 0x16): /* vmovshdup {x,y}mm/mem,{x,y}mm = */=0A+ d |=3D TwoOp;=0A+ op_bytes =3D !(vex.pfx & VEX_PREFIX_= DOUBLE_MASK) || vex.l=0A+ ? 16 << vex.l : 8;=0A+ = if ( vex.opcx =3D=3D vex_none )=0A+ {=0A+ host_and_vcpu_m= ust_have(sse3);=0A+ get_fpu(X86EMUL_FPU_xmm, &fic);=0A+ = }=0A+ else=0A+ {=0A+ host_and_vcpu_must_have(avx);= =0A+ get_fpu(X86EMUL_FPU_ymm, &fic);=0A+ }=0A+ = goto simd_0f_common;=0A =0A case X86EMUL_OPC(0x0f, 0x20): /* mov = cr,reg */=0A case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */=0A@@ = -5451,6 +5524,57 @@ x86_emulate(=0A break;=0A }=0A =0A+ = CASE_SIMD_PACKED_FP(, 0x0f, 0x50): /* movmskp{s,d} xmm,reg */=0A+ = CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */=0A+ = CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */=0A+ = case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */=0A+ = generate_exception_if(ea.type !=3D OP_REG, EXC_UD);=0A+=0A+ if ( = vex.opcx =3D=3D vex_none )=0A+ {=0A+ if ( vex.pfx & = VEX_PREFIX_DOUBLE_MASK )=0A+ vcpu_must_have(sse2);=0A+ = else=0A+ {=0A+ if ( b !=3D 0x50 )=0A+ = host_and_vcpu_must_have(mmx);=0A+ vcpu_must_ha= ve(sse);=0A+ }=0A+ if ( b =3D=3D 0x50 || (vex.pfx & = VEX_PREFIX_DOUBLE_MASK) )=0A+ get_fpu(X86EMUL_FPU_xmm, = &fic);=0A+ else=0A+ get_fpu(X86EMUL_FPU_mmx, = &fic);=0A+ }=0A+ else=0A+ {=0A+ generate_ex= ception_if(vex.reg !=3D 0xf, EXC_UD);=0A+ if ( b =3D=3D 0x50 || = !vex.l )=0A+ host_and_vcpu_must_have(avx);=0A+ = else=0A+ host_and_vcpu_must_have(avx2);=0A+ = get_fpu(X86EMUL_FPU_ymm, &fic);=0A+ }=0A+=0A+ opc =3D = init_prefixes(stub);=0A+ opc[0] =3D b;=0A+ /* Convert GPR = destination to %rAX. */=0A+ rex_prefix &=3D ~REX_R;=0A+ = vex.r =3D 1;=0A+ if ( !mode_64bit() )=0A+ vex.w =3D = 0;=0A+ opc[1] =3D modrm & 0xc7;=0A+ fic.insn_bytes =3D = PFX_BYTES + 2;=0A+ opc[2] =3D 0xc3;=0A+=0A+ copy_REX_VEX(opc,= rex_prefix, vex);=0A+ invoke_stub("", "", "=3Da" (dst.val) : = [dummy] "i" (0));=0A+=0A+ put_stub(stub);=0A+ put_fpu(&fic);= =0A+=0A+ dst.bytes =3D 4;=0A+ break;=0A+=0A CASE_SIMD_PAC= KED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */=0A case = X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm = */=0A CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}= mm */=0A@@ -5570,134 +5694,82 @@ x86_emulate(=0A }=0A goto = simd_0f_common;=0A =0A- case X86EMUL_OPC(0x0f, 0xe7): /* movntq = mm,m64 */=0A+ CASE_SIMD_PACKED_INT(0x0f, 0x6e): /* mov{d,q} = r/m,{,x}mm */=0A+ case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} = r/m,xmm */=0A+ CASE_SIMD_PACKED_INT(0x0f, 0x7e): /* mov{d,q} = {,x}mm,r/m */=0A+ case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmov{d,q} = xmm,r/m */=0A+ if ( vex.opcx !=3D vex_none )=0A+ {=0A+ = generate_exception_if(vex.l || vex.reg !=3D 0xf, EXC_UD);=0A+ = host_and_vcpu_must_have(avx);=0A+ get_fpu(X86EMUL_FPU_ymm, = &fic);=0A+ }=0A+ else if ( vex.pfx )=0A+ {=0A+ = vcpu_must_have(sse2);=0A+ get_fpu(X86EMUL_FPU_xmm, = &fic);=0A+ }=0A+ else=0A+ {=0A+ host_and_vc= pu_must_have(mmx);=0A+ get_fpu(X86EMUL_FPU_mmx, &fic);=0A+ = }=0A+=0A+ opc =3D init_prefixes(stub);=0A+ opc[0] =3D = b;=0A+ /* Convert memory/GPR operand to (%rAX). */=0A+ = rex_prefix &=3D ~REX_B;=0A+ vex.b =3D 1;=0A+ if ( !mode_64bit= () )=0A+ vex.w =3D 0;=0A+ opc[1] =3D modrm & 0x38;=0A+ = fic.insn_bytes =3D PFX_BYTES + 2;=0A+ opc[2] =3D 0xc3;=0A+=0A+ = copy_REX_VEX(opc, rex_prefix, vex);=0A+ invoke_stub("", "", = "+m" (src.val) : "a" (&src.val));=0A+ dst.val =3D src.val;=0A+=0A+ = put_stub(stub);=0A+ put_fpu(&fic);=0A+ break;=0A+=0A = case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */=0A- case = X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */=0A- = /* vmovntdq ymm,m256 */=0A- fail_if(ea.type= !=3D OP_MEM);=0A+ case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq = {x,y}mm,mem */=0A+ generate_exception_if(ea.type !=3D OP_MEM, = EXC_UD);=0A+ sfence =3D true;=0A /* fall through */=0A- = case X86EMUL_OPC(0x0f, 0x6f): /* movq mm/m64,mm */=0A case = X86EMUL_OPC_66(0x0f, 0x6f): /* movdqa xmm/m128,xmm */=0A+ case = X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa {x,y}mm/mem,{x,y}mm */=0A = case X86EMUL_OPC_F3(0x0f, 0x6f): /* movdqu xmm/m128,xmm */=0A- case = X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa xmm/m128,xmm */=0A- = /* vmovdqa ymm/m256,ymm */=0A- case = X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */=0A- = /* vmovdqu ymm/m256,ymm */=0A- case = X86EMUL_OPC(0x0f, 0x7e): /* movd mm,r/m32 */=0A- = /* movq mm,r/m64 */=0A- case X86EMUL_OPC_66(0x0f, = 0x7e): /* movd xmm,r/m32 */=0A- = /* movq xmm,r/m64 */=0A- case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmovd = xmm,r/m32 */=0A- /* vmovq = xmm,r/m64 */=0A- case X86EMUL_OPC(0x0f, 0x7f): /* movq mm,mm/m64 = */=0A+ case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu {x,y}mm/mem,{x,y}= mm */=0A case X86EMUL_OPC_66(0x0f, 0x7f): /* movdqa xmm,xmm/m128 = */=0A- case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 = */=0A- /* vmovdqa ymm,ymm/m256 = */=0A+ case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa {x,y}mm,{x,y}mm/m= 128 */=0A case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 = */=0A- case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu xmm,xmm/m128 = */=0A- /* vmovdqu ymm,ymm/m256 = */=0A- case X86EMUL_OPC_66(0x0f, 0xd6): /* movq xmm,xmm/m64 */=0A- = case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */=0A- = {=0A- uint8_t *buf =3D get_stub(stub);=0A-=0A- fic.insn_bytes= =3D 5;=0A- buf[0] =3D 0x3e;=0A- buf[1] =3D 0x3e;=0A- = buf[2] =3D 0x0f;=0A- buf[3] =3D b;=0A- buf[4] =3D modrm;=0A- = buf[5] =3D 0xc3;=0A- if ( vex.opcx =3D=3D vex_none )=0A- = {=0A- switch ( vex.pfx )=0A- {=0A- = case vex_66:=0A- case vex_f3:=0A- vcpu_must_have(= sse2);=0A- /* Converting movdqu to movdqa here: Our buffer = is aligned. */=0A- buf[0] =3D 0x66;=0A- = get_fpu(X86EMUL_FPU_xmm, &fic);=0A- ea.bytes =3D 16;=0A- = break;=0A- case vex_none:=0A- if ( b = !=3D 0xe7 )=0A- host_and_vcpu_must_have(mmx);=0A- = else=0A- vcpu_must_have(sse);=0A- = get_fpu(X86EMUL_FPU_mmx, &fic);=0A- ea.bytes =3D 8;=0A- = break;=0A- default:=0A- goto = cannot_emulate;=0A- }=0A- }=0A- else=0A+ case = X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu {x,y}mm,{x,y}mm/mem */=0A+ = if ( vex.opcx !=3D vex_none )=0A {=0A- fail_if(vex.reg= !=3D 0xf);=0A host_and_vcpu_must_have(avx);=0A = get_fpu(X86EMUL_FPU_ymm, &fic);=0A- ea.bytes =3D 16 << = vex.l;=0A }=0A- switch ( b )=0A+ else=0A = {=0A- case 0x7e:=0A- generate_exception_if(vex.l, = EXC_UD);=0A- ea.bytes =3D op_bytes;=0A- break;=0A- = case 0xd6:=0A- generate_exception_if(vex.l, EXC_UD);=0A- = ea.bytes =3D 8;=0A- break;=0A+ vcpu_must_hav= e(sse2);=0A+ get_fpu(X86EMUL_FPU_xmm, &fic);=0A }=0A- = if ( ea.type =3D=3D OP_MEM )=0A- {=0A- uint32_t = mxcsr =3D 0;=0A+ d |=3D TwoOp;=0A+ op_bytes =3D 16 << = vex.l;=0A+ goto simd_0f_common;=0A =0A- if ( ea.bytes < = 16 || vex.pfx =3D=3D vex_f3 )=0A- mxcsr =3D MXCSR_MM;=0A- = else if ( vcpu_has_misalignsse() )=0A- asm ( = "stmxcsr %0" : "=3Dm" (mxcsr) );=0A- generate_exception_if(!(mxc= sr & MXCSR_MM) &&=0A- !is_aligned(ea.mem.s= eg, ea.mem.off, ea.bytes,=0A- = ctxt, ops),=0A- EXC_GP, 0);=0A- = if ( b =3D=3D 0x6f )=0A- rc =3D ops->read(ea.mem.seg, = ea.mem.off+0, mmvalp,=0A- ea.bytes, = ctxt);=0A- else=0A- fail_if(!ops->write); /* = Check before running the stub. */=0A- }=0A- if ( ea.type = =3D=3D OP_MEM || b =3D=3D 0x7e )=0A- {=0A- /* Convert = memory operand or GPR destination to (%rAX) */=0A- rex_prefix = &=3D ~REX_B;=0A- vex.b =3D 1;=0A- buf[4] &=3D = 0x38;=0A- if ( ea.type =3D=3D OP_MEM )=0A- = ea.reg =3D (void *)mmvalp;=0A- else /* Ensure zero-extension of = a 32-bit result. */=0A- *ea.reg =3D 0;=0A- }=0A- = if ( !rc )=0A- {=0A- copy_REX_VEX(buf, rex_prefix, = vex);=0A- asm volatile ( "call *%0" : : "r" (stub.func), "a" = (ea.reg)=0A- : "memory" );=0A- = }=0A- put_fpu(&fic);=0A- put_stub(stub);=0A- if ( !rc = && (b !=3D 0x6f) && (ea.type =3D=3D OP_MEM) )=0A- {=0A- = ASSERT(ops->write); /* See the fail_if() above. */=0A- rc =3D = ops->write(ea.mem.seg, ea.mem.off, mmvalp,=0A- = ea.bytes, ctxt);=0A- }=0A- if ( rc )=0A- goto = done;=0A- dst.type =3D OP_NONE;=0A- break;=0A- }=0A+ = case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */=0A+ = generate_exception_if(vex.l, EXC_UD);=0A+ d |=3D TwoOp;=0A+ = /* fall through */=0A+ case X86EMUL_OPC_66(0x0f, 0xd6): /* movq = xmm,xmm/m64 */=0A+ case X86EMUL_OPC(0x0f, 0x6f): /* movq = mm/m64,mm */=0A+ case X86EMUL_OPC(0x0f, 0x7f): /* movq mm,mm/m64 = */=0A+ op_bytes =3D 8;=0A+ goto simd_0f_int;=0A =0A = CASE_SIMD_PACKED_INT(0x0f, 0x70): /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm = */=0A case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem= ,{x,y}mm */=0A@@ -5728,25 +5800,25 @@ x86_emulate(=0A = get_fpu(X86EMUL_FPU_mmx, &fic);=0A }=0A simd_0f_imm8:=0A- = {=0A- uint8_t *buf =3D get_stub(stub);=0A-=0A- buf[0] =3D = 0x3e;=0A- buf[1] =3D 0x3e;=0A- buf[2] =3D 0x0f;=0A- = buf[3] =3D b;=0A- buf[4] =3D modrm;=0A+ opc =3D init_prefixes= (stub);=0A+ opc[0] =3D b;=0A+ opc[1] =3D modrm;=0A = if ( ea.type =3D=3D OP_MEM )=0A {=0A /* Convert memory = operand to (%rAX). */=0A rex_prefix &=3D ~REX_B;=0A = vex.b =3D 1;=0A- buf[4] &=3D 0x38;=0A+ opc[1] &=3D = 0x38;=0A }=0A- buf[5] =3D imm1;=0A- fic.insn_bytes = =3D 6;=0A+ opc[2] =3D imm1;=0A+ fic.insn_bytes =3D PFX_BYTES = + 3;=0A break;=0A- }=0A+=0A+ case X86EMUL_OPC_F3(0x0f, = 0x7e): /* movq xmm/m64,xmm */=0A+ case X86EMUL_OPC_VEX_F3(0x0f, = 0x7e): /* vmovq xmm/m64,xmm */=0A+ generate_exception_if(vex.l, = EXC_UD);=0A+ op_bytes =3D 8;=0A+ goto simd_0f_int;=0A =0A = case X86EMUL_OPC_F2(0x0f, 0xf0): /* lddqu m128,xmm */=0A case = X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */=0A@@ -6319,6 = +6391,17 @@ x86_emulate(=0A get_fpu(X86EMUL_FPU_mmx, &fic);=0A = goto simd_0f_common;=0A =0A+ case X86EMUL_OPC_F3(0x0f, 0xd6): = /* movq2dq mm,xmm */=0A+ case X86EMUL_OPC_F2(0x0f, 0xd6): /* = movdq2q xmm,mm */=0A+ generate_exception_if(ea.type !=3D OP_REG, = EXC_UD);=0A+ op_bytes =3D 8;=0A+ host_and_vcpu_must_have(mmx)= ;=0A+ goto simd_0f_int;=0A+=0A+ case X86EMUL_OPC(0x0f, 0xe7): = /* movntq mm,m64 */=0A+ generate_exception_if(ea.type !=3D = OP_MEM, EXC_UD);=0A+ sfence =3D true;=0A+ /* fall through = */=0A case X86EMUL_OPC(0x0f, 0xda): /* pminub mm/m64,mm */=0A = case X86EMUL_OPC(0x0f, 0xde): /* pmaxub mm/m64,mm */=0A case = X86EMUL_OPC(0x0f, 0xea): /* pminsw mm/m64,mm */=0A@@ -6332,6 = +6415,73 @@ x86_emulate(=0A get_fpu(X86EMUL_FPU_mmx, &fic);=0A = goto simd_0f_common;=0A =0A+ CASE_SIMD_PACKED_INT(0x0f, 0xf7): = /* maskmov{q,dqu} {,x}mm,{,x}mm */=0A+ case X86EMUL_OPC_VEX_66(0x0f, = 0xf7): /* vmaskmovdqu xmm,xmm */=0A+ generate_exception_if(ea.type = !=3D OP_REG, EXC_UD);=0A+ if ( vex.opcx !=3D vex_none )=0A+ = {=0A+ generate_exception_if(vex.l || vex.reg !=3D 0xf, = EXC_UD);=0A+ d |=3D TwoOp;=0A+ host_and_vcpu_must_hav= e(avx);=0A+ get_fpu(X86EMUL_FPU_ymm, &fic);=0A+ }=0A+ = else if ( vex.pfx )=0A+ {=0A+ vcpu_must_have(sse2);= =0A+ get_fpu(X86EMUL_FPU_xmm, &fic);=0A+ }=0A+ = else=0A+ {=0A+ host_and_vcpu_must_have(mmx);=0A+ = vcpu_must_have(sse);=0A+ get_fpu(X86EMUL_FPU_mmx, &fic);=0A+ = }=0A+=0A+ /*=0A+ * While we can't reasonably provide = fully correct behavior here=0A+ * (in particular avoiding the = memory read in anticipation of all=0A+ * bytes in the range = eventually being written), we can (and should)=0A+ * still = suppress the memory access if all mask bits are clear. Read=0A+ * = the mask bits via {,v}pmovmskb for that purpose.=0A+ */=0A+ = opc =3D init_prefixes(stub);=0A+ opc[0] =3D 0xd7; /* {,v}pmovmskb = */=0A+ /* (Ab)use "sfence" for latching the original REX.R / VEX.R. = */=0A+ sfence =3D rex_prefix & REX_R;=0A+ /* Convert GPR = destination to %rAX. */=0A+ rex_prefix &=3D ~REX_R;=0A+ = vex.r =3D 1;=0A+ if ( !mode_64bit() )=0A+ vex.w =3D = 0;=0A+ opc[1] =3D modrm & 0xc7;=0A+ fic.insn_bytes =3D = PFX_BYTES + 2;=0A+ opc[2] =3D 0xc3;=0A+=0A+ copy_REX_VEX(opc,= rex_prefix, vex);=0A+ invoke_stub("", "", "=3Da" (ea.val) : = [dummy] "i" (0));=0A+=0A+ put_stub(stub);=0A+ if ( !ea.val = )=0A+ {=0A+ put_fpu(&fic);=0A+ goto complete_i= nsn;=0A+ }=0A+=0A+ opc =3D init_prefixes(stub);=0A+ = opc[0] =3D b;=0A+ opc[1] =3D modrm;=0A+ /* Restore high bit = of XMM destination. */=0A+ if ( sfence )=0A+ {=0A+ = rex_prefix |=3D REX_R;=0A+ vex.r =3D 0;=0A+ }=0A+=0A+ = ea.type =3D OP_MEM;=0A+ ea.mem.off =3D truncate_ea(_regs.r(di))= ;=0A+ sfence =3D true;=0A+ break;=0A+=0A case X86EMUL_OPC= (0x0f38, 0xf0): /* movbe m,r */=0A case X86EMUL_OPC(0x0f38, 0xf1): /* = movbe r,m */=0A vcpu_must_have(movbe);=0A@@ -6595,23 +6745,14 @@ = x86_emulate(=0A =0A if ( state->simd_size )=0A {=0A-#ifdef = __XEN__=0A- uint8_t *buf =3D stub.ptr;=0A-#else=0A- uint8_t = *buf =3D get_stub(stub);=0A-#endif=0A-=0A generate_exception_if(!op= _bytes, EXC_UD);=0A generate_exception_if(vex.opcx && (d & TwoOp) = && vex.reg !=3D 0xf,=0A EXC_UD);=0A =0A- = if ( !buf )=0A+ if ( !opc )=0A BUG();=0A- if = ( vex.opcx =3D=3D vex_none )=0A- SET_SSE_PREFIX(buf[0], = vex.pfx);=0A-=0A- buf[fic.insn_bytes] =3D 0xc3;=0A- = copy_REX_VEX(buf, rex_prefix, vex);=0A+ opc[fic.insn_bytes - = PFX_BYTES] =3D 0xc3;=0A+ copy_REX_VEX(opc, rex_prefix, vex);=0A =0A = if ( ea.type =3D=3D OP_MEM )=0A {=0A@@ -6619,10 +6760,16 = @@ x86_emulate(=0A =0A if ( op_bytes < 16 ||=0A = (vex.opcx=0A- ? /* vmov{a,nt}p{s,d} are exceptions. = */=0A- ext !=3D ext_0f || ((b | 1) !=3D 0x29 && b !=3D = 0x2b)=0A- : /* movup{s,d} and lddqu are exceptions. = */=0A- ext =3D=3D ext_0f && ((b | 1) =3D=3D 0x11 || b = =3D=3D 0xf0)) )=0A+ ? /* vmov{{a,nt}p{s,d},dqa,ntdq} are = exceptions. */=0A+ ext !=3D ext_0f ||=0A+ = ((b | 1) !=3D 0x29 && b !=3D 0x2b &&=0A+ ((b | = 0x10) !=3D 0x7f || vex.pfx !=3D vex_66) &&=0A+ b !=3D = 0xe7)=0A+ : /* movup{s,d}, {,mask}movdqu, and lddqu are = exceptions. */=0A+ ext =3D=3D ext_0f &&=0A+ = ((b | 1) =3D=3D 0x11 ||=0A+ ((b | 0x10) =3D=3D = 0x7f && vex.pfx =3D=3D vex_f3) ||=0A+ b =3D=3D 0xf7 || = b =3D=3D 0xf0)) )=0A mxcsr =3D MXCSR_MM;=0A = else if ( vcpu_has_misalignsse() )=0A asm ( "stmxcsr %0" : = "=3Dm" (mxcsr) );=0A@@ -6630,14 +6777,25 @@ x86_emulate(=0A = !is_aligned(ea.mem.seg, ea.mem.off, op_bytes,=0A = ctxt, ops),=0A = EXC_GP, 0);=0A- if ( (d & SrcMask) =3D=3D SrcMem = )=0A+ switch ( d & SrcMask )=0A {=0A+ = case SrcMem:=0A rc =3D ops->read(ea.mem.seg, ea.mem.off, = mmvalp, op_bytes, ctxt);=0A if ( rc !=3D X86EMUL_OKAY )=0A = goto done;=0A+ /* fall through */=0A+ = case SrcMem16:=0A dst.type =3D OP_NONE;=0A+ = break;=0A+ default:=0A+ if ( (d & = DstMask) !=3D DstMem )=0A+ {=0A+ = ASSERT_UNREACHABLE();=0A+ return X86EMUL_UNHANDLEABLE;= =0A+ }=0A+ break;=0A }=0A- = else if ( (d & DstMask) =3D=3D DstMem )=0A+ if ( (d & = DstMask) =3D=3D DstMem )=0A {=0A fail_if(!ops->= write); /* Check before running the stub. */=0A ASSERT(d & = Mov);=0A@@ -6645,18 +6803,17 @@ x86_emulate(=0A dst.bytes = =3D op_bytes;=0A dst.mem =3D ea.mem;=0A }=0A- = else if ( (d & SrcMask) =3D=3D SrcMem16 )=0A- = dst.type =3D OP_NONE;=0A- else=0A- {=0A- = ASSERT_UNREACHABLE();=0A- return X86EMUL_UNHANDLEABLE;=0A= - }=0A }=0A else=0A dst.type =3D = OP_NONE;=0A =0A- invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));= =0A+ /* {,v}maskmov{q,dqu}, as an exception, uses rDI. */=0A+ = if ( likely((ctxt->opcode & ~(X86EMUL_OPC_PFX_MASK |=0A+ = X86EMUL_OPC_ENCODING_MASK)) !=3D=0A+ = X86EMUL_OPC(0x0f, 0xf7)) )=0A+ invoke_stub("", "", "+m" = (*mmvalp) : "a" (mmvalp));=0A+ else=0A+ invoke_stub("", = "", "+m" (*mmvalp) : "D" (mmvalp));=0A =0A put_stub(stub);=0A = put_fpu(&fic);=0A@@ -6912,6 +7069,8 @@ x86_insn_is_mem_access(const = struct x86_=0A case 0xa4 ... 0xa7: /* MOVS / CMPS */=0A case 0xaa = ... 0xaf: /* STOS / LODS / SCAS */=0A case 0xd7: /* XLAT = */=0A+ CASE_SIMD_PACKED_INT(0x0f, 0xf7): /* MASKMOV{Q,DQU} */=0A+ = case X86EMUL_OPC_VEX_66(0x0f, 0xf7): /* VMASKMOVDQU */=0A return = true;=0A =0A case X86EMUL_OPC(0x0f, 0x01):=0A@@ -6929,7 +7088,8 @@ = x86_insn_is_mem_write(const struct x86_e=0A switch ( state->desc & = DstMask )=0A {=0A case DstMem:=0A- return state->modrm_mod = !=3D 3;=0A+ /* The SrcMem check is to cover {,V}MASKMOV{Q,DQU}. = */=0A+ return state->modrm_mod !=3D 3 || (state->desc & SrcMask) = =3D=3D SrcMem;=0A =0A case DstBitBase:=0A case DstImplicit:=0A@@ = -6949,22 +7109,9 @@ x86_insn_is_mem_write(const struct x86_e=0A case = 0x6c: case 0x6d: /* INS */=0A case 0xa4: case 0xa5: = /* MOVS */=0A case 0xaa: case 0xab: /* STOS = */=0A- case X86EMUL_OPC(0x0f, 0x7e): /* MOVD/MOVQ */=0A- case = X86EMUL_OPC_66(0x0f, 0x7e): /* MOVD/MOVQ */=0A- case X86EMUL_OPC_VEX= _66(0x0f, 0x7e): /* VMOVD/VMOVQ */=0A- case X86EMUL_OPC(0x0f, 0x7f): = /* VMOVQ */=0A- case X86EMUL_OPC_66(0x0f, 0x7f): /* MOVDQA = */=0A- case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* VMOVDQA */=0A- case = X86EMUL_OPC_F3(0x0f, 0x7f): /* MOVDQU */=0A- case X86EMUL_OPC_VEX_F3= (0x0f, 0x7f): /* VMOVDQU */=0A case X86EMUL_OPC(0x0f, 0xab): /* = BTS */=0A case X86EMUL_OPC(0x0f, 0xb3): /* BTR */=0A case = X86EMUL_OPC(0x0f, 0xbb): /* BTC */=0A- case X86EMUL_OPC_66(0x0f, = 0xd6): /* MOVQ */=0A- case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* VMOVQ = */=0A- case X86EMUL_OPC(0x0f, 0xe7): /* MOVNTQ */=0A- case = X86EMUL_OPC_66(0x0f, 0xe7): /* MOVNTDQ */=0A- case X86EMUL_OPC_VEX_6= 6(0x0f, 0xe7): /* VMOVNTDQ */=0A return true;=0A =0A case = 0xd9:=0A --=__Part427B8620.1__= Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: base64 Content-Disposition: inline X19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX18KWGVuLWRldmVs IG1haWxpbmcgbGlzdApYZW4tZGV2ZWxAbGlzdHMueGVuLm9yZwpodHRwczovL2xpc3RzLnhlbi5v cmcveGVuLWRldmVsCg== --=__Part427B8620.1__=--