* [PATCH 2/8] x86emul: support most memory accessing MMX/SSE/SSE2 insns
2017-01-25 14:49 [PATCH 0/8] x86emul: MMX/SSE/SSE2 support Jan Beulich
2017-01-25 15:03 ` [PATCH 1/8] x86emul: catch exceptions occurring in stubs Jan Beulich
@ 2017-01-25 15:04 ` Jan Beulich
2017-01-25 15:04 ` [PATCH 3/8] x86emul: support MMX/SSE/SSE2 moves Jan Beulich
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2017-01-25 15:04 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper
[-- Attachment #1: Type: text/plain, Size: 55386 bytes --]
This aims at covering most MMX/SSEn/AVX instructions in the 0x0f-escape
space with memory operands. Not covered here are irregular moves,
converts, and {,U}COMIS{S,D} (modifying EFLAGS).
Note that the distinction between simd_*_fp isn't strictly needed, but
I've kept them as separate entries since in an earlier version I needed
them to be separate, and we may well find it useful down the road to
have that distinction.
Also take the opportunity and adjust the vmovdqu test case the new
LDDQU one here has been cloned from: To zero a ymm register we don't
need to go through hoops, as 128-bit AVX insns zero the upper portion
of the destination register, and in the disabled AVX2 code there was a
wrong YMM register used.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1656,12 +1656,7 @@ int main(int argc, char **argv)
{
decl_insn(vmovdqu_from_mem);
-#if 0 /* Don't use AVX2 instructions for now */
- asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n"
-#else
- asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t"
- "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n"
-#endif
+ asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
put_insn(vmovdqu_from_mem, "vmovdqu (%0), %%ymm4")
:: "d" (NULL) );
@@ -1675,7 +1670,7 @@ int main(int argc, char **argv)
#if 0 /* Don't use AVX2 instructions for now */
asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
"vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
- "vpmovmskb %%ymm1, %0" : "=r" (rc) );
+ "vpmovmskb %%ymm0, %0" : "=r" (rc) );
#else
asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
"vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
@@ -2083,6 +2078,67 @@ int main(int argc, char **argv)
printf("skipped\n");
#endif
+ printf("%-40s", "Testing lddqu 4(%edx),%xmm4...");
+ if ( stack_exec && cpu_has_sse3 )
+ {
+ decl_insn(lddqu);
+
+ asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n"
+ put_insn(lddqu, "lddqu 4(%0), %%xmm4")
+ :: "d" (NULL) );
+
+ set_insn(lddqu);
+ memset(res, 0x55, 64);
+ memset(res + 1, 0xff, 16);
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(lddqu) )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm4, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vlddqu (%ecx),%ymm4...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vlddqu);
+
+ asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
+ put_insn(vlddqu, "vlddqu (%0), %%ymm4")
+ :: "c" (NULL) );
+
+ set_insn(vlddqu);
+ memset(res + 1, 0xff, 32);
+ regs.ecx = (unsigned long)(res + 1);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vlddqu) )
+ goto fail;
+#if 0 /* Don't use AVX2 instructions for now */
+ asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
+ "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
+ "vpmovmskb %%ymm0, %0" : "=r" (rc) );
+#else
+ asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
+ "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
+ "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
+ "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
+ "vpmovmskb %%xmm0, %0\n\t"
+ "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
+ rc |= i << 16;
+#endif
+ if ( ~rc )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
#undef decl_insn
#undef put_insn
#undef set_insn
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -81,6 +81,12 @@ static inline uint64_t xgetbv(uint32_t x
(res.d & (1U << 26)) != 0; \
})
+#define cpu_has_sse3 ({ \
+ struct cpuid_leaf res; \
+ emul_test_cpuid(1, 0, &res, NULL); \
+ (res.c & (1U << 0)) != 0; \
+})
+
#define cpu_has_popcnt ({ \
struct cpuid_leaf res; \
emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -45,6 +45,8 @@
#define ModRM (1<<6)
/* Destination is only written; never read. */
#define Mov (1<<7)
+/* VEX/EVEX (SIMD only): 2nd source operand unused (must be all ones) */
+#define TwoOp Mov
/* All operands are implicit in the opcode. */
#define ImplicitOps (DstImplicit|SrcImplicit)
@@ -180,104 +182,171 @@ static const opcode_desc_t opcode_table[
ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
};
-static const opcode_desc_t twobyte_table[256] = {
- /* 0x00 - 0x07 */
- ModRM, ImplicitOps|ModRM, DstReg|SrcMem16|ModRM, DstReg|SrcMem16|ModRM,
- 0, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x08 - 0x0F */
- ImplicitOps, ImplicitOps, 0, ImplicitOps,
- 0, ImplicitOps|ModRM, ImplicitOps, ModRM|SrcImmByte,
- /* 0x10 - 0x17 */
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- /* 0x18 - 0x1F */
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- /* 0x20 - 0x27 */
- DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
- DstImplicit|SrcMem|ModRM, DstImplicit|SrcMem|ModRM,
- 0, 0, 0, 0,
- /* 0x28 - 0x2F */
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- /* 0x30 - 0x37 */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, 0, ImplicitOps,
- /* 0x38 - 0x3F */
- DstReg|SrcMem|ModRM, 0, DstReg|SrcImmByte|ModRM, 0, 0, 0, 0, 0,
- /* 0x40 - 0x47 */
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- /* 0x48 - 0x4F */
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- /* 0x50 - 0x5F */
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- /* 0x60 - 0x6F */
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
- /* 0x70 - 0x7F */
- SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
- ModRM, ModRM, ModRM, ImplicitOps,
- ModRM, ModRM, 0, 0, ModRM, ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- /* 0x80 - 0x87 */
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- /* 0x88 - 0x8F */
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- /* 0x90 - 0x97 */
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- /* 0x98 - 0x9F */
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- /* 0xA0 - 0xA7 */
- ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, DstBitBase|SrcReg|ModRM,
- DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, ModRM, ModRM,
- /* 0xA8 - 0xAF */
- ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, DstBitBase|SrcReg|ModRM,
- DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM,
- ImplicitOps|ModRM, DstReg|SrcMem|ModRM,
- /* 0xB0 - 0xB7 */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- DstReg|SrcMem|ModRM|Mov, DstBitBase|SrcReg|ModRM,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
- /* 0xB8 - 0xBF */
- DstReg|SrcMem|ModRM, ModRM,
- DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
- DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
- /* 0xC0 - 0xC7 */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- SrcImmByte|ModRM, DstMem|SrcReg|ModRM|Mov,
- SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, ImplicitOps|ModRM,
- /* 0xC8 - 0xCF */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0xD0 - 0xDF */
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM, ModRM,
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- /* 0xE0 - 0xEF */
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- /* 0xF0 - 0xFF */
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
+enum simd_opsize {
+ simd_none,
+ /*
+ * Ordinary packed integers:
+ * - 64 bits without prefix 66 (MMX)
+ * - 128 bits with prefix 66 (SSEn)
+ * - 128/256 bits depending on VEX.L (AVX)
+ */
+ simd_packed_int,
+ /*
+ * Ordinary packed/scalar floating point:
+ * - 128 bits without prefix or with prefix 66 (SSEn)
+ * - 128/256 bits depending on VEX.L (AVX)
+ * - 32 bits with prefix F3 (scalar single)
+ * - 64 bits with prefix F2 (scalar doubgle)
+ */
+ simd_any_fp,
+ /*
+ * Packed floating point:
+ * - 128 bits without prefix or with prefix 66 (SSEn)
+ * - 128/256 bits depending on VEX.L (AVX)
+ */
+ simd_packed_fp,
+ /*
+ * Single precision packed/scalar floating point:
+ * - 128 bits without prefix (SSEn)
+ * - 128/256 bits depending on VEX.L, no prefix (AVX)
+ * - 32 bits with prefix F3 (scalar)
+ */
+ simd_single_fp,
+ /* Operand size encoded in non-standard way. */
+ simd_other
+};
+typedef uint8_t simd_opsize_t;
+
+static const struct {
+ opcode_desc_t desc;
+ simd_opsize_t size;
+} twobyte_table[256] = {
+ [0x00] = { ModRM },
+ [0x01] = { ImplicitOps|ModRM },
+ [0x02] = { DstReg|SrcMem16|ModRM },
+ [0x03] = { DstReg|SrcMem16|ModRM },
+ [0x05] = { ImplicitOps },
+ [0x06] = { ImplicitOps },
+ [0x07] = { ImplicitOps },
+ [0x08] = { ImplicitOps },
+ [0x09] = { ImplicitOps },
+ [0x0b] = { ImplicitOps },
+ [0x0d] = { ImplicitOps|ModRM },
+ [0x0e] = { ImplicitOps },
+ [0x0f] = { ModRM|SrcImmByte },
+ [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
+ [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+ [0x12] = { ImplicitOps|ModRM },
+ [0x13] = { ImplicitOps|ModRM },
+ [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+ [0x16] = { ImplicitOps|ModRM },
+ [0x17] = { ImplicitOps|ModRM },
+ [0x18] = { ImplicitOps|ModRM },
+ [0x19] = { ImplicitOps|ModRM },
+ [0x1a] = { ImplicitOps|ModRM },
+ [0x1b] = { ImplicitOps|ModRM },
+ [0x1c] = { ImplicitOps|ModRM },
+ [0x1d] = { ImplicitOps|ModRM },
+ [0x1e] = { ImplicitOps|ModRM },
+ [0x1f] = { ImplicitOps|ModRM },
+ [0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
+ [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
+ [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp },
+ [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp },
+ [0x2a] = { ImplicitOps|ModRM },
+ [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+ [0x2c ... 0x2d] = { ImplicitOps|ModRM },
+ [0x2e ... 0x2f] = { ImplicitOps|ModRM },
+ [0x30] = { ImplicitOps },
+ [0x31] = { ImplicitOps },
+ [0x32] = { ImplicitOps },
+ [0x33] = { ImplicitOps },
+ [0x34] = { ImplicitOps },
+ [0x35] = { ImplicitOps },
+ [0x37] = { ImplicitOps },
+ [0x38] = { DstReg|SrcMem|ModRM },
+ [0x3a] = { DstReg|SrcImmByte|ModRM },
+ [0x40 ... 0x4f] = { DstReg|SrcMem|ModRM|Mov },
+ [0x50] = { ModRM },
+ [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },
+ [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
+ [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+ [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+ [0x5a] = { ModRM },
+ [0x5b] = { ModRM },
+ [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+ [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x6b] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x6c ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x6e ... 0x6f] = { ImplicitOps|ModRM },
+ [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
+ [0x71 ... 0x73] = { SrcImmByte|ModRM },
+ [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x77] = { DstImplicit|SrcNone },
+ [0x78] = { ModRM },
+ [0x79] = { ModRM },
+ [0x7c] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x7e ... 0x7f] = { ImplicitOps|ModRM },
+ [0x80 ... 0x8f] = { DstImplicit|SrcImm },
+ [0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
+ [0xa0 ... 0xa1] = { ImplicitOps|Mov },
+ [0xa2] = { ImplicitOps },
+ [0xa3] = { DstBitBase|SrcReg|ModRM },
+ [0xa4] = { DstMem|SrcImmByte|ModRM },
+ [0xa5] = { DstMem|SrcReg|ModRM },
+ [0xa6] = { ModRM },
+ [0xa7] = { ModRM },
+ [0xa8 ... 0xa9] = { ImplicitOps|Mov },
+ [0xaa] = { ImplicitOps },
+ [0xab] = { DstBitBase|SrcReg|ModRM },
+ [0xac] = { DstMem|SrcImmByte|ModRM },
+ [0xad] = { DstMem|SrcReg|ModRM },
+ [0xae] = { ImplicitOps|ModRM },
+ [0xaf] = { DstReg|SrcMem|ModRM },
+ [0xb0] = { ByteOp|DstMem|SrcReg|ModRM },
+ [0xb1] = { DstMem|SrcReg|ModRM },
+ [0xb2] = { DstReg|SrcMem|ModRM|Mov },
+ [0xb3] = { DstBitBase|SrcReg|ModRM },
+ [0xb4 ... 0xb5] = { DstReg|SrcMem|ModRM|Mov },
+ [0xb6] = { ByteOp|DstReg|SrcMem|ModRM|Mov },
+ [0xb7] = { DstReg|SrcMem16|ModRM|Mov },
+ [0xb8] = { DstReg|SrcMem|ModRM },
+ [0xb9] = { ModRM },
+ [0xba] = { DstBitBase|SrcImmByte|ModRM },
+ [0xbb] = { DstBitBase|SrcReg|ModRM },
+ [0xbc ... 0xbd] = { DstReg|SrcMem|ModRM },
+ [0xbe] = { ByteOp|DstReg|SrcMem|ModRM|Mov },
+ [0xbf] = { DstReg|SrcMem16|ModRM|Mov },
+ [0xc0] = { ByteOp|DstMem|SrcReg|ModRM },
+ [0xc1] = { DstMem|SrcReg|ModRM },
+ [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp },
+ [0xc3] = { DstMem|SrcReg|ModRM|Mov },
+ [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
+ [0xc5] = { SrcImmByte|ModRM },
+ [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp },
+ [0xc7] = { ImplicitOps|ModRM },
+ [0xc8 ... 0xcf] = { ImplicitOps },
+ [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xd6] = { ImplicitOps|ModRM },
+ [0xd7] = { ModRM },
+ [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xe6] = { ModRM },
+ [0xe7] = { ImplicitOps|ModRM },
+ [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xf7] = { ModRM },
+ [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xff] = { ModRM }
};
static const opcode_desc_t xop_table[] = {
@@ -1372,10 +1441,12 @@ static bool vcpu_has(
#define vcpu_has_lahf_lm() vcpu_has(0x80000001, ECX, 0, ctxt, ops)
#define vcpu_has_cr8_legacy() vcpu_has(0x80000001, ECX, 4, ctxt, ops)
#define vcpu_has_lzcnt() vcpu_has(0x80000001, ECX, 5, ctxt, ops)
+#define vcpu_has_sse4a() vcpu_has(0x80000001, ECX, 6, ctxt, ops)
#define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX, 7, ctxt, ops)
#define vcpu_has_tbm() vcpu_has(0x80000001, ECX, 21, ctxt, ops)
#define vcpu_has_bmi1() vcpu_has( 7, EBX, 3, ctxt, ops)
#define vcpu_has_hle() vcpu_has( 7, EBX, 4, ctxt, ops)
+#define vcpu_has_avx2() vcpu_has( 7, EBX, 5, ctxt, ops)
#define vcpu_has_bmi2() vcpu_has( 7, EBX, 8, ctxt, ops)
#define vcpu_has_rtm() vcpu_has( 7, EBX, 11, ctxt, ops)
#define vcpu_has_mpx() vcpu_has( 7, EBX, 14, ctxt, ops)
@@ -1975,6 +2046,7 @@ struct x86_emulate_state {
opcode_desc_t desc;
union vex vex;
union evex evex;
+ enum simd_opsize simd_size;
/*
* Data operand effective address (usually computed from ModRM).
@@ -2110,7 +2182,8 @@ x86_decode_twobyte(
case 0x50 ... 0x77:
case 0x79 ... 0x7f:
case 0xae:
- case 0xc2 ... 0xc6:
+ case 0xc2 ... 0xc3:
+ case 0xc5 ... 0xc6:
case 0xd0 ... 0xfe:
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
break;
@@ -2137,8 +2210,23 @@ x86_decode_twobyte(
case 0xbd: bsr / lzcnt
* They're being dealt with in the execution phase (if at all).
*/
+
+ case 0xc4: /* pinsrw */
+ ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0, 0xc4): /* vpinsrw */
+ state->desc = DstReg | SrcMem16 | ModRM;
+ break;
}
+ /*
+ * Register forms of most VEX-encoded TwoOp instructions have
+ * three operands.
+ */
+ if ( state->simd_size && ea.type != OP_MEM &&
+ vex.opcx && (vex.pfx & VEX_PREFIX_SCALAR_MASK) )
+ state->desc &= ~TwoOp;
+
done:
return rc;
}
@@ -2270,12 +2358,13 @@ x86_decode(
{
/* Two-byte opcode. */
b = insn_fetch_type(uint8_t);
- d = twobyte_table[b];
+ d = twobyte_table[b].desc;
switch ( b )
{
default:
opcode = b | MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
ext = ext_0f;
+ state->simd_size = twobyte_table[b].size;
break;
case 0x38:
b = insn_fetch_type(uint8_t);
@@ -2381,15 +2470,16 @@ x86_decode(
{
case vex_0f:
opcode |= MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
- d = twobyte_table[b];
+ d = twobyte_table[b].desc;
+ state->simd_size = twobyte_table[b].size;
break;
case vex_0f38:
opcode |= MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
- d = twobyte_table[0x38];
+ d = twobyte_table[0x38].desc;
break;
case vex_0f3a:
opcode |= MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
- d = twobyte_table[0x3a];
+ d = twobyte_table[0x3a].desc;
break;
default:
rc = X86EMUL_UNHANDLEABLE;
@@ -2639,13 +2729,53 @@ x86_decode(
ea.mem.off = truncate_ea(ea.mem.off);
}
- /*
- * When prefix 66 has a meaning different from operand-size override,
- * operand size defaults to 4 and can't be overridden to 2.
- */
- if ( op_bytes == 2 &&
- (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
- op_bytes = 4;
+ switch ( state->simd_size )
+ {
+ case simd_none:
+ /*
+ * When prefix 66 has a meaning different from operand-size override,
+ * operand size defaults to 4 and can't be overridden to 2.
+ */
+ if ( op_bytes == 2 &&
+ (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
+ op_bytes = 4;
+ break;
+
+ case simd_packed_int:
+ switch ( vex.pfx )
+ {
+ case vex_none: op_bytes = 8; break;
+ case vex_66: op_bytes = 16 << vex.l; break;
+ default: op_bytes = 0; break;
+ }
+ break;
+
+ case simd_single_fp:
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ {
+ op_bytes = 0;
+ break;
+ case simd_packed_fp:
+ if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
+ {
+ op_bytes = 0;
+ break;
+ }
+ }
+ /* fall through */
+ case simd_any_fp:
+ switch ( vex.pfx )
+ {
+ default: op_bytes = 16 << vex.l; break;
+ case vex_f3: op_bytes = 4; break;
+ case vex_f2: op_bytes = 8; break;
+ }
+ break;
+
+ default:
+ op_bytes = 0;
+ break;
+ }
done:
return rc;
@@ -2669,8 +2799,10 @@ x86_emulate(
int rc;
uint8_t b, d;
bool singlestep = (_regs._eflags & EFLG_TF) && !is_branch_step(ctxt, ops);
+ bool sfence = false;
struct operand src = { .reg = PTR_POISON };
struct operand dst = { .reg = PTR_POISON };
+ unsigned long cr4;
enum x86_swint_type swint_type;
struct fpu_insn_ctxt fic;
struct x86_emulate_stub stub = {};
@@ -2737,6 +2869,8 @@ x86_emulate(
ea.bytes = 2;
goto srcmem_common;
case SrcMem:
+ if ( state->simd_size )
+ break;
ea.bytes = (d & ByteOp) ? 1 : op_bytes;
srcmem_common:
src = ea;
@@ -2837,6 +2971,11 @@ x86_emulate(
d = (d & ~DstMask) | DstMem;
/* Becomes a normal DstMem operation from here on. */
case DstMem:
+ if ( state->simd_size )
+ {
+ generate_exception_if(lock_prefix, EXC_UD);
+ break;
+ }
ea.bytes = (d & ByteOp) ? 1 : op_bytes;
dst = ea;
if ( dst.type == OP_REG )
@@ -2871,7 +3010,6 @@ x86_emulate(
{
enum x86_segment seg;
struct segment_register cs, sreg;
- unsigned long cr4;
struct cpuid_leaf cpuid_leaf;
case 0x00 ... 0x05: add: /* add */
@@ -5066,116 +5204,109 @@ x86_emulate(
case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
break;
- case X86EMUL_OPC(0x0f, 0x2b): /* movntps xmm,m128 */
- case X86EMUL_OPC_VEX(0x0f, 0x2b): /* vmovntps xmm,m128 */
- /* vmovntps ymm,m256 */
- case X86EMUL_OPC_66(0x0f, 0x2b): /* movntpd xmm,m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* vmovntpd xmm,m128 */
- /* vmovntpd ymm,m256 */
- fail_if(ea.type != OP_MEM);
- /* fall through */
- case X86EMUL_OPC(0x0f, 0x28): /* movaps xmm/m128,xmm */
- case X86EMUL_OPC_VEX(0x0f, 0x28): /* vmovaps xmm/m128,xmm */
- /* vmovaps ymm/m256,ymm */
- case X86EMUL_OPC_66(0x0f, 0x28): /* movapd xmm/m128,xmm */
- case X86EMUL_OPC_VEX_66(0x0f, 0x28): /* vmovapd xmm/m128,xmm */
- /* vmovapd ymm/m256,ymm */
- case X86EMUL_OPC(0x0f, 0x29): /* movaps xmm,xmm/m128 */
- case X86EMUL_OPC_VEX(0x0f, 0x29): /* vmovaps xmm,xmm/m128 */
- /* vmovaps ymm,ymm/m256 */
- case X86EMUL_OPC_66(0x0f, 0x29): /* movapd xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* vmovapd xmm,xmm/m128 */
- /* vmovapd ymm,ymm/m256 */
- case X86EMUL_OPC(0x0f, 0x10): /* movups xmm/m128,xmm */
- case X86EMUL_OPC_VEX(0x0f, 0x10): /* vmovups xmm/m128,xmm */
- /* vmovups ymm/m256,ymm */
- case X86EMUL_OPC_66(0x0f, 0x10): /* movupd xmm/m128,xmm */
- case X86EMUL_OPC_VEX_66(0x0f, 0x10): /* vmovupd xmm/m128,xmm */
- /* vmovupd ymm/m256,ymm */
- case X86EMUL_OPC_F3(0x0f, 0x10): /* movss xmm/m32,xmm */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x10): /* vmovss xmm/m32,xmm */
- case X86EMUL_OPC_F2(0x0f, 0x10): /* movsd xmm/m64,xmm */
- case X86EMUL_OPC_VEX_F2(0x0f, 0x10): /* vmovsd xmm/m64,xmm */
- case X86EMUL_OPC(0x0f, 0x11): /* movups xmm,xmm/m128 */
- case X86EMUL_OPC_VEX(0x0f, 0x11): /* vmovups xmm,xmm/m128 */
- /* vmovups ymm,ymm/m256 */
- case X86EMUL_OPC_66(0x0f, 0x11): /* movupd xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* vmovupd xmm,xmm/m128 */
- /* vmovupd ymm,ymm/m256 */
- case X86EMUL_OPC_F3(0x0f, 0x11): /* movss xmm,xmm/m32 */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* vmovss xmm,xmm/m32 */
- case X86EMUL_OPC_F2(0x0f, 0x11): /* movsd xmm,xmm/m64 */
- case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* vmovsd xmm,xmm/m64 */
- {
- uint8_t *buf = get_stub(stub);
+#define CASE_SIMD_PACKED_INT(pfx, opc) \
+ case X86EMUL_OPC(pfx, opc): \
+ case X86EMUL_OPC_66(pfx, opc)
+#define CASE_SIMD_SINGLE_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind(pfx, opc): \
+ case X86EMUL_OPC##kind##_F3(pfx, opc)
+#define CASE_SIMD_DOUBLE_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind##_66(pfx, opc): \
+ case X86EMUL_OPC##kind##_F2(pfx, opc)
+#define CASE_SIMD_ALL_FP(kind, pfx, opc) \
+ CASE_SIMD_SINGLE_FP(kind, pfx, opc): \
+ CASE_SIMD_DOUBLE_FP(kind, pfx, opc)
+#define CASE_SIMD_PACKED_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind(pfx, opc): \
+ case X86EMUL_OPC##kind##_66(pfx, opc)
+#define CASE_SIMD_SCALAR_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind##_F3(pfx, opc): \
+ case X86EMUL_OPC##kind##_F2(pfx, opc)
- fic.insn_bytes = 5;
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
- buf[5] = 0xc3;
+ CASE_SIMD_SCALAR_FP(, 0x0f, 0x2b): /* movnts{s,d} xmm,mem */
+ host_and_vcpu_must_have(sse4a);
+ /* fall through */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2b): /* movntp{s,d} xmm,m128 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2b): /* vmovntp{s,d} {x,y}mm,mem */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ sfence = true;
+ /* fall through */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x10): /* mov{up,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x11): /* mov{up,s}{s,d} xmm,xmm/mem */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x10): /* vmovup{s,d} {x,y}mm/mem,{x,y}mm */
+ CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x10): /* vmovs{s,d} mem,xmm */
+ /* vmovs{s,d} xmm,xmm,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x11): /* vmovup{s,d} {x,y}mm,{x,y}mm/mem */
+ CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x11): /* vmovs{s,d} xmm,mem */
+ /* vmovs{s,d} xmm,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x14): /* unpcklp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x14): /* vunpcklp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x15): /* unpckhp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x15): /* vunpckhp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x28): /* movap{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x28): /* vmovap{s,d} {x,y}mm/mem,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x29): /* movap{s,d} xmm,xmm/m128 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x29): /* vmovap{s,d} {x,y}mm,{x,y}mm/mem */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x51): /* sqrt{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x51): /* vsqrtp{s,d} {x,y}mm/mem,{x,y}mm */
+ /* vsqrts{s,d} xmm,xmm,xmm */
+ CASE_SIMD_SINGLE_FP(, 0x0f, 0x52): /* rsqrt{p,s}s xmm/mem,xmm */
+ CASE_SIMD_SINGLE_FP(_VEX, 0x0f, 0x52): /* vrsqrtps {x,y}mm/mem,{x,y}mm */
+ /* vrsqrtss xmm,xmm,xmm */
+ CASE_SIMD_SINGLE_FP(, 0x0f, 0x53): /* rcp{p,s}s xmm/mem,xmm */
+ CASE_SIMD_SINGLE_FP(_VEX, 0x0f, 0x53): /* vrcpps {x,y}mm/mem,{x,y}mm */
+ /* vrcpss xmm,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x54): /* andp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x54): /* vandp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x55): /* andnp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x55): /* vandnp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x56): /* orp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x56): /* vorp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x57): /* xorp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x57): /* vxorp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x58): /* add{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x58): /* vadd{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x59): /* mul{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x59): /* vmul{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5c): /* sub{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5c): /* vsub{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5d): /* min{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5d): /* vmin{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5e): /* div{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5e): /* vdiv{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5f): /* max{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5f): /* vmax{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
if ( vex.opcx == vex_none )
{
if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
vcpu_must_have(sse2);
else
vcpu_must_have(sse);
- ea.bytes = 16;
- SET_SSE_PREFIX(buf[0], vex.pfx);
get_fpu(X86EMUL_FPU_xmm, &fic);
}
else
{
- fail_if((vex.reg != 0xf) &&
- ((ea.type == OP_MEM) ||
- !(vex.pfx & VEX_PREFIX_SCALAR_MASK)));
host_and_vcpu_must_have(avx);
+ fail_if((vex.pfx & VEX_PREFIX_SCALAR_MASK) && vex.l);
get_fpu(X86EMUL_FPU_ymm, &fic);
- ea.bytes = 16 << vex.l;
}
- if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
- ea.bytes = vex.pfx & VEX_PREFIX_DOUBLE_MASK ? 8 : 4;
+ simd_0f_common:
+ {
+ uint8_t *buf = get_stub(stub);
+
+ buf[0] = 0x3e;
+ buf[1] = 0x3e;
+ buf[2] = 0x0f;
+ buf[3] = b;
+ buf[4] = modrm;
if ( ea.type == OP_MEM )
{
- uint32_t mxcsr = 0;
-
- if ( b < 0x28 )
- mxcsr = MXCSR_MM;
- else if ( vcpu_has_misalignsse() )
- asm ( "stmxcsr %0" : "=m" (mxcsr) );
- generate_exception_if(!(mxcsr & MXCSR_MM) &&
- !is_aligned(ea.mem.seg, ea.mem.off, ea.bytes,
- ctxt, ops),
- EXC_GP, 0);
- if ( !(b & 1) )
- rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
- ea.bytes, ctxt);
- else
- fail_if(!ops->write); /* Check before running the stub. */
/* convert memory operand to (%rAX) */
rex_prefix &= ~REX_B;
vex.b = 1;
buf[4] &= 0x38;
}
- if ( !rc )
- {
- copy_REX_VEX(buf, rex_prefix, vex);
- asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
- : "memory" );
- }
- put_fpu(&fic);
- put_stub(stub);
- if ( !rc && (b & 1) && (ea.type == OP_MEM) )
- {
- ASSERT(ops->write); /* See the fail_if() above. */
- rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
- ea.bytes, ctxt);
- }
- if ( rc )
- goto done;
- dst.type = OP_NONE;
+ fic.insn_bytes = 5;
break;
}
@@ -5338,6 +5469,125 @@ x86_emulate(
break;
}
+ CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x61): /* vpunpcklwd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x62): /* punpckldq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x62): /* vpunpckldq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x68): /* punpckhbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x68): /* vpunpckhbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x69): /* punpckhwd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x69): /* vpunpckhwd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x6a): /* punpckhdq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6a): /* vpunpckhdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ op_bytes = vex.pfx ? 16 << vex.l : 4;
+ /* fall through */
+ CASE_SIMD_PACKED_INT(0x0f, 0x63): /* packssbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x63): /* vpackssbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x64): /* pcmpgtb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x64): /* vpcmpgtb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x65): /* pcmpgtw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x65): /* vpcmpgtw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x66): /* pcmpgtd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x66): /* vpcmpgtd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x67): /* packusbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x67): /* vpackusbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x6b): /* packsswd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6b): /* vpacksswd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x6c): /* punpcklqdq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6c): /* vpunpcklqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x6d): /* punpckhqdq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6d): /* vpunpckhqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x74): /* pcmpeqb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x74): /* vpcmpeqb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x75): /* pcmpeqw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x75): /* vpcmpeqw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x76): /* pcmpeqd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x76): /* vpcmpeqd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xd4): /* paddq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd4): /* vpaddq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd5): /* pmullw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd5): /* vpmullw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd8): /* psubusb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd8): /* vpsubusb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd9): /* psubusw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd9): /* vpsubusw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX(0x0f, 0xda): /* pminub xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xda): /* vpminub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdb): /* pand {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdb): /* vpand {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdc): /* paddusb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdc): /* vpaddusb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdd): /* paddusw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdd): /* vpaddusw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX(0x0f, 0xde): /* pmaxub xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xde): /* vpmaxub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdf): /* pandn {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdf): /* vpandn {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xe0): /* pavgb xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe0): /* vpavgb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xe3): /* pavgw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe3): /* vpavgw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xe4): /* pmulhuw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe4): /* vpmulhuw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe5): /* pmulhw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe5): /* vpmulhw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe8): /* psubsb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe8): /* vpsubsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe9): /* psubsw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe9): /* vpsubsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX(0x0f, 0xea): /* pminsw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xea): /* vpminsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xeb): /* por {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xeb): /* vpor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xec): /* paddsb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xec): /* vpaddsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xed): /* paddsw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xed): /* vpaddsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX(0x0f, 0xee): /* pmaxsw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xee): /* vpmaxsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xef): /* pxor {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xef): /* vpxor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xf4): /* pmuludq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf4): /* vpmuludq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xf6): /* psadbw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf6): /* vpsadbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf8): /* psubb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf8): /* vpsubb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf9): /* psubw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf9): /* vpsubw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfa): /* psubd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfa): /* vpsubd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xfb): /* psubq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfb): /* vpsubq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfc): /* paddb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfc): /* vpaddb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfd): /* paddw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfd): /* vpaddw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfe): /* paddd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfe): /* vpaddd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ simd_0f_int:
+ if ( vex.opcx != vex_none )
+ {
+ if ( vex.l )
+ host_and_vcpu_must_have(avx2);
+ else
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ goto simd_0f_common;
+
case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */
case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */
case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
@@ -5467,6 +5717,82 @@ x86_emulate(
break;
}
+ CASE_SIMD_PACKED_INT(0x0f, 0x70): /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x70): /* vpshufhw $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x70): /* vpshuflw $imm8,{x,y}mm/mem,{x,y}mm */
+ d = (d & ~SrcMask) | SrcMem | TwoOp;
+ op_bytes = vex.pfx ? 16 << vex.l : 8;
+ simd_0f_int_imm8:
+ if ( vex.opcx != vex_none )
+ {
+ if ( vex.l )
+ host_and_vcpu_must_have(avx2);
+ else
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ simd_0f_imm8:
+ {
+ uint8_t *buf = get_stub(stub);
+
+ buf[0] = 0x3e;
+ buf[1] = 0x3e;
+ buf[2] = 0x0f;
+ buf[3] = b;
+ buf[4] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ buf[4] &= 0x38;
+ }
+ buf[5] = imm1;
+ fic.insn_bytes = 6;
+ break;
+ }
+
+ case X86EMUL_OPC_F2(0x0f, 0xf0): /* lddqu m128,xmm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f, 0x7c): /* haddpd xmm/m128,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0x7c): /* haddps xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7c): /* vhaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x7c): /* vhaddps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x7d): /* hsubpd xmm/m128,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0x7d): /* hsubps xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7d): /* vhsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x7d): /* vhsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xd0): /* haddsubpd xmm/m128,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0xd0): /* haddsubps xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd0): /* vhaddsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0xd0): /* vhaddsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ op_bytes = 16 << vex.l;
+ if ( vex.opcx != vex_none )
+ {
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(sse3);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ goto simd_0f_common;
+
case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
if ( test_cc(b, _regs._eflags) )
jmp_rel((int32_t)src.val);
@@ -5767,12 +6093,41 @@ x86_emulate(
}
goto add;
+ CASE_SIMD_ALL_FP(, 0x0f, 0xc2): /* cmp{p,s}{s,d} $imm8,xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2): /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0xc6): /* shufp{s,d} $imm8,xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+ d = (d & ~SrcMask) | SrcMem;
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(avx);
+ fail_if((vex.pfx & VEX_PREFIX_SCALAR_MASK) && vex.l);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ goto simd_0f_imm8;
+
case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
/* Ignore the non-temporal hint for now. */
vcpu_must_have(sse2);
dst.val = src.val;
+ sfence = true;
break;
+ CASE_SIMD_PACKED_INT(0x0f, 0xc4): /* pinsrw $imm8,r32/m16,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xc4): /* vpinsrw $imm8,r32/m16,xmm,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ memcpy(mmvalp, &src.val, 2);
+ ea.type = OP_MEM;
+ goto simd_0f_int_imm8;
+
case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
{
union {
@@ -5953,6 +6308,46 @@ x86_emulate(
}
break;
+ CASE_SIMD_PACKED_INT(0x0f, 0xd1): /* psrlw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd2): /* psrld {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd3): /* psrlq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe1): /* psraw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe2): /* psrad {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe2): /* vpsrad xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf1): /* psllw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf2): /* pslld {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf2): /* vpslld xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf3): /* psllq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,{x,y}mm,{x,y}mm */
+ op_bytes = vex.pfx ? 16 : 4;
+ goto simd_0f_int;
+
+ case X86EMUL_OPC(0x0f, 0xd4): /* paddq mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xf4): /* pmuludq mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xfb): /* psubq mm/m64,mm */
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ goto simd_0f_common;
+
+ case X86EMUL_OPC(0x0f, 0xda): /* pminub mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xde): /* pmaxub mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xea): /* pminsw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xee): /* pmaxsw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xe0): /* pavgb mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xe3): /* pavgw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xe4): /* pmulhuw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xf6): /* psadbw mm/m64,mm */
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ goto simd_0f_common;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);
@@ -6214,6 +6609,75 @@ x86_emulate(
goto cannot_emulate;
}
+ if ( state->simd_size )
+ {
+#ifdef __XEN__
+ uint8_t *buf = stub.ptr;
+#else
+ uint8_t *buf = get_stub(stub);
+#endif
+
+ generate_exception_if(!op_bytes, EXC_UD);
+ generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
+ EXC_UD);
+
+ if ( !buf )
+ BUG();
+ if ( vex.opcx == vex_none )
+ SET_SSE_PREFIX(buf[0], vex.pfx);
+
+ buf[fic.insn_bytes] = 0xc3;
+ copy_REX_VEX(buf, rex_prefix, vex);
+
+ if ( ea.type == OP_MEM )
+ {
+ uint32_t mxcsr = 0;
+
+ if ( op_bytes < 16 ||
+ (vex.opcx
+ ? /* vmov{a,nt}p{s,d} are exceptions. */
+ ext != ext_0f || ((b | 1) != 0x29 && b != 0x2b)
+ : /* movup{s,d} and lddqu are exceptions. */
+ ext == ext_0f && ((b | 1) == 0x11 || b == 0xf0)) )
+ mxcsr = MXCSR_MM;
+ else if ( vcpu_has_misalignsse() )
+ asm ( "stmxcsr %0" : "=m" (mxcsr) );
+ generate_exception_if(!(mxcsr & MXCSR_MM) &&
+ !is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
+ ctxt, ops),
+ EXC_GP, 0);
+ if ( (d & SrcMask) == SrcMem )
+ {
+ rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+ dst.type = OP_NONE;
+ }
+ else if ( (d & DstMask) == DstMem )
+ {
+ fail_if(!ops->write); /* Check before running the stub. */
+ ASSERT(d & Mov);
+ dst.type = OP_MEM;
+ dst.bytes = op_bytes;
+ dst.mem = ea.mem;
+ }
+ else if ( (d & SrcMask) == SrcMem16 )
+ dst.type = OP_NONE;
+ else
+ {
+ ASSERT_UNREACHABLE();
+ return X86EMUL_UNHANDLEABLE;
+ }
+ }
+ else
+ dst.type = OP_NONE;
+
+ invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
+
+ put_stub(stub);
+ put_fpu(&fic);
+ }
+
switch ( dst.type )
{
case OP_REG:
@@ -6240,8 +6704,11 @@ x86_emulate(
else
{
fail_if(!ops->write);
- rc = ops->write(
- dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+ rc = ops->write(dst.mem.seg, dst.mem.off,
+ !state->simd_size ? &dst.val : (void *)mmvalp,
+ dst.bytes, ctxt);
+ if ( sfence )
+ asm volatile ( "sfence" ::: "memory" );
}
if ( rc != 0 )
goto done;
@@ -6498,22 +6965,6 @@ x86_insn_is_mem_write(const struct x86_e
case 0x6c: case 0x6d: /* INS */
case 0xa4: case 0xa5: /* MOVS */
case 0xaa: case 0xab: /* STOS */
- case X86EMUL_OPC(0x0f, 0x11): /* MOVUPS */
- case X86EMUL_OPC_VEX(0x0f, 0x11): /* VMOVUPS */
- case X86EMUL_OPC_66(0x0f, 0x11): /* MOVUPD */
- case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* VMOVUPD */
- case X86EMUL_OPC_F3(0x0f, 0x11): /* MOVSS */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* VMOVSS */
- case X86EMUL_OPC_F2(0x0f, 0x11): /* MOVSD */
- case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* VMOVSD */
- case X86EMUL_OPC(0x0f, 0x29): /* MOVAPS */
- case X86EMUL_OPC_VEX(0x0f, 0x29): /* VMOVAPS */
- case X86EMUL_OPC_66(0x0f, 0x29): /* MOVAPD */
- case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* VMOVAPD */
- case X86EMUL_OPC(0x0f, 0x2b): /* MOVNTPS */
- case X86EMUL_OPC_VEX(0x0f, 0x2b): /* VMOVNTPS */
- case X86EMUL_OPC_66(0x0f, 0x2b): /* MOVNTPD */
- case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* VMOVNTPD */
case X86EMUL_OPC(0x0f, 0x7e): /* MOVD/MOVQ */
case X86EMUL_OPC_66(0x0f, 0x7e): /* MOVD/MOVQ */
case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* VMOVD/VMOVQ */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -71,12 +71,14 @@
#define cpu_has_xsavec boot_cpu_has(X86_FEATURE_XSAVEC)
#define cpu_has_xgetbv1 boot_cpu_has(X86_FEATURE_XGETBV1)
#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)
+#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
#define cpu_has_monitor boot_cpu_has(X86_FEATURE_MONITOR)
#define cpu_has_eist boot_cpu_has(X86_FEATURE_EIST)
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
#define cpu_has_rdrand boot_cpu_has(X86_FEATURE_RDRAND)
#define cpu_has_rdseed boot_cpu_has(X86_FEATURE_RDSEED)
#define cpu_has_cmp_legacy boot_cpu_has(X86_FEATURE_CMP_LEGACY)
+#define cpu_has_sse4a boot_cpu_has(X86_FEATURE_SSE4A)
#define cpu_has_tbm boot_cpu_has(X86_FEATURE_TBM)
enum _cache_type {
[-- Attachment #2: x86emul-SSE-AVX-0f-mem.patch --]
[-- Type: text/plain, Size: 55443 bytes --]
x86emul: support most memory accessing MMX/SSE/SSE2 insns
This aims at covering most MMX/SSEn/AVX instructions in the 0x0f-escape
space with memory operands. Not covered here are irregular moves,
converts, and {,U}COMIS{S,D} (modifying EFLAGS).
Note that the distinction between simd_*_fp isn't strictly needed, but
I've kept them as separate entries since in an earlier version I needed
them to be separate, and we may well find it useful down the road to
have that distinction.
Also take the opportunity and adjust the vmovdqu test case the new
LDDQU one here has been cloned from: To zero a ymm register we don't
need to go through hoops, as 128-bit AVX insns zero the upper portion
of the destination register, and in the disabled AVX2 code there was a
wrong YMM register used.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1656,12 +1656,7 @@ int main(int argc, char **argv)
{
decl_insn(vmovdqu_from_mem);
-#if 0 /* Don't use AVX2 instructions for now */
- asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n"
-#else
- asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t"
- "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n"
-#endif
+ asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
put_insn(vmovdqu_from_mem, "vmovdqu (%0), %%ymm4")
:: "d" (NULL) );
@@ -1675,7 +1670,7 @@ int main(int argc, char **argv)
#if 0 /* Don't use AVX2 instructions for now */
asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
"vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
- "vpmovmskb %%ymm1, %0" : "=r" (rc) );
+ "vpmovmskb %%ymm0, %0" : "=r" (rc) );
#else
asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
"vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
@@ -2083,6 +2078,67 @@ int main(int argc, char **argv)
printf("skipped\n");
#endif
+ printf("%-40s", "Testing lddqu 4(%edx),%xmm4...");
+ if ( stack_exec && cpu_has_sse3 )
+ {
+ decl_insn(lddqu);
+
+ asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n"
+ put_insn(lddqu, "lddqu 4(%0), %%xmm4")
+ :: "d" (NULL) );
+
+ set_insn(lddqu);
+ memset(res, 0x55, 64);
+ memset(res + 1, 0xff, 16);
+ regs.edx = (unsigned long)res;
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(lddqu) )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm4, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vlddqu (%ecx),%ymm4...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vlddqu);
+
+ asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
+ put_insn(vlddqu, "vlddqu (%0), %%ymm4")
+ :: "c" (NULL) );
+
+ set_insn(vlddqu);
+ memset(res + 1, 0xff, 32);
+ regs.ecx = (unsigned long)(res + 1);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vlddqu) )
+ goto fail;
+#if 0 /* Don't use AVX2 instructions for now */
+ asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
+ "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
+ "vpmovmskb %%ymm0, %0" : "=r" (rc) );
+#else
+ asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
+ "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
+ "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
+ "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
+ "vpmovmskb %%xmm0, %0\n\t"
+ "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
+ rc |= i << 16;
+#endif
+ if ( ~rc )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
#undef decl_insn
#undef put_insn
#undef set_insn
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -81,6 +81,12 @@ static inline uint64_t xgetbv(uint32_t x
(res.d & (1U << 26)) != 0; \
})
+#define cpu_has_sse3 ({ \
+ struct cpuid_leaf res; \
+ emul_test_cpuid(1, 0, &res, NULL); \
+ (res.c & (1U << 0)) != 0; \
+})
+
#define cpu_has_popcnt ({ \
struct cpuid_leaf res; \
emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -45,6 +45,8 @@
#define ModRM (1<<6)
/* Destination is only written; never read. */
#define Mov (1<<7)
+/* VEX/EVEX (SIMD only): 2nd source operand unused (must be all ones) */
+#define TwoOp Mov
/* All operands are implicit in the opcode. */
#define ImplicitOps (DstImplicit|SrcImplicit)
@@ -180,104 +182,171 @@ static const opcode_desc_t opcode_table[
ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
};
-static const opcode_desc_t twobyte_table[256] = {
- /* 0x00 - 0x07 */
- ModRM, ImplicitOps|ModRM, DstReg|SrcMem16|ModRM, DstReg|SrcMem16|ModRM,
- 0, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0x08 - 0x0F */
- ImplicitOps, ImplicitOps, 0, ImplicitOps,
- 0, ImplicitOps|ModRM, ImplicitOps, ModRM|SrcImmByte,
- /* 0x10 - 0x17 */
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- /* 0x18 - 0x1F */
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- /* 0x20 - 0x27 */
- DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
- DstImplicit|SrcMem|ModRM, DstImplicit|SrcMem|ModRM,
- 0, 0, 0, 0,
- /* 0x28 - 0x2F */
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- /* 0x30 - 0x37 */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, 0, ImplicitOps,
- /* 0x38 - 0x3F */
- DstReg|SrcMem|ModRM, 0, DstReg|SrcImmByte|ModRM, 0, 0, 0, 0, 0,
- /* 0x40 - 0x47 */
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- /* 0x48 - 0x4F */
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- /* 0x50 - 0x5F */
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- /* 0x60 - 0x6F */
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
- /* 0x70 - 0x7F */
- SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
- ModRM, ModRM, ModRM, ImplicitOps,
- ModRM, ModRM, 0, 0, ModRM, ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
- /* 0x80 - 0x87 */
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- /* 0x88 - 0x8F */
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- DstImplicit|SrcImm, DstImplicit|SrcImm,
- /* 0x90 - 0x97 */
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- /* 0x98 - 0x9F */
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
- /* 0xA0 - 0xA7 */
- ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, DstBitBase|SrcReg|ModRM,
- DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, ModRM, ModRM,
- /* 0xA8 - 0xAF */
- ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, DstBitBase|SrcReg|ModRM,
- DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM,
- ImplicitOps|ModRM, DstReg|SrcMem|ModRM,
- /* 0xB0 - 0xB7 */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- DstReg|SrcMem|ModRM|Mov, DstBitBase|SrcReg|ModRM,
- DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
- ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
- /* 0xB8 - 0xBF */
- DstReg|SrcMem|ModRM, ModRM,
- DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
- DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
- ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
- /* 0xC0 - 0xC7 */
- ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
- SrcImmByte|ModRM, DstMem|SrcReg|ModRM|Mov,
- SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, ImplicitOps|ModRM,
- /* 0xC8 - 0xCF */
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
- /* 0xD0 - 0xDF */
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM, ModRM,
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- /* 0xE0 - 0xEF */
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- /* 0xF0 - 0xFF */
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
- ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
+enum simd_opsize {
+ simd_none,
+ /*
+ * Ordinary packed integers:
+ * - 64 bits without prefix 66 (MMX)
+ * - 128 bits with prefix 66 (SSEn)
+ * - 128/256 bits depending on VEX.L (AVX)
+ */
+ simd_packed_int,
+ /*
+ * Ordinary packed/scalar floating point:
+ * - 128 bits without prefix or with prefix 66 (SSEn)
+ * - 128/256 bits depending on VEX.L (AVX)
+ * - 32 bits with prefix F3 (scalar single)
+ * - 64 bits with prefix F2 (scalar doubgle)
+ */
+ simd_any_fp,
+ /*
+ * Packed floating point:
+ * - 128 bits without prefix or with prefix 66 (SSEn)
+ * - 128/256 bits depending on VEX.L (AVX)
+ */
+ simd_packed_fp,
+ /*
+ * Single precision packed/scalar floating point:
+ * - 128 bits without prefix (SSEn)
+ * - 128/256 bits depending on VEX.L, no prefix (AVX)
+ * - 32 bits with prefix F3 (scalar)
+ */
+ simd_single_fp,
+ /* Operand size encoded in non-standard way. */
+ simd_other
+};
+typedef uint8_t simd_opsize_t;
+
+static const struct {
+ opcode_desc_t desc;
+ simd_opsize_t size;
+} twobyte_table[256] = {
+ [0x00] = { ModRM },
+ [0x01] = { ImplicitOps|ModRM },
+ [0x02] = { DstReg|SrcMem16|ModRM },
+ [0x03] = { DstReg|SrcMem16|ModRM },
+ [0x05] = { ImplicitOps },
+ [0x06] = { ImplicitOps },
+ [0x07] = { ImplicitOps },
+ [0x08] = { ImplicitOps },
+ [0x09] = { ImplicitOps },
+ [0x0b] = { ImplicitOps },
+ [0x0d] = { ImplicitOps|ModRM },
+ [0x0e] = { ImplicitOps },
+ [0x0f] = { ModRM|SrcImmByte },
+ [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
+ [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+ [0x12] = { ImplicitOps|ModRM },
+ [0x13] = { ImplicitOps|ModRM },
+ [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+ [0x16] = { ImplicitOps|ModRM },
+ [0x17] = { ImplicitOps|ModRM },
+ [0x18] = { ImplicitOps|ModRM },
+ [0x19] = { ImplicitOps|ModRM },
+ [0x1a] = { ImplicitOps|ModRM },
+ [0x1b] = { ImplicitOps|ModRM },
+ [0x1c] = { ImplicitOps|ModRM },
+ [0x1d] = { ImplicitOps|ModRM },
+ [0x1e] = { ImplicitOps|ModRM },
+ [0x1f] = { ImplicitOps|ModRM },
+ [0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
+ [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
+ [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp },
+ [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp },
+ [0x2a] = { ImplicitOps|ModRM },
+ [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+ [0x2c ... 0x2d] = { ImplicitOps|ModRM },
+ [0x2e ... 0x2f] = { ImplicitOps|ModRM },
+ [0x30] = { ImplicitOps },
+ [0x31] = { ImplicitOps },
+ [0x32] = { ImplicitOps },
+ [0x33] = { ImplicitOps },
+ [0x34] = { ImplicitOps },
+ [0x35] = { ImplicitOps },
+ [0x37] = { ImplicitOps },
+ [0x38] = { DstReg|SrcMem|ModRM },
+ [0x3a] = { DstReg|SrcImmByte|ModRM },
+ [0x40 ... 0x4f] = { DstReg|SrcMem|ModRM|Mov },
+ [0x50] = { ModRM },
+ [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },
+ [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
+ [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+ [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+ [0x5a] = { ModRM },
+ [0x5b] = { ModRM },
+ [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+ [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x6b] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x6c ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x6e ... 0x6f] = { ImplicitOps|ModRM },
+ [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
+ [0x71 ... 0x73] = { SrcImmByte|ModRM },
+ [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0x77] = { DstImplicit|SrcNone },
+ [0x78] = { ModRM },
+ [0x79] = { ModRM },
+ [0x7c] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0x7e ... 0x7f] = { ImplicitOps|ModRM },
+ [0x80 ... 0x8f] = { DstImplicit|SrcImm },
+ [0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
+ [0xa0 ... 0xa1] = { ImplicitOps|Mov },
+ [0xa2] = { ImplicitOps },
+ [0xa3] = { DstBitBase|SrcReg|ModRM },
+ [0xa4] = { DstMem|SrcImmByte|ModRM },
+ [0xa5] = { DstMem|SrcReg|ModRM },
+ [0xa6] = { ModRM },
+ [0xa7] = { ModRM },
+ [0xa8 ... 0xa9] = { ImplicitOps|Mov },
+ [0xaa] = { ImplicitOps },
+ [0xab] = { DstBitBase|SrcReg|ModRM },
+ [0xac] = { DstMem|SrcImmByte|ModRM },
+ [0xad] = { DstMem|SrcReg|ModRM },
+ [0xae] = { ImplicitOps|ModRM },
+ [0xaf] = { DstReg|SrcMem|ModRM },
+ [0xb0] = { ByteOp|DstMem|SrcReg|ModRM },
+ [0xb1] = { DstMem|SrcReg|ModRM },
+ [0xb2] = { DstReg|SrcMem|ModRM|Mov },
+ [0xb3] = { DstBitBase|SrcReg|ModRM },
+ [0xb4 ... 0xb5] = { DstReg|SrcMem|ModRM|Mov },
+ [0xb6] = { ByteOp|DstReg|SrcMem|ModRM|Mov },
+ [0xb7] = { DstReg|SrcMem16|ModRM|Mov },
+ [0xb8] = { DstReg|SrcMem|ModRM },
+ [0xb9] = { ModRM },
+ [0xba] = { DstBitBase|SrcImmByte|ModRM },
+ [0xbb] = { DstBitBase|SrcReg|ModRM },
+ [0xbc ... 0xbd] = { DstReg|SrcMem|ModRM },
+ [0xbe] = { ByteOp|DstReg|SrcMem|ModRM|Mov },
+ [0xbf] = { DstReg|SrcMem16|ModRM|Mov },
+ [0xc0] = { ByteOp|DstMem|SrcReg|ModRM },
+ [0xc1] = { DstMem|SrcReg|ModRM },
+ [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp },
+ [0xc3] = { DstMem|SrcReg|ModRM|Mov },
+ [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
+ [0xc5] = { SrcImmByte|ModRM },
+ [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp },
+ [0xc7] = { ImplicitOps|ModRM },
+ [0xc8 ... 0xcf] = { ImplicitOps },
+ [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xd6] = { ImplicitOps|ModRM },
+ [0xd7] = { ModRM },
+ [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xe6] = { ModRM },
+ [0xe7] = { ImplicitOps|ModRM },
+ [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
+ [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xf7] = { ModRM },
+ [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+ [0xff] = { ModRM }
};
static const opcode_desc_t xop_table[] = {
@@ -1372,10 +1441,12 @@ static bool vcpu_has(
#define vcpu_has_lahf_lm() vcpu_has(0x80000001, ECX, 0, ctxt, ops)
#define vcpu_has_cr8_legacy() vcpu_has(0x80000001, ECX, 4, ctxt, ops)
#define vcpu_has_lzcnt() vcpu_has(0x80000001, ECX, 5, ctxt, ops)
+#define vcpu_has_sse4a() vcpu_has(0x80000001, ECX, 6, ctxt, ops)
#define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX, 7, ctxt, ops)
#define vcpu_has_tbm() vcpu_has(0x80000001, ECX, 21, ctxt, ops)
#define vcpu_has_bmi1() vcpu_has( 7, EBX, 3, ctxt, ops)
#define vcpu_has_hle() vcpu_has( 7, EBX, 4, ctxt, ops)
+#define vcpu_has_avx2() vcpu_has( 7, EBX, 5, ctxt, ops)
#define vcpu_has_bmi2() vcpu_has( 7, EBX, 8, ctxt, ops)
#define vcpu_has_rtm() vcpu_has( 7, EBX, 11, ctxt, ops)
#define vcpu_has_mpx() vcpu_has( 7, EBX, 14, ctxt, ops)
@@ -1975,6 +2046,7 @@ struct x86_emulate_state {
opcode_desc_t desc;
union vex vex;
union evex evex;
+ enum simd_opsize simd_size;
/*
* Data operand effective address (usually computed from ModRM).
@@ -2110,7 +2182,8 @@ x86_decode_twobyte(
case 0x50 ... 0x77:
case 0x79 ... 0x7f:
case 0xae:
- case 0xc2 ... 0xc6:
+ case 0xc2 ... 0xc3:
+ case 0xc5 ... 0xc6:
case 0xd0 ... 0xfe:
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
break;
@@ -2137,8 +2210,23 @@ x86_decode_twobyte(
case 0xbd: bsr / lzcnt
* They're being dealt with in the execution phase (if at all).
*/
+
+ case 0xc4: /* pinsrw */
+ ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0, 0xc4): /* vpinsrw */
+ state->desc = DstReg | SrcMem16 | ModRM;
+ break;
}
+ /*
+ * Register forms of most VEX-encoded TwoOp instructions have
+ * three operands.
+ */
+ if ( state->simd_size && ea.type != OP_MEM &&
+ vex.opcx && (vex.pfx & VEX_PREFIX_SCALAR_MASK) )
+ state->desc &= ~TwoOp;
+
done:
return rc;
}
@@ -2270,12 +2358,13 @@ x86_decode(
{
/* Two-byte opcode. */
b = insn_fetch_type(uint8_t);
- d = twobyte_table[b];
+ d = twobyte_table[b].desc;
switch ( b )
{
default:
opcode = b | MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
ext = ext_0f;
+ state->simd_size = twobyte_table[b].size;
break;
case 0x38:
b = insn_fetch_type(uint8_t);
@@ -2381,15 +2470,16 @@ x86_decode(
{
case vex_0f:
opcode |= MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
- d = twobyte_table[b];
+ d = twobyte_table[b].desc;
+ state->simd_size = twobyte_table[b].size;
break;
case vex_0f38:
opcode |= MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
- d = twobyte_table[0x38];
+ d = twobyte_table[0x38].desc;
break;
case vex_0f3a:
opcode |= MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
- d = twobyte_table[0x3a];
+ d = twobyte_table[0x3a].desc;
break;
default:
rc = X86EMUL_UNHANDLEABLE;
@@ -2639,13 +2729,53 @@ x86_decode(
ea.mem.off = truncate_ea(ea.mem.off);
}
- /*
- * When prefix 66 has a meaning different from operand-size override,
- * operand size defaults to 4 and can't be overridden to 2.
- */
- if ( op_bytes == 2 &&
- (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
- op_bytes = 4;
+ switch ( state->simd_size )
+ {
+ case simd_none:
+ /*
+ * When prefix 66 has a meaning different from operand-size override,
+ * operand size defaults to 4 and can't be overridden to 2.
+ */
+ if ( op_bytes == 2 &&
+ (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
+ op_bytes = 4;
+ break;
+
+ case simd_packed_int:
+ switch ( vex.pfx )
+ {
+ case vex_none: op_bytes = 8; break;
+ case vex_66: op_bytes = 16 << vex.l; break;
+ default: op_bytes = 0; break;
+ }
+ break;
+
+ case simd_single_fp:
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ {
+ op_bytes = 0;
+ break;
+ case simd_packed_fp:
+ if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
+ {
+ op_bytes = 0;
+ break;
+ }
+ }
+ /* fall through */
+ case simd_any_fp:
+ switch ( vex.pfx )
+ {
+ default: op_bytes = 16 << vex.l; break;
+ case vex_f3: op_bytes = 4; break;
+ case vex_f2: op_bytes = 8; break;
+ }
+ break;
+
+ default:
+ op_bytes = 0;
+ break;
+ }
done:
return rc;
@@ -2669,8 +2799,10 @@ x86_emulate(
int rc;
uint8_t b, d;
bool singlestep = (_regs._eflags & EFLG_TF) && !is_branch_step(ctxt, ops);
+ bool sfence = false;
struct operand src = { .reg = PTR_POISON };
struct operand dst = { .reg = PTR_POISON };
+ unsigned long cr4;
enum x86_swint_type swint_type;
struct fpu_insn_ctxt fic;
struct x86_emulate_stub stub = {};
@@ -2737,6 +2869,8 @@ x86_emulate(
ea.bytes = 2;
goto srcmem_common;
case SrcMem:
+ if ( state->simd_size )
+ break;
ea.bytes = (d & ByteOp) ? 1 : op_bytes;
srcmem_common:
src = ea;
@@ -2837,6 +2971,11 @@ x86_emulate(
d = (d & ~DstMask) | DstMem;
/* Becomes a normal DstMem operation from here on. */
case DstMem:
+ if ( state->simd_size )
+ {
+ generate_exception_if(lock_prefix, EXC_UD);
+ break;
+ }
ea.bytes = (d & ByteOp) ? 1 : op_bytes;
dst = ea;
if ( dst.type == OP_REG )
@@ -2871,7 +3010,6 @@ x86_emulate(
{
enum x86_segment seg;
struct segment_register cs, sreg;
- unsigned long cr4;
struct cpuid_leaf cpuid_leaf;
case 0x00 ... 0x05: add: /* add */
@@ -5066,116 +5204,109 @@ x86_emulate(
case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
break;
- case X86EMUL_OPC(0x0f, 0x2b): /* movntps xmm,m128 */
- case X86EMUL_OPC_VEX(0x0f, 0x2b): /* vmovntps xmm,m128 */
- /* vmovntps ymm,m256 */
- case X86EMUL_OPC_66(0x0f, 0x2b): /* movntpd xmm,m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* vmovntpd xmm,m128 */
- /* vmovntpd ymm,m256 */
- fail_if(ea.type != OP_MEM);
- /* fall through */
- case X86EMUL_OPC(0x0f, 0x28): /* movaps xmm/m128,xmm */
- case X86EMUL_OPC_VEX(0x0f, 0x28): /* vmovaps xmm/m128,xmm */
- /* vmovaps ymm/m256,ymm */
- case X86EMUL_OPC_66(0x0f, 0x28): /* movapd xmm/m128,xmm */
- case X86EMUL_OPC_VEX_66(0x0f, 0x28): /* vmovapd xmm/m128,xmm */
- /* vmovapd ymm/m256,ymm */
- case X86EMUL_OPC(0x0f, 0x29): /* movaps xmm,xmm/m128 */
- case X86EMUL_OPC_VEX(0x0f, 0x29): /* vmovaps xmm,xmm/m128 */
- /* vmovaps ymm,ymm/m256 */
- case X86EMUL_OPC_66(0x0f, 0x29): /* movapd xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* vmovapd xmm,xmm/m128 */
- /* vmovapd ymm,ymm/m256 */
- case X86EMUL_OPC(0x0f, 0x10): /* movups xmm/m128,xmm */
- case X86EMUL_OPC_VEX(0x0f, 0x10): /* vmovups xmm/m128,xmm */
- /* vmovups ymm/m256,ymm */
- case X86EMUL_OPC_66(0x0f, 0x10): /* movupd xmm/m128,xmm */
- case X86EMUL_OPC_VEX_66(0x0f, 0x10): /* vmovupd xmm/m128,xmm */
- /* vmovupd ymm/m256,ymm */
- case X86EMUL_OPC_F3(0x0f, 0x10): /* movss xmm/m32,xmm */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x10): /* vmovss xmm/m32,xmm */
- case X86EMUL_OPC_F2(0x0f, 0x10): /* movsd xmm/m64,xmm */
- case X86EMUL_OPC_VEX_F2(0x0f, 0x10): /* vmovsd xmm/m64,xmm */
- case X86EMUL_OPC(0x0f, 0x11): /* movups xmm,xmm/m128 */
- case X86EMUL_OPC_VEX(0x0f, 0x11): /* vmovups xmm,xmm/m128 */
- /* vmovups ymm,ymm/m256 */
- case X86EMUL_OPC_66(0x0f, 0x11): /* movupd xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* vmovupd xmm,xmm/m128 */
- /* vmovupd ymm,ymm/m256 */
- case X86EMUL_OPC_F3(0x0f, 0x11): /* movss xmm,xmm/m32 */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* vmovss xmm,xmm/m32 */
- case X86EMUL_OPC_F2(0x0f, 0x11): /* movsd xmm,xmm/m64 */
- case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* vmovsd xmm,xmm/m64 */
- {
- uint8_t *buf = get_stub(stub);
+#define CASE_SIMD_PACKED_INT(pfx, opc) \
+ case X86EMUL_OPC(pfx, opc): \
+ case X86EMUL_OPC_66(pfx, opc)
+#define CASE_SIMD_SINGLE_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind(pfx, opc): \
+ case X86EMUL_OPC##kind##_F3(pfx, opc)
+#define CASE_SIMD_DOUBLE_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind##_66(pfx, opc): \
+ case X86EMUL_OPC##kind##_F2(pfx, opc)
+#define CASE_SIMD_ALL_FP(kind, pfx, opc) \
+ CASE_SIMD_SINGLE_FP(kind, pfx, opc): \
+ CASE_SIMD_DOUBLE_FP(kind, pfx, opc)
+#define CASE_SIMD_PACKED_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind(pfx, opc): \
+ case X86EMUL_OPC##kind##_66(pfx, opc)
+#define CASE_SIMD_SCALAR_FP(kind, pfx, opc) \
+ case X86EMUL_OPC##kind##_F3(pfx, opc): \
+ case X86EMUL_OPC##kind##_F2(pfx, opc)
- fic.insn_bytes = 5;
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
- buf[5] = 0xc3;
+ CASE_SIMD_SCALAR_FP(, 0x0f, 0x2b): /* movnts{s,d} xmm,mem */
+ host_and_vcpu_must_have(sse4a);
+ /* fall through */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x2b): /* movntp{s,d} xmm,m128 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2b): /* vmovntp{s,d} {x,y}mm,mem */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ sfence = true;
+ /* fall through */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x10): /* mov{up,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x11): /* mov{up,s}{s,d} xmm,xmm/mem */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x10): /* vmovup{s,d} {x,y}mm/mem,{x,y}mm */
+ CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x10): /* vmovs{s,d} mem,xmm */
+ /* vmovs{s,d} xmm,xmm,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x11): /* vmovup{s,d} {x,y}mm,{x,y}mm/mem */
+ CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x11): /* vmovs{s,d} xmm,mem */
+ /* vmovs{s,d} xmm,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x14): /* unpcklp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x14): /* vunpcklp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x15): /* unpckhp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x15): /* vunpckhp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x28): /* movap{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x28): /* vmovap{s,d} {x,y}mm/mem,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x29): /* movap{s,d} xmm,xmm/m128 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x29): /* vmovap{s,d} {x,y}mm,{x,y}mm/mem */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x51): /* sqrt{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x51): /* vsqrtp{s,d} {x,y}mm/mem,{x,y}mm */
+ /* vsqrts{s,d} xmm,xmm,xmm */
+ CASE_SIMD_SINGLE_FP(, 0x0f, 0x52): /* rsqrt{p,s}s xmm/mem,xmm */
+ CASE_SIMD_SINGLE_FP(_VEX, 0x0f, 0x52): /* vrsqrtps {x,y}mm/mem,{x,y}mm */
+ /* vrsqrtss xmm,xmm,xmm */
+ CASE_SIMD_SINGLE_FP(, 0x0f, 0x53): /* rcp{p,s}s xmm/mem,xmm */
+ CASE_SIMD_SINGLE_FP(_VEX, 0x0f, 0x53): /* vrcpps {x,y}mm/mem,{x,y}mm */
+ /* vrcpss xmm,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x54): /* andp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x54): /* vandp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x55): /* andnp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x55): /* vandnp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x56): /* orp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x56): /* vorp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x57): /* xorp{s,d} xmm/m128,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x57): /* vxorp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x58): /* add{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x58): /* vadd{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x59): /* mul{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x59): /* vmul{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5c): /* sub{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5c): /* vsub{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5d): /* min{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5d): /* vmin{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5e): /* div{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5e): /* vdiv{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_ALL_FP(, 0x0f, 0x5f): /* max{p,s}{s,d} xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5f): /* vmax{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
if ( vex.opcx == vex_none )
{
if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
vcpu_must_have(sse2);
else
vcpu_must_have(sse);
- ea.bytes = 16;
- SET_SSE_PREFIX(buf[0], vex.pfx);
get_fpu(X86EMUL_FPU_xmm, &fic);
}
else
{
- fail_if((vex.reg != 0xf) &&
- ((ea.type == OP_MEM) ||
- !(vex.pfx & VEX_PREFIX_SCALAR_MASK)));
host_and_vcpu_must_have(avx);
+ fail_if((vex.pfx & VEX_PREFIX_SCALAR_MASK) && vex.l);
get_fpu(X86EMUL_FPU_ymm, &fic);
- ea.bytes = 16 << vex.l;
}
- if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
- ea.bytes = vex.pfx & VEX_PREFIX_DOUBLE_MASK ? 8 : 4;
+ simd_0f_common:
+ {
+ uint8_t *buf = get_stub(stub);
+
+ buf[0] = 0x3e;
+ buf[1] = 0x3e;
+ buf[2] = 0x0f;
+ buf[3] = b;
+ buf[4] = modrm;
if ( ea.type == OP_MEM )
{
- uint32_t mxcsr = 0;
-
- if ( b < 0x28 )
- mxcsr = MXCSR_MM;
- else if ( vcpu_has_misalignsse() )
- asm ( "stmxcsr %0" : "=m" (mxcsr) );
- generate_exception_if(!(mxcsr & MXCSR_MM) &&
- !is_aligned(ea.mem.seg, ea.mem.off, ea.bytes,
- ctxt, ops),
- EXC_GP, 0);
- if ( !(b & 1) )
- rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
- ea.bytes, ctxt);
- else
- fail_if(!ops->write); /* Check before running the stub. */
/* convert memory operand to (%rAX) */
rex_prefix &= ~REX_B;
vex.b = 1;
buf[4] &= 0x38;
}
- if ( !rc )
- {
- copy_REX_VEX(buf, rex_prefix, vex);
- asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
- : "memory" );
- }
- put_fpu(&fic);
- put_stub(stub);
- if ( !rc && (b & 1) && (ea.type == OP_MEM) )
- {
- ASSERT(ops->write); /* See the fail_if() above. */
- rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
- ea.bytes, ctxt);
- }
- if ( rc )
- goto done;
- dst.type = OP_NONE;
+ fic.insn_bytes = 5;
break;
}
@@ -5338,6 +5469,125 @@ x86_emulate(
break;
}
+ CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x61): /* vpunpcklwd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x62): /* punpckldq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x62): /* vpunpckldq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x68): /* punpckhbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x68): /* vpunpckhbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x69): /* punpckhwd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x69): /* vpunpckhwd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x6a): /* punpckhdq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6a): /* vpunpckhdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ op_bytes = vex.pfx ? 16 << vex.l : 4;
+ /* fall through */
+ CASE_SIMD_PACKED_INT(0x0f, 0x63): /* packssbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x63): /* vpackssbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x64): /* pcmpgtb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x64): /* vpcmpgtb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x65): /* pcmpgtw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x65): /* vpcmpgtw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x66): /* pcmpgtd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x66): /* vpcmpgtd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x67): /* packusbw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x67): /* vpackusbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x6b): /* packsswd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6b): /* vpacksswd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x6c): /* punpcklqdq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6c): /* vpunpcklqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x6d): /* punpckhqdq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6d): /* vpunpckhqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x74): /* pcmpeqb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x74): /* vpcmpeqb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x75): /* pcmpeqw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x75): /* vpcmpeqw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x76): /* pcmpeqd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x76): /* vpcmpeqd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xd4): /* paddq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd4): /* vpaddq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd5): /* pmullw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd5): /* vpmullw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd8): /* psubusb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd8): /* vpsubusb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd9): /* psubusw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd9): /* vpsubusw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX(0x0f, 0xda): /* pminub xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xda): /* vpminub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdb): /* pand {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdb): /* vpand {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdc): /* paddusb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdc): /* vpaddusb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdd): /* paddusw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdd): /* vpaddusw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX(0x0f, 0xde): /* pmaxub xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xde): /* vpmaxub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xdf): /* pandn {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xdf): /* vpandn {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xe0): /* pavgb xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe0): /* vpavgb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xe3): /* pavgw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe3): /* vpavgw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xe4): /* pmulhuw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe4): /* vpmulhuw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe5): /* pmulhw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe5): /* vpmulhw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe8): /* psubsb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe8): /* vpsubsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe9): /* psubsw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe9): /* vpsubsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX(0x0f, 0xea): /* pminsw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xea): /* vpminsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xeb): /* por {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xeb): /* vpor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xec): /* paddsb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xec): /* vpaddsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xed): /* paddsw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xed): /* vpaddsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX(0x0f, 0xee): /* pmaxsw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xee): /* vpmaxsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xef): /* pxor {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xef): /* vpxor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xf4): /* pmuludq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf4): /* vpmuludq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xf6): /* psadbw xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf6): /* vpsadbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf8): /* psubb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf8): /* vpsubb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf9): /* psubw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf9): /* vpsubw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfa): /* psubd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfa): /* vpsubd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xfb): /* psubq xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfb): /* vpsubq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfc): /* paddb {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfc): /* vpaddb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfd): /* paddw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfd): /* vpaddw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xfe): /* paddd {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xfe): /* vpaddd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ simd_0f_int:
+ if ( vex.opcx != vex_none )
+ {
+ if ( vex.l )
+ host_and_vcpu_must_have(avx2);
+ else
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ goto simd_0f_common;
+
case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */
case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */
case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
@@ -5467,6 +5717,82 @@ x86_emulate(
break;
}
+ CASE_SIMD_PACKED_INT(0x0f, 0x70): /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x70): /* vpshufhw $imm8,{x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x70): /* vpshuflw $imm8,{x,y}mm/mem,{x,y}mm */
+ d = (d & ~SrcMask) | SrcMem | TwoOp;
+ op_bytes = vex.pfx ? 16 << vex.l : 8;
+ simd_0f_int_imm8:
+ if ( vex.opcx != vex_none )
+ {
+ if ( vex.l )
+ host_and_vcpu_must_have(avx2);
+ else
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ simd_0f_imm8:
+ {
+ uint8_t *buf = get_stub(stub);
+
+ buf[0] = 0x3e;
+ buf[1] = 0x3e;
+ buf[2] = 0x0f;
+ buf[3] = b;
+ buf[4] = modrm;
+ if ( ea.type == OP_MEM )
+ {
+ /* Convert memory operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ buf[4] &= 0x38;
+ }
+ buf[5] = imm1;
+ fic.insn_bytes = 6;
+ break;
+ }
+
+ case X86EMUL_OPC_F2(0x0f, 0xf0): /* lddqu m128,xmm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f, 0x7c): /* haddpd xmm/m128,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0x7c): /* haddps xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7c): /* vhaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x7c): /* vhaddps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x7d): /* hsubpd xmm/m128,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0x7d): /* hsubps xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7d): /* vhsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x7d): /* vhsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0xd0): /* haddsubpd xmm/m128,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0xd0): /* haddsubps xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd0): /* vhaddsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0xd0): /* vhaddsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ op_bytes = 16 << vex.l;
+ if ( vex.opcx != vex_none )
+ {
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(sse3);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ goto simd_0f_common;
+
case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
if ( test_cc(b, _regs._eflags) )
jmp_rel((int32_t)src.val);
@@ -5767,12 +6093,41 @@ x86_emulate(
}
goto add;
+ CASE_SIMD_ALL_FP(, 0x0f, 0xc2): /* cmp{p,s}{s,d} $imm8,xmm/mem,xmm */
+ CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2): /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0xc6): /* shufp{s,d} $imm8,xmm/mem,xmm */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+ d = (d & ~SrcMask) | SrcMem;
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(avx);
+ fail_if((vex.pfx & VEX_PREFIX_SCALAR_MASK) && vex.l);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ goto simd_0f_imm8;
+
case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
/* Ignore the non-temporal hint for now. */
vcpu_must_have(sse2);
dst.val = src.val;
+ sfence = true;
break;
+ CASE_SIMD_PACKED_INT(0x0f, 0xc4): /* pinsrw $imm8,r32/m16,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xc4): /* vpinsrw $imm8,r32/m16,xmm,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ memcpy(mmvalp, &src.val, 2);
+ ea.type = OP_MEM;
+ goto simd_0f_int_imm8;
+
case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
{
union {
@@ -5953,6 +6308,46 @@ x86_emulate(
}
break;
+ CASE_SIMD_PACKED_INT(0x0f, 0xd1): /* psrlw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd2): /* psrld {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd3): /* psrlq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe1): /* psraw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xe2): /* psrad {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe2): /* vpsrad xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf1): /* psllw {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf2): /* pslld {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf2): /* vpslld xmm/m128,{x,y}mm,{x,y}mm */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf3): /* psllq {,x}mm/mem,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,{x,y}mm,{x,y}mm */
+ op_bytes = vex.pfx ? 16 : 4;
+ goto simd_0f_int;
+
+ case X86EMUL_OPC(0x0f, 0xd4): /* paddq mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xf4): /* pmuludq mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xfb): /* psubq mm/m64,mm */
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ goto simd_0f_common;
+
+ case X86EMUL_OPC(0x0f, 0xda): /* pminub mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xde): /* pmaxub mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xea): /* pminsw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xee): /* pmaxsw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xe0): /* pavgb mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xe3): /* pavgw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xe4): /* pmulhuw mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0xf6): /* psadbw mm/m64,mm */
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ goto simd_0f_common;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);
@@ -6214,6 +6609,75 @@ x86_emulate(
goto cannot_emulate;
}
+ if ( state->simd_size )
+ {
+#ifdef __XEN__
+ uint8_t *buf = stub.ptr;
+#else
+ uint8_t *buf = get_stub(stub);
+#endif
+
+ generate_exception_if(!op_bytes, EXC_UD);
+ generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
+ EXC_UD);
+
+ if ( !buf )
+ BUG();
+ if ( vex.opcx == vex_none )
+ SET_SSE_PREFIX(buf[0], vex.pfx);
+
+ buf[fic.insn_bytes] = 0xc3;
+ copy_REX_VEX(buf, rex_prefix, vex);
+
+ if ( ea.type == OP_MEM )
+ {
+ uint32_t mxcsr = 0;
+
+ if ( op_bytes < 16 ||
+ (vex.opcx
+ ? /* vmov{a,nt}p{s,d} are exceptions. */
+ ext != ext_0f || ((b | 1) != 0x29 && b != 0x2b)
+ : /* movup{s,d} and lddqu are exceptions. */
+ ext == ext_0f && ((b | 1) == 0x11 || b == 0xf0)) )
+ mxcsr = MXCSR_MM;
+ else if ( vcpu_has_misalignsse() )
+ asm ( "stmxcsr %0" : "=m" (mxcsr) );
+ generate_exception_if(!(mxcsr & MXCSR_MM) &&
+ !is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
+ ctxt, ops),
+ EXC_GP, 0);
+ if ( (d & SrcMask) == SrcMem )
+ {
+ rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ goto done;
+ dst.type = OP_NONE;
+ }
+ else if ( (d & DstMask) == DstMem )
+ {
+ fail_if(!ops->write); /* Check before running the stub. */
+ ASSERT(d & Mov);
+ dst.type = OP_MEM;
+ dst.bytes = op_bytes;
+ dst.mem = ea.mem;
+ }
+ else if ( (d & SrcMask) == SrcMem16 )
+ dst.type = OP_NONE;
+ else
+ {
+ ASSERT_UNREACHABLE();
+ return X86EMUL_UNHANDLEABLE;
+ }
+ }
+ else
+ dst.type = OP_NONE;
+
+ invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
+
+ put_stub(stub);
+ put_fpu(&fic);
+ }
+
switch ( dst.type )
{
case OP_REG:
@@ -6240,8 +6704,11 @@ x86_emulate(
else
{
fail_if(!ops->write);
- rc = ops->write(
- dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+ rc = ops->write(dst.mem.seg, dst.mem.off,
+ !state->simd_size ? &dst.val : (void *)mmvalp,
+ dst.bytes, ctxt);
+ if ( sfence )
+ asm volatile ( "sfence" ::: "memory" );
}
if ( rc != 0 )
goto done;
@@ -6498,22 +6965,6 @@ x86_insn_is_mem_write(const struct x86_e
case 0x6c: case 0x6d: /* INS */
case 0xa4: case 0xa5: /* MOVS */
case 0xaa: case 0xab: /* STOS */
- case X86EMUL_OPC(0x0f, 0x11): /* MOVUPS */
- case X86EMUL_OPC_VEX(0x0f, 0x11): /* VMOVUPS */
- case X86EMUL_OPC_66(0x0f, 0x11): /* MOVUPD */
- case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* VMOVUPD */
- case X86EMUL_OPC_F3(0x0f, 0x11): /* MOVSS */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* VMOVSS */
- case X86EMUL_OPC_F2(0x0f, 0x11): /* MOVSD */
- case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* VMOVSD */
- case X86EMUL_OPC(0x0f, 0x29): /* MOVAPS */
- case X86EMUL_OPC_VEX(0x0f, 0x29): /* VMOVAPS */
- case X86EMUL_OPC_66(0x0f, 0x29): /* MOVAPD */
- case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* VMOVAPD */
- case X86EMUL_OPC(0x0f, 0x2b): /* MOVNTPS */
- case X86EMUL_OPC_VEX(0x0f, 0x2b): /* VMOVNTPS */
- case X86EMUL_OPC_66(0x0f, 0x2b): /* MOVNTPD */
- case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* VMOVNTPD */
case X86EMUL_OPC(0x0f, 0x7e): /* MOVD/MOVQ */
case X86EMUL_OPC_66(0x0f, 0x7e): /* MOVD/MOVQ */
case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* VMOVD/VMOVQ */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -71,12 +71,14 @@
#define cpu_has_xsavec boot_cpu_has(X86_FEATURE_XSAVEC)
#define cpu_has_xgetbv1 boot_cpu_has(X86_FEATURE_XGETBV1)
#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)
+#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
#define cpu_has_monitor boot_cpu_has(X86_FEATURE_MONITOR)
#define cpu_has_eist boot_cpu_has(X86_FEATURE_EIST)
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
#define cpu_has_rdrand boot_cpu_has(X86_FEATURE_RDRAND)
#define cpu_has_rdseed boot_cpu_has(X86_FEATURE_RDSEED)
#define cpu_has_cmp_legacy boot_cpu_has(X86_FEATURE_CMP_LEGACY)
+#define cpu_has_sse4a boot_cpu_has(X86_FEATURE_SSE4A)
#define cpu_has_tbm boot_cpu_has(X86_FEATURE_TBM)
enum _cache_type {
[-- Attachment #3: Type: text/plain, Size: 127 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 3/8] x86emul: support MMX/SSE/SSE2 moves
2017-01-25 14:49 [PATCH 0/8] x86emul: MMX/SSE/SSE2 support Jan Beulich
2017-01-25 15:03 ` [PATCH 1/8] x86emul: catch exceptions occurring in stubs Jan Beulich
2017-01-25 15:04 ` [PATCH 2/8] x86emul: support most memory accessing MMX/SSE/SSE2 insns Jan Beulich
@ 2017-01-25 15:04 ` Jan Beulich
2017-01-25 15:05 ` [PATCH 4/8] x86emul: support MMX/SSE/SSE2 converts Jan Beulich
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2017-01-25 15:04 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper
[-- Attachment #1: Type: text/plain, Size: 38947 bytes --]
Previously supported insns are being converted to the new model, and
several new ones are being added.
To keep the stub handling reasonably simple, integrate SET_SSE_PREFIX()
into copy_REX_VEX(), at once switching the stubs to use an empty REX
prefix instead of a double DS: one (no byte registers are being
accessed, so an empty REX prefix has no effect), except (of course) for
the 32-bit test harness build.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1557,6 +1557,29 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing movq 32(%ecx),%xmm1...");
+ if ( stack_exec && cpu_has_sse2 )
+ {
+ decl_insn(movq_from_mem2);
+
+ asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+ put_insn(movq_from_mem2, "movq 32(%0), %%xmm1")
+ :: "c" (NULL) );
+
+ set_insn(movq_from_mem2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movq_from_mem2) )
+ goto fail;
+ asm ( "pcmpgtb %%xmm0, %%xmm0\n\t"
+ "pcmpeqb %%xmm1, %%xmm0\n\t"
+ "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing vmovq %xmm1,32(%edx)...");
if ( stack_exec && cpu_has_avx )
{
@@ -1581,6 +1604,29 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing vmovq 32(%edx),%xmm0...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmovq_from_mem);
+
+ asm volatile ( "pcmpeqb %%xmm0, %%xmm0\n"
+ put_insn(vmovq_from_mem, "vmovq 32(%0), %%xmm0")
+ :: "d" (NULL) );
+
+ set_insn(vmovq_from_mem);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmovq_from_mem) )
+ goto fail;
+ asm ( "pcmpgtb %%xmm1, %%xmm1\n\t"
+ "pcmpeqb %%xmm0, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movdqu %xmm2,(%ecx)...");
if ( stack_exec && cpu_has_sse2 )
{
@@ -1812,6 +1858,33 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing movd 32(%ecx),%mm4...");
+ if ( stack_exec && cpu_has_mmx )
+ {
+ decl_insn(movd_from_mem);
+
+ asm volatile ( "pcmpgtb %%mm4, %%mm4\n"
+ put_insn(movd_from_mem, "movd 32(%0), %%mm4")
+ :: "c" (NULL) );
+
+ set_insn(movd_from_mem);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movd_from_mem) )
+ goto fail;
+ asm ( "pxor %%mm2,%%mm2\n\t"
+ "pcmpeqb %%mm4, %%mm2\n\t"
+ "pmovmskb %%mm2, %0" : "=r" (rc) );
+ if ( rc != 0xf0 )
+ goto fail;
+ asm ( "pcmpeqb %%mm4, %%mm3\n\t"
+ "pmovmskb %%mm3, %0" : "=r" (rc) );
+ if ( rc != 0x0f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movd %xmm2,32(%edx)...");
if ( stack_exec && cpu_has_sse2 )
{
@@ -1836,6 +1909,34 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing movd 32(%edx),%xmm3...");
+ if ( stack_exec && cpu_has_sse2 )
+ {
+ decl_insn(movd_from_mem2);
+
+ asm volatile ( "pcmpeqb %%xmm3, %%xmm3\n"
+ put_insn(movd_from_mem2, "movd 32(%0), %%xmm3")
+ :: "d" (NULL) );
+
+ set_insn(movd_from_mem2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movd_from_mem2) )
+ goto fail;
+ asm ( "pxor %%xmm1,%%xmm1\n\t"
+ "pcmpeqb %%xmm3, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm3, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing vmovd %xmm1,32(%ecx)...");
if ( stack_exec && cpu_has_avx )
{
@@ -1860,6 +1961,34 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing vmovd 32(%ecx),%xmm2...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmovd_from_mem);
+
+ asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+ put_insn(vmovd_from_mem, "vmovd 32(%0), %%xmm2")
+ :: "c" (NULL) );
+
+ set_insn(vmovd_from_mem);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmovd_from_mem) )
+ goto fail;
+ asm ( "pxor %%xmm0,%%xmm0\n\t"
+ "pcmpeqb %%xmm2, %%xmm0\n\t"
+ "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm1, %%xmm1\n\t"
+ "pcmpeqb %%xmm2, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movd %mm3,%ebx...");
if ( stack_exec && cpu_has_mmx )
{
@@ -1890,6 +2019,34 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing movd %ebx,%mm4...");
+ if ( stack_exec && cpu_has_mmx )
+ {
+ decl_insn(movd_from_reg);
+
+ /* See comment next to movd above. */
+ asm volatile ( "pcmpgtb %%mm4, %%mm4\n"
+ put_insn(movd_from_reg, "movd %%ebx, %%mm4")
+ :: );
+
+ set_insn(movd_from_reg);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || !check_eip(movd_from_reg) )
+ goto fail;
+ asm ( "pxor %%mm2,%%mm2\n\t"
+ "pcmpeqb %%mm4, %%mm2\n\t"
+ "pmovmskb %%mm2, %0" : "=r" (rc) );
+ if ( rc != 0xf0 )
+ goto fail;
+ asm ( "pcmpeqb %%mm4, %%mm3\n\t"
+ "pmovmskb %%mm3, %0" : "=r" (rc) );
+ if ( rc != 0x0f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movd %xmm2,%ebx...");
if ( stack_exec && cpu_has_sse2 )
{
@@ -1915,6 +2072,35 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing movd %ebx,%xmm3...");
+ if ( stack_exec && cpu_has_sse2 )
+ {
+ decl_insn(movd_from_reg2);
+
+ /* See comment next to movd above. */
+ asm volatile ( "pcmpgtb %%xmm3, %%xmm3\n"
+ put_insn(movd_from_reg2, "movd %%ebx, %%xmm3")
+ :: );
+
+ set_insn(movd_from_reg2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || !check_eip(movd_from_reg2) )
+ goto fail;
+ asm ( "pxor %%xmm1,%%xmm1\n\t"
+ "pcmpeqb %%xmm3, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm3, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing vmovd %xmm1,%ebx...");
if ( stack_exec && cpu_has_avx )
{
@@ -1937,6 +2123,35 @@ int main(int argc, char **argv)
goto fail;
printf("okay\n");
}
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vmovd %ebx,%xmm2...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmovd_from_reg);
+
+ /* See comment next to movd above. */
+ asm volatile ( "pcmpgtb %%xmm2, %%xmm2\n"
+ put_insn(vmovd_from_reg, "vmovd %%ebx, %%xmm2")
+ :: );
+
+ set_insn(vmovd_from_reg);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || !check_eip(vmovd_from_reg) )
+ goto fail;
+ asm ( "pxor %%xmm0,%%xmm0\n\t"
+ "pcmpeqb %%xmm2, %%xmm0\n\t"
+ "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm1, %%xmm1\n\t"
+ "pcmpeqb %%xmm2, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
else
printf("skipped\n");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -236,11 +236,11 @@ static const struct {
[0x0f] = { ModRM|SrcImmByte },
[0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
[0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
- [0x12] = { ImplicitOps|ModRM },
- [0x13] = { ImplicitOps|ModRM },
+ [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
[0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
- [0x16] = { ImplicitOps|ModRM },
- [0x17] = { ImplicitOps|ModRM },
+ [0x16] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0x17] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
[0x18] = { ImplicitOps|ModRM },
[0x19] = { ImplicitOps|ModRM },
[0x1a] = { ImplicitOps|ModRM },
@@ -267,7 +267,7 @@ static const struct {
[0x38] = { DstReg|SrcMem|ModRM },
[0x3a] = { DstReg|SrcImmByte|ModRM },
[0x40 ... 0x4f] = { DstReg|SrcMem|ModRM|Mov },
- [0x50] = { ModRM },
+ [0x50] = { DstReg|SrcImplicit|ModRM|Mov },
[0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },
[0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
[0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
@@ -280,7 +280,8 @@ static const struct {
[0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
[0x6b] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0x6c ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
- [0x6e ... 0x6f] = { ImplicitOps|ModRM },
+ [0x6e] = { DstImplicit|SrcMem|ModRM|Mov },
+ [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int },
[0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
[0x71 ... 0x73] = { SrcImmByte|ModRM },
[0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
@@ -289,7 +290,8 @@ static const struct {
[0x79] = { ModRM },
[0x7c] = { DstImplicit|SrcMem|ModRM, simd_other },
[0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
- [0x7e ... 0x7f] = { ImplicitOps|ModRM },
+ [0x7e] = { DstMem|SrcImplicit|ModRM|Mov },
+ [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
[0x80 ... 0x8f] = { DstImplicit|SrcImm },
[0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
[0xa0 ... 0xa1] = { ImplicitOps|Mov },
@@ -332,19 +334,19 @@ static const struct {
[0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
- [0xd6] = { ImplicitOps|ModRM },
- [0xd7] = { ModRM },
+ [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
+ [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
[0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xe6] = { ModRM },
- [0xe7] = { ImplicitOps|ModRM },
+ [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
[0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
[0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
- [0xf7] = { ModRM },
+ [0xf7] = { DstMem|SrcMem|ModRM|Mov, simd_packed_int },
[0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xff] = { ModRM }
};
@@ -380,11 +382,6 @@ enum vex_pfx {
static const uint8_t sse_prefix[] = { 0x66, 0xf3, 0xf2 };
-#define SET_SSE_PREFIX(dst, vex_pfx) do { \
- if ( vex_pfx ) \
- (dst) = sse_prefix[(vex_pfx) - 1]; \
-} while (0)
-
union vex {
uint8_t raw[2];
struct {
@@ -399,15 +396,35 @@ union vex {
};
};
+#ifdef __x86_64__
+# define PFX2 REX_PREFIX
+#else
+# define PFX2 0x3e
+#endif
+#define PFX_BYTES 3
+#define init_prefixes(stub) ({ \
+ uint8_t *buf_ = get_stub(stub); \
+ buf_[0] = 0x3e; \
+ buf_[1] = PFX2; \
+ buf_[2] = 0x0f; \
+ buf_ + 3; \
+})
+
#define copy_REX_VEX(ptr, rex, vex) do { \
if ( (vex).opcx != vex_none ) \
{ \
if ( !mode_64bit() ) \
vex.reg |= 8; \
- ptr[0] = 0xc4, ptr[1] = (vex).raw[0], ptr[2] = (vex).raw[1]; \
+ (ptr)[0 - PFX_BYTES] = 0xc4; \
+ (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+ (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+ } \
+ else \
+ { \
+ if ( (vex).pfx ) \
+ (ptr)[0 - PFX_BYTES] = sse_prefix[(vex).pfx - 1]; \
+ (ptr)[1 - PFX_BYTES] |= rex; \
} \
- else if ( mode_64bit() ) \
- ptr[1] = rex | REX_PREFIX; \
} while (0)
union evex {
@@ -2180,7 +2197,8 @@ x86_decode_twobyte(
case 0x10 ... 0x18:
case 0x28 ... 0x2f:
case 0x50 ... 0x77:
- case 0x79 ... 0x7f:
+ case 0x79 ... 0x7d:
+ case 0x7f:
case 0xae:
case 0xc2 ... 0xc3:
case 0xc5 ... 0xc6:
@@ -2200,6 +2218,18 @@ x86_decode_twobyte(
op_bytes = mode_64bit() ? 8 : 4;
break;
+ case 0x7e:
+ ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */
+ {
+ case X86EMUL_OPC_VEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */
+ state->desc = DstImplicit | SrcMem | ModRM | Mov;
+ state->simd_size = simd_other;
+ /* Avoid the state->desc adjustment below. */
+ return X86EMUL_OKAY;
+ }
+ break;
+
case 0xb8: /* jmpe / popcnt */
if ( rep_prefix() )
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
@@ -2797,7 +2827,7 @@ x86_emulate(
struct cpu_user_regs _regs = *ctxt->regs;
struct x86_emulate_state state;
int rc;
- uint8_t b, d;
+ uint8_t b, d, *opc = NULL;
bool singlestep = (_regs._eflags & EFLG_TF) && !is_branch_step(ctxt, ops);
bool sfence = false;
struct operand src = { .reg = PTR_POISON };
@@ -5276,6 +5306,7 @@ x86_emulate(
CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5e): /* vdiv{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
CASE_SIMD_ALL_FP(, 0x0f, 0x5f): /* max{p,s}{s,d} xmm/mem,xmm */
CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5f): /* vmax{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ simd_0f_fp:
if ( vex.opcx == vex_none )
{
if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
@@ -5291,24 +5322,62 @@ x86_emulate(
get_fpu(X86EMUL_FPU_ymm, &fic);
}
simd_0f_common:
- {
- uint8_t *buf = get_stub(stub);
-
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
if ( ea.type == OP_MEM )
{
/* convert memory operand to (%rAX) */
rex_prefix &= ~REX_B;
vex.b = 1;
- buf[4] &= 0x38;
+ opc[1] &= 0x38;
}
- fic.insn_bytes = 5;
+ fic.insn_bytes = PFX_BYTES + 2;
break;
- }
+
+ case X86EMUL_OPC_66(0x0f, 0x12): /* movlpd m64,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x12): /* vmovlpd m64,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x13): /* movlp{s,d} xmm,m64 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x13): /* vmovlp{s,d} xmm,m64 */
+ case X86EMUL_OPC_66(0x0f, 0x16): /* movhpd m64,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x16): /* vmovhpd m64,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x17): /* movhp{s,d} xmm,m64 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x17): /* vmovhp{s,d} xmm,m64 */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC(0x0f, 0x12): /* movlps m64,xmm */
+ /* movhlps xmm,xmm */
+ case X86EMUL_OPC_VEX(0x0f, 0x12): /* vmovlps m64,xmm,xmm */
+ /* vmovhlps xmm,xmm,xmm */
+ case X86EMUL_OPC(0x0f, 0x16): /* movhps m64,xmm */
+ /* movlhps xmm,xmm */
+ case X86EMUL_OPC_VEX(0x0f, 0x16): /* vmovhps m64,xmm,xmm */
+ /* vmovlhps xmm,xmm,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ d &= ~TwoOp;
+ op_bytes = 8;
+ goto simd_0f_fp;
+
+ case X86EMUL_OPC_F3(0x0f, 0x12): /* movsldup xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x12): /* vmovsldup {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F2(0x0f, 0x12): /* movddup xmm/m64,xmm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x12): /* vmovddup {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x16): /* movshdup xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x16): /* vmovshdup {x,y}mm/mem,{x,y}mm */
+ d |= TwoOp;
+ op_bytes = !(vex.pfx & VEX_PREFIX_DOUBLE_MASK) || vex.l
+ ? 16 << vex.l : 8;
+ if ( vex.opcx == vex_none )
+ {
+ host_and_vcpu_must_have(sse3);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ goto simd_0f_common;
case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
@@ -5469,6 +5538,57 @@ x86_emulate(
break;
}
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x50): /* movmskp{s,d} xmm,reg */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ {
+ if ( b != 0x50 )
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ }
+ if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ else
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.reg != 0xf, EXC_UD);
+ if ( b == 0x50 || !vex.l )
+ host_and_vcpu_must_have(avx);
+ else
+ host_and_vcpu_must_have(avx2);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+ put_stub(stub);
+ put_fpu(&fic);
+
+ dst.bytes = 4;
+ break;
+
CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
@@ -5588,134 +5708,72 @@ x86_emulate(
}
goto simd_0f_common;
- case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */
- case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
- /* vmovntdq ymm,m256 */
- fail_if(ea.type != OP_MEM);
- /* fall through */
- case X86EMUL_OPC(0x0f, 0x6f): /* movq mm/m64,mm */
- case X86EMUL_OPC_66(0x0f, 0x6f): /* movdqa xmm/m128,xmm */
- case X86EMUL_OPC_F3(0x0f, 0x6f): /* movdqu xmm/m128,xmm */
- case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa xmm/m128,xmm */
- /* vmovdqa ymm/m256,ymm */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */
- /* vmovdqu ymm/m256,ymm */
- case X86EMUL_OPC(0x0f, 0x7e): /* movd mm,r/m32 */
- /* movq mm,r/m64 */
- case X86EMUL_OPC_66(0x0f, 0x7e): /* movd xmm,r/m32 */
- /* movq xmm,r/m64 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmovd xmm,r/m32 */
- /* vmovq xmm,r/m64 */
- case X86EMUL_OPC(0x0f, 0x7f): /* movq mm,mm/m64 */
- case X86EMUL_OPC_66(0x0f, 0x7f): /* movdqa xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 */
- /* vmovdqa ymm,ymm/m256 */
- case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu xmm,xmm/m128 */
- /* vmovdqu ymm,ymm/m256 */
- case X86EMUL_OPC_66(0x0f, 0xd6): /* movq xmm,xmm/m64 */
- case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
- {
- uint8_t *buf = get_stub(stub);
-
- fic.insn_bytes = 5;
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
- buf[5] = 0xc3;
- if ( vex.opcx == vex_none )
- {
- switch ( vex.pfx )
- {
- case vex_66:
- case vex_f3:
- vcpu_must_have(sse2);
- /* Converting movdqu to movdqa here: Our buffer is aligned. */
- buf[0] = 0x66;
- get_fpu(X86EMUL_FPU_xmm, &fic);
- ea.bytes = 16;
- break;
- case vex_none:
- if ( b != 0xe7 )
- host_and_vcpu_must_have(mmx);
- else
- vcpu_must_have(sse);
- get_fpu(X86EMUL_FPU_mmx, &fic);
- ea.bytes = 8;
- break;
- default:
- goto cannot_emulate;
- }
- }
- else
+ CASE_SIMD_PACKED_INT(0x0f, 0x6e): /* mov{d,q} r/m,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x7e): /* mov{d,q} {,x}mm,r/m */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */
+ if ( vex.opcx != vex_none )
{
- fail_if(vex.reg != 0xf);
+ generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
host_and_vcpu_must_have(avx);
get_fpu(X86EMUL_FPU_ymm, &fic);
- ea.bytes = 16 << vex.l;
}
- switch ( b )
+ else if ( vex.pfx )
{
- case 0x7e:
- generate_exception_if(vex.l, EXC_UD);
- ea.bytes = op_bytes;
- break;
- case 0xd6:
- generate_exception_if(vex.l, EXC_UD);
- ea.bytes = 8;
- break;
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
}
- if ( ea.type == OP_MEM )
+ else
{
- uint32_t mxcsr = 0;
-
- if ( ea.bytes < 16 || vex.pfx == vex_f3 )
- mxcsr = MXCSR_MM;
- else if ( vcpu_has_misalignsse() )
- asm ( "stmxcsr %0" : "=m" (mxcsr) );
- generate_exception_if(!(mxcsr & MXCSR_MM) &&
- !is_aligned(ea.mem.seg, ea.mem.off, ea.bytes,
- ctxt, ops),
- EXC_GP, 0);
- if ( b == 0x6f )
- rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
- ea.bytes, ctxt);
- else
- fail_if(!ops->write); /* Check before running the stub. */
+ host_and_vcpu_must_have(mmx);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
}
- if ( ea.type == OP_MEM || b == 0x7e )
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert memory/GPR operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0x38;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "+m" (src.val) : "a" (&src.val));
+ dst.val = src.val;
+
+ put_stub(stub);
+ put_fpu(&fic);
+ break;
+
+ case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq {x,y}mm,mem */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ sfence = true;
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f, 0x6f): /* movdqa xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x6f): /* movdqu xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x7f): /* movdqa xmm,xmm/m128 */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa {x,y}mm,{x,y}mm/m128 */
+ case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu {x,y}mm,{x,y}mm/mem */
+ if ( vex.opcx != vex_none )
{
- /* Convert memory operand or GPR destination to (%rAX) */
- rex_prefix &= ~REX_B;
- vex.b = 1;
- buf[4] &= 0x38;
- if ( ea.type == OP_MEM )
- ea.reg = (void *)mmvalp;
- else /* Ensure zero-extension of a 32-bit result. */
- *ea.reg = 0;
- }
- if ( !rc )
- {
- copy_REX_VEX(buf, rex_prefix, vex);
- asm volatile ( "call *%0" : : "r" (stub.func), "a" (ea.reg)
- : "memory" );
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
}
- put_fpu(&fic);
- put_stub(stub);
- if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
+ else
{
- ASSERT(ops->write); /* See the fail_if() above. */
- rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
- ea.bytes, ctxt);
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
}
- if ( rc )
- goto done;
- dst.type = OP_NONE;
- break;
- }
+ d |= TwoOp;
+ op_bytes = 16 << vex.l;
+ goto simd_0f_common;
CASE_SIMD_PACKED_INT(0x0f, 0x70): /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
@@ -5744,25 +5802,25 @@ x86_emulate(
get_fpu(X86EMUL_FPU_mmx, &fic);
}
simd_0f_imm8:
- {
- uint8_t *buf = get_stub(stub);
-
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
if ( ea.type == OP_MEM )
{
/* Convert memory operand to (%rAX). */
rex_prefix &= ~REX_B;
vex.b = 1;
- buf[4] &= 0x38;
+ opc[1] &= 0x38;
}
- buf[5] = imm1;
- fic.insn_bytes = 6;
+ opc[2] = imm1;
+ fic.insn_bytes = PFX_BYTES + 3;
break;
- }
+
+ case X86EMUL_OPC_F3(0x0f, 0x7e): /* movq xmm/m64,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ op_bytes = 8;
+ goto simd_0f_int;
case X86EMUL_OPC_F2(0x0f, 0xf0): /* lddqu m128,xmm */
case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
@@ -6335,6 +6393,27 @@ x86_emulate(
get_fpu(X86EMUL_FPU_mmx, &fic);
goto simd_0f_common;
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
+ generate_exception_if(vex.l, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f, 0xd6): /* movq xmm,xmm/m64 */
+ case X86EMUL_OPC(0x0f, 0x6f): /* movq mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0x7f): /* movq mm,mm/m64 */
+ op_bytes = 8;
+ d |= TwoOp;
+ goto simd_0f_int;
+
+ case X86EMUL_OPC_F3(0x0f, 0xd6): /* movq2dq mm,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0xd6): /* movdq2q xmm,mm */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+ op_bytes = 8;
+ host_and_vcpu_must_have(mmx);
+ goto simd_0f_int;
+
+ case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ sfence = true;
+ /* fall through */
case X86EMUL_OPC(0x0f, 0xda): /* pminub mm/m64,mm */
case X86EMUL_OPC(0x0f, 0xde): /* pmaxub mm/m64,mm */
case X86EMUL_OPC(0x0f, 0xea): /* pminsw mm/m64,mm */
@@ -6348,6 +6427,73 @@ x86_emulate(
get_fpu(X86EMUL_FPU_mmx, &fic);
goto simd_0f_common;
+ CASE_SIMD_PACKED_INT(0x0f, 0xf7): /* maskmov{q,dqu} {,x}mm,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf7): /* vmaskmovdqu xmm,xmm */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+ if ( vex.opcx != vex_none )
+ {
+ generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+
+ /*
+ * While we can't reasonably provide fully correct behavior here
+ * (in particular avoiding the memory read in anticipation of all
+ * bytes in the range eventually being written), we can (and should)
+ * still suppress the memory access if all mask bits are clear. Read
+ * the mask bits via {,v}pmovmskb for that purpose.
+ */
+ opc = init_prefixes(stub);
+ opc[0] = 0xd7; /* {,v}pmovmskb */
+ /* (Ab)use "sfence" for latching the original REX.R / VEX.R. */
+ sfence = rex_prefix & REX_R;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+
+ put_stub(stub);
+ if ( !ea.val )
+ {
+ put_fpu(&fic);
+ goto complete_insn;
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
+ /* Restore high bit of XMM destination. */
+ if ( sfence )
+ {
+ rex_prefix |= REX_R;
+ vex.r = 0;
+ }
+
+ d |= TwoOp;
+ ea.type = OP_MEM;
+ ea.mem.off = truncate_ea(_regs.r(di));
+ sfence = true;
+ break;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);
@@ -6611,23 +6757,14 @@ x86_emulate(
if ( state->simd_size )
{
-#ifdef __XEN__
- uint8_t *buf = stub.ptr;
-#else
- uint8_t *buf = get_stub(stub);
-#endif
-
generate_exception_if(!op_bytes, EXC_UD);
generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
EXC_UD);
- if ( !buf )
+ if ( !opc )
BUG();
- if ( vex.opcx == vex_none )
- SET_SSE_PREFIX(buf[0], vex.pfx);
-
- buf[fic.insn_bytes] = 0xc3;
- copy_REX_VEX(buf, rex_prefix, vex);
+ opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+ copy_REX_VEX(opc, rex_prefix, vex);
if ( ea.type == OP_MEM )
{
@@ -6635,10 +6772,16 @@ x86_emulate(
if ( op_bytes < 16 ||
(vex.opcx
- ? /* vmov{a,nt}p{s,d} are exceptions. */
- ext != ext_0f || ((b | 1) != 0x29 && b != 0x2b)
- : /* movup{s,d} and lddqu are exceptions. */
- ext == ext_0f && ((b | 1) == 0x11 || b == 0xf0)) )
+ ? /* vmov{{a,nt}p{s,d},dqa,ntdq} are exceptions. */
+ ext != ext_0f ||
+ ((b | 1) != 0x29 && b != 0x2b &&
+ ((b | 0x10) != 0x7f || vex.pfx != vex_66) &&
+ b != 0xe7)
+ : /* movup{s,d}, {,mask}movdqu, and lddqu are exceptions. */
+ ext == ext_0f &&
+ ((b | 1) == 0x11 ||
+ ((b | 0x10) == 0x7f && vex.pfx == vex_f3) ||
+ b == 0xf7 || b == 0xf0)) )
mxcsr = MXCSR_MM;
else if ( vcpu_has_misalignsse() )
asm ( "stmxcsr %0" : "=m" (mxcsr) );
@@ -6646,14 +6789,25 @@ x86_emulate(
!is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
ctxt, ops),
EXC_GP, 0);
- if ( (d & SrcMask) == SrcMem )
+ switch ( d & SrcMask )
{
+ case SrcMem:
rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
if ( rc != X86EMUL_OKAY )
goto done;
+ /* fall through */
+ case SrcMem16:
dst.type = OP_NONE;
+ break;
+ default:
+ if ( (d & DstMask) != DstMem )
+ {
+ ASSERT_UNREACHABLE();
+ return X86EMUL_UNHANDLEABLE;
+ }
+ break;
}
- else if ( (d & DstMask) == DstMem )
+ if ( (d & DstMask) == DstMem )
{
fail_if(!ops->write); /* Check before running the stub. */
ASSERT(d & Mov);
@@ -6661,13 +6815,6 @@ x86_emulate(
dst.bytes = op_bytes;
dst.mem = ea.mem;
}
- else if ( (d & SrcMask) == SrcMem16 )
- dst.type = OP_NONE;
- else
- {
- ASSERT_UNREACHABLE();
- return X86EMUL_UNHANDLEABLE;
- }
}
else
dst.type = OP_NONE;
@@ -6928,6 +7075,8 @@ x86_insn_is_mem_access(const struct x86_
case 0xa4 ... 0xa7: /* MOVS / CMPS */
case 0xaa ... 0xaf: /* STOS / LODS / SCAS */
case 0xd7: /* XLAT */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf7): /* MASKMOV{Q,DQU} */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf7): /* VMASKMOVDQU */
return true;
case X86EMUL_OPC(0x0f, 0x01):
@@ -6945,7 +7094,8 @@ x86_insn_is_mem_write(const struct x86_e
switch ( state->desc & DstMask )
{
case DstMem:
- return state->modrm_mod != 3;
+ /* The SrcMem check is to cover {,V}MASKMOV{Q,DQU}. */
+ return state->modrm_mod != 3 || (state->desc & SrcMask) == SrcMem;
case DstBitBase:
case DstImplicit:
@@ -6965,22 +7115,9 @@ x86_insn_is_mem_write(const struct x86_e
case 0x6c: case 0x6d: /* INS */
case 0xa4: case 0xa5: /* MOVS */
case 0xaa: case 0xab: /* STOS */
- case X86EMUL_OPC(0x0f, 0x7e): /* MOVD/MOVQ */
- case X86EMUL_OPC_66(0x0f, 0x7e): /* MOVD/MOVQ */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* VMOVD/VMOVQ */
- case X86EMUL_OPC(0x0f, 0x7f): /* VMOVQ */
- case X86EMUL_OPC_66(0x0f, 0x7f): /* MOVDQA */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* VMOVDQA */
- case X86EMUL_OPC_F3(0x0f, 0x7f): /* MOVDQU */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* VMOVDQU */
case X86EMUL_OPC(0x0f, 0xab): /* BTS */
case X86EMUL_OPC(0x0f, 0xb3): /* BTR */
case X86EMUL_OPC(0x0f, 0xbb): /* BTC */
- case X86EMUL_OPC_66(0x0f, 0xd6): /* MOVQ */
- case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* VMOVQ */
- case X86EMUL_OPC(0x0f, 0xe7): /* MOVNTQ */
- case X86EMUL_OPC_66(0x0f, 0xe7): /* MOVNTDQ */
- case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* VMOVNTDQ */
return true;
case 0xd9:
[-- Attachment #2: x86emul-SSE-AVX-0f-mov.patch --]
[-- Type: text/plain, Size: 38982 bytes --]
x86emul: support MMX/SSE/SSE2 moves
Previously supported insns are being converted to the new model, and
several new ones are being added.
To keep the stub handling reasonably simple, integrate SET_SSE_PREFIX()
into copy_REX_VEX(), at once switching the stubs to use an empty REX
prefix instead of a double DS: one (no byte registers are being
accessed, so an empty REX prefix has no effect), except (of course) for
the 32-bit test harness build.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1557,6 +1557,29 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing movq 32(%ecx),%xmm1...");
+ if ( stack_exec && cpu_has_sse2 )
+ {
+ decl_insn(movq_from_mem2);
+
+ asm volatile ( "pcmpeqb %%xmm1, %%xmm1\n"
+ put_insn(movq_from_mem2, "movq 32(%0), %%xmm1")
+ :: "c" (NULL) );
+
+ set_insn(movq_from_mem2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movq_from_mem2) )
+ goto fail;
+ asm ( "pcmpgtb %%xmm0, %%xmm0\n\t"
+ "pcmpeqb %%xmm1, %%xmm0\n\t"
+ "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing vmovq %xmm1,32(%edx)...");
if ( stack_exec && cpu_has_avx )
{
@@ -1581,6 +1604,29 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing vmovq 32(%edx),%xmm0...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmovq_from_mem);
+
+ asm volatile ( "pcmpeqb %%xmm0, %%xmm0\n"
+ put_insn(vmovq_from_mem, "vmovq 32(%0), %%xmm0")
+ :: "d" (NULL) );
+
+ set_insn(vmovq_from_mem);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmovq_from_mem) )
+ goto fail;
+ asm ( "pcmpgtb %%xmm1, %%xmm1\n\t"
+ "pcmpeqb %%xmm0, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movdqu %xmm2,(%ecx)...");
if ( stack_exec && cpu_has_sse2 )
{
@@ -1812,6 +1858,33 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing movd 32(%ecx),%mm4...");
+ if ( stack_exec && cpu_has_mmx )
+ {
+ decl_insn(movd_from_mem);
+
+ asm volatile ( "pcmpgtb %%mm4, %%mm4\n"
+ put_insn(movd_from_mem, "movd 32(%0), %%mm4")
+ :: "c" (NULL) );
+
+ set_insn(movd_from_mem);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movd_from_mem) )
+ goto fail;
+ asm ( "pxor %%mm2,%%mm2\n\t"
+ "pcmpeqb %%mm4, %%mm2\n\t"
+ "pmovmskb %%mm2, %0" : "=r" (rc) );
+ if ( rc != 0xf0 )
+ goto fail;
+ asm ( "pcmpeqb %%mm4, %%mm3\n\t"
+ "pmovmskb %%mm3, %0" : "=r" (rc) );
+ if ( rc != 0x0f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movd %xmm2,32(%edx)...");
if ( stack_exec && cpu_has_sse2 )
{
@@ -1836,6 +1909,34 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing movd 32(%edx),%xmm3...");
+ if ( stack_exec && cpu_has_sse2 )
+ {
+ decl_insn(movd_from_mem2);
+
+ asm volatile ( "pcmpeqb %%xmm3, %%xmm3\n"
+ put_insn(movd_from_mem2, "movd 32(%0), %%xmm3")
+ :: "d" (NULL) );
+
+ set_insn(movd_from_mem2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(movd_from_mem2) )
+ goto fail;
+ asm ( "pxor %%xmm1,%%xmm1\n\t"
+ "pcmpeqb %%xmm3, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm3, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing vmovd %xmm1,32(%ecx)...");
if ( stack_exec && cpu_has_avx )
{
@@ -1860,6 +1961,34 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing vmovd 32(%ecx),%xmm2...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmovd_from_mem);
+
+ asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+ put_insn(vmovd_from_mem, "vmovd 32(%0), %%xmm2")
+ :: "c" (NULL) );
+
+ set_insn(vmovd_from_mem);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(vmovd_from_mem) )
+ goto fail;
+ asm ( "pxor %%xmm0,%%xmm0\n\t"
+ "pcmpeqb %%xmm2, %%xmm0\n\t"
+ "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm1, %%xmm1\n\t"
+ "pcmpeqb %%xmm2, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movd %mm3,%ebx...");
if ( stack_exec && cpu_has_mmx )
{
@@ -1890,6 +2019,34 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing movd %ebx,%mm4...");
+ if ( stack_exec && cpu_has_mmx )
+ {
+ decl_insn(movd_from_reg);
+
+ /* See comment next to movd above. */
+ asm volatile ( "pcmpgtb %%mm4, %%mm4\n"
+ put_insn(movd_from_reg, "movd %%ebx, %%mm4")
+ :: );
+
+ set_insn(movd_from_reg);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || !check_eip(movd_from_reg) )
+ goto fail;
+ asm ( "pxor %%mm2,%%mm2\n\t"
+ "pcmpeqb %%mm4, %%mm2\n\t"
+ "pmovmskb %%mm2, %0" : "=r" (rc) );
+ if ( rc != 0xf0 )
+ goto fail;
+ asm ( "pcmpeqb %%mm4, %%mm3\n\t"
+ "pmovmskb %%mm3, %0" : "=r" (rc) );
+ if ( rc != 0x0f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing movd %xmm2,%ebx...");
if ( stack_exec && cpu_has_sse2 )
{
@@ -1915,6 +2072,35 @@ int main(int argc, char **argv)
else
printf("skipped\n");
+ printf("%-40s", "Testing movd %ebx,%xmm3...");
+ if ( stack_exec && cpu_has_sse2 )
+ {
+ decl_insn(movd_from_reg2);
+
+ /* See comment next to movd above. */
+ asm volatile ( "pcmpgtb %%xmm3, %%xmm3\n"
+ put_insn(movd_from_reg2, "movd %%ebx, %%xmm3")
+ :: );
+
+ set_insn(movd_from_reg2);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || !check_eip(movd_from_reg2) )
+ goto fail;
+ asm ( "pxor %%xmm1,%%xmm1\n\t"
+ "pcmpeqb %%xmm3, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+ "pcmpeqb %%xmm3, %%xmm2\n\t"
+ "pmovmskb %%xmm2, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
printf("%-40s", "Testing vmovd %xmm1,%ebx...");
if ( stack_exec && cpu_has_avx )
{
@@ -1937,6 +2123,35 @@ int main(int argc, char **argv)
goto fail;
printf("okay\n");
}
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing vmovd %ebx,%xmm2...");
+ if ( stack_exec && cpu_has_avx )
+ {
+ decl_insn(vmovd_from_reg);
+
+ /* See comment next to movd above. */
+ asm volatile ( "pcmpgtb %%xmm2, %%xmm2\n"
+ put_insn(vmovd_from_reg, "vmovd %%ebx, %%xmm2")
+ :: );
+
+ set_insn(vmovd_from_reg);
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( (rc != X86EMUL_OKAY) || !check_eip(vmovd_from_reg) )
+ goto fail;
+ asm ( "pxor %%xmm0,%%xmm0\n\t"
+ "pcmpeqb %%xmm2, %%xmm0\n\t"
+ "pmovmskb %%xmm0, %0" : "=r" (rc) );
+ if ( rc != 0xfff0 )
+ goto fail;
+ asm ( "pcmpeqb %%xmm1, %%xmm1\n\t"
+ "pcmpeqb %%xmm2, %%xmm1\n\t"
+ "pmovmskb %%xmm1, %0" : "=r" (rc) );
+ if ( rc != 0x000f )
+ goto fail;
+ printf("okay\n");
+ }
else
printf("skipped\n");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -236,11 +236,11 @@ static const struct {
[0x0f] = { ModRM|SrcImmByte },
[0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
[0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
- [0x12] = { ImplicitOps|ModRM },
- [0x13] = { ImplicitOps|ModRM },
+ [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
[0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
- [0x16] = { ImplicitOps|ModRM },
- [0x17] = { ImplicitOps|ModRM },
+ [0x16] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+ [0x17] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
[0x18] = { ImplicitOps|ModRM },
[0x19] = { ImplicitOps|ModRM },
[0x1a] = { ImplicitOps|ModRM },
@@ -267,7 +267,7 @@ static const struct {
[0x38] = { DstReg|SrcMem|ModRM },
[0x3a] = { DstReg|SrcImmByte|ModRM },
[0x40 ... 0x4f] = { DstReg|SrcMem|ModRM|Mov },
- [0x50] = { ModRM },
+ [0x50] = { DstReg|SrcImplicit|ModRM|Mov },
[0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },
[0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
[0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
@@ -280,7 +280,8 @@ static const struct {
[0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
[0x6b] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0x6c ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
- [0x6e ... 0x6f] = { ImplicitOps|ModRM },
+ [0x6e] = { DstImplicit|SrcMem|ModRM|Mov },
+ [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int },
[0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
[0x71 ... 0x73] = { SrcImmByte|ModRM },
[0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
@@ -289,7 +290,8 @@ static const struct {
[0x79] = { ModRM },
[0x7c] = { DstImplicit|SrcMem|ModRM, simd_other },
[0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
- [0x7e ... 0x7f] = { ImplicitOps|ModRM },
+ [0x7e] = { DstMem|SrcImplicit|ModRM|Mov },
+ [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
[0x80 ... 0x8f] = { DstImplicit|SrcImm },
[0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
[0xa0 ... 0xa1] = { ImplicitOps|Mov },
@@ -332,19 +334,19 @@ static const struct {
[0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
- [0xd6] = { ImplicitOps|ModRM },
- [0xd7] = { ModRM },
+ [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
+ [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
[0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xe6] = { ModRM },
- [0xe7] = { ImplicitOps|ModRM },
+ [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
[0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
[0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
[0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
- [0xf7] = { ModRM },
+ [0xf7] = { DstMem|SrcMem|ModRM|Mov, simd_packed_int },
[0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
[0xff] = { ModRM }
};
@@ -380,11 +382,6 @@ enum vex_pfx {
static const uint8_t sse_prefix[] = { 0x66, 0xf3, 0xf2 };
-#define SET_SSE_PREFIX(dst, vex_pfx) do { \
- if ( vex_pfx ) \
- (dst) = sse_prefix[(vex_pfx) - 1]; \
-} while (0)
-
union vex {
uint8_t raw[2];
struct {
@@ -399,15 +396,35 @@ union vex {
};
};
+#ifdef __x86_64__
+# define PFX2 REX_PREFIX
+#else
+# define PFX2 0x3e
+#endif
+#define PFX_BYTES 3
+#define init_prefixes(stub) ({ \
+ uint8_t *buf_ = get_stub(stub); \
+ buf_[0] = 0x3e; \
+ buf_[1] = PFX2; \
+ buf_[2] = 0x0f; \
+ buf_ + 3; \
+})
+
#define copy_REX_VEX(ptr, rex, vex) do { \
if ( (vex).opcx != vex_none ) \
{ \
if ( !mode_64bit() ) \
vex.reg |= 8; \
- ptr[0] = 0xc4, ptr[1] = (vex).raw[0], ptr[2] = (vex).raw[1]; \
+ (ptr)[0 - PFX_BYTES] = 0xc4; \
+ (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
+ (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
+ } \
+ else \
+ { \
+ if ( (vex).pfx ) \
+ (ptr)[0 - PFX_BYTES] = sse_prefix[(vex).pfx - 1]; \
+ (ptr)[1 - PFX_BYTES] |= rex; \
} \
- else if ( mode_64bit() ) \
- ptr[1] = rex | REX_PREFIX; \
} while (0)
union evex {
@@ -2180,7 +2197,8 @@ x86_decode_twobyte(
case 0x10 ... 0x18:
case 0x28 ... 0x2f:
case 0x50 ... 0x77:
- case 0x79 ... 0x7f:
+ case 0x79 ... 0x7d:
+ case 0x7f:
case 0xae:
case 0xc2 ... 0xc3:
case 0xc5 ... 0xc6:
@@ -2200,6 +2218,18 @@ x86_decode_twobyte(
op_bytes = mode_64bit() ? 8 : 4;
break;
+ case 0x7e:
+ ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+ if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */
+ {
+ case X86EMUL_OPC_VEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */
+ state->desc = DstImplicit | SrcMem | ModRM | Mov;
+ state->simd_size = simd_other;
+ /* Avoid the state->desc adjustment below. */
+ return X86EMUL_OKAY;
+ }
+ break;
+
case 0xb8: /* jmpe / popcnt */
if ( rep_prefix() )
ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
@@ -2797,7 +2827,7 @@ x86_emulate(
struct cpu_user_regs _regs = *ctxt->regs;
struct x86_emulate_state state;
int rc;
- uint8_t b, d;
+ uint8_t b, d, *opc = NULL;
bool singlestep = (_regs._eflags & EFLG_TF) && !is_branch_step(ctxt, ops);
bool sfence = false;
struct operand src = { .reg = PTR_POISON };
@@ -5276,6 +5306,7 @@ x86_emulate(
CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5e): /* vdiv{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
CASE_SIMD_ALL_FP(, 0x0f, 0x5f): /* max{p,s}{s,d} xmm/mem,xmm */
CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5f): /* vmax{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ simd_0f_fp:
if ( vex.opcx == vex_none )
{
if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
@@ -5291,24 +5322,62 @@ x86_emulate(
get_fpu(X86EMUL_FPU_ymm, &fic);
}
simd_0f_common:
- {
- uint8_t *buf = get_stub(stub);
-
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
if ( ea.type == OP_MEM )
{
/* convert memory operand to (%rAX) */
rex_prefix &= ~REX_B;
vex.b = 1;
- buf[4] &= 0x38;
+ opc[1] &= 0x38;
}
- fic.insn_bytes = 5;
+ fic.insn_bytes = PFX_BYTES + 2;
break;
- }
+
+ case X86EMUL_OPC_66(0x0f, 0x12): /* movlpd m64,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x12): /* vmovlpd m64,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x13): /* movlp{s,d} xmm,m64 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x13): /* vmovlp{s,d} xmm,m64 */
+ case X86EMUL_OPC_66(0x0f, 0x16): /* movhpd m64,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x16): /* vmovhpd m64,xmm,xmm */
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x17): /* movhp{s,d} xmm,m64 */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x17): /* vmovhp{s,d} xmm,m64 */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC(0x0f, 0x12): /* movlps m64,xmm */
+ /* movhlps xmm,xmm */
+ case X86EMUL_OPC_VEX(0x0f, 0x12): /* vmovlps m64,xmm,xmm */
+ /* vmovhlps xmm,xmm,xmm */
+ case X86EMUL_OPC(0x0f, 0x16): /* movhps m64,xmm */
+ /* movlhps xmm,xmm */
+ case X86EMUL_OPC_VEX(0x0f, 0x16): /* vmovhps m64,xmm,xmm */
+ /* vmovlhps xmm,xmm,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ d &= ~TwoOp;
+ op_bytes = 8;
+ goto simd_0f_fp;
+
+ case X86EMUL_OPC_F3(0x0f, 0x12): /* movsldup xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x12): /* vmovsldup {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F2(0x0f, 0x12): /* movddup xmm/m64,xmm */
+ case X86EMUL_OPC_VEX_F2(0x0f, 0x12): /* vmovddup {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x16): /* movshdup xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x16): /* vmovshdup {x,y}mm/mem,{x,y}mm */
+ d |= TwoOp;
+ op_bytes = !(vex.pfx & VEX_PREFIX_DOUBLE_MASK) || vex.l
+ ? 16 << vex.l : 8;
+ if ( vex.opcx == vex_none )
+ {
+ host_and_vcpu_must_have(sse3);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ goto simd_0f_common;
case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
@@ -5469,6 +5538,57 @@ x86_emulate(
break;
}
+ CASE_SIMD_PACKED_FP(, 0x0f, 0x50): /* movmskp{s,d} xmm,reg */
+ CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
+ CASE_SIMD_PACKED_INT(0x0f, 0xd7): /* pmovmskb {,x}mm,reg */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd7): /* vpmovmskb {x,y}mm,reg */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+
+ if ( vex.opcx == vex_none )
+ {
+ if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+ vcpu_must_have(sse2);
+ else
+ {
+ if ( b != 0x50 )
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ }
+ if ( b == 0x50 || (vex.pfx & VEX_PREFIX_DOUBLE_MASK) )
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ else
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+ else
+ {
+ generate_exception_if(vex.reg != 0xf, EXC_UD);
+ if ( b == 0x50 || !vex.l )
+ host_and_vcpu_must_have(avx);
+ else
+ host_and_vcpu_must_have(avx2);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+ put_stub(stub);
+ put_fpu(&fic);
+
+ dst.bytes = 4;
+ break;
+
CASE_SIMD_PACKED_INT(0x0f, 0x60): /* punpcklbw {,x}mm/mem,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
CASE_SIMD_PACKED_INT(0x0f, 0x61): /* punpcklwd {,x}mm/mem,{,x}mm */
@@ -5588,134 +5708,72 @@ x86_emulate(
}
goto simd_0f_common;
- case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */
- case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
- /* vmovntdq ymm,m256 */
- fail_if(ea.type != OP_MEM);
- /* fall through */
- case X86EMUL_OPC(0x0f, 0x6f): /* movq mm/m64,mm */
- case X86EMUL_OPC_66(0x0f, 0x6f): /* movdqa xmm/m128,xmm */
- case X86EMUL_OPC_F3(0x0f, 0x6f): /* movdqu xmm/m128,xmm */
- case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa xmm/m128,xmm */
- /* vmovdqa ymm/m256,ymm */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu xmm/m128,xmm */
- /* vmovdqu ymm/m256,ymm */
- case X86EMUL_OPC(0x0f, 0x7e): /* movd mm,r/m32 */
- /* movq mm,r/m64 */
- case X86EMUL_OPC_66(0x0f, 0x7e): /* movd xmm,r/m32 */
- /* movq xmm,r/m64 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmovd xmm,r/m32 */
- /* vmovq xmm,r/m64 */
- case X86EMUL_OPC(0x0f, 0x7f): /* movq mm,mm/m64 */
- case X86EMUL_OPC_66(0x0f, 0x7f): /* movdqa xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa xmm,xmm/m128 */
- /* vmovdqa ymm,ymm/m256 */
- case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu xmm,xmm/m128 */
- /* vmovdqu ymm,ymm/m256 */
- case X86EMUL_OPC_66(0x0f, 0xd6): /* movq xmm,xmm/m64 */
- case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
- {
- uint8_t *buf = get_stub(stub);
-
- fic.insn_bytes = 5;
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
- buf[5] = 0xc3;
- if ( vex.opcx == vex_none )
- {
- switch ( vex.pfx )
- {
- case vex_66:
- case vex_f3:
- vcpu_must_have(sse2);
- /* Converting movdqu to movdqa here: Our buffer is aligned. */
- buf[0] = 0x66;
- get_fpu(X86EMUL_FPU_xmm, &fic);
- ea.bytes = 16;
- break;
- case vex_none:
- if ( b != 0xe7 )
- host_and_vcpu_must_have(mmx);
- else
- vcpu_must_have(sse);
- get_fpu(X86EMUL_FPU_mmx, &fic);
- ea.bytes = 8;
- break;
- default:
- goto cannot_emulate;
- }
- }
- else
+ CASE_SIMD_PACKED_INT(0x0f, 0x6e): /* mov{d,q} r/m,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
+ CASE_SIMD_PACKED_INT(0x0f, 0x7e): /* mov{d,q} {,x}mm,r/m */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */
+ if ( vex.opcx != vex_none )
{
- fail_if(vex.reg != 0xf);
+ generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
host_and_vcpu_must_have(avx);
get_fpu(X86EMUL_FPU_ymm, &fic);
- ea.bytes = 16 << vex.l;
}
- switch ( b )
+ else if ( vex.pfx )
{
- case 0x7e:
- generate_exception_if(vex.l, EXC_UD);
- ea.bytes = op_bytes;
- break;
- case 0xd6:
- generate_exception_if(vex.l, EXC_UD);
- ea.bytes = 8;
- break;
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
}
- if ( ea.type == OP_MEM )
+ else
{
- uint32_t mxcsr = 0;
-
- if ( ea.bytes < 16 || vex.pfx == vex_f3 )
- mxcsr = MXCSR_MM;
- else if ( vcpu_has_misalignsse() )
- asm ( "stmxcsr %0" : "=m" (mxcsr) );
- generate_exception_if(!(mxcsr & MXCSR_MM) &&
- !is_aligned(ea.mem.seg, ea.mem.off, ea.bytes,
- ctxt, ops),
- EXC_GP, 0);
- if ( b == 0x6f )
- rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
- ea.bytes, ctxt);
- else
- fail_if(!ops->write); /* Check before running the stub. */
+ host_and_vcpu_must_have(mmx);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
}
- if ( ea.type == OP_MEM || b == 0x7e )
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ /* Convert memory/GPR operand to (%rAX). */
+ rex_prefix &= ~REX_B;
+ vex.b = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0x38;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "+m" (src.val) : "a" (&src.val));
+ dst.val = src.val;
+
+ put_stub(stub);
+ put_fpu(&fic);
+ break;
+
+ case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq {x,y}mm,mem */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ sfence = true;
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f, 0x6f): /* movdqa xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x6f): /* vmovdqa {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_F3(0x0f, 0x6f): /* movdqu xmm/m128,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x6f): /* vmovdqu {x,y}mm/mem,{x,y}mm */
+ case X86EMUL_OPC_66(0x0f, 0x7f): /* movdqa xmm,xmm/m128 */
+ case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* vmovdqa {x,y}mm,{x,y}mm/m128 */
+ case X86EMUL_OPC_F3(0x0f, 0x7f): /* movdqu xmm,xmm/m128 */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* vmovdqu {x,y}mm,{x,y}mm/mem */
+ if ( vex.opcx != vex_none )
{
- /* Convert memory operand or GPR destination to (%rAX) */
- rex_prefix &= ~REX_B;
- vex.b = 1;
- buf[4] &= 0x38;
- if ( ea.type == OP_MEM )
- ea.reg = (void *)mmvalp;
- else /* Ensure zero-extension of a 32-bit result. */
- *ea.reg = 0;
- }
- if ( !rc )
- {
- copy_REX_VEX(buf, rex_prefix, vex);
- asm volatile ( "call *%0" : : "r" (stub.func), "a" (ea.reg)
- : "memory" );
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
}
- put_fpu(&fic);
- put_stub(stub);
- if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
+ else
{
- ASSERT(ops->write); /* See the fail_if() above. */
- rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
- ea.bytes, ctxt);
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
}
- if ( rc )
- goto done;
- dst.type = OP_NONE;
- break;
- }
+ d |= TwoOp;
+ op_bytes = 16 << vex.l;
+ goto simd_0f_common;
CASE_SIMD_PACKED_INT(0x0f, 0x70): /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
@@ -5744,25 +5802,25 @@ x86_emulate(
get_fpu(X86EMUL_FPU_mmx, &fic);
}
simd_0f_imm8:
- {
- uint8_t *buf = get_stub(stub);
-
- buf[0] = 0x3e;
- buf[1] = 0x3e;
- buf[2] = 0x0f;
- buf[3] = b;
- buf[4] = modrm;
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
if ( ea.type == OP_MEM )
{
/* Convert memory operand to (%rAX). */
rex_prefix &= ~REX_B;
vex.b = 1;
- buf[4] &= 0x38;
+ opc[1] &= 0x38;
}
- buf[5] = imm1;
- fic.insn_bytes = 6;
+ opc[2] = imm1;
+ fic.insn_bytes = PFX_BYTES + 3;
break;
- }
+
+ case X86EMUL_OPC_F3(0x0f, 0x7e): /* movq xmm/m64,xmm */
+ case X86EMUL_OPC_VEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
+ generate_exception_if(vex.l, EXC_UD);
+ op_bytes = 8;
+ goto simd_0f_int;
case X86EMUL_OPC_F2(0x0f, 0xf0): /* lddqu m128,xmm */
case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
@@ -6335,6 +6393,27 @@ x86_emulate(
get_fpu(X86EMUL_FPU_mmx, &fic);
goto simd_0f_common;
+ case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
+ generate_exception_if(vex.l, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_66(0x0f, 0xd6): /* movq xmm,xmm/m64 */
+ case X86EMUL_OPC(0x0f, 0x6f): /* movq mm/m64,mm */
+ case X86EMUL_OPC(0x0f, 0x7f): /* movq mm,mm/m64 */
+ op_bytes = 8;
+ d |= TwoOp;
+ goto simd_0f_int;
+
+ case X86EMUL_OPC_F3(0x0f, 0xd6): /* movq2dq mm,xmm */
+ case X86EMUL_OPC_F2(0x0f, 0xd6): /* movdq2q xmm,mm */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+ op_bytes = 8;
+ host_and_vcpu_must_have(mmx);
+ goto simd_0f_int;
+
+ case X86EMUL_OPC(0x0f, 0xe7): /* movntq mm,m64 */
+ generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ sfence = true;
+ /* fall through */
case X86EMUL_OPC(0x0f, 0xda): /* pminub mm/m64,mm */
case X86EMUL_OPC(0x0f, 0xde): /* pmaxub mm/m64,mm */
case X86EMUL_OPC(0x0f, 0xea): /* pminsw mm/m64,mm */
@@ -6348,6 +6427,73 @@ x86_emulate(
get_fpu(X86EMUL_FPU_mmx, &fic);
goto simd_0f_common;
+ CASE_SIMD_PACKED_INT(0x0f, 0xf7): /* maskmov{q,dqu} {,x}mm,{,x}mm */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf7): /* vmaskmovdqu xmm,xmm */
+ generate_exception_if(ea.type != OP_REG, EXC_UD);
+ if ( vex.opcx != vex_none )
+ {
+ generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+ host_and_vcpu_must_have(avx);
+ get_fpu(X86EMUL_FPU_ymm, &fic);
+ }
+ else if ( vex.pfx )
+ {
+ vcpu_must_have(sse2);
+ get_fpu(X86EMUL_FPU_xmm, &fic);
+ }
+ else
+ {
+ host_and_vcpu_must_have(mmx);
+ vcpu_must_have(sse);
+ get_fpu(X86EMUL_FPU_mmx, &fic);
+ }
+
+ /*
+ * While we can't reasonably provide fully correct behavior here
+ * (in particular avoiding the memory read in anticipation of all
+ * bytes in the range eventually being written), we can (and should)
+ * still suppress the memory access if all mask bits are clear. Read
+ * the mask bits via {,v}pmovmskb for that purpose.
+ */
+ opc = init_prefixes(stub);
+ opc[0] = 0xd7; /* {,v}pmovmskb */
+ /* (Ab)use "sfence" for latching the original REX.R / VEX.R. */
+ sfence = rex_prefix & REX_R;
+ /* Convert GPR destination to %rAX. */
+ rex_prefix &= ~REX_R;
+ vex.r = 1;
+ if ( !mode_64bit() )
+ vex.w = 0;
+ opc[1] = modrm & 0xc7;
+ fic.insn_bytes = PFX_BYTES + 2;
+ opc[2] = 0xc3;
+
+ copy_REX_VEX(opc, rex_prefix, vex);
+ invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
+
+ put_stub(stub);
+ if ( !ea.val )
+ {
+ put_fpu(&fic);
+ goto complete_insn;
+ }
+
+ opc = init_prefixes(stub);
+ opc[0] = b;
+ opc[1] = modrm;
+ /* Restore high bit of XMM destination. */
+ if ( sfence )
+ {
+ rex_prefix |= REX_R;
+ vex.r = 0;
+ }
+
+ d |= TwoOp;
+ ea.type = OP_MEM;
+ ea.mem.off = truncate_ea(_regs.r(di));
+ sfence = true;
+ break;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);
@@ -6611,23 +6757,14 @@ x86_emulate(
if ( state->simd_size )
{
-#ifdef __XEN__
- uint8_t *buf = stub.ptr;
-#else
- uint8_t *buf = get_stub(stub);
-#endif
-
generate_exception_if(!op_bytes, EXC_UD);
generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
EXC_UD);
- if ( !buf )
+ if ( !opc )
BUG();
- if ( vex.opcx == vex_none )
- SET_SSE_PREFIX(buf[0], vex.pfx);
-
- buf[fic.insn_bytes] = 0xc3;
- copy_REX_VEX(buf, rex_prefix, vex);
+ opc[fic.insn_bytes - PFX_BYTES] = 0xc3;
+ copy_REX_VEX(opc, rex_prefix, vex);
if ( ea.type == OP_MEM )
{
@@ -6635,10 +6772,16 @@ x86_emulate(
if ( op_bytes < 16 ||
(vex.opcx
- ? /* vmov{a,nt}p{s,d} are exceptions. */
- ext != ext_0f || ((b | 1) != 0x29 && b != 0x2b)
- : /* movup{s,d} and lddqu are exceptions. */
- ext == ext_0f && ((b | 1) == 0x11 || b == 0xf0)) )
+ ? /* vmov{{a,nt}p{s,d},dqa,ntdq} are exceptions. */
+ ext != ext_0f ||
+ ((b | 1) != 0x29 && b != 0x2b &&
+ ((b | 0x10) != 0x7f || vex.pfx != vex_66) &&
+ b != 0xe7)
+ : /* movup{s,d}, {,mask}movdqu, and lddqu are exceptions. */
+ ext == ext_0f &&
+ ((b | 1) == 0x11 ||
+ ((b | 0x10) == 0x7f && vex.pfx == vex_f3) ||
+ b == 0xf7 || b == 0xf0)) )
mxcsr = MXCSR_MM;
else if ( vcpu_has_misalignsse() )
asm ( "stmxcsr %0" : "=m" (mxcsr) );
@@ -6646,14 +6789,25 @@ x86_emulate(
!is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
ctxt, ops),
EXC_GP, 0);
- if ( (d & SrcMask) == SrcMem )
+ switch ( d & SrcMask )
{
+ case SrcMem:
rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
if ( rc != X86EMUL_OKAY )
goto done;
+ /* fall through */
+ case SrcMem16:
dst.type = OP_NONE;
+ break;
+ default:
+ if ( (d & DstMask) != DstMem )
+ {
+ ASSERT_UNREACHABLE();
+ return X86EMUL_UNHANDLEABLE;
+ }
+ break;
}
- else if ( (d & DstMask) == DstMem )
+ if ( (d & DstMask) == DstMem )
{
fail_if(!ops->write); /* Check before running the stub. */
ASSERT(d & Mov);
@@ -6661,13 +6815,6 @@ x86_emulate(
dst.bytes = op_bytes;
dst.mem = ea.mem;
}
- else if ( (d & SrcMask) == SrcMem16 )
- dst.type = OP_NONE;
- else
- {
- ASSERT_UNREACHABLE();
- return X86EMUL_UNHANDLEABLE;
- }
}
else
dst.type = OP_NONE;
@@ -6928,6 +7075,8 @@ x86_insn_is_mem_access(const struct x86_
case 0xa4 ... 0xa7: /* MOVS / CMPS */
case 0xaa ... 0xaf: /* STOS / LODS / SCAS */
case 0xd7: /* XLAT */
+ CASE_SIMD_PACKED_INT(0x0f, 0xf7): /* MASKMOV{Q,DQU} */
+ case X86EMUL_OPC_VEX_66(0x0f, 0xf7): /* VMASKMOVDQU */
return true;
case X86EMUL_OPC(0x0f, 0x01):
@@ -6945,7 +7094,8 @@ x86_insn_is_mem_write(const struct x86_e
switch ( state->desc & DstMask )
{
case DstMem:
- return state->modrm_mod != 3;
+ /* The SrcMem check is to cover {,V}MASKMOV{Q,DQU}. */
+ return state->modrm_mod != 3 || (state->desc & SrcMask) == SrcMem;
case DstBitBase:
case DstImplicit:
@@ -6965,22 +7115,9 @@ x86_insn_is_mem_write(const struct x86_e
case 0x6c: case 0x6d: /* INS */
case 0xa4: case 0xa5: /* MOVS */
case 0xaa: case 0xab: /* STOS */
- case X86EMUL_OPC(0x0f, 0x7e): /* MOVD/MOVQ */
- case X86EMUL_OPC_66(0x0f, 0x7e): /* MOVD/MOVQ */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* VMOVD/VMOVQ */
- case X86EMUL_OPC(0x0f, 0x7f): /* VMOVQ */
- case X86EMUL_OPC_66(0x0f, 0x7f): /* MOVDQA */
- case X86EMUL_OPC_VEX_66(0x0f, 0x7f): /* VMOVDQA */
- case X86EMUL_OPC_F3(0x0f, 0x7f): /* MOVDQU */
- case X86EMUL_OPC_VEX_F3(0x0f, 0x7f): /* VMOVDQU */
case X86EMUL_OPC(0x0f, 0xab): /* BTS */
case X86EMUL_OPC(0x0f, 0xb3): /* BTR */
case X86EMUL_OPC(0x0f, 0xbb): /* BTC */
- case X86EMUL_OPC_66(0x0f, 0xd6): /* MOVQ */
- case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* VMOVQ */
- case X86EMUL_OPC(0x0f, 0xe7): /* MOVNTQ */
- case X86EMUL_OPC_66(0x0f, 0xe7): /* MOVNTDQ */
- case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* VMOVNTDQ */
return true;
case 0xd9:
[-- Attachment #3: Type: text/plain, Size: 127 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 9+ messages in thread