All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Jan Beulich" <JBeulich@suse.com>
To: xen-devel <xen-devel@lists.xenproject.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: [PATCH 2/8] x86emul: support most memory accessing MMX/SSE/SSE2 insns
Date: Wed, 25 Jan 2017 08:04:21 -0700	[thread overview]
Message-ID: <5888CC850200007800133DAB@prv-mh.provo.novell.com> (raw)
In-Reply-To: <5888C9110200007800133D98@prv-mh.provo.novell.com>

[-- Attachment #1: Type: text/plain, Size: 55386 bytes --]

This aims at covering most MMX/SSEn/AVX instructions in the 0x0f-escape
space with memory operands. Not covered here are irregular moves,
converts, and {,U}COMIS{S,D} (modifying EFLAGS).

Note that the distinction between simd_*_fp isn't strictly needed, but
I've kept them as separate entries since in an earlier version I needed
them to be separate, and we may well find it useful down the road to
have that distinction.

Also take the opportunity and adjust the vmovdqu test case the new
LDDQU one here has been cloned from: To zero a ymm register we don't
need to go through hoops, as 128-bit AVX insns zero the upper portion
of the destination register, and in the disabled AVX2 code there was a
wrong YMM register used.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1656,12 +1656,7 @@ int main(int argc, char **argv)
     {
         decl_insn(vmovdqu_from_mem);
 
-#if 0 /* Don't use AVX2 instructions for now */
-        asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n"
-#else
-        asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t"
-                       "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n"
-#endif
+        asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
                        put_insn(vmovdqu_from_mem, "vmovdqu (%0), %%ymm4")
                        :: "d" (NULL) );
 
@@ -1675,7 +1670,7 @@ int main(int argc, char **argv)
 #if 0 /* Don't use AVX2 instructions for now */
         asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
               "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
-              "vpmovmskb %%ymm1, %0" : "=r" (rc) );
+              "vpmovmskb %%ymm0, %0" : "=r" (rc) );
 #else
         asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
               "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
@@ -2083,6 +2078,67 @@ int main(int argc, char **argv)
         printf("skipped\n");
 #endif
 
+    printf("%-40s", "Testing lddqu 4(%edx),%xmm4...");
+    if ( stack_exec && cpu_has_sse3 )
+    {
+        decl_insn(lddqu);
+
+        asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n"
+                       put_insn(lddqu, "lddqu 4(%0), %%xmm4")
+                       :: "d" (NULL) );
+
+        set_insn(lddqu);
+        memset(res, 0x55, 64);
+        memset(res + 1, 0xff, 16);
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(lddqu) )
+            goto fail;
+        asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+              "pcmpeqb %%xmm4, %%xmm2\n\t"
+              "pmovmskb %%xmm2, %0" : "=r" (rc) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vlddqu (%ecx),%ymm4...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vlddqu);
+
+        asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
+                       put_insn(vlddqu, "vlddqu (%0), %%ymm4")
+                       :: "c" (NULL) );
+
+        set_insn(vlddqu);
+        memset(res + 1, 0xff, 32);
+        regs.ecx = (unsigned long)(res + 1);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vlddqu) )
+            goto fail;
+#if 0 /* Don't use AVX2 instructions for now */
+        asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
+              "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
+              "vpmovmskb %%ymm0, %0" : "=r" (rc) );
+#else
+        asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
+              "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
+              "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
+              "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
+              "vpmovmskb %%xmm0, %0\n\t"
+              "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
+        rc |= i << 16;
+#endif
+        if ( ~rc )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #undef decl_insn
 #undef put_insn
 #undef set_insn
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -81,6 +81,12 @@ static inline uint64_t xgetbv(uint32_t x
     (res.d & (1U << 26)) != 0; \
 })
 
+#define cpu_has_sse3 ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    (res.c & (1U << 0)) != 0; \
+})
+
 #define cpu_has_popcnt ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -45,6 +45,8 @@
 #define ModRM       (1<<6)
 /* Destination is only written; never read. */
 #define Mov         (1<<7)
+/* VEX/EVEX (SIMD only): 2nd source operand unused (must be all ones) */
+#define TwoOp       Mov
 /* All operands are implicit in the opcode. */
 #define ImplicitOps (DstImplicit|SrcImplicit)
 
@@ -180,104 +182,171 @@ static const opcode_desc_t opcode_table[
     ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
 };
 
-static const opcode_desc_t twobyte_table[256] = {
-    /* 0x00 - 0x07 */
-    ModRM, ImplicitOps|ModRM, DstReg|SrcMem16|ModRM, DstReg|SrcMem16|ModRM,
-    0, ImplicitOps, ImplicitOps, ImplicitOps,
-    /* 0x08 - 0x0F */
-    ImplicitOps, ImplicitOps, 0, ImplicitOps,
-    0, ImplicitOps|ModRM, ImplicitOps, ModRM|SrcImmByte,
-    /* 0x10 - 0x17 */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    /* 0x18 - 0x1F */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    /* 0x20 - 0x27 */
-    DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
-    DstImplicit|SrcMem|ModRM, DstImplicit|SrcMem|ModRM,
-    0, 0, 0, 0,
-    /* 0x28 - 0x2F */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    /* 0x30 - 0x37 */
-    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-    ImplicitOps, ImplicitOps, 0, ImplicitOps,
-    /* 0x38 - 0x3F */
-    DstReg|SrcMem|ModRM, 0, DstReg|SrcImmByte|ModRM, 0, 0, 0, 0, 0,
-    /* 0x40 - 0x47 */
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    /* 0x48 - 0x4F */
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    /* 0x50 - 0x5F */
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    /* 0x60 - 0x6F */
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
-    /* 0x70 - 0x7F */
-    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
-    ModRM, ModRM, ModRM, ImplicitOps,
-    ModRM, ModRM, 0, 0, ModRM, ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    /* 0x80 - 0x87 */
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    /* 0x88 - 0x8F */
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    /* 0x90 - 0x97 */
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    /* 0x98 - 0x9F */
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    /* 0xA0 - 0xA7 */
-    ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, DstBitBase|SrcReg|ModRM,
-    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, ModRM, ModRM,
-    /* 0xA8 - 0xAF */
-    ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, DstBitBase|SrcReg|ModRM,
-    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM,
-    ImplicitOps|ModRM, DstReg|SrcMem|ModRM,
-    /* 0xB0 - 0xB7 */
-    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
-    DstReg|SrcMem|ModRM|Mov, DstBitBase|SrcReg|ModRM,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
-    /* 0xB8 - 0xBF */
-    DstReg|SrcMem|ModRM, ModRM,
-    DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
-    DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
-    ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
-    /* 0xC0 - 0xC7 */
-    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
-    SrcImmByte|ModRM, DstMem|SrcReg|ModRM|Mov,
-    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, ImplicitOps|ModRM,
-    /* 0xC8 - 0xCF */
-    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-    /* 0xD0 - 0xDF */
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM, ModRM,
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    /* 0xE0 - 0xEF */
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    /* 0xF0 - 0xFF */
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
+enum simd_opsize {
+    simd_none,
+    /*
+     * Ordinary packed integers:
+     * - 64 bits without prefix 66 (MMX)
+     * - 128 bits with prefix 66 (SSEn)
+     * - 128/256 bits depending on VEX.L (AVX)
+     */
+    simd_packed_int,
+    /*
+     * Ordinary packed/scalar floating point:
+     * - 128 bits without prefix or with prefix 66 (SSEn)
+     * - 128/256 bits depending on VEX.L (AVX)
+     * - 32 bits with prefix F3 (scalar single)
+     * - 64 bits with prefix F2 (scalar doubgle)
+     */
+    simd_any_fp,
+    /*
+     * Packed floating point:
+     * - 128 bits without prefix or with prefix 66 (SSEn)
+     * - 128/256 bits depending on VEX.L (AVX)
+     */
+    simd_packed_fp,
+    /*
+     * Single precision packed/scalar floating point:
+     * - 128 bits without prefix (SSEn)
+     * - 128/256 bits depending on VEX.L, no prefix (AVX)
+     * - 32 bits with prefix F3 (scalar)
+     */
+    simd_single_fp,
+    /* Operand size encoded in non-standard way. */
+    simd_other
+};
+typedef uint8_t simd_opsize_t;
+
+static const struct {
+    opcode_desc_t desc;
+    simd_opsize_t size;
+} twobyte_table[256] = {
+    [0x00] = { ModRM },
+    [0x01] = { ImplicitOps|ModRM },
+    [0x02] = { DstReg|SrcMem16|ModRM },
+    [0x03] = { DstReg|SrcMem16|ModRM },
+    [0x05] = { ImplicitOps },
+    [0x06] = { ImplicitOps },
+    [0x07] = { ImplicitOps },
+    [0x08] = { ImplicitOps },
+    [0x09] = { ImplicitOps },
+    [0x0b] = { ImplicitOps },
+    [0x0d] = { ImplicitOps|ModRM },
+    [0x0e] = { ImplicitOps },
+    [0x0f] = { ModRM|SrcImmByte },
+    [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
+    [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+    [0x12] = { ImplicitOps|ModRM },
+    [0x13] = { ImplicitOps|ModRM },
+    [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+    [0x16] = { ImplicitOps|ModRM },
+    [0x17] = { ImplicitOps|ModRM },
+    [0x18] = { ImplicitOps|ModRM },
+    [0x19] = { ImplicitOps|ModRM },
+    [0x1a] = { ImplicitOps|ModRM },
+    [0x1b] = { ImplicitOps|ModRM },
+    [0x1c] = { ImplicitOps|ModRM },
+    [0x1d] = { ImplicitOps|ModRM },
+    [0x1e] = { ImplicitOps|ModRM },
+    [0x1f] = { ImplicitOps|ModRM },
+    [0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
+    [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
+    [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp },
+    [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp },
+    [0x2a] = { ImplicitOps|ModRM },
+    [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+    [0x2c ... 0x2d] = { ImplicitOps|ModRM },
+    [0x2e ... 0x2f] = { ImplicitOps|ModRM },
+    [0x30] = { ImplicitOps },
+    [0x31] = { ImplicitOps },
+    [0x32] = { ImplicitOps },
+    [0x33] = { ImplicitOps },
+    [0x34] = { ImplicitOps },
+    [0x35] = { ImplicitOps },
+    [0x37] = { ImplicitOps },
+    [0x38] = { DstReg|SrcMem|ModRM },
+    [0x3a] = { DstReg|SrcImmByte|ModRM },
+    [0x40 ... 0x4f] = { DstReg|SrcMem|ModRM|Mov },
+    [0x50] = { ModRM },
+    [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },
+    [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
+    [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+    [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+    [0x5a] = { ModRM },
+    [0x5b] = { ModRM },
+    [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+    [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x6b] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x6c ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x6e ... 0x6f] = { ImplicitOps|ModRM },
+    [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
+    [0x71 ... 0x73] = { SrcImmByte|ModRM },
+    [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x77] = { DstImplicit|SrcNone },
+    [0x78] = { ModRM },
+    [0x79] = { ModRM },
+    [0x7c] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x7e ... 0x7f] = { ImplicitOps|ModRM },
+    [0x80 ... 0x8f] = { DstImplicit|SrcImm },
+    [0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
+    [0xa0 ... 0xa1] = { ImplicitOps|Mov },
+    [0xa2] = { ImplicitOps },
+    [0xa3] = { DstBitBase|SrcReg|ModRM },
+    [0xa4] = { DstMem|SrcImmByte|ModRM },
+    [0xa5] = { DstMem|SrcReg|ModRM },
+    [0xa6] = { ModRM },
+    [0xa7] = { ModRM },
+    [0xa8 ... 0xa9] = { ImplicitOps|Mov },
+    [0xaa] = { ImplicitOps },
+    [0xab] = { DstBitBase|SrcReg|ModRM },
+    [0xac] = { DstMem|SrcImmByte|ModRM },
+    [0xad] = { DstMem|SrcReg|ModRM },
+    [0xae] = { ImplicitOps|ModRM },
+    [0xaf] = { DstReg|SrcMem|ModRM },
+    [0xb0] = { ByteOp|DstMem|SrcReg|ModRM },
+    [0xb1] = { DstMem|SrcReg|ModRM },
+    [0xb2] = { DstReg|SrcMem|ModRM|Mov },
+    [0xb3] = { DstBitBase|SrcReg|ModRM },
+    [0xb4 ... 0xb5] = { DstReg|SrcMem|ModRM|Mov },
+    [0xb6] = { ByteOp|DstReg|SrcMem|ModRM|Mov },
+    [0xb7] = { DstReg|SrcMem16|ModRM|Mov },
+    [0xb8] = { DstReg|SrcMem|ModRM },
+    [0xb9] = { ModRM },
+    [0xba] = { DstBitBase|SrcImmByte|ModRM },
+    [0xbb] = { DstBitBase|SrcReg|ModRM },
+    [0xbc ... 0xbd] = { DstReg|SrcMem|ModRM },
+    [0xbe] = { ByteOp|DstReg|SrcMem|ModRM|Mov },
+    [0xbf] = { DstReg|SrcMem16|ModRM|Mov },
+    [0xc0] = { ByteOp|DstMem|SrcReg|ModRM },
+    [0xc1] = { DstMem|SrcReg|ModRM },
+    [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp },
+    [0xc3] = { DstMem|SrcReg|ModRM|Mov },
+    [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
+    [0xc5] = { SrcImmByte|ModRM },
+    [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp },
+    [0xc7] = { ImplicitOps|ModRM },
+    [0xc8 ... 0xcf] = { ImplicitOps },
+    [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xd6] = { ImplicitOps|ModRM },
+    [0xd7] = { ModRM },
+    [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe6] = { ModRM },
+    [0xe7] = { ImplicitOps|ModRM },
+    [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+    [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xf7] = { ModRM },
+    [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xff] = { ModRM }
 };
 
 static const opcode_desc_t xop_table[] = {
@@ -1372,10 +1441,12 @@ static bool vcpu_has(
 #define vcpu_has_lahf_lm()     vcpu_has(0x80000001, ECX,  0, ctxt, ops)
 #define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, ECX,  4, ctxt, ops)
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
+#define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
 #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
 #define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
 #define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
 #define vcpu_has_hle()         vcpu_has(         7, EBX,  4, ctxt, ops)
+#define vcpu_has_avx2()        vcpu_has(         7, EBX,  5, ctxt, ops)
 #define vcpu_has_bmi2()        vcpu_has(         7, EBX,  8, ctxt, ops)
 #define vcpu_has_rtm()         vcpu_has(         7, EBX, 11, ctxt, ops)
 #define vcpu_has_mpx()         vcpu_has(         7, EBX, 14, ctxt, ops)
@@ -1975,6 +2046,7 @@ struct x86_emulate_state {
     opcode_desc_t desc;
     union vex vex;
     union evex evex;
+    enum simd_opsize simd_size;
 
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -2110,7 +2182,8 @@ x86_decode_twobyte(
     case 0x50 ... 0x77:
     case 0x79 ... 0x7f:
     case 0xae:
-    case 0xc2 ... 0xc6:
+    case 0xc2 ... 0xc3:
+    case 0xc5 ... 0xc6:
     case 0xd0 ... 0xfe:
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         break;
@@ -2137,8 +2210,23 @@ x86_decode_twobyte(
     case 0xbd: bsr / lzcnt
          * They're being dealt with in the execution phase (if at all).
          */
+
+    case 0xc4: /* pinsrw */
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0, 0xc4): /* vpinsrw */
+        state->desc = DstReg | SrcMem16 | ModRM;
+        break;
     }
 
+    /*
+     * Register forms of most VEX-encoded TwoOp instructions have
+     * three operands.
+     */
+    if ( state->simd_size && ea.type != OP_MEM &&
+         vex.opcx && (vex.pfx & VEX_PREFIX_SCALAR_MASK) )
+        state->desc &= ~TwoOp;
+
  done:
     return rc;
 }
@@ -2270,12 +2358,13 @@ x86_decode(
     {
         /* Two-byte opcode. */
         b = insn_fetch_type(uint8_t);
-        d = twobyte_table[b];
+        d = twobyte_table[b].desc;
         switch ( b )
         {
         default:
             opcode = b | MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
             ext = ext_0f;
+            state->simd_size = twobyte_table[b].size;
             break;
         case 0x38:
             b = insn_fetch_type(uint8_t);
@@ -2381,15 +2470,16 @@ x86_decode(
                     {
                     case vex_0f:
                         opcode |= MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
-                        d = twobyte_table[b];
+                        d = twobyte_table[b].desc;
+                        state->simd_size = twobyte_table[b].size;
                         break;
                     case vex_0f38:
                         opcode |= MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
-                        d = twobyte_table[0x38];
+                        d = twobyte_table[0x38].desc;
                         break;
                     case vex_0f3a:
                         opcode |= MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
-                        d = twobyte_table[0x3a];
+                        d = twobyte_table[0x3a].desc;
                         break;
                     default:
                         rc = X86EMUL_UNHANDLEABLE;
@@ -2639,13 +2729,53 @@ x86_decode(
         ea.mem.off = truncate_ea(ea.mem.off);
     }
 
-    /*
-     * When prefix 66 has a meaning different from operand-size override,
-     * operand size defaults to 4 and can't be overridden to 2.
-     */
-    if ( op_bytes == 2 &&
-         (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
-        op_bytes = 4;
+    switch ( state->simd_size )
+    {
+    case simd_none:
+        /*
+         * When prefix 66 has a meaning different from operand-size override,
+         * operand size defaults to 4 and can't be overridden to 2.
+         */
+        if ( op_bytes == 2 &&
+             (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
+            op_bytes = 4;
+        break;
+
+    case simd_packed_int:
+        switch ( vex.pfx )
+        {
+        case vex_none: op_bytes = 8;           break;
+        case vex_66:   op_bytes = 16 << vex.l; break;
+        default:       op_bytes = 0;           break;
+        }
+        break;
+
+    case simd_single_fp:
+        if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+        {
+            op_bytes = 0;
+            break;
+    case simd_packed_fp:
+            if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
+            {
+                op_bytes = 0;
+                break;
+            }
+        }
+        /* fall through */
+    case simd_any_fp:
+        switch ( vex.pfx )
+        {
+        default:     op_bytes = 16 << vex.l; break;
+        case vex_f3: op_bytes = 4;           break;
+        case vex_f2: op_bytes = 8;           break;
+        }
+        break;
+
+    default:
+        op_bytes = 0;
+        break;
+    }
 
  done:
     return rc;
@@ -2669,8 +2799,10 @@ x86_emulate(
     int rc;
     uint8_t b, d;
     bool singlestep = (_regs._eflags & EFLG_TF) && !is_branch_step(ctxt, ops);
+    bool sfence = false;
     struct operand src = { .reg = PTR_POISON };
     struct operand dst = { .reg = PTR_POISON };
+    unsigned long cr4;
     enum x86_swint_type swint_type;
     struct fpu_insn_ctxt fic;
     struct x86_emulate_stub stub = {};
@@ -2737,6 +2869,8 @@ x86_emulate(
         ea.bytes = 2;
         goto srcmem_common;
     case SrcMem:
+        if ( state->simd_size )
+            break;
         ea.bytes = (d & ByteOp) ? 1 : op_bytes;
     srcmem_common:
         src = ea;
@@ -2837,6 +2971,11 @@ x86_emulate(
         d = (d & ~DstMask) | DstMem;
         /* Becomes a normal DstMem operation from here on. */
     case DstMem:
+        if ( state->simd_size )
+        {
+            generate_exception_if(lock_prefix, EXC_UD);
+            break;
+        }
         ea.bytes = (d & ByteOp) ? 1 : op_bytes;
         dst = ea;
         if ( dst.type == OP_REG )
@@ -2871,7 +3010,6 @@ x86_emulate(
     {
         enum x86_segment seg;
         struct segment_register cs, sreg;
-        unsigned long cr4;
         struct cpuid_leaf cpuid_leaf;
 
     case 0x00 ... 0x05: add: /* add */
@@ -5066,116 +5204,109 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
-    case X86EMUL_OPC(0x0f, 0x2b):        /* movntps xmm,m128 */
-    case X86EMUL_OPC_VEX(0x0f, 0x2b):    /* vmovntps xmm,m128 */
-                                         /* vmovntps ymm,m256 */
-    case X86EMUL_OPC_66(0x0f, 0x2b):     /* movntpd xmm,m128 */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* vmovntpd xmm,m128 */
-                                         /* vmovntpd ymm,m256 */
-        fail_if(ea.type != OP_MEM);
-        /* fall through */
-    case X86EMUL_OPC(0x0f, 0x28):        /* movaps xmm/m128,xmm */
-    case X86EMUL_OPC_VEX(0x0f, 0x28):    /* vmovaps xmm/m128,xmm */
-                                         /* vmovaps ymm/m256,ymm */
-    case X86EMUL_OPC_66(0x0f, 0x28):     /* movapd xmm/m128,xmm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x28): /* vmovapd xmm/m128,xmm */
-                                         /* vmovapd ymm/m256,ymm */
-    case X86EMUL_OPC(0x0f, 0x29):        /* movaps xmm,xmm/m128 */
-    case X86EMUL_OPC_VEX(0x0f, 0x29):    /* vmovaps xmm,xmm/m128 */
-                                         /* vmovaps ymm,ymm/m256 */
-    case X86EMUL_OPC_66(0x0f, 0x29):     /* movapd xmm,xmm/m128 */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* vmovapd xmm,xmm/m128 */
-                                         /* vmovapd ymm,ymm/m256 */
-    case X86EMUL_OPC(0x0f, 0x10):        /* movups xmm/m128,xmm */
-    case X86EMUL_OPC_VEX(0x0f, 0x10):    /* vmovups xmm/m128,xmm */
-                                         /* vmovups ymm/m256,ymm */
-    case X86EMUL_OPC_66(0x0f, 0x10):     /* movupd xmm/m128,xmm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x10): /* vmovupd xmm/m128,xmm */
-                                         /* vmovupd ymm/m256,ymm */
-    case X86EMUL_OPC_F3(0x0f, 0x10):     /* movss xmm/m32,xmm */
-    case X86EMUL_OPC_VEX_F3(0x0f, 0x10): /* vmovss xmm/m32,xmm */
-    case X86EMUL_OPC_F2(0x0f, 0x10):     /* movsd xmm/m64,xmm */
-    case X86EMUL_OPC_VEX_F2(0x0f, 0x10): /* vmovsd xmm/m64,xmm */
-    case X86EMUL_OPC(0x0f, 0x11):        /* movups xmm,xmm/m128 */
-    case X86EMUL_OPC_VEX(0x0f, 0x11):    /* vmovups xmm,xmm/m128 */
-                                         /* vmovups ymm,ymm/m256 */
-    case X86EMUL_OPC_66(0x0f, 0x11):     /* movupd xmm,xmm/m128 */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* vmovupd xmm,xmm/m128 */
-                                         /* vmovupd ymm,ymm/m256 */
-    case X86EMUL_OPC_F3(0x0f, 0x11):     /* movss xmm,xmm/m32 */
-    case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* vmovss xmm,xmm/m32 */
-    case X86EMUL_OPC_F2(0x0f, 0x11):     /* movsd xmm,xmm/m64 */
-    case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* vmovsd xmm,xmm/m64 */
-    {
-        uint8_t *buf = get_stub(stub);
+#define CASE_SIMD_PACKED_INT(pfx, opc)       \
+    case X86EMUL_OPC(pfx, opc):              \
+    case X86EMUL_OPC_66(pfx, opc)
+#define CASE_SIMD_SINGLE_FP(kind, pfx, opc)  \
+    case X86EMUL_OPC##kind(pfx, opc):        \
+    case X86EMUL_OPC##kind##_F3(pfx, opc)
+#define CASE_SIMD_DOUBLE_FP(kind, pfx, opc)  \
+    case X86EMUL_OPC##kind##_66(pfx, opc):   \
+    case X86EMUL_OPC##kind##_F2(pfx, opc)
+#define CASE_SIMD_ALL_FP(kind, pfx, opc)     \
+    CASE_SIMD_SINGLE_FP(kind, pfx, opc):     \
+    CASE_SIMD_DOUBLE_FP(kind, pfx, opc)
+#define CASE_SIMD_PACKED_FP(kind, pfx, opc)  \
+    case X86EMUL_OPC##kind(pfx, opc):        \
+    case X86EMUL_OPC##kind##_66(pfx, opc)
+#define CASE_SIMD_SCALAR_FP(kind, pfx, opc)  \
+    case X86EMUL_OPC##kind##_F3(pfx, opc):   \
+    case X86EMUL_OPC##kind##_F2(pfx, opc)
 
-        fic.insn_bytes = 5;
-        buf[0] = 0x3e;
-        buf[1] = 0x3e;
-        buf[2] = 0x0f;
-        buf[3] = b;
-        buf[4] = modrm;
-        buf[5] = 0xc3;
+    CASE_SIMD_SCALAR_FP(, 0x0f, 0x2b):     /* movnts{s,d} xmm,mem */
+        host_and_vcpu_must_have(sse4a);
+        /* fall through */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x2b):     /* movntp{s,d} xmm,m128 */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2b): /* vmovntp{s,d} {x,y}mm,mem */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        sfence = true;
+        /* fall through */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x10):        /* mov{up,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x11):        /* mov{up,s}{s,d} xmm,xmm/mem */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x10): /* vmovup{s,d} {x,y}mm/mem,{x,y}mm */
+    CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x10): /* vmovs{s,d} mem,xmm */
+                                           /* vmovs{s,d} xmm,xmm,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x11): /* vmovup{s,d} {x,y}mm,{x,y}mm/mem */
+    CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x11): /* vmovs{s,d} xmm,mem */
+                                           /* vmovs{s,d} xmm,xmm,xmm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x14):     /* unpcklp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x14): /* vunpcklp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x15):     /* unpckhp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x15): /* vunpckhp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x28):     /* movap{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x28): /* vmovap{s,d} {x,y}mm/mem,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x29):     /* movap{s,d} xmm,xmm/m128 */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x29): /* vmovap{s,d} {x,y}mm,{x,y}mm/mem */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x51):        /* sqrt{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x51):    /* vsqrtp{s,d} {x,y}mm/mem,{x,y}mm */
+                                           /* vsqrts{s,d} xmm,xmm,xmm */
+    CASE_SIMD_SINGLE_FP(, 0x0f, 0x52):     /* rsqrt{p,s}s xmm/mem,xmm */
+    CASE_SIMD_SINGLE_FP(_VEX, 0x0f, 0x52): /* vrsqrtps {x,y}mm/mem,{x,y}mm */
+                                           /* vrsqrtss xmm,xmm,xmm */
+    CASE_SIMD_SINGLE_FP(, 0x0f, 0x53):     /* rcp{p,s}s xmm/mem,xmm */
+    CASE_SIMD_SINGLE_FP(_VEX, 0x0f, 0x53): /* vrcpps {x,y}mm/mem,{x,y}mm */
+                                           /* vrcpss xmm,xmm,xmm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x54):     /* andp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x54): /* vandp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x55):     /* andnp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x55): /* vandnp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x56):     /* orp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x56): /* vorp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x57):     /* xorp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x57): /* vxorp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x58):        /* add{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x58):    /* vadd{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x59):        /* mul{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x59):    /* vmul{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x5c):        /* sub{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5c):    /* vsub{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x5d):        /* min{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5d):    /* vmin{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x5e):        /* div{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5e):    /* vdiv{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x5f):        /* max{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5f):    /* vmax{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
         if ( vex.opcx == vex_none )
         {
             if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            ea.bytes = 16;
-            SET_SSE_PREFIX(buf[0], vex.pfx);
             get_fpu(X86EMUL_FPU_xmm, &fic);
         }
         else
         {
-            fail_if((vex.reg != 0xf) &&
-                    ((ea.type == OP_MEM) ||
-                     !(vex.pfx & VEX_PREFIX_SCALAR_MASK)));
             host_and_vcpu_must_have(avx);
+            fail_if((vex.pfx & VEX_PREFIX_SCALAR_MASK) && vex.l);
             get_fpu(X86EMUL_FPU_ymm, &fic);
-            ea.bytes = 16 << vex.l;
         }
-        if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
-            ea.bytes = vex.pfx & VEX_PREFIX_DOUBLE_MASK ? 8 : 4;
+    simd_0f_common:
+    {
+        uint8_t *buf = get_stub(stub);
+
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
         if ( ea.type == OP_MEM )
         {
-            uint32_t mxcsr = 0;
-
-            if ( b < 0x28 )
-                mxcsr = MXCSR_MM;
-            else if ( vcpu_has_misalignsse() )
-                asm ( "stmxcsr %0" : "=m" (mxcsr) );
-            generate_exception_if(!(mxcsr & MXCSR_MM) &&
-                                  !is_aligned(ea.mem.seg, ea.mem.off, ea.bytes,
-                                              ctxt, ops),
-                                  EXC_GP, 0);
-            if ( !(b & 1) )
-                rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
-                               ea.bytes, ctxt);
-            else
-                fail_if(!ops->write); /* Check before running the stub. */
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
             buf[4] &= 0x38;
         }
-        if ( !rc )
-        {
-           copy_REX_VEX(buf, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
-                                     : "memory" );
-        }
-        put_fpu(&fic);
-        put_stub(stub);
-        if ( !rc && (b & 1) && (ea.type == OP_MEM) )
-        {
-            ASSERT(ops->write); /* See the fail_if() above. */
-            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
-                            ea.bytes, ctxt);
-        }
-        if ( rc )
-            goto done;
-        dst.type = OP_NONE;
+        fic.insn_bytes = 5;
         break;
     }
 
@@ -5338,6 +5469,125 @@ x86_emulate(
         break;
     }
 
+    CASE_SIMD_PACKED_INT(0x0f, 0x60):    /* punpcklbw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x61):    /* punpcklwd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x61): /* vpunpcklwd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x62):    /* punpckldq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x62): /* vpunpckldq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x68):    /* punpckhbw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x68): /* vpunpckhbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x69):    /* punpckhwd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x69): /* vpunpckhwd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x6a):    /* punpckhdq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6a): /* vpunpckhdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        op_bytes = vex.pfx ? 16 << vex.l : 4;
+        /* fall through */
+    CASE_SIMD_PACKED_INT(0x0f, 0x63):    /* packssbw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x63): /* vpackssbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x64):    /* pcmpgtb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x64): /* vpcmpgtb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x65):    /* pcmpgtw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x65): /* vpcmpgtw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x66):    /* pcmpgtd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x66): /* vpcmpgtd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x67):    /* packusbw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x67): /* vpackusbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x6b):    /* packsswd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6b): /* vpacksswd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0x6c):     /* punpcklqdq xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6c): /* vpunpcklqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0x6d):     /* punpckhqdq xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6d): /* vpunpckhqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x74):    /* pcmpeqb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x74): /* vpcmpeqb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x75):    /* pcmpeqw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x75): /* vpcmpeqw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x76):    /* pcmpeqd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x76): /* vpcmpeqd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xd4):     /* paddq xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd4): /* vpaddq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd5):    /* pmullw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd5): /* vpmullw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd8):    /* psubusb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd8): /* vpsubusb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd9):    /* psubusw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd9): /* vpsubusw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX(0x0f, 0xda):    /* pminub xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xda): /* vpminub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xdb):    /* pand {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xdb): /* vpand {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xdc):    /* paddusb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xdc): /* vpaddusb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xdd):    /* paddusw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xdd): /* vpaddusw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX(0x0f, 0xde):    /* pmaxub xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xde): /* vpmaxub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xdf):    /* pandn {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xdf): /* vpandn {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xe0):     /* pavgb xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe0): /* vpavgb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xe3):     /* pavgw xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe3): /* vpavgw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xe4):     /* pmulhuw xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe4): /* vpmulhuw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe5):    /* pmulhw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe5): /* vpmulhw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe8):    /* psubsb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe8): /* vpsubsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe9):    /* psubsw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe9): /* vpsubsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX(0x0f, 0xea):    /* pminsw xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xea): /* vpminsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xeb):    /* por {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xeb): /* vpor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xec):    /* paddsb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xec): /* vpaddsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xed):    /* paddsw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xed): /* vpaddsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX(0x0f, 0xee):    /* pmaxsw xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xee): /* vpmaxsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xef):    /* pxor {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xef): /* vpxor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xf4):     /* pmuludq xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf4): /* vpmuludq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xf6):     /* psadbw xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf6): /* vpsadbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf8):    /* psubb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf8): /* vpsubb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf9):    /* psubw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf9): /* vpsubw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xfa):    /* psubd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xfa): /* vpsubd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xfb):     /* psubq xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xfb): /* vpsubq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xfc):    /* paddb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xfc): /* vpaddb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xfd):    /* paddw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xfd): /* vpaddw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xfe):    /* paddd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xfe): /* vpaddd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    simd_0f_int:
+        if ( vex.opcx != vex_none )
+        {
+            if ( vex.l )
+                host_and_vcpu_must_have(avx2);
+            else
+                host_and_vcpu_must_have(avx);
+            get_fpu(X86EMUL_FPU_ymm, &fic);
+        }
+        else if ( vex.pfx )
+        {
+            vcpu_must_have(sse2);
+            get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
+        else
+        {
+            host_and_vcpu_must_have(mmx);
+            get_fpu(X86EMUL_FPU_mmx, &fic);
+        }
+        goto simd_0f_common;
+
     case X86EMUL_OPC(0x0f, 0xe7):        /* movntq mm,m64 */
     case X86EMUL_OPC_66(0x0f, 0xe7):     /* movntdq xmm,m128 */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
@@ -5467,6 +5717,82 @@ x86_emulate(
         break;
     }
 
+    CASE_SIMD_PACKED_INT(0x0f, 0x70):    /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x70): /* vpshufhw $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x70): /* vpshuflw $imm8,{x,y}mm/mem,{x,y}mm */
+        d = (d & ~SrcMask) | SrcMem | TwoOp;
+        op_bytes = vex.pfx ? 16 << vex.l : 8;
+    simd_0f_int_imm8:
+        if ( vex.opcx != vex_none )
+        {
+            if ( vex.l )
+                host_and_vcpu_must_have(avx2);
+            else
+                host_and_vcpu_must_have(avx);
+            get_fpu(X86EMUL_FPU_ymm, &fic);
+        }
+        else if ( vex.pfx )
+        {
+            vcpu_must_have(sse2);
+            get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
+        else
+        {
+            host_and_vcpu_must_have(mmx);
+            vcpu_must_have(sse);
+            get_fpu(X86EMUL_FPU_mmx, &fic);
+        }
+    simd_0f_imm8:
+    {
+        uint8_t *buf = get_stub(stub);
+
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* Convert memory operand to (%rAX). */
+            rex_prefix &= ~REX_B;
+            vex.b = 1;
+            buf[4] &= 0x38;
+        }
+        buf[5] = imm1;
+        fic.insn_bytes = 6;
+        break;
+    }
+
+    case X86EMUL_OPC_F2(0x0f, 0xf0):     /* lddqu m128,xmm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_66(0x0f, 0x7c):     /* haddpd xmm/m128,xmm */
+    case X86EMUL_OPC_F2(0x0f, 0x7c):     /* haddps xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x7c): /* vhaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x7c): /* vhaddps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0x7d):     /* hsubpd xmm/m128,xmm */
+    case X86EMUL_OPC_F2(0x0f, 0x7d):     /* hsubps xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x7d): /* vhsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x7d): /* vhsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xd0):     /* haddsubpd xmm/m128,xmm */
+    case X86EMUL_OPC_F2(0x0f, 0xd0):     /* haddsubps xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd0): /* vhaddsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0xd0): /* vhaddsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        op_bytes = 16 << vex.l;
+        if ( vex.opcx != vex_none )
+        {
+            host_and_vcpu_must_have(avx);
+            get_fpu(X86EMUL_FPU_ymm, &fic);
+        }
+        else
+        {
+            host_and_vcpu_must_have(sse3);
+            get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
+        goto simd_0f_common;
+
     case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
         if ( test_cc(b, _regs._eflags) )
             jmp_rel((int32_t)src.val);
@@ -5767,12 +6093,41 @@ x86_emulate(
         }
         goto add;
 
+    CASE_SIMD_ALL_FP(, 0x0f, 0xc2):        /* cmp{p,s}{s,d} $imm8,xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2):    /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0xc6):     /* shufp{s,d} $imm8,xmm/mem,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+        d = (d & ~SrcMask) | SrcMem;
+        if ( vex.opcx == vex_none )
+        {
+            if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+                vcpu_must_have(sse2);
+            else
+                vcpu_must_have(sse);
+            get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
+        else
+        {
+            host_and_vcpu_must_have(avx);
+            fail_if((vex.pfx & VEX_PREFIX_SCALAR_MASK) && vex.l);
+            get_fpu(X86EMUL_FPU_ymm, &fic);
+        }
+        goto simd_0f_imm8;
+
     case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
         /* Ignore the non-temporal hint for now. */
         vcpu_must_have(sse2);
         dst.val = src.val;
+        sfence = true;
         break;
 
+    CASE_SIMD_PACKED_INT(0x0f, 0xc4):      /* pinsrw $imm8,r32/m16,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xc4):   /* vpinsrw $imm8,r32/m16,xmm,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        memcpy(mmvalp, &src.val, 2);
+        ea.type = OP_MEM;
+        goto simd_0f_int_imm8;
+
     case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
     {
         union {
@@ -5953,6 +6308,46 @@ x86_emulate(
         }
         break;
 
+    CASE_SIMD_PACKED_INT(0x0f, 0xd1):    /* psrlw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd2):    /* psrld {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd3):    /* psrlq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe1):    /* psraw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe2):    /* psrad {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe2): /* vpsrad xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf1):    /* psllw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf2):    /* pslld {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf2): /* vpslld xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf3):    /* psllq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,{x,y}mm,{x,y}mm */
+        op_bytes = vex.pfx ? 16 : 4;
+        goto simd_0f_int;
+
+    case X86EMUL_OPC(0x0f, 0xd4):        /* paddq mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xf4):        /* pmuludq mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xfb):        /* psubq mm/m64,mm */
+        host_and_vcpu_must_have(mmx);
+        vcpu_must_have(sse2);
+        get_fpu(X86EMUL_FPU_mmx, &fic);
+        goto simd_0f_common;
+
+    case X86EMUL_OPC(0x0f, 0xda):        /* pminub mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xde):        /* pmaxub mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xea):        /* pminsw mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xee):        /* pmaxsw mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xe0):        /* pavgb mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xe3):        /* pavgw mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xe4):        /* pmulhuw mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xf6):        /* psadbw mm/m64,mm */
+        host_and_vcpu_must_have(mmx);
+        vcpu_must_have(sse);
+        get_fpu(X86EMUL_FPU_mmx, &fic);
+        goto simd_0f_common;
+
     case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
     case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
         vcpu_must_have(movbe);
@@ -6214,6 +6609,75 @@ x86_emulate(
         goto cannot_emulate;
     }
 
+    if ( state->simd_size )
+    {
+#ifdef __XEN__
+        uint8_t *buf = stub.ptr;
+#else
+        uint8_t *buf = get_stub(stub);
+#endif
+
+        generate_exception_if(!op_bytes, EXC_UD);
+        generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
+                              EXC_UD);
+
+        if ( !buf )
+            BUG();
+        if ( vex.opcx == vex_none )
+            SET_SSE_PREFIX(buf[0], vex.pfx);
+
+        buf[fic.insn_bytes] = 0xc3;
+        copy_REX_VEX(buf, rex_prefix, vex);
+
+        if ( ea.type == OP_MEM )
+        {
+            uint32_t mxcsr = 0;
+
+            if ( op_bytes < 16 ||
+                 (vex.opcx
+                  ? /* vmov{a,nt}p{s,d} are exceptions. */
+                    ext != ext_0f || ((b | 1) != 0x29 && b != 0x2b)
+                  : /* movup{s,d} and lddqu are exceptions. */
+                    ext == ext_0f && ((b | 1) == 0x11 || b == 0xf0)) )
+                mxcsr = MXCSR_MM;
+            else if ( vcpu_has_misalignsse() )
+                asm ( "stmxcsr %0" : "=m" (mxcsr) );
+            generate_exception_if(!(mxcsr & MXCSR_MM) &&
+                                  !is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
+                                              ctxt, ops),
+                                  EXC_GP, 0);
+            if ( (d & SrcMask) == SrcMem )
+            {
+                rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+                if ( rc != X86EMUL_OKAY )
+                    goto done;
+                dst.type = OP_NONE;
+            }
+            else if ( (d & DstMask) == DstMem )
+            {
+                fail_if(!ops->write); /* Check before running the stub. */
+                ASSERT(d & Mov);
+                dst.type = OP_MEM;
+                dst.bytes = op_bytes;
+                dst.mem = ea.mem;
+            }
+            else if ( (d & SrcMask) == SrcMem16 )
+                dst.type = OP_NONE;
+            else
+            {
+                ASSERT_UNREACHABLE();
+                return X86EMUL_UNHANDLEABLE;
+            }
+        }
+        else
+            dst.type = OP_NONE;
+
+        invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
+
+        put_stub(stub);
+        put_fpu(&fic);
+    }
+
     switch ( dst.type )
     {
     case OP_REG:
@@ -6240,8 +6704,11 @@ x86_emulate(
         else
         {
             fail_if(!ops->write);
-            rc = ops->write(
-                dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+            rc = ops->write(dst.mem.seg, dst.mem.off,
+                            !state->simd_size ? &dst.val : (void *)mmvalp,
+                            dst.bytes, ctxt);
+            if ( sfence )
+                asm volatile ( "sfence" ::: "memory" );
         }
         if ( rc != 0 )
             goto done;
@@ -6498,22 +6965,6 @@ x86_insn_is_mem_write(const struct x86_e
     case 0x6c: case 0x6d:                /* INS */
     case 0xa4: case 0xa5:                /* MOVS */
     case 0xaa: case 0xab:                /* STOS */
-    case X86EMUL_OPC(0x0f, 0x11):        /* MOVUPS */
-    case X86EMUL_OPC_VEX(0x0f, 0x11):    /* VMOVUPS */
-    case X86EMUL_OPC_66(0x0f, 0x11):     /* MOVUPD */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* VMOVUPD */
-    case X86EMUL_OPC_F3(0x0f, 0x11):     /* MOVSS */
-    case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* VMOVSS */
-    case X86EMUL_OPC_F2(0x0f, 0x11):     /* MOVSD */
-    case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* VMOVSD */
-    case X86EMUL_OPC(0x0f, 0x29):        /* MOVAPS */
-    case X86EMUL_OPC_VEX(0x0f, 0x29):    /* VMOVAPS */
-    case X86EMUL_OPC_66(0x0f, 0x29):     /* MOVAPD */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* VMOVAPD */
-    case X86EMUL_OPC(0x0f, 0x2b):        /* MOVNTPS */
-    case X86EMUL_OPC_VEX(0x0f, 0x2b):    /* VMOVNTPS */
-    case X86EMUL_OPC_66(0x0f, 0x2b):     /* MOVNTPD */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* VMOVNTPD */
     case X86EMUL_OPC(0x0f, 0x7e):        /* MOVD/MOVQ */
     case X86EMUL_OPC_66(0x0f, 0x7e):     /* MOVD/MOVQ */
     case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* VMOVD/VMOVQ */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -71,12 +71,14 @@
 #define cpu_has_xsavec		boot_cpu_has(X86_FEATURE_XSAVEC)
 #define cpu_has_xgetbv1		boot_cpu_has(X86_FEATURE_XGETBV1)
 #define cpu_has_xsaves		boot_cpu_has(X86_FEATURE_XSAVES)
+#define cpu_has_avx2		boot_cpu_has(X86_FEATURE_AVX2)
 #define cpu_has_monitor		boot_cpu_has(X86_FEATURE_MONITOR)
 #define cpu_has_eist		boot_cpu_has(X86_FEATURE_EIST)
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_rdrand		boot_cpu_has(X86_FEATURE_RDRAND)
 #define cpu_has_rdseed		boot_cpu_has(X86_FEATURE_RDSEED)
 #define cpu_has_cmp_legacy	boot_cpu_has(X86_FEATURE_CMP_LEGACY)
+#define cpu_has_sse4a		boot_cpu_has(X86_FEATURE_SSE4A)
 #define cpu_has_tbm		boot_cpu_has(X86_FEATURE_TBM)
 
 enum _cache_type {



[-- Attachment #2: x86emul-SSE-AVX-0f-mem.patch --]
[-- Type: text/plain, Size: 55443 bytes --]

x86emul: support most memory accessing MMX/SSE/SSE2 insns

This aims at covering most MMX/SSEn/AVX instructions in the 0x0f-escape
space with memory operands. Not covered here are irregular moves,
converts, and {,U}COMIS{S,D} (modifying EFLAGS).

Note that the distinction between simd_*_fp isn't strictly needed, but
I've kept them as separate entries since in an earlier version I needed
them to be separate, and we may well find it useful down the road to
have that distinction.

Also take the opportunity and adjust the vmovdqu test case the new
LDDQU one here has been cloned from: To zero a ymm register we don't
need to go through hoops, as 128-bit AVX insns zero the upper portion
of the destination register, and in the disabled AVX2 code there was a
wrong YMM register used.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1656,12 +1656,7 @@ int main(int argc, char **argv)
     {
         decl_insn(vmovdqu_from_mem);
 
-#if 0 /* Don't use AVX2 instructions for now */
-        asm volatile ( "vpcmpgtb %%ymm4, %%ymm4, %%ymm4\n"
-#else
-        asm volatile ( "vpcmpgtb %%xmm4, %%xmm4, %%xmm4\n\t"
-                       "vinsertf128 $1, %%xmm4, %%ymm4, %%ymm4\n"
-#endif
+        asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
                        put_insn(vmovdqu_from_mem, "vmovdqu (%0), %%ymm4")
                        :: "d" (NULL) );
 
@@ -1675,7 +1670,7 @@ int main(int argc, char **argv)
 #if 0 /* Don't use AVX2 instructions for now */
         asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
               "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
-              "vpmovmskb %%ymm1, %0" : "=r" (rc) );
+              "vpmovmskb %%ymm0, %0" : "=r" (rc) );
 #else
         asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
               "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
@@ -2083,6 +2078,67 @@ int main(int argc, char **argv)
         printf("skipped\n");
 #endif
 
+    printf("%-40s", "Testing lddqu 4(%edx),%xmm4...");
+    if ( stack_exec && cpu_has_sse3 )
+    {
+        decl_insn(lddqu);
+
+        asm volatile ( "pcmpgtb %%xmm4, %%xmm4\n"
+                       put_insn(lddqu, "lddqu 4(%0), %%xmm4")
+                       :: "d" (NULL) );
+
+        set_insn(lddqu);
+        memset(res, 0x55, 64);
+        memset(res + 1, 0xff, 16);
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(lddqu) )
+            goto fail;
+        asm ( "pcmpeqb %%xmm2, %%xmm2\n\t"
+              "pcmpeqb %%xmm4, %%xmm2\n\t"
+              "pmovmskb %%xmm2, %0" : "=r" (rc) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vlddqu (%ecx),%ymm4...");
+    if ( stack_exec && cpu_has_avx )
+    {
+        decl_insn(vlddqu);
+
+        asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
+                       put_insn(vlddqu, "vlddqu (%0), %%ymm4")
+                       :: "c" (NULL) );
+
+        set_insn(vlddqu);
+        memset(res + 1, 0xff, 32);
+        regs.ecx = (unsigned long)(res + 1);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vlddqu) )
+            goto fail;
+#if 0 /* Don't use AVX2 instructions for now */
+        asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
+              "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
+              "vpmovmskb %%ymm0, %0" : "=r" (rc) );
+#else
+        asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
+              "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
+              "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
+              "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
+              "vpmovmskb %%xmm0, %0\n\t"
+              "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
+        rc |= i << 16;
+#endif
+        if ( ~rc )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #undef decl_insn
 #undef put_insn
 #undef set_insn
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -81,6 +81,12 @@ static inline uint64_t xgetbv(uint32_t x
     (res.d & (1U << 26)) != 0; \
 })
 
+#define cpu_has_sse3 ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    (res.c & (1U << 0)) != 0; \
+})
+
 #define cpu_has_popcnt ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -45,6 +45,8 @@
 #define ModRM       (1<<6)
 /* Destination is only written; never read. */
 #define Mov         (1<<7)
+/* VEX/EVEX (SIMD only): 2nd source operand unused (must be all ones) */
+#define TwoOp       Mov
 /* All operands are implicit in the opcode. */
 #define ImplicitOps (DstImplicit|SrcImplicit)
 
@@ -180,104 +182,171 @@ static const opcode_desc_t opcode_table[
     ImplicitOps, ImplicitOps, ByteOp|DstMem|SrcNone|ModRM, DstMem|SrcNone|ModRM
 };
 
-static const opcode_desc_t twobyte_table[256] = {
-    /* 0x00 - 0x07 */
-    ModRM, ImplicitOps|ModRM, DstReg|SrcMem16|ModRM, DstReg|SrcMem16|ModRM,
-    0, ImplicitOps, ImplicitOps, ImplicitOps,
-    /* 0x08 - 0x0F */
-    ImplicitOps, ImplicitOps, 0, ImplicitOps,
-    0, ImplicitOps|ModRM, ImplicitOps, ModRM|SrcImmByte,
-    /* 0x10 - 0x17 */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    /* 0x18 - 0x1F */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    /* 0x20 - 0x27 */
-    DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
-    DstImplicit|SrcMem|ModRM, DstImplicit|SrcMem|ModRM,
-    0, 0, 0, 0,
-    /* 0x28 - 0x2F */
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    /* 0x30 - 0x37 */
-    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-    ImplicitOps, ImplicitOps, 0, ImplicitOps,
-    /* 0x38 - 0x3F */
-    DstReg|SrcMem|ModRM, 0, DstReg|SrcImmByte|ModRM, 0, 0, 0, 0, 0,
-    /* 0x40 - 0x47 */
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    /* 0x48 - 0x4F */
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    /* 0x50 - 0x5F */
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    /* 0x60 - 0x6F */
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
-    /* 0x70 - 0x7F */
-    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM,
-    ModRM, ModRM, ModRM, ImplicitOps,
-    ModRM, ModRM, 0, 0, ModRM, ModRM, ImplicitOps|ModRM, ImplicitOps|ModRM,
-    /* 0x80 - 0x87 */
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    /* 0x88 - 0x8F */
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    DstImplicit|SrcImm, DstImplicit|SrcImm,
-    /* 0x90 - 0x97 */
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    /* 0x98 - 0x9F */
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
-    /* 0xA0 - 0xA7 */
-    ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, DstBitBase|SrcReg|ModRM,
-    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM, ModRM, ModRM,
-    /* 0xA8 - 0xAF */
-    ImplicitOps|Mov, ImplicitOps|Mov, ImplicitOps, DstBitBase|SrcReg|ModRM,
-    DstMem|SrcImmByte|ModRM, DstMem|SrcReg|ModRM,
-    ImplicitOps|ModRM, DstReg|SrcMem|ModRM,
-    /* 0xB0 - 0xB7 */
-    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
-    DstReg|SrcMem|ModRM|Mov, DstBitBase|SrcReg|ModRM,
-    DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
-    ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
-    /* 0xB8 - 0xBF */
-    DstReg|SrcMem|ModRM, ModRM,
-    DstBitBase|SrcImmByte|ModRM, DstBitBase|SrcReg|ModRM,
-    DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
-    ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem16|ModRM|Mov,
-    /* 0xC0 - 0xC7 */
-    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
-    SrcImmByte|ModRM, DstMem|SrcReg|ModRM|Mov,
-    SrcImmByte|ModRM, SrcImmByte|ModRM, SrcImmByte|ModRM, ImplicitOps|ModRM,
-    /* 0xC8 - 0xCF */
-    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-    ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-    /* 0xD0 - 0xDF */
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM, ModRM,
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    /* 0xE0 - 0xEF */
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ImplicitOps|ModRM,
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    /* 0xF0 - 0xFF */
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM,
-    ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM, ModRM
+enum simd_opsize {
+    simd_none,
+    /*
+     * Ordinary packed integers:
+     * - 64 bits without prefix 66 (MMX)
+     * - 128 bits with prefix 66 (SSEn)
+     * - 128/256 bits depending on VEX.L (AVX)
+     */
+    simd_packed_int,
+    /*
+     * Ordinary packed/scalar floating point:
+     * - 128 bits without prefix or with prefix 66 (SSEn)
+     * - 128/256 bits depending on VEX.L (AVX)
+     * - 32 bits with prefix F3 (scalar single)
+     * - 64 bits with prefix F2 (scalar doubgle)
+     */
+    simd_any_fp,
+    /*
+     * Packed floating point:
+     * - 128 bits without prefix or with prefix 66 (SSEn)
+     * - 128/256 bits depending on VEX.L (AVX)
+     */
+    simd_packed_fp,
+    /*
+     * Single precision packed/scalar floating point:
+     * - 128 bits without prefix (SSEn)
+     * - 128/256 bits depending on VEX.L, no prefix (AVX)
+     * - 32 bits with prefix F3 (scalar)
+     */
+    simd_single_fp,
+    /* Operand size encoded in non-standard way. */
+    simd_other
+};
+typedef uint8_t simd_opsize_t;
+
+static const struct {
+    opcode_desc_t desc;
+    simd_opsize_t size;
+} twobyte_table[256] = {
+    [0x00] = { ModRM },
+    [0x01] = { ImplicitOps|ModRM },
+    [0x02] = { DstReg|SrcMem16|ModRM },
+    [0x03] = { DstReg|SrcMem16|ModRM },
+    [0x05] = { ImplicitOps },
+    [0x06] = { ImplicitOps },
+    [0x07] = { ImplicitOps },
+    [0x08] = { ImplicitOps },
+    [0x09] = { ImplicitOps },
+    [0x0b] = { ImplicitOps },
+    [0x0d] = { ImplicitOps|ModRM },
+    [0x0e] = { ImplicitOps },
+    [0x0f] = { ModRM|SrcImmByte },
+    [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
+    [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+    [0x12] = { ImplicitOps|ModRM },
+    [0x13] = { ImplicitOps|ModRM },
+    [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+    [0x16] = { ImplicitOps|ModRM },
+    [0x17] = { ImplicitOps|ModRM },
+    [0x18] = { ImplicitOps|ModRM },
+    [0x19] = { ImplicitOps|ModRM },
+    [0x1a] = { ImplicitOps|ModRM },
+    [0x1b] = { ImplicitOps|ModRM },
+    [0x1c] = { ImplicitOps|ModRM },
+    [0x1d] = { ImplicitOps|ModRM },
+    [0x1e] = { ImplicitOps|ModRM },
+    [0x1f] = { ImplicitOps|ModRM },
+    [0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
+    [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
+    [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp },
+    [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp },
+    [0x2a] = { ImplicitOps|ModRM },
+    [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+    [0x2c ... 0x2d] = { ImplicitOps|ModRM },
+    [0x2e ... 0x2f] = { ImplicitOps|ModRM },
+    [0x30] = { ImplicitOps },
+    [0x31] = { ImplicitOps },
+    [0x32] = { ImplicitOps },
+    [0x33] = { ImplicitOps },
+    [0x34] = { ImplicitOps },
+    [0x35] = { ImplicitOps },
+    [0x37] = { ImplicitOps },
+    [0x38] = { DstReg|SrcMem|ModRM },
+    [0x3a] = { DstReg|SrcImmByte|ModRM },
+    [0x40 ... 0x4f] = { DstReg|SrcMem|ModRM|Mov },
+    [0x50] = { ModRM },
+    [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },
+    [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
+    [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+    [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+    [0x5a] = { ModRM },
+    [0x5b] = { ModRM },
+    [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+    [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x6b] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x6c ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x6e ... 0x6f] = { ImplicitOps|ModRM },
+    [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
+    [0x71 ... 0x73] = { SrcImmByte|ModRM },
+    [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x77] = { DstImplicit|SrcNone },
+    [0x78] = { ModRM },
+    [0x79] = { ModRM },
+    [0x7c] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x7e ... 0x7f] = { ImplicitOps|ModRM },
+    [0x80 ... 0x8f] = { DstImplicit|SrcImm },
+    [0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
+    [0xa0 ... 0xa1] = { ImplicitOps|Mov },
+    [0xa2] = { ImplicitOps },
+    [0xa3] = { DstBitBase|SrcReg|ModRM },
+    [0xa4] = { DstMem|SrcImmByte|ModRM },
+    [0xa5] = { DstMem|SrcReg|ModRM },
+    [0xa6] = { ModRM },
+    [0xa7] = { ModRM },
+    [0xa8 ... 0xa9] = { ImplicitOps|Mov },
+    [0xaa] = { ImplicitOps },
+    [0xab] = { DstBitBase|SrcReg|ModRM },
+    [0xac] = { DstMem|SrcImmByte|ModRM },
+    [0xad] = { DstMem|SrcReg|ModRM },
+    [0xae] = { ImplicitOps|ModRM },
+    [0xaf] = { DstReg|SrcMem|ModRM },
+    [0xb0] = { ByteOp|DstMem|SrcReg|ModRM },
+    [0xb1] = { DstMem|SrcReg|ModRM },
+    [0xb2] = { DstReg|SrcMem|ModRM|Mov },
+    [0xb3] = { DstBitBase|SrcReg|ModRM },
+    [0xb4 ... 0xb5] = { DstReg|SrcMem|ModRM|Mov },
+    [0xb6] = { ByteOp|DstReg|SrcMem|ModRM|Mov },
+    [0xb7] = { DstReg|SrcMem16|ModRM|Mov },
+    [0xb8] = { DstReg|SrcMem|ModRM },
+    [0xb9] = { ModRM },
+    [0xba] = { DstBitBase|SrcImmByte|ModRM },
+    [0xbb] = { DstBitBase|SrcReg|ModRM },
+    [0xbc ... 0xbd] = { DstReg|SrcMem|ModRM },
+    [0xbe] = { ByteOp|DstReg|SrcMem|ModRM|Mov },
+    [0xbf] = { DstReg|SrcMem16|ModRM|Mov },
+    [0xc0] = { ByteOp|DstMem|SrcReg|ModRM },
+    [0xc1] = { DstMem|SrcReg|ModRM },
+    [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp },
+    [0xc3] = { DstMem|SrcReg|ModRM|Mov },
+    [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
+    [0xc5] = { SrcImmByte|ModRM },
+    [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp },
+    [0xc7] = { ImplicitOps|ModRM },
+    [0xc8 ... 0xcf] = { ImplicitOps },
+    [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xd6] = { ImplicitOps|ModRM },
+    [0xd7] = { ModRM },
+    [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe6] = { ModRM },
+    [0xe7] = { ImplicitOps|ModRM },
+    [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+    [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xf7] = { ModRM },
+    [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xff] = { ModRM }
 };
 
 static const opcode_desc_t xop_table[] = {
@@ -1372,10 +1441,12 @@ static bool vcpu_has(
 #define vcpu_has_lahf_lm()     vcpu_has(0x80000001, ECX,  0, ctxt, ops)
 #define vcpu_has_cr8_legacy()  vcpu_has(0x80000001, ECX,  4, ctxt, ops)
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
+#define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
 #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
 #define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
 #define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
 #define vcpu_has_hle()         vcpu_has(         7, EBX,  4, ctxt, ops)
+#define vcpu_has_avx2()        vcpu_has(         7, EBX,  5, ctxt, ops)
 #define vcpu_has_bmi2()        vcpu_has(         7, EBX,  8, ctxt, ops)
 #define vcpu_has_rtm()         vcpu_has(         7, EBX, 11, ctxt, ops)
 #define vcpu_has_mpx()         vcpu_has(         7, EBX, 14, ctxt, ops)
@@ -1975,6 +2046,7 @@ struct x86_emulate_state {
     opcode_desc_t desc;
     union vex vex;
     union evex evex;
+    enum simd_opsize simd_size;
 
     /*
      * Data operand effective address (usually computed from ModRM).
@@ -2110,7 +2182,8 @@ x86_decode_twobyte(
     case 0x50 ... 0x77:
     case 0x79 ... 0x7f:
     case 0xae:
-    case 0xc2 ... 0xc6:
+    case 0xc2 ... 0xc3:
+    case 0xc5 ... 0xc6:
     case 0xd0 ... 0xfe:
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         break;
@@ -2137,8 +2210,23 @@ x86_decode_twobyte(
     case 0xbd: bsr / lzcnt
          * They're being dealt with in the execution phase (if at all).
          */
+
+    case 0xc4: /* pinsrw */
+        ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
+        /* fall through */
+    case X86EMUL_OPC_VEX_66(0, 0xc4): /* vpinsrw */
+        state->desc = DstReg | SrcMem16 | ModRM;
+        break;
     }
 
+    /*
+     * Register forms of most VEX-encoded TwoOp instructions have
+     * three operands.
+     */
+    if ( state->simd_size && ea.type != OP_MEM &&
+         vex.opcx && (vex.pfx & VEX_PREFIX_SCALAR_MASK) )
+        state->desc &= ~TwoOp;
+
  done:
     return rc;
 }
@@ -2270,12 +2358,13 @@ x86_decode(
     {
         /* Two-byte opcode. */
         b = insn_fetch_type(uint8_t);
-        d = twobyte_table[b];
+        d = twobyte_table[b].desc;
         switch ( b )
         {
         default:
             opcode = b | MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
             ext = ext_0f;
+            state->simd_size = twobyte_table[b].size;
             break;
         case 0x38:
             b = insn_fetch_type(uint8_t);
@@ -2381,15 +2470,16 @@ x86_decode(
                     {
                     case vex_0f:
                         opcode |= MASK_INSR(0x0f, X86EMUL_OPC_EXT_MASK);
-                        d = twobyte_table[b];
+                        d = twobyte_table[b].desc;
+                        state->simd_size = twobyte_table[b].size;
                         break;
                     case vex_0f38:
                         opcode |= MASK_INSR(0x0f38, X86EMUL_OPC_EXT_MASK);
-                        d = twobyte_table[0x38];
+                        d = twobyte_table[0x38].desc;
                         break;
                     case vex_0f3a:
                         opcode |= MASK_INSR(0x0f3a, X86EMUL_OPC_EXT_MASK);
-                        d = twobyte_table[0x3a];
+                        d = twobyte_table[0x3a].desc;
                         break;
                     default:
                         rc = X86EMUL_UNHANDLEABLE;
@@ -2639,13 +2729,53 @@ x86_decode(
         ea.mem.off = truncate_ea(ea.mem.off);
     }
 
-    /*
-     * When prefix 66 has a meaning different from operand-size override,
-     * operand size defaults to 4 and can't be overridden to 2.
-     */
-    if ( op_bytes == 2 &&
-         (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
-        op_bytes = 4;
+    switch ( state->simd_size )
+    {
+    case simd_none:
+        /*
+         * When prefix 66 has a meaning different from operand-size override,
+         * operand size defaults to 4 and can't be overridden to 2.
+         */
+        if ( op_bytes == 2 &&
+             (ctxt->opcode & X86EMUL_OPC_PFX_MASK) == X86EMUL_OPC_66(0, 0) )
+            op_bytes = 4;
+        break;
+
+    case simd_packed_int:
+        switch ( vex.pfx )
+        {
+        case vex_none: op_bytes = 8;           break;
+        case vex_66:   op_bytes = 16 << vex.l; break;
+        default:       op_bytes = 0;           break;
+        }
+        break;
+
+    case simd_single_fp:
+        if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+        {
+            op_bytes = 0;
+            break;
+    case simd_packed_fp:
+            if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
+            {
+                op_bytes = 0;
+                break;
+            }
+        }
+        /* fall through */
+    case simd_any_fp:
+        switch ( vex.pfx )
+        {
+        default:     op_bytes = 16 << vex.l; break;
+        case vex_f3: op_bytes = 4;           break;
+        case vex_f2: op_bytes = 8;           break;
+        }
+        break;
+
+    default:
+        op_bytes = 0;
+        break;
+    }
 
  done:
     return rc;
@@ -2669,8 +2799,10 @@ x86_emulate(
     int rc;
     uint8_t b, d;
     bool singlestep = (_regs._eflags & EFLG_TF) && !is_branch_step(ctxt, ops);
+    bool sfence = false;
     struct operand src = { .reg = PTR_POISON };
     struct operand dst = { .reg = PTR_POISON };
+    unsigned long cr4;
     enum x86_swint_type swint_type;
     struct fpu_insn_ctxt fic;
     struct x86_emulate_stub stub = {};
@@ -2737,6 +2869,8 @@ x86_emulate(
         ea.bytes = 2;
         goto srcmem_common;
     case SrcMem:
+        if ( state->simd_size )
+            break;
         ea.bytes = (d & ByteOp) ? 1 : op_bytes;
     srcmem_common:
         src = ea;
@@ -2837,6 +2971,11 @@ x86_emulate(
         d = (d & ~DstMask) | DstMem;
         /* Becomes a normal DstMem operation from here on. */
     case DstMem:
+        if ( state->simd_size )
+        {
+            generate_exception_if(lock_prefix, EXC_UD);
+            break;
+        }
         ea.bytes = (d & ByteOp) ? 1 : op_bytes;
         dst = ea;
         if ( dst.type == OP_REG )
@@ -2871,7 +3010,6 @@ x86_emulate(
     {
         enum x86_segment seg;
         struct segment_register cs, sreg;
-        unsigned long cr4;
         struct cpuid_leaf cpuid_leaf;
 
     case 0x00 ... 0x05: add: /* add */
@@ -5066,116 +5204,109 @@ x86_emulate(
     case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
         break;
 
-    case X86EMUL_OPC(0x0f, 0x2b):        /* movntps xmm,m128 */
-    case X86EMUL_OPC_VEX(0x0f, 0x2b):    /* vmovntps xmm,m128 */
-                                         /* vmovntps ymm,m256 */
-    case X86EMUL_OPC_66(0x0f, 0x2b):     /* movntpd xmm,m128 */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* vmovntpd xmm,m128 */
-                                         /* vmovntpd ymm,m256 */
-        fail_if(ea.type != OP_MEM);
-        /* fall through */
-    case X86EMUL_OPC(0x0f, 0x28):        /* movaps xmm/m128,xmm */
-    case X86EMUL_OPC_VEX(0x0f, 0x28):    /* vmovaps xmm/m128,xmm */
-                                         /* vmovaps ymm/m256,ymm */
-    case X86EMUL_OPC_66(0x0f, 0x28):     /* movapd xmm/m128,xmm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x28): /* vmovapd xmm/m128,xmm */
-                                         /* vmovapd ymm/m256,ymm */
-    case X86EMUL_OPC(0x0f, 0x29):        /* movaps xmm,xmm/m128 */
-    case X86EMUL_OPC_VEX(0x0f, 0x29):    /* vmovaps xmm,xmm/m128 */
-                                         /* vmovaps ymm,ymm/m256 */
-    case X86EMUL_OPC_66(0x0f, 0x29):     /* movapd xmm,xmm/m128 */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* vmovapd xmm,xmm/m128 */
-                                         /* vmovapd ymm,ymm/m256 */
-    case X86EMUL_OPC(0x0f, 0x10):        /* movups xmm/m128,xmm */
-    case X86EMUL_OPC_VEX(0x0f, 0x10):    /* vmovups xmm/m128,xmm */
-                                         /* vmovups ymm/m256,ymm */
-    case X86EMUL_OPC_66(0x0f, 0x10):     /* movupd xmm/m128,xmm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x10): /* vmovupd xmm/m128,xmm */
-                                         /* vmovupd ymm/m256,ymm */
-    case X86EMUL_OPC_F3(0x0f, 0x10):     /* movss xmm/m32,xmm */
-    case X86EMUL_OPC_VEX_F3(0x0f, 0x10): /* vmovss xmm/m32,xmm */
-    case X86EMUL_OPC_F2(0x0f, 0x10):     /* movsd xmm/m64,xmm */
-    case X86EMUL_OPC_VEX_F2(0x0f, 0x10): /* vmovsd xmm/m64,xmm */
-    case X86EMUL_OPC(0x0f, 0x11):        /* movups xmm,xmm/m128 */
-    case X86EMUL_OPC_VEX(0x0f, 0x11):    /* vmovups xmm,xmm/m128 */
-                                         /* vmovups ymm,ymm/m256 */
-    case X86EMUL_OPC_66(0x0f, 0x11):     /* movupd xmm,xmm/m128 */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* vmovupd xmm,xmm/m128 */
-                                         /* vmovupd ymm,ymm/m256 */
-    case X86EMUL_OPC_F3(0x0f, 0x11):     /* movss xmm,xmm/m32 */
-    case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* vmovss xmm,xmm/m32 */
-    case X86EMUL_OPC_F2(0x0f, 0x11):     /* movsd xmm,xmm/m64 */
-    case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* vmovsd xmm,xmm/m64 */
-    {
-        uint8_t *buf = get_stub(stub);
+#define CASE_SIMD_PACKED_INT(pfx, opc)       \
+    case X86EMUL_OPC(pfx, opc):              \
+    case X86EMUL_OPC_66(pfx, opc)
+#define CASE_SIMD_SINGLE_FP(kind, pfx, opc)  \
+    case X86EMUL_OPC##kind(pfx, opc):        \
+    case X86EMUL_OPC##kind##_F3(pfx, opc)
+#define CASE_SIMD_DOUBLE_FP(kind, pfx, opc)  \
+    case X86EMUL_OPC##kind##_66(pfx, opc):   \
+    case X86EMUL_OPC##kind##_F2(pfx, opc)
+#define CASE_SIMD_ALL_FP(kind, pfx, opc)     \
+    CASE_SIMD_SINGLE_FP(kind, pfx, opc):     \
+    CASE_SIMD_DOUBLE_FP(kind, pfx, opc)
+#define CASE_SIMD_PACKED_FP(kind, pfx, opc)  \
+    case X86EMUL_OPC##kind(pfx, opc):        \
+    case X86EMUL_OPC##kind##_66(pfx, opc)
+#define CASE_SIMD_SCALAR_FP(kind, pfx, opc)  \
+    case X86EMUL_OPC##kind##_F3(pfx, opc):   \
+    case X86EMUL_OPC##kind##_F2(pfx, opc)
 
-        fic.insn_bytes = 5;
-        buf[0] = 0x3e;
-        buf[1] = 0x3e;
-        buf[2] = 0x0f;
-        buf[3] = b;
-        buf[4] = modrm;
-        buf[5] = 0xc3;
+    CASE_SIMD_SCALAR_FP(, 0x0f, 0x2b):     /* movnts{s,d} xmm,mem */
+        host_and_vcpu_must_have(sse4a);
+        /* fall through */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x2b):     /* movntp{s,d} xmm,m128 */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x2b): /* vmovntp{s,d} {x,y}mm,mem */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        sfence = true;
+        /* fall through */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x10):        /* mov{up,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x11):        /* mov{up,s}{s,d} xmm,xmm/mem */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x10): /* vmovup{s,d} {x,y}mm/mem,{x,y}mm */
+    CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x10): /* vmovs{s,d} mem,xmm */
+                                           /* vmovs{s,d} xmm,xmm,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x11): /* vmovup{s,d} {x,y}mm,{x,y}mm/mem */
+    CASE_SIMD_SCALAR_FP(_VEX, 0x0f, 0x11): /* vmovs{s,d} xmm,mem */
+                                           /* vmovs{s,d} xmm,xmm,xmm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x14):     /* unpcklp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x14): /* vunpcklp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x15):     /* unpckhp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x15): /* vunpckhp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x28):     /* movap{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x28): /* vmovap{s,d} {x,y}mm/mem,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x29):     /* movap{s,d} xmm,xmm/m128 */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x29): /* vmovap{s,d} {x,y}mm,{x,y}mm/mem */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x51):        /* sqrt{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x51):    /* vsqrtp{s,d} {x,y}mm/mem,{x,y}mm */
+                                           /* vsqrts{s,d} xmm,xmm,xmm */
+    CASE_SIMD_SINGLE_FP(, 0x0f, 0x52):     /* rsqrt{p,s}s xmm/mem,xmm */
+    CASE_SIMD_SINGLE_FP(_VEX, 0x0f, 0x52): /* vrsqrtps {x,y}mm/mem,{x,y}mm */
+                                           /* vrsqrtss xmm,xmm,xmm */
+    CASE_SIMD_SINGLE_FP(, 0x0f, 0x53):     /* rcp{p,s}s xmm/mem,xmm */
+    CASE_SIMD_SINGLE_FP(_VEX, 0x0f, 0x53): /* vrcpps {x,y}mm/mem,{x,y}mm */
+                                           /* vrcpss xmm,xmm,xmm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x54):     /* andp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x54): /* vandp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x55):     /* andnp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x55): /* vandnp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x56):     /* orp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x56): /* vorp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0x57):     /* xorp{s,d} xmm/m128,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x57): /* vxorp{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x58):        /* add{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x58):    /* vadd{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x59):        /* mul{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x59):    /* vmul{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x5c):        /* sub{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5c):    /* vsub{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x5d):        /* min{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5d):    /* vmin{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x5e):        /* div{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5e):    /* vdiv{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_ALL_FP(, 0x0f, 0x5f):        /* max{p,s}{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5f):    /* vmax{p,s}{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
         if ( vex.opcx == vex_none )
         {
             if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
                 vcpu_must_have(sse2);
             else
                 vcpu_must_have(sse);
-            ea.bytes = 16;
-            SET_SSE_PREFIX(buf[0], vex.pfx);
             get_fpu(X86EMUL_FPU_xmm, &fic);
         }
         else
         {
-            fail_if((vex.reg != 0xf) &&
-                    ((ea.type == OP_MEM) ||
-                     !(vex.pfx & VEX_PREFIX_SCALAR_MASK)));
             host_and_vcpu_must_have(avx);
+            fail_if((vex.pfx & VEX_PREFIX_SCALAR_MASK) && vex.l);
             get_fpu(X86EMUL_FPU_ymm, &fic);
-            ea.bytes = 16 << vex.l;
         }
-        if ( vex.pfx & VEX_PREFIX_SCALAR_MASK )
-            ea.bytes = vex.pfx & VEX_PREFIX_DOUBLE_MASK ? 8 : 4;
+    simd_0f_common:
+    {
+        uint8_t *buf = get_stub(stub);
+
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
         if ( ea.type == OP_MEM )
         {
-            uint32_t mxcsr = 0;
-
-            if ( b < 0x28 )
-                mxcsr = MXCSR_MM;
-            else if ( vcpu_has_misalignsse() )
-                asm ( "stmxcsr %0" : "=m" (mxcsr) );
-            generate_exception_if(!(mxcsr & MXCSR_MM) &&
-                                  !is_aligned(ea.mem.seg, ea.mem.off, ea.bytes,
-                                              ctxt, ops),
-                                  EXC_GP, 0);
-            if ( !(b & 1) )
-                rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
-                               ea.bytes, ctxt);
-            else
-                fail_if(!ops->write); /* Check before running the stub. */
             /* convert memory operand to (%rAX) */
             rex_prefix &= ~REX_B;
             vex.b = 1;
             buf[4] &= 0x38;
         }
-        if ( !rc )
-        {
-           copy_REX_VEX(buf, rex_prefix, vex);
-           asm volatile ( "call *%0" : : "r" (stub.func), "a" (mmvalp)
-                                     : "memory" );
-        }
-        put_fpu(&fic);
-        put_stub(stub);
-        if ( !rc && (b & 1) && (ea.type == OP_MEM) )
-        {
-            ASSERT(ops->write); /* See the fail_if() above. */
-            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
-                            ea.bytes, ctxt);
-        }
-        if ( rc )
-            goto done;
-        dst.type = OP_NONE;
+        fic.insn_bytes = 5;
         break;
     }
 
@@ -5338,6 +5469,125 @@ x86_emulate(
         break;
     }
 
+    CASE_SIMD_PACKED_INT(0x0f, 0x60):    /* punpcklbw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x60): /* vpunpcklbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x61):    /* punpcklwd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x61): /* vpunpcklwd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x62):    /* punpckldq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x62): /* vpunpckldq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x68):    /* punpckhbw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x68): /* vpunpckhbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x69):    /* punpckhwd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x69): /* vpunpckhwd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x6a):    /* punpckhdq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6a): /* vpunpckhdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        op_bytes = vex.pfx ? 16 << vex.l : 4;
+        /* fall through */
+    CASE_SIMD_PACKED_INT(0x0f, 0x63):    /* packssbw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x63): /* vpackssbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x64):    /* pcmpgtb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x64): /* vpcmpgtb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x65):    /* pcmpgtw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x65): /* vpcmpgtw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x66):    /* pcmpgtd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x66): /* vpcmpgtd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x67):    /* packusbw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x67): /* vpackusbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x6b):    /* packsswd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6b): /* vpacksswd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0x6c):     /* punpcklqdq xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6c): /* vpunpcklqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0x6d):     /* punpckhqdq xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x6d): /* vpunpckhqdq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x74):    /* pcmpeqb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x74): /* vpcmpeqb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x75):    /* pcmpeqw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x75): /* vpcmpeqw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0x76):    /* pcmpeqd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x76): /* vpcmpeqd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xd4):     /* paddq xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd4): /* vpaddq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd5):    /* pmullw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd5): /* vpmullw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd8):    /* psubusb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd8): /* vpsubusb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd9):    /* psubusw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd9): /* vpsubusw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX(0x0f, 0xda):    /* pminub xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xda): /* vpminub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xdb):    /* pand {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xdb): /* vpand {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xdc):    /* paddusb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xdc): /* vpaddusb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xdd):    /* paddusw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xdd): /* vpaddusw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX(0x0f, 0xde):    /* pmaxub xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xde): /* vpmaxub {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xdf):    /* pandn {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xdf): /* vpandn {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xe0):     /* pavgb xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe0): /* vpavgb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xe3):     /* pavgw xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe3): /* vpavgw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xe4):     /* pmulhuw xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe4): /* vpmulhuw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe5):    /* pmulhw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe5): /* vpmulhw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe8):    /* psubsb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe8): /* vpsubsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe9):    /* psubsw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe9): /* vpsubsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX(0x0f, 0xea):    /* pminsw xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xea): /* vpminsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xeb):    /* por {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xeb): /* vpor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xec):    /* paddsb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xec): /* vpaddsb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xed):    /* paddsw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xed): /* vpaddsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX(0x0f, 0xee):    /* pmaxsw xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xee): /* vpmaxsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xef):    /* pxor {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xef): /* vpxor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xf4):     /* pmuludq xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf4): /* vpmuludq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xf6):     /* psadbw xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf6): /* vpsadbw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf8):    /* psubb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf8): /* vpsubb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf9):    /* psubw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf9): /* vpsubw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xfa):    /* psubd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xfa): /* vpsubd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xfb):     /* psubq xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xfb): /* vpsubq {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xfc):    /* paddb {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xfc): /* vpaddb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xfd):    /* paddw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xfd): /* vpaddw {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xfe):    /* paddd {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xfe): /* vpaddd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    simd_0f_int:
+        if ( vex.opcx != vex_none )
+        {
+            if ( vex.l )
+                host_and_vcpu_must_have(avx2);
+            else
+                host_and_vcpu_must_have(avx);
+            get_fpu(X86EMUL_FPU_ymm, &fic);
+        }
+        else if ( vex.pfx )
+        {
+            vcpu_must_have(sse2);
+            get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
+        else
+        {
+            host_and_vcpu_must_have(mmx);
+            get_fpu(X86EMUL_FPU_mmx, &fic);
+        }
+        goto simd_0f_common;
+
     case X86EMUL_OPC(0x0f, 0xe7):        /* movntq mm,m64 */
     case X86EMUL_OPC_66(0x0f, 0xe7):     /* movntdq xmm,m128 */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq xmm,m128 */
@@ -5467,6 +5717,82 @@ x86_emulate(
         break;
     }
 
+    CASE_SIMD_PACKED_INT(0x0f, 0x70):    /* pshuf{w,d} $imm8,{,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x70): /* vpshufd $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_F3(0x0f, 0x70): /* vpshufhw $imm8,{x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x70): /* vpshuflw $imm8,{x,y}mm/mem,{x,y}mm */
+        d = (d & ~SrcMask) | SrcMem | TwoOp;
+        op_bytes = vex.pfx ? 16 << vex.l : 8;
+    simd_0f_int_imm8:
+        if ( vex.opcx != vex_none )
+        {
+            if ( vex.l )
+                host_and_vcpu_must_have(avx2);
+            else
+                host_and_vcpu_must_have(avx);
+            get_fpu(X86EMUL_FPU_ymm, &fic);
+        }
+        else if ( vex.pfx )
+        {
+            vcpu_must_have(sse2);
+            get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
+        else
+        {
+            host_and_vcpu_must_have(mmx);
+            vcpu_must_have(sse);
+            get_fpu(X86EMUL_FPU_mmx, &fic);
+        }
+    simd_0f_imm8:
+    {
+        uint8_t *buf = get_stub(stub);
+
+        buf[0] = 0x3e;
+        buf[1] = 0x3e;
+        buf[2] = 0x0f;
+        buf[3] = b;
+        buf[4] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* Convert memory operand to (%rAX). */
+            rex_prefix &= ~REX_B;
+            vex.b = 1;
+            buf[4] &= 0x38;
+        }
+        buf[5] = imm1;
+        fic.insn_bytes = 6;
+        break;
+    }
+
+    case X86EMUL_OPC_F2(0x0f, 0xf0):     /* lddqu m128,xmm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0xf0): /* vlddqu mem,{x,y}mm */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_66(0x0f, 0x7c):     /* haddpd xmm/m128,xmm */
+    case X86EMUL_OPC_F2(0x0f, 0x7c):     /* haddps xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x7c): /* vhaddpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x7c): /* vhaddps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0x7d):     /* hsubpd xmm/m128,xmm */
+    case X86EMUL_OPC_F2(0x0f, 0x7d):     /* hsubps xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x7d): /* vhsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x7d): /* vhsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_66(0x0f, 0xd0):     /* haddsubpd xmm/m128,xmm */
+    case X86EMUL_OPC_F2(0x0f, 0xd0):     /* haddsubps xmm/m128,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd0): /* vhaddsubpd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0xd0): /* vhaddsubps {x,y}mm/mem,{x,y}mm,{x,y}mm */
+        op_bytes = 16 << vex.l;
+        if ( vex.opcx != vex_none )
+        {
+            host_and_vcpu_must_have(avx);
+            get_fpu(X86EMUL_FPU_ymm, &fic);
+        }
+        else
+        {
+            host_and_vcpu_must_have(sse3);
+            get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
+        goto simd_0f_common;
+
     case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
         if ( test_cc(b, _regs._eflags) )
             jmp_rel((int32_t)src.val);
@@ -5767,12 +6093,41 @@ x86_emulate(
         }
         goto add;
 
+    CASE_SIMD_ALL_FP(, 0x0f, 0xc2):        /* cmp{p,s}{s,d} $imm8,xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2):    /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+    CASE_SIMD_PACKED_FP(, 0x0f, 0xc6):     /* shufp{s,d} $imm8,xmm/mem,xmm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+        d = (d & ~SrcMask) | SrcMem;
+        if ( vex.opcx == vex_none )
+        {
+            if ( vex.pfx & VEX_PREFIX_DOUBLE_MASK )
+                vcpu_must_have(sse2);
+            else
+                vcpu_must_have(sse);
+            get_fpu(X86EMUL_FPU_xmm, &fic);
+        }
+        else
+        {
+            host_and_vcpu_must_have(avx);
+            fail_if((vex.pfx & VEX_PREFIX_SCALAR_MASK) && vex.l);
+            get_fpu(X86EMUL_FPU_ymm, &fic);
+        }
+        goto simd_0f_imm8;
+
     case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
         /* Ignore the non-temporal hint for now. */
         vcpu_must_have(sse2);
         dst.val = src.val;
+        sfence = true;
         break;
 
+    CASE_SIMD_PACKED_INT(0x0f, 0xc4):      /* pinsrw $imm8,r32/m16,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xc4):   /* vpinsrw $imm8,r32/m16,xmm,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        memcpy(mmvalp, &src.val, 2);
+        ea.type = OP_MEM;
+        goto simd_0f_int_imm8;
+
     case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
     {
         union {
@@ -5953,6 +6308,46 @@ x86_emulate(
         }
         break;
 
+    CASE_SIMD_PACKED_INT(0x0f, 0xd1):    /* psrlw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd2):    /* psrld {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd3):    /* psrlq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe1):    /* psraw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe2):    /* psrad {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe2): /* vpsrad xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf1):    /* psllw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf2):    /* pslld {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf2): /* vpslld xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf3):    /* psllq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,{x,y}mm,{x,y}mm */
+        op_bytes = vex.pfx ? 16 : 4;
+        goto simd_0f_int;
+
+    case X86EMUL_OPC(0x0f, 0xd4):        /* paddq mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xf4):        /* pmuludq mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xfb):        /* psubq mm/m64,mm */
+        host_and_vcpu_must_have(mmx);
+        vcpu_must_have(sse2);
+        get_fpu(X86EMUL_FPU_mmx, &fic);
+        goto simd_0f_common;
+
+    case X86EMUL_OPC(0x0f, 0xda):        /* pminub mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xde):        /* pmaxub mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xea):        /* pminsw mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xee):        /* pmaxsw mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xe0):        /* pavgb mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xe3):        /* pavgw mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xe4):        /* pmulhuw mm/m64,mm */
+    case X86EMUL_OPC(0x0f, 0xf6):        /* psadbw mm/m64,mm */
+        host_and_vcpu_must_have(mmx);
+        vcpu_must_have(sse);
+        get_fpu(X86EMUL_FPU_mmx, &fic);
+        goto simd_0f_common;
+
     case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
     case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
         vcpu_must_have(movbe);
@@ -6214,6 +6609,75 @@ x86_emulate(
         goto cannot_emulate;
     }
 
+    if ( state->simd_size )
+    {
+#ifdef __XEN__
+        uint8_t *buf = stub.ptr;
+#else
+        uint8_t *buf = get_stub(stub);
+#endif
+
+        generate_exception_if(!op_bytes, EXC_UD);
+        generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
+                              EXC_UD);
+
+        if ( !buf )
+            BUG();
+        if ( vex.opcx == vex_none )
+            SET_SSE_PREFIX(buf[0], vex.pfx);
+
+        buf[fic.insn_bytes] = 0xc3;
+        copy_REX_VEX(buf, rex_prefix, vex);
+
+        if ( ea.type == OP_MEM )
+        {
+            uint32_t mxcsr = 0;
+
+            if ( op_bytes < 16 ||
+                 (vex.opcx
+                  ? /* vmov{a,nt}p{s,d} are exceptions. */
+                    ext != ext_0f || ((b | 1) != 0x29 && b != 0x2b)
+                  : /* movup{s,d} and lddqu are exceptions. */
+                    ext == ext_0f && ((b | 1) == 0x11 || b == 0xf0)) )
+                mxcsr = MXCSR_MM;
+            else if ( vcpu_has_misalignsse() )
+                asm ( "stmxcsr %0" : "=m" (mxcsr) );
+            generate_exception_if(!(mxcsr & MXCSR_MM) &&
+                                  !is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
+                                              ctxt, ops),
+                                  EXC_GP, 0);
+            if ( (d & SrcMask) == SrcMem )
+            {
+                rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
+                if ( rc != X86EMUL_OKAY )
+                    goto done;
+                dst.type = OP_NONE;
+            }
+            else if ( (d & DstMask) == DstMem )
+            {
+                fail_if(!ops->write); /* Check before running the stub. */
+                ASSERT(d & Mov);
+                dst.type = OP_MEM;
+                dst.bytes = op_bytes;
+                dst.mem = ea.mem;
+            }
+            else if ( (d & SrcMask) == SrcMem16 )
+                dst.type = OP_NONE;
+            else
+            {
+                ASSERT_UNREACHABLE();
+                return X86EMUL_UNHANDLEABLE;
+            }
+        }
+        else
+            dst.type = OP_NONE;
+
+        invoke_stub("", "", "+m" (*mmvalp) : "a" (mmvalp));
+
+        put_stub(stub);
+        put_fpu(&fic);
+    }
+
     switch ( dst.type )
     {
     case OP_REG:
@@ -6240,8 +6704,11 @@ x86_emulate(
         else
         {
             fail_if(!ops->write);
-            rc = ops->write(
-                dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+            rc = ops->write(dst.mem.seg, dst.mem.off,
+                            !state->simd_size ? &dst.val : (void *)mmvalp,
+                            dst.bytes, ctxt);
+            if ( sfence )
+                asm volatile ( "sfence" ::: "memory" );
         }
         if ( rc != 0 )
             goto done;
@@ -6498,22 +6965,6 @@ x86_insn_is_mem_write(const struct x86_e
     case 0x6c: case 0x6d:                /* INS */
     case 0xa4: case 0xa5:                /* MOVS */
     case 0xaa: case 0xab:                /* STOS */
-    case X86EMUL_OPC(0x0f, 0x11):        /* MOVUPS */
-    case X86EMUL_OPC_VEX(0x0f, 0x11):    /* VMOVUPS */
-    case X86EMUL_OPC_66(0x0f, 0x11):     /* MOVUPD */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x11): /* VMOVUPD */
-    case X86EMUL_OPC_F3(0x0f, 0x11):     /* MOVSS */
-    case X86EMUL_OPC_VEX_F3(0x0f, 0x11): /* VMOVSS */
-    case X86EMUL_OPC_F2(0x0f, 0x11):     /* MOVSD */
-    case X86EMUL_OPC_VEX_F2(0x0f, 0x11): /* VMOVSD */
-    case X86EMUL_OPC(0x0f, 0x29):        /* MOVAPS */
-    case X86EMUL_OPC_VEX(0x0f, 0x29):    /* VMOVAPS */
-    case X86EMUL_OPC_66(0x0f, 0x29):     /* MOVAPD */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x29): /* VMOVAPD */
-    case X86EMUL_OPC(0x0f, 0x2b):        /* MOVNTPS */
-    case X86EMUL_OPC_VEX(0x0f, 0x2b):    /* VMOVNTPS */
-    case X86EMUL_OPC_66(0x0f, 0x2b):     /* MOVNTPD */
-    case X86EMUL_OPC_VEX_66(0x0f, 0x2b): /* VMOVNTPD */
     case X86EMUL_OPC(0x0f, 0x7e):        /* MOVD/MOVQ */
     case X86EMUL_OPC_66(0x0f, 0x7e):     /* MOVD/MOVQ */
     case X86EMUL_OPC_VEX_66(0x0f, 0x7e): /* VMOVD/VMOVQ */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -71,12 +71,14 @@
 #define cpu_has_xsavec		boot_cpu_has(X86_FEATURE_XSAVEC)
 #define cpu_has_xgetbv1		boot_cpu_has(X86_FEATURE_XGETBV1)
 #define cpu_has_xsaves		boot_cpu_has(X86_FEATURE_XSAVES)
+#define cpu_has_avx2		boot_cpu_has(X86_FEATURE_AVX2)
 #define cpu_has_monitor		boot_cpu_has(X86_FEATURE_MONITOR)
 #define cpu_has_eist		boot_cpu_has(X86_FEATURE_EIST)
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_rdrand		boot_cpu_has(X86_FEATURE_RDRAND)
 #define cpu_has_rdseed		boot_cpu_has(X86_FEATURE_RDSEED)
 #define cpu_has_cmp_legacy	boot_cpu_has(X86_FEATURE_CMP_LEGACY)
+#define cpu_has_sse4a		boot_cpu_has(X86_FEATURE_SSE4A)
 #define cpu_has_tbm		boot_cpu_has(X86_FEATURE_TBM)
 
 enum _cache_type {

[-- Attachment #3: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

  parent reply	other threads:[~2017-01-25 15:04 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-01-25 14:49 [PATCH 0/8] x86emul: MMX/SSE/SSE2 support Jan Beulich
2017-01-25 15:03 ` [PATCH 1/8] x86emul: catch exceptions occurring in stubs Jan Beulich
2017-01-25 15:04 ` Jan Beulich [this message]
2017-01-25 15:04 ` [PATCH 3/8] x86emul: support MMX/SSE/SSE2 moves Jan Beulich
2017-01-25 15:05 ` [PATCH 4/8] x86emul: support MMX/SSE/SSE2 converts Jan Beulich
2017-01-25 15:05 ` [PATCH 5/8] x86emul: support {,V}{,U}COMIS{S,D} Jan Beulich
2017-01-25 15:06 ` [PATCH 6/8] x86emul: support MMX/SSE/SSE2 insns with only register operands Jan Beulich
2017-01-25 15:06 ` [PATCH 7/8] x86emul: support {,V}{LD,ST}MXCSR Jan Beulich
2017-01-25 15:07 ` [PATCH 8/8] x86emul: support {,V}MOVNTDQA Jan Beulich

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5888CC850200007800133DAB@prv-mh.provo.novell.com \
    --to=jbeulich@suse.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.