[PATCH v5 02/47] x86emul: support basic AVX512 moves

From: "Jan Beulich" <JBeulich@suse.com>
To: xen-devel <xen-devel@lists.xenproject.org>
Cc: George Dunlap <George.Dunlap@eu.citrix.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Wei Liu <wei.liu2@citrix.com>,
	Roger Pau Monne <roger.pau@citrix.com>
Subject: [PATCH v5 02/47] x86emul: support basic AVX512 moves
Date: Mon, 19 Nov 2018 03:13:50 -0700	[thread overview]
Message-ID: <5BF28CDE02000078001FD402@prv1-mh.provo.novell.com> (raw)
In-Reply-To: <5BF289D802000078001FD3DF@prv1-mh.provo.novell.com>

Note: SDM Vol 2 rev 067 is not really consistent about EVEX.L'L for LIG
      insns - the only place where this is made explicit is a table in
      the section titled "Vector Length Orthogonality": While they
      tolerate 0, 1, and 2, a value of 3 uniformly leads to #UD.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: Use IMPOSSIBLE() to guard against division by zero. Correct style.
    Re-base.
v4: Introduce d8s_dq64 to deal with 32-bit mode VMOVD with EVEX.W set.
    Adjust a comment.
v3: Restrict k-reg reading to insns with memory operand. Shrink scope of
    "disp8scale".
v2: Move "full" into more narrow scope.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1985,6 +1985,53 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovq %xmm1,32(%edx)...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_to_mem);
+
+        asm volatile ( "pcmpgtb %%xmm1, %%xmm1\n"
+                       put_insn(evex_vmovq_to_mem, "%{evex%} vmovq %%xmm1, 32(%0)")
+                       :: "d" (NULL) );
+
+        memset(res, 0xdb, 64);
+        set_insn(evex_vmovq_to_mem);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_mem) ||
+             *((uint64_t *)res + 4) ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing {evex} vmovq 32(%edx),%xmm0...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_from_mem);
+
+        asm volatile ( "pcmpeqb %%xmm0, %%xmm0\n"
+                       put_insn(evex_vmovq_from_mem, "%{evex%} vmovq 32(%0), %%xmm0")
+                       :: "d" (NULL) );
+
+        set_insn(evex_vmovq_from_mem);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_from_mem) )
+            goto fail;
+        asm ( "vmovq %1, %%xmm1\n\t"
+              "vpcmpeqq %%zmm0, %%zmm1, %%k0\n"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) );
+        if ( rc != 0xff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movdqu %xmm2,(%ecx)...");
     if ( stack_exec && cpu_has_sse2 )
     {
@@ -2085,6 +2132,118 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vmovdqu32 %zmm2,(%ecx){%k1}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovdqu32_to_mem);
+
+        memset(res, 0x55, 128);
+
+        asm volatile ( "vpcmpeqd %%ymm2, %%ymm2, %%ymm2\n\t"
+                       "kmovw %1,%%k1\n"
+                       put_insn(vmovdqu32_to_mem,
+                                "vmovdqu32 %%zmm2, (%0)%{%%k1%}")
+                       :: "c" (NULL), "rm" (res[0]) );
+        set_insn(vmovdqu32_to_mem);
+
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || memcmp(res + 16, res + 24, 32) ||
+             !check_eip(vmovdqu32_to_mem) )
+            goto fail;
+
+        res[16] = ~0; res[18] = ~0; res[20] = ~0; res[22] = ~0;
+        res[24] =  0; res[26] =  0; res[28] =  0; res[30] =  0;
+        if ( memcmp(res, res + 16, 64) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovdqu32 64(%edx),%zmm2{%k2}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovdqu32_from_mem);
+
+        asm volatile ( "knotw %%k1, %%k2\n"
+                       put_insn(vmovdqu32_from_mem,
+                                "vmovdqu32 64(%0), %%zmm2%{%%k2%}")
+                       :: "d" (NULL) );
+
+        set_insn(vmovdqu32_from_mem);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovdqu32_from_mem) )
+            goto fail;
+        asm ( "vpcmpeqd %1, %%zmm2, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[0]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovdqu16 %zmm3,(%ecx){%k1}...");
+    if ( stack_exec && cpu_has_avx512bw )
+    {
+        decl_insn(vmovdqu16_to_mem);
+
+        memset(res, 0x55, 128);
+
+        asm volatile ( "vpcmpeqw %%ymm3, %%ymm3, %%ymm3\n\t"
+                       "kmovd %1,%%k1\n"
+                       put_insn(vmovdqu16_to_mem,
+                                "vmovdqu16 %%zmm3, (%0)%{%%k1%}")
+                       :: "c" (NULL), "rm" (res[0]) );
+        set_insn(vmovdqu16_to_mem);
+
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || memcmp(res + 16, res + 24, 32) ||
+             !check_eip(vmovdqu16_to_mem) )
+            goto fail;
+
+        for ( i = 16; i < 24; ++i )
+            res[i] |= 0x0000ffff;
+        for ( ; i < 32; ++i )
+            res[i] &= 0xffff0000;
+        if ( memcmp(res, res + 16, 64) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovdqu16 64(%edx),%zmm3{%k2}...");
+    if ( stack_exec && cpu_has_avx512bw )
+    {
+        decl_insn(vmovdqu16_from_mem);
+
+        asm volatile ( "knotd %%k1, %%k2\n"
+                       put_insn(vmovdqu16_from_mem,
+                                "vmovdqu16 64(%0), %%zmm3%{%%k2%}")
+                       :: "d" (NULL) );
+
+        set_insn(vmovdqu16_from_mem);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovdqu16_from_mem) )
+            goto fail;
+        asm ( "vpcmpeqw %1, %%zmm3, %%k0\n\t"
+              "kmovd %%k0, %0" : "=r" (rc) : "m" (res[0]) );
+        if ( rc != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movsd %xmm5,(%ecx)...");
     memset(res, 0x77, 64);
     memset(res + 10, 0x66, 8);
@@ -2186,6 +2345,71 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vmovsd %xmm5,16(%ecx){%k3}...");
+    memset(res, 0x88, 128);
+    memset(res + 20, 0x77, 8);
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovsd_masked_to_mem);
+
+        asm volatile ( "vbroadcastsd %0, %%ymm5\n\t"
+                       "kxorw %%k3, %%k3, %%k3\n"
+                       put_insn(vmovsd_masked_to_mem,
+                                "vmovsd %%xmm5, 16(%1)%{%%k3%}")
+                       :: "m" (res[20]), "c" (NULL) );
+
+        set_insn(vmovsd_masked_to_mem);
+        regs.ecx = 0;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(vmovsd_masked_to_mem) )
+            goto fail;
+
+        asm volatile ( "kmovw %0, %%k3\n" :: "m" (res[20]) );
+
+        set_insn(vmovsd_masked_to_mem);
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(vmovsd_masked_to_mem) ||
+             memcmp(res, res + 16, 64) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+    {
+        printf("skipped\n");
+        memset(res + 4, 0x77, 8);
+    }
+
+    printf("%-40s", "Testing vmovaps (%edx),%zmm7{%k3}{z}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovaps_masked_from_mem);
+
+        asm volatile ( "vpcmpeqd %%xmm7, %%xmm7, %%xmm7\n\t"
+                       "vbroadcastss %%xmm7, %%zmm7\n"
+                       put_insn(vmovaps_masked_from_mem,
+                                "vmovaps (%0), %%zmm7%{%%k3%}%{z%}")
+                       :: "d" (NULL) );
+
+        set_insn(vmovaps_masked_from_mem);
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovaps_masked_from_mem) )
+            goto fail;
+        asm ( "vcmpeqps %1, %%zmm7, %%k0\n\t"
+              "vxorps %%xmm0, %%xmm0, %%xmm0\n\t"
+              "vcmpeqps %%zmm0, %%zmm7, %%k1\n\t"
+              "kxorw %%k1, %%k0, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[16]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movd %mm3,32(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
     {
@@ -2341,6 +2565,55 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovd %xmm3,32(%ecx)...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_to_mem);
+
+        asm volatile ( "pcmpeqb %%xmm3, %%xmm3\n"
+                       put_insn(evex_vmovd_to_mem,
+                                "%{evex%} vmovd %%xmm3, 32(%0)")
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(evex_vmovd_to_mem);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovd_to_mem) ||
+             res[8] + 1 ||
+             memcmp(res, res + 9, 28) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing {evex} vmovd 32(%ecx),%xmm4...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_from_mem);
+
+        asm volatile ( "pcmpeqb %%xmm4, %%xmm4\n"
+                       put_insn(evex_vmovd_from_mem,
+                                "%{evex%} vmovd 32(%0), %%xmm4")
+                       :: "c" (NULL) );
+
+        set_insn(evex_vmovd_from_mem);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovd_from_mem) )
+            goto fail;
+        asm ( "vmovd %1, %%xmm0\n\t"
+              "vpcmpeqd %%zmm4, %%zmm0, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movd %mm3,%ebx...");
     if ( stack_exec && cpu_has_mmx )
     {
@@ -2507,6 +2780,57 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovd %xmm2,%ebx...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(evex_vmovd_to_reg,
+                                "%{evex%} vmovd %%xmm2, %%ebx")
+                       :: );
+
+        set_insn(evex_vmovd_to_reg);
+#ifdef __x86_64__
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+#else
+        regs.ebx = 0xbdbdbdbdUL;
+#endif
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(evex_vmovd_to_reg) ||
+             regs.ebx != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing {evex} vmovd %ebx,%xmm1...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_from_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpgtb %%xmm1, %%xmm1\n"
+                       put_insn(evex_vmovd_from_reg,
+                                "%{evex%} vmovd %%ebx, %%xmm1")
+                       :: );
+
+        set_insn(evex_vmovd_from_reg);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(evex_vmovd_from_reg) )
+            goto fail;
+        asm ( "vmovd %1, %%xmm0\n\t"
+              "vpcmpeqd %%zmm1, %%zmm0, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #ifdef __x86_64__
     printf("%-40s", "Testing movq %mm3,32(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
@@ -2584,6 +2908,36 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovq %xmm11,32(%ecx)...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_to_mem2);
+
+        asm volatile ( "pcmpeqb %%xmm11, %%xmm11\n"
+#if 0 /* This may not work, as the assembler might pick opcode D6. */
+                       put_insn(evex_vmovq_to_mem2,
+                                "{evex} vmovq %%xmm11, 32(%0)")
+#else
+                       put_insn(evex_vmovq_to_mem2,
+                                ".byte 0x62, 0xf1, 0xfd, 0x08, 0x7e, 0x49, 0x04")
+#endif
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(evex_vmovq_to_mem2);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_mem2) ||
+             *((long *)res + 4) + 1 ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movq %mm3,%rbx...");
     if ( stack_exec && cpu_has_mmx )
     {
@@ -2643,6 +2997,28 @@ int main(int argc, char **argv)
     }
     else
         printf("skipped\n");
+
+    printf("%-40s", "Testing vmovq %xmm22,%rbx...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqq %%xmm2, %%xmm2\n\t"
+                       "vmovq %%xmm2, %%xmm22\n"
+                       put_insn(evex_vmovq_to_reg, "vmovq %%xmm22, %%rbx")
+                       :: );
+
+        set_insn(evex_vmovq_to_reg);
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_reg) ||
+             regs.rbx + 1 )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
 #endif
 
     printf("%-40s", "Testing maskmovq %mm4,%mm4...");
@@ -2812,6 +3188,32 @@ int main(int argc, char **argv)
             goto fail;
         printf("okay\n");
     }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovntdqa 64(%ecx),%zmm4...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovntdqa);
+
+        asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
+                       put_insn(evex_vmovntdqa, "vmovntdqa 64(%0), %%zmm4")
+                       :: "c" (NULL) );
+
+        set_insn(evex_vmovntdqa);
+        memset(res, 0x55, 192);
+        memset(res + 16, 0xff, 64);
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovntdqa) )
+            goto fail;
+        asm ( "vpbroadcastd %1, %%zmm2\n\t"
+              "vpcmpeqd %%zmm4, %%zmm2, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "0" (~0) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
     else
         printf("skipped\n");
 
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -222,6 +222,7 @@ int emul_test_get_fpu(
         if ( cpu_has_avx )
             break;
     case X86EMUL_FPU_opmask:
+    case X86EMUL_FPU_zmm:
         if ( cpu_has_avx512f )
             break;
     default:
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -132,6 +132,7 @@ static inline bool xcr0_mask(uint64_t ma
 #define cpu_has_avx512f   (cp.feat.avx512f  && xcr0_mask(0xe6))
 #define cpu_has_avx512dq  (cp.feat.avx512dq && xcr0_mask(0xe6))
 #define cpu_has_avx512bw  (cp.feat.avx512bw && xcr0_mask(0xe6))
+#define cpu_has_avx512vl  (cp.feat.avx512vl && xcr0_mask(0xe6))
 
 #define cpu_has_xgetbv1   (cpu_has_xsave && cp.xstate.xgetbv1)
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -243,9 +243,27 @@ enum simd_opsize {
 };
 typedef uint8_t simd_opsize_t;
 
+enum disp8scale {
+    /* Values 0 ... 4 are explicit sizes. */
+    d8s_bw = 5,
+    d8s_dq,
+    /* EVEX.W ignored outside of 64-bit mode */
+    d8s_dq64,
+    /*
+     * All further values must strictly be last and in the order
+     * given so that arithmetic on the values works.
+     */
+    d8s_vl,
+    d8s_vl_by_2,
+    d8s_vl_by_4,
+    d8s_vl_by_8,
+};
+typedef uint8_t disp8scale_t;
+
 static const struct twobyte_table {
     opcode_desc_t desc;
-    simd_opsize_t size;
+    simd_opsize_t size:4;
+    disp8scale_t d8s:4;
 } twobyte_table[256] = {
     [0x00] = { ModRM },
     [0x01] = { ImplicitOps|ModRM },
@@ -260,8 +278,8 @@ static const struct twobyte_table {
     [0x0d] = { ImplicitOps|ModRM },
     [0x0e] = { ImplicitOps },
     [0x0f] = { ModRM|SrcImmByte },
-    [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
-    [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+    [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl },
+    [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
     [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
@@ -270,10 +288,10 @@ static const struct twobyte_table {
     [0x18 ... 0x1f] = { ImplicitOps|ModRM },
     [0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
     [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
-    [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp },
-    [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp },
+    [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl },
+    [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp, d8s_vl },
     [0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+    [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x2c ... 0x2d] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp },
     [0x30 ... 0x35] = { ImplicitOps },
@@ -292,8 +310,8 @@ static const struct twobyte_table {
     [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
-    [0x6e] = { DstImplicit|SrcMem|ModRM|Mov },
-    [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int },
+    [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 },
+    [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
     [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM },
     [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
@@ -301,8 +319,8 @@ static const struct twobyte_table {
     [0x78] = { ImplicitOps|ModRM },
     [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int },
     [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0x7e] = { DstMem|SrcImplicit|ModRM|Mov },
-    [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
+    [0x7e] = { DstMem|SrcImplicit|ModRM|Mov, simd_none, d8s_dq64 },
+    [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x80 ... 0x8f] = { DstImplicit|SrcImm },
     [0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
     [0xa0 ... 0xa1] = { ImplicitOps|Mov },
@@ -344,14 +362,14 @@ static const struct twobyte_table {
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
-    [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
+    [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
     [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
+    [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
@@ -406,6 +424,7 @@ static const struct ext0f38_table {
     uint8_t to_mem:1;
     uint8_t two_op:1;
     uint8_t vsib:1;
+    disp8scale_t d8s:4;
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
@@ -418,7 +437,7 @@ static const struct ext0f38_table {
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
-    [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
     [0x2c ... 0x2d] = { .simd_size = simd_other },
     [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
@@ -656,6 +675,22 @@ union evex {
     };
 };
 
+#define EVEX_PFX_BYTES 4
+#define init_evex(stub) ({ \
+    uint8_t *buf_ = get_stub(stub); \
+    buf_[0] = 0x62; \
+    buf_ + EVEX_PFX_BYTES; \
+})
+
+#define copy_EVEX(ptr, evex) ({ \
+    if ( !mode_64bit() ) \
+        (evex).reg |= 8; \
+    (ptr)[1 - EVEX_PFX_BYTES] = (evex).raw[0]; \
+    (ptr)[2 - EVEX_PFX_BYTES] = (evex).raw[1]; \
+    (ptr)[3 - EVEX_PFX_BYTES] = (evex).raw[2]; \
+    container_of((ptr) + 1 - EVEX_PFX_BYTES, typeof(evex), raw[0]); \
+})
+
 #define rep_prefix()   (vex.pfx >= vex_f3)
 #define repe_prefix()  (vex.pfx == vex_f3)
 #define repne_prefix() (vex.pfx == vex_f2)
@@ -768,6 +803,7 @@ typedef union {
     uint64_t mmx;
     uint64_t __attribute__ ((aligned(16))) xmm[2];
     uint64_t __attribute__ ((aligned(32))) ymm[4];
+    uint64_t __attribute__ ((aligned(64))) zmm[8];
 } mmval_t;
 
 /*
@@ -1201,6 +1237,11 @@ static int _get_fpu(
 
     switch ( type )
     {
+    case X86EMUL_FPU_zmm:
+        if ( !(xcr0 & X86_XCR0_ZMM) || !(xcr0 & X86_XCR0_HI_ZMM) ||
+             !(xcr0 & X86_XCR0_OPMASK) )
+            return X86EMUL_UNHANDLEABLE;
+        /* fall through */
     case X86EMUL_FPU_ymm:
         if ( !(xcr0 & X86_XCR0_SSE) || !(xcr0 & X86_XCR0_YMM) )
             return X86EMUL_UNHANDLEABLE;
@@ -1787,6 +1828,7 @@ static bool vcpu_has(
 #define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
 #define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
 #define vcpu_has_avx512bw()    vcpu_has(         7, EBX, 30, ctxt, ops)
+#define vcpu_has_avx512vl()    vcpu_has(         7, EBX, 31, ctxt, ops)
 #define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
 #define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
 
@@ -2160,6 +2202,65 @@ static unsigned long *decode_vex_gpr(
     return decode_gpr(regs, ~vex_reg & (mode_64bit() ? 0xf : 7));
 }
 
+static unsigned int decode_disp8scale(enum disp8scale scale,
+                                      const struct x86_emulate_state *state)
+{
+    switch ( scale )
+    {
+    case d8s_bw:
+        return state->evex.w;
+
+    default:
+        if ( scale < d8s_vl )
+            return scale;
+        if ( state->evex.br )
+        {
+    case d8s_dq:
+            return 2 + state->evex.w;
+        }
+        break;
+
+    case d8s_dq64:
+        return 2 + (state->op_bytes == 8);
+    }
+
+    switch ( state->simd_size )
+    {
+    case simd_any_fp:
+    case simd_single_fp:
+        if ( !(state->evex.pfx & VEX_PREFIX_SCALAR_MASK) )
+            break;
+        /* fall through */
+    case simd_scalar_opc:
+    case simd_scalar_vexw:
+        return 2 + state->evex.w;
+
+    case simd_128:
+        /* These should have an explicit size specified. */
+        ASSERT_UNREACHABLE();
+        return 4;
+
+    default:
+        break;
+    }
+
+    return 4 + state->evex.lr - (scale - d8s_vl);
+}
+
+#define avx512_vlen_check(lig) do { \
+    switch ( evex.lr ) \
+    { \
+    default: \
+        generate_exception(EXC_UD); \
+    case 2: \
+        break; \
+    case 0: case 1: \
+        if ( !(lig) ) \
+            host_and_vcpu_must_have(avx512vl); \
+        break; \
+    } \
+} while ( false )
+
 static bool is_aligned(enum x86_segment seg, unsigned long offs,
                        unsigned int size, struct x86_emulate_ctxt *ctxt,
                        const struct x86_emulate_ops *ops)
@@ -2406,6 +2507,7 @@ x86_decode_twobyte(
         if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */
         {
     case X86EMUL_OPC_VEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */
+    case X86EMUL_OPC_EVEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */
             state->desc = DstImplicit | SrcMem | TwoOp;
             state->simd_size = simd_other;
             /* Avoid the state->desc clobbering of TwoOp below. */
@@ -2476,7 +2578,7 @@ x86_decode_twobyte(
     }
 
     /*
-     * Scalar forms of most VEX-encoded TwoOp instructions have
+     * Scalar forms of most VEX-/EVEX-encoded TwoOp instructions have
      * three operands.  Those which do really have two operands
      * should have exited earlier.
      */
@@ -2841,6 +2943,8 @@ x86_decode(
 
     if ( d & ModRM )
     {
+        unsigned int disp8scale = 0;
+
         d &= ~ModRM;
 #undef ModRM /* Only its aliases are valid to use from here on. */
         modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
@@ -2883,6 +2987,9 @@ x86_decode(
             break;
 
         case ext_0f:
+            if ( evex_encoded() )
+                disp8scale = decode_disp8scale(twobyte_table[b].d8s, state);
+
             switch ( b )
             {
             case 0x20: /* mov cr,reg */
@@ -2896,6 +3003,11 @@ x86_decode(
                  */
                 modrm_mod = 3;
                 break;
+
+            case 0x7e: /* vmovq xmm/m64,xmm needs special casing */
+                if ( disp8scale == 2 && evex.pfx == vex_f3 )
+                    disp8scale = 3;
+                break;
             }
             break;
 
@@ -2907,6 +3019,8 @@ x86_decode(
             if ( ext0f38_table[b].vsib )
                 d |= vSIB;
             state->simd_size = ext0f38_table[b].simd_size;
+            if ( evex_encoded() )
+                disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
             break;
 
         case ext_8f09:
@@ -2975,7 +3089,7 @@ x86_decode(
                     ea.mem.off = insn_fetch_type(int16_t);
                 break;
             case 1:
-                ea.mem.off += insn_fetch_type(int8_t);
+                ea.mem.off += insn_fetch_type(int8_t) << disp8scale;
                 break;
             case 2:
                 ea.mem.off += insn_fetch_type(int16_t);
@@ -3034,7 +3148,7 @@ x86_decode(
                 pc_rel = mode_64bit();
                 break;
             case 1:
-                ea.mem.off += insn_fetch_type(int8_t);
+                ea.mem.off += insn_fetch_type(int8_t) << disp8scale;
                 break;
             case 2:
                 ea.mem.off += insn_fetch_type(int32_t);
@@ -3235,10 +3349,11 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc, cr4_rc;
     uint8_t b, d, *opc = NULL;
-    unsigned int first_byte = 0, insn_bytes = 0;
+    unsigned int first_byte = 0, elem_bytes, insn_bytes = 0;
+    uint64_t op_mask = ~0ULL;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
-    bool sfence = false;
+    bool sfence = false, fault_suppression = false;
     struct operand src = { .reg = PTR_POISON };
     struct operand dst = { .reg = PTR_POISON };
     unsigned long cr4 = 0;
@@ -3286,6 +3401,7 @@ x86_emulate(
     b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
+    elem_bytes = 4 << evex.w;
 
     generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD);
 
@@ -3360,6 +3476,28 @@ x86_emulate(
         break;
     }
 
+    /* With a memory operand, fetch the mask register in use (if any). */
+    if ( ea.type == OP_MEM && evex.opmsk )
+    {
+        uint8_t *stb = get_stub(stub);
+
+        /* KMOV{W,Q} %k<n>, (%rax) */
+        stb[0] = 0xc4;
+        stb[1] = 0xe1;
+        stb[2] = cpu_has_avx512bw ? 0xf8 : 0x78;
+        stb[3] = 0x91;
+        stb[4] = evex.opmsk << 3;
+        insn_bytes = 5;
+        stb[5] = 0xc3;
+
+        invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask));
+
+        insn_bytes = 0;
+        put_stub(stub);
+
+        fault_suppression = true;
+    }
+
     /* Decode (but don't fetch) the destination operand: register or memory. */
     switch ( d & DstMask )
     {
@@ -5716,6 +5854,41 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 2;
         break;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x2b): /* vmovntp{s,d} [xyz]mm,mem */
+        generate_exception_if(ea.type != OP_MEM || evex.opmsk, EXC_UD);
+        sfence = true;
+        fault_suppression = false;
+        /* fall through */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x10): /* vmovup{s,d} [xyz]mm/mem,[xyz]mm{k} */
+    CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x10): /* vmovs{s,d} mem,xmm{k} */
+                                            /* vmovs{s,d} xmm,xmm,xmm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x11): /* vmovup{s,d} [xyz]mm,[xyz]mm/mem{k} */
+    CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x11): /* vmovs{s,d} xmm,mem{k} */
+                                            /* vmovs{s,d} xmm,xmm,xmm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x28): /* vmovap{s,d} [xyz]mm/mem,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x29): /* vmovap{s,d} [xyz]mm,[xyz]mm/mem{k} */
+        /* vmovs{s,d} to/from memory have only two operands. */
+        if ( (b & ~1) == 0x10 && ea.type == OP_MEM )
+            d |= TwoOp;
+        generate_exception_if(evex.br, EXC_UD);
+        generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
+    simd_zmm:
+        get_fpu(X86EMUL_FPU_zmm);
+        opc = init_evex(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* convert memory operand to (%rAX) */
+            evex.b = 1;
+            opc[1] &= 0x38;
+        }
+        insn_bytes = EVEX_PFX_BYTES + 2;
+        break;
+
     case X86EMUL_OPC_66(0x0f, 0x12):       /* movlpd m64,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x12):   /* vmovlpd m64,xmm,xmm */
     CASE_SIMD_PACKED_FP(, 0x0f, 0x13):     /* movlp{s,d} xmm,m64 */
@@ -6355,6 +6528,41 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */
+        generate_exception_if((evex.lr || evex.opmsk || evex.br ||
+                               evex.reg != 0xf || !evex.RX),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        opc = init_evex(stub);
+        opc[0] = b;
+        /* Convert memory/GPR operand to (%rAX). */
+        evex.b = 1;
+        if ( !mode_64bit() )
+            evex.w = 0;
+        opc[1] = modrm & 0x38;
+        insn_bytes = EVEX_PFX_BYTES + 2;
+        opc[2] = 0xc3;
+
+        copy_EVEX(opc, evex);
+        invoke_stub("", "", "+m" (src.val) : "a" (&src.val));
+        dst.val = src.val;
+
+        put_stub(stub);
+        ASSERT(!state->simd_size);
+        break;
+
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
+        generate_exception_if(evex.lr || !evex.w || evex.opmsk || evex.br,
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        d |= TwoOp;
+        op_bytes = 8;
+        goto simd_zmm;
+
     case X86EMUL_OPC_66(0x0f, 0xe7):     /* movntdq xmm,m128 */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq {x,y}mm,mem */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -6375,6 +6583,30 @@ x86_emulate(
             goto simd_0f_avx;
         goto simd_0f_sse2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe7): /* vmovntdq [xyz]mm,mem */
+        generate_exception_if(ea.type != OP_MEM || evex.opmsk || evex.w,
+                              EXC_UD);
+        sfence = true;
+        fault_suppression = false;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6f): /* vmovdqa{32,64} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x6f): /* vmovdqu{32,64} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x7f): /* vmovdqa{32,64} [xyz]mm,[xyz]mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x7f): /* vmovdqu{32,64} [xyz]mm,[xyz]mm/mem{k} */
+    vmovdqa:
+        generate_exception_if(evex.br, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(false);
+        d |= TwoOp;
+        op_bytes = 16 << evex.lr;
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_F2(0x0f, 0x6f): /* vmovdqu{8,16} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F2(0x0f, 0x7f): /* vmovdqu{8,16} [xyz]mm,[xyz]mm/mem{k} */
+        host_and_vcpu_must_have(avx512bw);
+        elem_bytes = 1 << evex.w;
+        goto vmovdqa;
+
     case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
         generate_exception_if(vex.l, EXC_UD);
         d |= TwoOp;
@@ -7739,6 +7971,15 @@ x86_emulate(
         }
         goto movdqa;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x2a): /* vmovntdqa mem,[xyz]mm */
+        generate_exception_if(ea.type != OP_MEM || evex.opmsk || evex.w,
+                              EXC_UD);
+        /* Ignore the non-temporal hint for now, using vmovdqa32 instead. */
+        asm volatile ( "mfence" ::: "memory" );
+        b = 0x6f;
+        evex.opcx = vex_0f;
+        goto vmovdqa;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */
@@ -8792,17 +9033,27 @@ x86_emulate(
     else if ( state->simd_size )
     {
         generate_exception_if(!op_bytes, EXC_UD);
-        generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
+        generate_exception_if((vex.opcx && (d & TwoOp) &&
+                               (vex.reg != 0xf || (evex_encoded() && !evex.RX))),
                               EXC_UD);
 
         if ( !opc )
             BUG();
-        opc[insn_bytes - PFX_BYTES] = 0xc3;
-        copy_REX_VEX(opc, rex_prefix, vex);
+        if ( evex_encoded() )
+        {
+            opc[insn_bytes - EVEX_PFX_BYTES] = 0xc3;
+            copy_EVEX(opc, evex);
+        }
+        else
+        {
+            opc[insn_bytes - PFX_BYTES] = 0xc3;
+            copy_REX_VEX(opc, rex_prefix, vex);
+        }
 
         if ( ea.type == OP_MEM )
         {
             uint32_t mxcsr = 0;
+            uint64_t full = 0;
 
             if ( op_bytes < 16 ||
                  (vex.opcx
@@ -8824,6 +9075,45 @@ x86_emulate(
                                   !is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
                                               ctxt, ops),
                                   EXC_GP, 0);
+
+            IMPOSSIBLE(elem_bytes <= 0);
+            if ( evex.br )
+            {
+                ASSERT((d & DstMask) != DstMem);
+                op_bytes = elem_bytes;
+            }
+            if ( evex.opmsk )
+            {
+                ASSERT(!(op_bytes % elem_bytes));
+                full = ~0ULL >> (64 - op_bytes / elem_bytes);
+                op_mask &= full;
+            }
+            if ( fault_suppression )
+            {
+                if ( !op_mask )
+                    goto simd_no_mem;
+                if ( !evex.br )
+                {
+                    first_byte = __builtin_ctzll(op_mask);
+                    op_mask >>= first_byte;
+                    full >>= first_byte;
+                    first_byte *= elem_bytes;
+                    op_bytes = (64 - __builtin_clzll(op_mask)) * elem_bytes;
+                }
+            }
+            /*
+             * Independent of fault suppression we may need to read (parts of)
+             * the memory operand for the purpose of merging without splitting
+             * the write below into multiple ones. Note that the EVEX.Z check
+             * here isn't strictly needed, due to there not currently being
+             * any instructions allowing zeroing-merging on memory writes (and
+             * we raise #UD during DstMem processing far above in this case),
+             * yet conceptually the read is then unnecessary.
+             */
+            if ( evex.opmsk && !evex.z && (d & DstMask) == DstMem &&
+                 op_mask != full )
+                d = (d & ~SrcMask) | SrcMem;
+
             switch ( d & SrcMask )
             {
             case SrcMem:
@@ -8865,7 +9155,10 @@ x86_emulate(
             }
         }
         else
+        {
+        simd_no_mem:
             dst.type = OP_NONE;
+        }
 
         /* {,v}maskmov{q,dqu}, as an exception, uses rDI. */
         if ( likely((ctxt->opcode & ~(X86EMUL_OPC_PFX_MASK |
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -171,6 +171,7 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */
     X86EMUL_FPU_ymm, /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
     X86EMUL_FPU_opmask, /* AVX512 opmask instruction set (%k0-%k7) */
+    X86EMUL_FPU_zmm, /* AVX512 instruction set (%zmm0-%zmm7/31) */
     /* This sentinel will never be passed to ->get_fpu(). */
     X86EMUL_FPU_none
 };
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -105,6 +105,7 @@
 #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
 #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
 #define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
+#define cpu_has_avx512vl        boot_cpu_has(X86_FEATURE_AVX512VL)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel