Xen-Devel Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support
@ 2018-08-09  8:15 Jan Beulich
  2018-08-09  8:23 ` [PATCH 1/6] x86emul: fix FMA scalar operand sizes Jan Beulich
                   ` (12 more replies)
  0 siblings, 13 replies; 465+ messages in thread
From: Jan Beulich @ 2018-08-09  8:15 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

1: fix FMA scalar operand sizes
2: extend MASKMOV{Q,DQU} tests
3: support AVX512 opmask insns
4: clean up AVX2 insn use in test harness
5: correct EVEX decoding
6: generalize vector length handling for AVX512/EVEX

While I also have ready a patch emulating the basic AVX512 moves,
its prereq to widen respective fields in HVM code to allow for 64 byte
operand sizes has a dependency on the "x86/HVM: implement memory
read caching" series, which looks to be stalled, and I prefer to avoid
posting further patches with dependencies on stalled series.

Some support beyond moves is also mostly ready, but it's not quite
enough yet to enable testing that code in the harness, and I'd rather
not post code that I haven't pushed through the harness yet. The
partial testing I've been doing means, however, that in particular the
last two patches here have been tested already, even if no such
testing would be possible for others until further patches actually get
posted.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH 1/6] x86emul: fix FMA scalar operand sizes
  2018-08-09  8:15 [PATCH 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
@ 2018-08-09  8:23 ` Jan Beulich
  2018-08-09  8:24 ` [PATCH 2/6] x86emul: extend MASKMOV{Q,DQU} tests Jan Beulich
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-08-09  8:23 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

FMA insns, other than the earlier AVX additions, don't use the low
opcode bit to distinguish between single and double vector elements.
While the difference is benign for packed flavors, the scalar ones
need to use VEX.W here. Oddly enough the table entries didn't even use
simd_scalar_fp, but uniformly used simd_packed_fp (implying the
distinction was by [VEX-encoded] opcode prefix).

Split simd_scalar_fp into simd_scalar_opc and simd_scalar_vexw, and
correct 

Also correct the scalar insn comments (they only ever use XMM registers
as operands).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -224,7 +224,13 @@ enum simd_opsize {
      * - 32 bits with low opcode bit clear (scalar single)
      * - 64 bits with low opcode bit set (scalar double)
      */
-    simd_scalar_fp,
+    simd_scalar_opc,
+
+    /*
+     * Scalar floating point:
+     * - 32/64 bits depending on VEX.W
+     */
+    simd_scalar_vexw,
 
     /*
      * 128 bits of integer or floating point data, with no further
@@ -407,7 +413,7 @@ static const struct ext0f38_table {
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
     [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x18 ... 0x19] = { .simd_size = simd_scalar_opc, .two_op = 1 },
     [0x1a] = { .simd_size = simd_128, .two_op = 1 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
@@ -427,9 +433,30 @@ static const struct ext0f38_table {
     [0x8c] = { .simd_size = simd_other },
     [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
-    [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
-    [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
-    [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
+    [0x96 ... 0x98] = { .simd_size = simd_packed_fp },
+    [0x99] = { .simd_size = simd_scalar_vexw },
+    [0x9a] = { .simd_size = simd_packed_fp },
+    [0x9b] = { .simd_size = simd_scalar_vexw },
+    [0x9c] = { .simd_size = simd_packed_fp },
+    [0x9d] = { .simd_size = simd_scalar_vexw },
+    [0x9e] = { .simd_size = simd_packed_fp },
+    [0x9f] = { .simd_size = simd_scalar_vexw },
+    [0xa6 ... 0xa8] = { .simd_size = simd_packed_fp },
+    [0xa9] = { .simd_size = simd_scalar_vexw },
+    [0xaa] = { .simd_size = simd_packed_fp },
+    [0xab] = { .simd_size = simd_scalar_vexw },
+    [0xac] = { .simd_size = simd_packed_fp },
+    [0xad] = { .simd_size = simd_scalar_vexw },
+    [0xae] = { .simd_size = simd_packed_fp },
+    [0xaf] = { .simd_size = simd_scalar_vexw },
+    [0xb6 ... 0xb8] = { .simd_size = simd_packed_fp },
+    [0xb9] = { .simd_size = simd_scalar_vexw },
+    [0xba] = { .simd_size = simd_packed_fp },
+    [0xbb] = { .simd_size = simd_scalar_vexw },
+    [0xbc] = { .simd_size = simd_packed_fp },
+    [0xbd] = { .simd_size = simd_scalar_vexw },
+    [0xbe] = { .simd_size = simd_packed_fp },
+    [0xbf] = { .simd_size = simd_scalar_vexw },
     [0xc8 ... 0xcd] = { .simd_size = simd_other },
     [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xdc ... 0xdf] = { .simd_size = simd_packed_int },
@@ -454,7 +481,7 @@ static const struct ext0f3a_table {
     [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
-    [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
+    [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc },
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
     [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
@@ -476,13 +503,13 @@ static const struct ext0f3a_table {
     [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x68 ... 0x69] = { .simd_size = simd_packed_fp, .four_op = 1 },
-    [0x6a ... 0x6b] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x6a ... 0x6b] = { .simd_size = simd_scalar_opc, .four_op = 1 },
     [0x6c ... 0x6d] = { .simd_size = simd_packed_fp, .four_op = 1 },
-    [0x6e ... 0x6f] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x6e ... 0x6f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
     [0x78 ... 0x79] = { .simd_size = simd_packed_fp, .four_op = 1 },
-    [0x7a ... 0x7b] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x7a ... 0x7b] = { .simd_size = simd_scalar_opc, .four_op = 1 },
     [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
-    [0x7e ... 0x7f] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x7e ... 0x7f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
     [0xcc] = { .simd_size = simd_other },
     [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xf0] = {},
@@ -518,7 +545,7 @@ static const struct ext8f09_table {
 } ext8f09_table[256] = {
     [0x01 ... 0x02] = { .two_op = 1 },
     [0x80 ... 0x81] = { .simd_size = simd_packed_fp, .two_op = 1 },
-    [0x82 ... 0x83] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x82 ... 0x83] = { .simd_size = simd_scalar_opc, .two_op = 1 },
     [0x90 ... 0x9b] = { .simd_size = simd_packed_int },
     [0xc1 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xc6 ... 0xc7] = { .simd_size = simd_packed_int, .two_op = 1 },
@@ -3132,10 +3159,14 @@ x86_decode(
         }
         break;
 
-    case simd_scalar_fp:
+    case simd_scalar_opc:
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 
+    case simd_scalar_vexw:
+        op_bytes = 4 << vex.w;
+        break;
+
     case simd_128:
         op_bytes = 16;
         break;
@@ -7747,33 +7778,33 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} xmm/mem,xmm,xmm */
         host_and_vcpu_must_have(fma);
         goto simd_0f_ymm;
 




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH 2/6] x86emul: extend MASKMOV{Q,DQU} tests
  2018-08-09  8:15 [PATCH 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
  2018-08-09  8:23 ` [PATCH 1/6] x86emul: fix FMA scalar operand sizes Jan Beulich
@ 2018-08-09  8:24 ` Jan Beulich
  2018-08-09  8:24 ` [PATCH 3/6] x86emul: support AVX512 opmask insns Jan Beulich
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-08-09  8:24 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

While deriving the first AVX512 pieces from existing code I've got the
(in the end wrong) impression that the emulation of these insns would be
broken. Besides testing that the instructions act as no-ops when the
controlling mask bits are all zero, add ones to also check that the data
merging actually works.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -2626,7 +2626,7 @@ int main(int argc, char **argv)
         printf("skipped\n");
 #endif
 
-    printf("%-40s", "Testing maskmovq (zero mask)...");
+    printf("%-40s", "Testing maskmovq %mm4,%mm4...");
     if ( stack_exec && cpu_has_sse )
     {
         decl_insn(maskmovq);
@@ -2639,12 +2639,25 @@ int main(int argc, char **argv)
         rc = x86_emulate(&ctxt, &emulops);
         if ( rc != X86EMUL_OKAY || !check_eip(maskmovq) )
             goto fail;
+
+        asm volatile ( "pcmpeqb %mm3, %mm3\n\t"
+                       "punpcklbw %mm3, %mm4\n" );
+        memset(res, 0x55, 24);
+
+        set_insn(maskmovq);
+        regs.edi = (unsigned long)(res + 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(maskmovq) ||
+             memcmp(res, res + 4, 8) ||
+             res[2] != 0xff55ff55 || res[3] != 0xff55ff55 )
+            goto fail;
+
         printf("okay\n");
     }
     else
         printf("skipped\n");
 
-    printf("%-40s", "Testing maskmovdqu (zero mask)...");
+    printf("%-40s", "Testing maskmovdqu %xmm3,%xmm3...");
     if ( stack_exec && cpu_has_sse2 )
     {
         decl_insn(maskmovdqu);
@@ -2653,9 +2666,24 @@ int main(int argc, char **argv)
                        put_insn(maskmovdqu, "maskmovdqu %xmm3, %xmm3") );
 
         set_insn(maskmovdqu);
+        regs.edi = 0;
         rc = x86_emulate(&ctxt, &emulops);
         if ( rc != X86EMUL_OKAY || !check_eip(maskmovdqu) )
             goto fail;
+
+        asm volatile ( "pcmpeqb %xmm4, %xmm4\n\t"
+                       "punpcklbw %xmm4, %xmm3\n" );
+        memset(res, 0x55, 48);
+
+        set_insn(maskmovdqu);
+        regs.edi = (unsigned long)(res + 4);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(maskmovdqu) ||
+             memcmp(res, res + 8, 16) ||
+             res[4] != 0xff55ff55 || res[5] != 0xff55ff55 ||
+             res[6] != 0xff55ff55 || res[7] != 0xff55ff55 )
+            goto fail;
+
         printf("okay\n");
     }
     else





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH 3/6] x86emul: support AVX512 opmask insns
  2018-08-09  8:15 [PATCH 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
  2018-08-09  8:23 ` [PATCH 1/6] x86emul: fix FMA scalar operand sizes Jan Beulich
  2018-08-09  8:24 ` [PATCH 2/6] x86emul: extend MASKMOV{Q,DQU} tests Jan Beulich
@ 2018-08-09  8:24 ` Jan Beulich
  2018-08-09  8:25 ` [PATCH 4/6] x86emul: clean up AVX2 insn use in test harness Jan Beulich
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-08-09  8:24 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

These are all VEX encoded, so the EVEX decoding logic continues to
remain unused at this point.

The new testcase is deliberately coded in assembly, as a C one would
have become almost unreadable due to the overwhelming amount of
__builtin_...() that would need to be used. After all the compiler has
no underlying type (yet) that could be operated on without builtins,
other than the vector types used for "normal" SIMD insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -16,6 +16,8 @@ FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
 
+OPMASK := avx512f avx512dq avx512bw
+
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
@@ -51,6 +53,10 @@ xop-vecs := $(avx-vecs)
 xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
 
+avx512f-opmask-vecs := 2
+avx512dq-opmask-vecs := 1
+avx512bw-opmask-vecs := 4 8
+
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.  For 3DNow!, however, force SSE
 # use for floating point operations, to avoid mixing MMX and FPU register
@@ -80,9 +86,13 @@ $(1)-cflags := \
 	   $(foreach flt,$($(1)-flts), \
 	     "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
 endef
+define opmask-defs
+$(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)")
+endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
 $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
@@ -100,6 +110,22 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 	)
 	mv $@.new $@
 
+$(addsuffix -opmask.h,$(OPMASK)): %.h: opmask.S testcase.mk Makefile
+	rm -f $@.new $*.bin
+	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
+	    for cflags in $($*-cflags) $($*-cflags-$(arch)); do \
+		$(MAKE) -f testcase.mk TESTCASE=$* XEN_TARGET_ARCH=$(arch) $*-cflags="$$cflags" all; \
+		prefix=$(shell echo $(subst -,_,$*) | sed -e 's,^\([0-9]\),_\1,'); \
+		flavor=$$(echo $${cflags} | sed -e 's, .*,,' -e 'y,-=,__,') ; \
+		(echo 'static const unsigned int __attribute__((section(".test, \"ax\", @progbits #")))' \
+		      "$${prefix}_$(arch)$${flavor}[] = {"; \
+		 od -v -t x $*.bin | sed -e 's/^[0-9]* /0x/' -e 's/ /, 0x/g' -e 's/$$/,/'; \
+		 echo "};") >>$@.new; \
+		rm -f $*.bin; \
+	    done; \
+	)
+	mv $@.new $@
+
 $(addsuffix .c,$(SIMD)):
 	ln -sf simd.c $@
 
@@ -145,4 +171,4 @@ x86-emulate.o test_x86_emulator.o wrappe
 x86-emulate.o: x86_emulate/x86_emulate.c
 x86-emulate.o: HOSTCFLAGS += -D__XEN_TOOLS__
 
-test_x86_emulator.o: $(addsuffix .h,$(TESTCASES))
+test_x86_emulator.o: $(addsuffix .h,$(TESTCASES)) $(addsuffix -opmask.h,$(OPMASK))
--- /dev/null
+++ b/tools/tests/x86_emulator/opmask.S
@@ -0,0 +1,144 @@
+#ifdef __i386__
+# define R(x) e##x
+# define DATA(x) x
+#else
+# if SIZE == 8
+#  define R(x) r##x
+# else
+#  define R(x) e##x
+# endif
+# define DATA(x) x(%rip)
+#endif
+
+#if SIZE == 1
+# define _(x) x##b
+#elif SIZE == 2
+# define _(x) x##w
+# define WIDEN(x) x##bw
+#elif SIZE == 4
+# define _(x) x##d
+# define WIDEN(x) x##wd
+#elif SIZE == 8
+# define _(x) x##q
+# define WIDEN(x) x##dq
+#endif
+
+    .macro check res1:req, res2:req, line:req
+    _(kmov)       %\res1, DATA(out)
+#if SIZE < 8 || !defined(__i386__)
+    _(kmov)       %\res2, %R(dx)
+    cmp           DATA(out), %R(dx)
+#else
+    sub           $8, %esp
+    kmovq         %\res2, (%esp)
+    pop           %ecx
+    pop           %edx
+    cmp           DATA(out), %ecx
+    jne           0f
+    cmp           DATA(out+4), %edx
+0:
+#endif
+    je            1f
+    mov           $\line, %eax
+    ret
+1:
+    .endm
+
+    .text
+    .globl _start
+_start:
+    _(kmov)       DATA(in1), %k1
+#if SIZE < 8 || !defined(__i386__)
+    mov           DATA(in2), %R(ax)
+    _(kmov)       %R(ax), %k2
+#else
+    _(kmov)       DATA(in2), %k2
+#endif
+
+    _(kor)        %k1, %k2, %k3
+    _(kand)       %k1, %k2, %k4
+    _(kandn)      %k3, %k4, %k5
+    _(kxor)       %k1, %k2, %k6
+    check         k5, k6, __LINE__
+
+    _(knot)       %k6, %k3
+    _(kxnor)      %k1, %k2, %k4
+    check         k3, k4, __LINE__
+
+    _(kshiftl)    $1, %k1, %k3
+    _(kshiftl)    $2, %k3, %k4
+    _(kshiftl)    $3, %k1, %k5
+    check         k4, k5, __LINE__
+
+    _(kshiftr)    $1, %k1, %k3
+    _(kshiftr)    $2, %k3, %k4
+    _(kshiftr)    $3, %k1, %k5
+    check         k4, k5, __LINE__
+
+    _(kortest)    %k6, %k6
+    jnbe          1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxor)       %k0, %k0, %k3
+    _(kortest)    %k3, %k3
+    jz            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxnor)      %k0, %k0, %k3
+    _(kortest)    %k3, %k3
+    jc            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+#if SIZE > 1
+
+    _(kshiftr)    $SIZE*4, %k3, %k4
+    WIDEN(kunpck) %k4, %k4, %k5
+    check         k3, k5, __LINE__
+
+#endif
+
+#if SIZE != 2 || defined(__AVX512DQ__)
+
+    _(kadd)       %k1, %k1, %k3
+    _(kshiftl)    $1, %k1, %k4
+    check         k3, k4, __LINE__
+
+    _(ktest)      %k2, %k1
+    jnbe          1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxor)       %k0, %k0, %k3
+    _(ktest)      %k0, %k3
+    jz            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxnor)      %k0, %k0, %k4
+    _(ktest)      %k0, %k4
+    jc            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+#endif
+
+    xor           %eax, %eax
+    ret
+
+    .section .rodata, "a", @progbits
+    .balign 8
+in1: .byte 0b10110011, 0b10001111, 0b00001111, 0b10000011, 0b11110000, 0b00111111, 0b10000000, 0b11111111
+in2: .byte 0b11111111, 0b00000001, 0b11111100, 0b00001111, 0b11000001, 0b11110000, 0b11110001, 0b11001101
+
+    .data
+    .balign 8
+out: .quad 0
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -18,6 +18,9 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx2.h"
 #include "avx2-sg.h"
 #include "xop.h"
+#include "avx512f-opmask.h"
+#include "avx512dq-opmask.h"
+#include "avx512bw-opmask.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -78,6 +81,24 @@ static bool simd_check_xop(void)
     return cpu_has_xop;
 }
 
+static bool simd_check_avx512f(void)
+{
+    return cpu_has_avx512f;
+}
+#define simd_check_avx512f_opmask simd_check_avx512f
+
+static bool simd_check_avx512dq(void)
+{
+    return cpu_has_avx512dq;
+}
+#define simd_check_avx512dq_opmask simd_check_avx512dq
+
+static bool simd_check_avx512bw(void)
+{
+    return cpu_has_avx512bw;
+}
+#define simd_check_avx512bw_opmask simd_check_avx512bw
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -223,6 +244,10 @@ static const struct {
     SIMD(XOP i16x16,              xop,      32i2),
     SIMD(XOP i32x8,               xop,      32i4),
     SIMD(XOP i64x4,               xop,      32i8),
+    SIMD(OPMASK/w,     avx512f_opmask,         2),
+    SIMD(OPMASK/b,    avx512dq_opmask,         1),
+    SIMD(OPMASK/d,    avx512bw_opmask,         4),
+    SIMD(OPMASK/q,    avx512bw_opmask,         8),
 #undef SIMD_
 #undef SIMD
 };
@@ -3469,8 +3494,8 @@ int main(int argc, char **argv)
             rc = x86_emulate(&ctxt, &emulops);
             if ( rc != X86EMUL_OKAY )
             {
-                printf("failed at %%eip == %08lx (opcode %08x)\n",
-                       (unsigned long)regs.eip, ctxt.opcode);
+                printf("failed (%d) at %%eip == %08lx (opcode %08x)\n",
+                       rc, (unsigned long)regs.eip, ctxt.opcode);
                 return 1;
             }
         }
--- a/tools/tests/x86_emulator/testcase.mk
+++ b/tools/tests/x86_emulator/testcase.mk
@@ -14,3 +14,9 @@ all: $(TESTCASE).bin
 	$(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x100000 -o $*.tmp $*.o
 	$(OBJCOPY) -O binary $*.tmp $@
 	rm -f $*.tmp
+
+%-opmask.bin: opmask.S
+	$(CC) $(filter-out -M% .%,$(CFLAGS)) -c $< -o $*.o
+	$(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x100000 -o $*.tmp $*.o
+	$(OBJCOPY) -O binary $*.tmp $@
+	rm -f $*.tmp
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -209,6 +209,9 @@ int emul_test_get_fpu(
     case X86EMUL_FPU_ymm:
         if ( cpu_has_avx )
             break;
+    case X86EMUL_FPU_opmask:
+        if ( cpu_has_avx512f )
+            break;
     default:
         return X86EMUL_UNHANDLEABLE;
     }
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -236,6 +236,36 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 21)) != 0; \
 })
 
+#define cpu_has_avx512f ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 16)) != 0; \
+})
+
+#define cpu_has_avx512dq ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 17)) != 0; \
+})
+
+#define cpu_has_avx512bw ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 30)) != 0; \
+})
+
 int emul_test_cpuid(
     uint32_t leaf,
     uint32_t subleaf,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -491,6 +491,7 @@ static const struct ext0f3a_table {
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
+    [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
@@ -1187,6 +1188,11 @@ static int _get_fpu(
             return X86EMUL_UNHANDLEABLE;
         break;
 
+    case X86EMUL_FPU_opmask:
+        if ( !(xcr0 & X86_XCR0_SSE) || !(xcr0 & X86_XCR0_OPMASK) )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
     default:
         break;
     }
@@ -1762,12 +1768,15 @@ static bool vcpu_has(
 #define vcpu_has_bmi2()        vcpu_has(         7, EBX,  8, ctxt, ops)
 #define vcpu_has_rtm()         vcpu_has(         7, EBX, 11, ctxt, ops)
 #define vcpu_has_mpx()         vcpu_has(         7, EBX, 14, ctxt, ops)
+#define vcpu_has_avx512f()     vcpu_has(         7, EBX, 16, ctxt, ops)
+#define vcpu_has_avx512dq()    vcpu_has(         7, EBX, 17, ctxt, ops)
 #define vcpu_has_rdseed()      vcpu_has(         7, EBX, 18, ctxt, ops)
 #define vcpu_has_adx()         vcpu_has(         7, EBX, 19, ctxt, ops)
 #define vcpu_has_smap()        vcpu_has(         7, EBX, 20, ctxt, ops)
 #define vcpu_has_clflushopt()  vcpu_has(         7, EBX, 23, ctxt, ops)
 #define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
 #define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
+#define vcpu_has_avx512bw()    vcpu_has(         7, EBX, 30, ctxt, ops)
 #define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
 #define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
 
@@ -2396,6 +2405,18 @@ x86_decode_twobyte(
         }
         break;
 
+    case X86EMUL_OPC_VEX(0, 0x90):    /* kmov{w,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x90): /* kmov{b,d} */
+        state->desc = DstReg | SrcMem | Mov;
+        state->simd_size = simd_other;
+        break;
+
+    case X86EMUL_OPC_VEX(0, 0x91):    /* kmov{w,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x91): /* kmov{b,d} */
+        state->desc = DstMem | SrcReg | Mov;
+        state->simd_size = simd_other;
+        break;
+
     case 0xae:
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         /* fall through */
@@ -6002,6 +6023,60 @@ x86_emulate(
             dst.val = src.val;
         break;
 
+    case X86EMUL_OPC_VEX(0x0f, 0x4a):    /* kadd{w,q} k,k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x41):    /* kand{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x41): /* kand{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x42):    /* kandn{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x42): /* kandn{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x45):    /* kor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x45): /* kor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x46):    /* kxnor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x46): /* kxnor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x47):    /* kxor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x47): /* kxor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x4a): /* kadd{b,d} k,k,k */
+        generate_exception_if(!vex.l, EXC_UD);
+    opmask_basic:
+        if ( vex.w )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( vex.pfx )
+            host_and_vcpu_must_have(avx512dq);
+    opmask_common:
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(!vex.r || (mode_64bit() && !(vex.reg & 8)) ||
+                              ea.type != OP_REG, EXC_UD);
+
+        vex.reg |= 8;
+        d &= ~TwoOp;
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        insn_bytes = PFX_BYTES + 2;
+
+        state->simd_size = simd_other;
+        op_bytes = 1; /* Any non-zero value will do. */
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x44):    /* knot{w,q} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x44): /* knot{b,d} k,k */
+        generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+        goto opmask_basic;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x4b):    /* kunpck{w,d}{d,q} k,k,k */
+        generate_exception_if(!vex.l, EXC_UD);
+        host_and_vcpu_must_have(avx512bw);
+        goto opmask_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f, 0x4b): /* kunpckbw k,k,k */
+        generate_exception_if(!vex.l || vex.w, EXC_UD);
+        goto opmask_common;
+
     CASE_SIMD_PACKED_FP(, 0x0f, 0x50):     /* movmskp{s,d} xmm,reg */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
     CASE_SIMD_PACKED_INT(0x0f, 0xd7):      /* pmovmskb {,x}mm,reg */
@@ -6552,6 +6627,154 @@ x86_emulate(
         dst.val = test_cc(b, _regs.eflags);
         break;
 
+    case X86EMUL_OPC_VEX(0x0f, 0x91):    /* kmov{w,q} k,mem */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x91): /* kmov{b,d} k,mem */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x90):    /* kmov{w,q} k/mem,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x90): /* kmov{b,d} k/mem,k */
+        generate_exception_if(vex.l || !vex.r, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.w )
+        {
+            host_and_vcpu_must_have(avx512bw);
+            op_bytes = 4 << !vex.pfx;
+        }
+        else if ( vex.pfx )
+        {
+            host_and_vcpu_must_have(avx512dq);
+            op_bytes = 1;
+        }
+        else
+            op_bytes = 2;
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* convert memory operand to (%rAX) */
+            vex.b = 1;
+            opc[1] &= 0x38;
+        }
+        insn_bytes = PFX_BYTES + 2;
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x92):    /* kmovw r32,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x92): /* kmovb r32,k */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x92): /* kmov{d,q} reg,k */
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.pfx == vex_f2 )
+            host_and_vcpu_must_have(avx512bw);
+        else
+        {
+            generate_exception_if(vex.w, EXC_UD);
+            if ( vex.pfx )
+                host_and_vcpu_must_have(avx512dq);
+        }
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert GPR source to %rAX. */
+        vex.b = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1] = modrm & 0xf8;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        ea.reg = decode_gpr(&_regs, modrm_rm);
+        invoke_stub("", "", "=m" (dummy) : "a" (*ea.reg));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        dst.type = OP_NONE;
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x93):    /* kmovw k,r32 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x93): /* kmovb k,r32 */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x93): /* kmov{d,q} k,reg */
+        generate_exception_if(vex.l || vex.reg != 0xf || ea.type != OP_REG,
+                              EXC_UD);
+        dst = ea;
+        dst.reg = decode_gpr(&_regs, modrm_reg);
+
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.pfx == vex_f2 )
+        {
+            host_and_vcpu_must_have(avx512bw);
+            dst.bytes = 4 << (mode_64bit() && vex.w);
+        }
+        else
+        {
+            generate_exception_if(vex.w, EXC_UD);
+            dst.bytes = 4;
+            if ( vex.pfx )
+                host_and_vcpu_must_have(avx512dq);
+        }
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert GPR destination to %rAX. */
+        vex.r = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1] = modrm & 0xc7;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x99):    /* ktest{w,q} k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x98):    /* kortest{w,q} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x98): /* kortest{b,d} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x99): /* ktest{b,d} k,k */
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.w )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( vex.pfx )
+            host_and_vcpu_must_have(avx512dq);
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+                    _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+                    [eflags] "+g" (_regs.eflags),
+                    "=a" (dst.val), [tmp] "=&r" (dummy)
+                    : [mask] "i" (EFLAGS_MASK));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        dst.type = OP_NONE;
+        break;
+
     case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
         msr_val = 0;
         fail_if(ops->cpuid == NULL);
@@ -8170,6 +8393,23 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+    opmask_shift_imm:
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_opmask);
+        op_bytes = 1; /* Any non-zero value will do. */
+        goto simd_0f_imm8;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x31): /* kshiftr{d,q} $imm8,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x33): /* kshiftl{d,q} $imm8,k,k */
+        host_and_vcpu_must_have(avx512bw);
+        goto opmask_shift_imm;
+
     case X86EMUL_OPC_66(0x0f3a, 0x44):     /* pclmulqdq $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,xmm/m128,xmm,xmm */
         host_and_vcpu_must_have(pclmulqdq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -170,6 +170,7 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_mmx, /* MMX instruction set (%mm0-%mm7) */
     X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */
     X86EMUL_FPU_ymm, /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
+    X86EMUL_FPU_opmask, /* AVX512 opmask instruction set (%k0-%k7) */
     /* This sentinel will never be passed to ->get_fpu(). */
     X86EMUL_FPU_none
 };
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -99,9 +99,12 @@
 #define cpu_has_rtm             boot_cpu_has(X86_FEATURE_RTM)
 #define cpu_has_fpu_sel         (!boot_cpu_has(X86_FEATURE_NO_FPU_SEL))
 #define cpu_has_mpx             boot_cpu_has(X86_FEATURE_MPX)
+#define cpu_has_avx512f         boot_cpu_has(X86_FEATURE_AVX512F)
+#define cpu_has_avx512dq        boot_cpu_has(X86_FEATURE_AVX512DQ)
 #define cpu_has_rdseed          boot_cpu_has(X86_FEATURE_RDSEED)
 #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
 #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
+#define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH 4/6] x86emul: clean up AVX2 insn use in test harness
  2018-08-09  8:15 [PATCH 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                   ` (2 preceding siblings ...)
  2018-08-09  8:24 ` [PATCH 3/6] x86emul: support AVX512 opmask insns Jan Beulich
@ 2018-08-09  8:25 ` Jan Beulich
  2018-08-09  8:25 ` [PATCH 5/6] x86emul: correct EVEX decoding Jan Beulich
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-08-09  8:25 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

Drop the pretty pointless conditionals from code testing AVX insns and
properly use AVX2 mnemonics in code testing AVX2 insns (the test harness
is already requiring sufficiently new a compiler/assembler).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -2071,11 +2071,6 @@ int main(int argc, char **argv)
         rc = x86_emulate(&ctxt, &emulops);
         if ( rc != X86EMUL_OKAY || !check_eip(vmovdqu_from_mem) )
             goto fail;
-#if 0 /* Don't use AVX2 instructions for now */
-        asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
-              "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
-              "vpmovmskb %%ymm0, %0" : "=r" (rc) );
-#else
         asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
               "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
               "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
@@ -2083,7 +2078,6 @@ int main(int argc, char **argv)
               "vpmovmskb %%xmm0, %0\n\t"
               "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
         rc |= i << 16;
-#endif
         if ( rc != 0xffffffff )
             goto fail;
         printf("okay\n");
@@ -2755,11 +2749,6 @@ int main(int argc, char **argv)
         rc = x86_emulate(&ctxt, &emulops);
         if ( rc != X86EMUL_OKAY || !check_eip(vlddqu) )
             goto fail;
-#if 0 /* Don't use AVX2 instructions for now */
-        asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
-              "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
-              "vpmovmskb %%ymm0, %0" : "=r" (rc) );
-#else
         asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
               "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
               "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
@@ -2767,7 +2756,6 @@ int main(int argc, char **argv)
               "vpmovmskb %%xmm0, %0\n\t"
               "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
         rc |= i << 16;
-#endif
         if ( ~rc )
             goto fail;
         printf("okay\n");
@@ -2806,15 +2794,9 @@ int main(int argc, char **argv)
     {
         decl_insn(vmovntdqa);
 
-#if 0 /* Don't use AVX2 instructions for now */
         asm volatile ( "vpxor %%ymm4, %%ymm4, %%ymm4\n"
                        put_insn(vmovntdqa, "vmovntdqa (%0), %%ymm4")
                        :: "c" (NULL) );
-#else
-        asm volatile ( "vpxor %xmm4, %xmm4, %xmm4\n"
-                       put_insn(vmovntdqa,
-                                ".byte 0xc4, 0xe2, 0x7d, 0x2a, 0x21") );
-#endif
 
         set_insn(vmovntdqa);
         memset(res, 0x55, 96);
@@ -2823,19 +2805,9 @@ int main(int argc, char **argv)
         rc = x86_emulate(&ctxt, &emulops);
         if ( rc != X86EMUL_OKAY || !check_eip(vmovntdqa) )
             goto fail;
-#if 0 /* Don't use AVX2 instructions for now */
         asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
               "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
               "vpmovmskb %%ymm0, %0" : "=r" (rc) );
-#else
-        asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
-              "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
-              "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
-              "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
-              "vpmovmskb %%xmm0, %0\n\t"
-              "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
-        rc |= i << 16;
-#endif
         if ( ~rc )
             goto fail;
         printf("okay\n");
@@ -3161,12 +3133,7 @@ int main(int argc, char **argv)
 
         asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
                        "vpinsrd $0b00, %1, %%xmm1, %%xmm2\n\t"
-#if 0 /* Don't use AVX2 instructions for now */
                        put_insn(vpmaskmovd, "vpmaskmovd %%xmm1, %%xmm2, (%0)")
-#else
-                       put_insn(vpmaskmovd,
-                                ".byte 0xc4, 0xe2, 0x69, 0x8e, 0x0a")
-#endif
                        :: "d" (NULL), "r" (~0) );
 
         memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
@@ -3200,14 +3167,8 @@ int main(int argc, char **argv)
 
         asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
                        "vpcmpeqd %%xmm0, %%xmm0, %%xmm0\n\t"
-#if 0 /* Don't use AVX2 instructions for now */
                        "vpblendd $0b0011, %%xmm0, %%xmm1, %%xmm2\n\t"
                        put_insn(vpmaskmovq, "vpmaskmovq %%xmm1, %%xmm2, (%0)")
-#else
-                       ".byte 0xc4, 0xe3, 0x71, 0x02, 0xd0, 0b0011\n\t"
-                       put_insn(vpmaskmovq,
-                                ".byte 0xc4, 0xe2, 0xe9, 0x8e, 0x0a")
-#endif
                        :: "d" (NULL) );
 
         memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
@@ -3221,11 +3182,7 @@ int main(int argc, char **argv)
                     res + MMAP_SZ / sizeof(*res) - 4, 8) )
             goto fail;
 
-#if 0 /* Don't use AVX2 instructions for now */
         asm volatile ( "vpermq $0b00000001, %ymm2, %ymm2" );
-#else
-        asm volatile ( ".byte 0xc4, 0xe3, 0xfd, 0x00, 0xd2, 0b00000001" );
-#endif
         memset(res, 0xdb, 32);
         set_insn(vpmaskmovq);
         regs.edx = (unsigned long)(res - 2);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH 5/6] x86emul: correct EVEX decoding
  2018-08-09  8:15 [PATCH 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                   ` (3 preceding siblings ...)
  2018-08-09  8:25 ` [PATCH 4/6] x86emul: clean up AVX2 insn use in test harness Jan Beulich
@ 2018-08-09  8:25 ` Jan Beulich
  2018-08-09  8:26 ` [PATCH 6/6] x86emul: generalize vector length handling for AVX512/EVEX Jan Beulich
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-08-09  8:25 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

Fix an inverted pair of checks, drop an incorrect instance of #UD
raising for non-64-bit mode, and add further generic checks.

Note: Other than SDM Vol 2 rev 067 states, EVEX.V' is _not_ ignored
      outside of 64-bit mode when the field does not encode a register.
      Just like EVEX.VVVV is required to be 0b1111 in that case, EVEX.V'
      is required to be 1 there.

Also rename the bcst field to br, as #UD generation for individual insns
will need to consider both of its possible meanings.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -650,7 +650,7 @@ union evex {
         uint8_t w:1;
         uint8_t opmsk:3;
         uint8_t RX:1;
-        uint8_t bcst:1;
+        uint8_t br:1;
         uint8_t lr:2;
         uint8_t z:1;
     };
@@ -2760,13 +2760,11 @@ x86_decode(
                         evex.raw[1] = vex.raw[1];
                         evex.raw[2] = insn_fetch_type(uint8_t);
 
-                        generate_exception_if(evex.mbs || !evex.mbz, EXC_UD);
+                        generate_exception_if(!evex.mbs || evex.mbz, EXC_UD);
+                        generate_exception_if(!evex.opmsk && evex.z, EXC_UD);
 
                         if ( !mode_64bit() )
-                        {
-                            generate_exception_if(!evex.RX, EXC_UD);
                             evex.R = 1;
-                        }
 
                         vex.opcx = evex.opcx;
                         break;
@@ -3404,6 +3402,7 @@ x86_emulate(
         d = (d & ~DstMask) | DstMem;
         /* Becomes a normal DstMem operation from here on. */
     case DstMem:
+        generate_exception_if(ea.type == OP_MEM && evex.z, EXC_UD);
         if ( state->simd_size )
         {
             generate_exception_if(lock_prefix, EXC_UD);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH 6/6] x86emul: generalize vector length handling for AVX512/EVEX
  2018-08-09  8:15 [PATCH 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                   ` (4 preceding siblings ...)
  2018-08-09  8:25 ` [PATCH 5/6] x86emul: correct EVEX decoding Jan Beulich
@ 2018-08-09  8:26 ` Jan Beulich
  2018-08-29 14:20 ` [PATCH v2 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-08-09  8:26 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

To allow for some code sharing where possible, copy VEX.L into EVEX.LR
even for VEX (or XOP) encoded insns. Make operand size determination
use this right away, at the same time adding consistency checks for the
EVEX scalar insn cases (the non-scalar ones aren't uniform enough for
the checking to be done in a central place like this).

Note that the broadcast case is not handled here, but will be taken care
of elsewhere (in just a single place rather than at least two).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -191,14 +191,14 @@ enum simd_opsize {
      * Ordinary packed integers:
      * - 64 bits without prefix 66 (MMX)
      * - 128 bits with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      */
     simd_packed_int,
 
     /*
      * Ordinary packed/scalar floating point:
      * - 128 bits without prefix or with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      * - 32 bits with prefix F3 (scalar single)
      * - 64 bits with prefix F2 (scalar doubgle)
      */
@@ -207,14 +207,14 @@ enum simd_opsize {
     /*
      * Packed floating point:
      * - 128 bits without prefix or with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      */
     simd_packed_fp,
 
     /*
      * Single precision packed/scalar floating point:
      * - 128 bits without prefix (SSEn)
-     * - 128/256 bits depending on VEX.L, no prefix (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      * - 32 bits with prefix F3 (scalar)
      */
     simd_single_fp,
@@ -228,7 +228,7 @@ enum simd_opsize {
 
     /*
      * Scalar floating point:
-     * - 32/64 bits depending on VEX.W
+     * - 32/64 bits depending on VEX.W/EVEX.W
      */
     simd_scalar_vexw,
 
@@ -2818,6 +2818,9 @@ x86_decode(
 
                 opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
 
+                if ( !evex.mbs )
+                    evex.lr = vex.l;
+
                 if ( !(d & ModRM) )
                     break;
 
@@ -3148,7 +3151,7 @@ x86_decode(
             }
             /* fall through */
         case vex_66:
-            op_bytes = 16 << vex.l;
+            op_bytes = 16 << evex.lr;
             break;
         default:
             op_bytes = 0;
@@ -3172,13 +3175,23 @@ x86_decode(
     case simd_any_fp:
         switch ( vex.pfx )
         {
-        default:     op_bytes = 16 << vex.l; break;
-        case vex_f3: op_bytes = 4;           break;
-        case vex_f2: op_bytes = 8;           break;
+        default:
+            op_bytes = 16 << evex.lr;
+            break;
+        case vex_f3:
+            generate_exception_if(evex.mbs && evex.w, EXC_UD);
+            op_bytes = 4;
+            break;
+        case vex_f2:
+            generate_exception_if(evex.mbs && !evex.w, EXC_UD);
+            op_bytes = 8;
+            break;
         }
         break;
 
     case simd_scalar_opc:
+        generate_exception_if(evex.mbs && (evex.w != (ctxt->opcode & 1)),
+                              EXC_UD);
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v2 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support
  2018-08-09  8:15 [PATCH 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                   ` (5 preceding siblings ...)
  2018-08-09  8:26 ` [PATCH 6/6] x86emul: generalize vector length handling for AVX512/EVEX Jan Beulich
@ 2018-08-29 14:20 ` Jan Beulich
  2018-08-29 14:23   ` [PATCH v2 1/6] x86emul: fix FMA scalar operand sizes Jan Beulich
                     ` (5 more replies)
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                   ` (5 subsequent siblings)
  12 siblings, 6 replies; 465+ messages in thread
From: Jan Beulich @ 2018-08-29 14:20 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

1: fix FMA scalar operand sizes
2: extend MASKMOV{Q,DQU} tests
3: support AVX512 opmask insns
4: clean up AVX2 insn use in test harness
5: correct EVEX decoding
6: generalize vector length handling for AVX512/EVEX

While I also have ready a patch emulating the basic AVX512 moves,
its prereq to widen respective fields in HVM code to allow for 64 byte
operand sizes has a dependency on the "x86/HVM: implement memory
read caching" series, which looks to be stalled, and I prefer to avoid
posting further patches with dependencies on stalled series.

Some support beyond moves is also mostly ready, but it's not quite
enough yet to enable testing that code in the harness, and I'd rather
not post code that I haven't pushed through the harness yet. The
partial testing I've been doing means, however, that in particular the
last two patches here have been tested already, even if no such
testing would be possible for others until further patches actually get
posted.

Jan




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v2 1/6] x86emul: fix FMA scalar operand sizes
  2018-08-29 14:20 ` [PATCH v2 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
@ 2018-08-29 14:23   ` Jan Beulich
  2018-09-03 16:43     ` Andrew Cooper
  2018-08-29 14:23   ` [PATCH v2 2/6] x86emul: extend MASKMOV{Q,DQU} tests Jan Beulich
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-08-29 14:23 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

FMA insns, other than the earlier AVX additions, don't use the low
opcode bit to distinguish between single and double vector elements.
While the difference is benign for packed flavors, the scalar ones
need to use VEX.W here. Oddly enough the table entries didn't even use
simd_scalar_fp, but uniformly used simd_packed_fp (implying the
distinction was by [VEX-encoded] opcode prefix).

Split simd_scalar_fp into simd_scalar_opc and simd_scalar_vexw, and
correct 

Also correct the scalar insn comments (they only ever use XMM registers
as operands).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -224,7 +224,13 @@ enum simd_opsize {
      * - 32 bits with low opcode bit clear (scalar single)
      * - 64 bits with low opcode bit set (scalar double)
      */
-    simd_scalar_fp,
+    simd_scalar_opc,
+
+    /*
+     * Scalar floating point:
+     * - 32/64 bits depending on VEX.W
+     */
+    simd_scalar_vexw,
 
     /*
      * 128 bits of integer or floating point data, with no further
@@ -407,7 +413,7 @@ static const struct ext0f38_table {
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
     [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x18 ... 0x19] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x18 ... 0x19] = { .simd_size = simd_scalar_opc, .two_op = 1 },
     [0x1a] = { .simd_size = simd_128, .two_op = 1 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
@@ -427,9 +433,30 @@ static const struct ext0f38_table {
     [0x8c] = { .simd_size = simd_other },
     [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
-    [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
-    [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
-    [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
+    [0x96 ... 0x98] = { .simd_size = simd_packed_fp },
+    [0x99] = { .simd_size = simd_scalar_vexw },
+    [0x9a] = { .simd_size = simd_packed_fp },
+    [0x9b] = { .simd_size = simd_scalar_vexw },
+    [0x9c] = { .simd_size = simd_packed_fp },
+    [0x9d] = { .simd_size = simd_scalar_vexw },
+    [0x9e] = { .simd_size = simd_packed_fp },
+    [0x9f] = { .simd_size = simd_scalar_vexw },
+    [0xa6 ... 0xa8] = { .simd_size = simd_packed_fp },
+    [0xa9] = { .simd_size = simd_scalar_vexw },
+    [0xaa] = { .simd_size = simd_packed_fp },
+    [0xab] = { .simd_size = simd_scalar_vexw },
+    [0xac] = { .simd_size = simd_packed_fp },
+    [0xad] = { .simd_size = simd_scalar_vexw },
+    [0xae] = { .simd_size = simd_packed_fp },
+    [0xaf] = { .simd_size = simd_scalar_vexw },
+    [0xb6 ... 0xb8] = { .simd_size = simd_packed_fp },
+    [0xb9] = { .simd_size = simd_scalar_vexw },
+    [0xba] = { .simd_size = simd_packed_fp },
+    [0xbb] = { .simd_size = simd_scalar_vexw },
+    [0xbc] = { .simd_size = simd_packed_fp },
+    [0xbd] = { .simd_size = simd_scalar_vexw },
+    [0xbe] = { .simd_size = simd_packed_fp },
+    [0xbf] = { .simd_size = simd_scalar_vexw },
     [0xc8 ... 0xcd] = { .simd_size = simd_other },
     [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xdc ... 0xdf] = { .simd_size = simd_packed_int },
@@ -454,7 +481,7 @@ static const struct ext0f3a_table {
     [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
-    [0x0a ... 0x0b] = { .simd_size = simd_scalar_fp },
+    [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc },
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
     [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
@@ -476,13 +503,13 @@ static const struct ext0f3a_table {
     [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x60 ... 0x63] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x68 ... 0x69] = { .simd_size = simd_packed_fp, .four_op = 1 },
-    [0x6a ... 0x6b] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x6a ... 0x6b] = { .simd_size = simd_scalar_opc, .four_op = 1 },
     [0x6c ... 0x6d] = { .simd_size = simd_packed_fp, .four_op = 1 },
-    [0x6e ... 0x6f] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x6e ... 0x6f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
     [0x78 ... 0x79] = { .simd_size = simd_packed_fp, .four_op = 1 },
-    [0x7a ... 0x7b] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x7a ... 0x7b] = { .simd_size = simd_scalar_opc, .four_op = 1 },
     [0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
-    [0x7e ... 0x7f] = { .simd_size = simd_scalar_fp, .four_op = 1 },
+    [0x7e ... 0x7f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
     [0xcc] = { .simd_size = simd_other },
     [0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xf0] = {},
@@ -518,7 +545,7 @@ static const struct ext8f09_table {
 } ext8f09_table[256] = {
     [0x01 ... 0x02] = { .two_op = 1 },
     [0x80 ... 0x81] = { .simd_size = simd_packed_fp, .two_op = 1 },
-    [0x82 ... 0x83] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x82 ... 0x83] = { .simd_size = simd_scalar_opc, .two_op = 1 },
     [0x90 ... 0x9b] = { .simd_size = simd_packed_int },
     [0xc1 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xc6 ... 0xc7] = { .simd_size = simd_packed_int, .two_op = 1 },
@@ -3132,10 +3159,14 @@ x86_decode(
         }
         break;
 
-    case simd_scalar_fp:
+    case simd_scalar_opc:
         op_bytes = 4 << (ctxt->opcode & 1);
         break;
 
+    case simd_scalar_vexw:
+        op_bytes = 4 << vex.w;
+        break;
+
     case simd_128:
         op_bytes = 16;
         break;
@@ -7747,33 +7778,33 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} xmm/mem,xmm,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
-    case X86EMUL_OPC_VEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} xmm/mem,xmm,xmm */
         host_and_vcpu_must_have(fma);
         goto simd_0f_ymm;
 




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v2 2/6] x86emul: extend MASKMOV{Q,DQU} tests
  2018-08-29 14:20 ` [PATCH v2 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
  2018-08-29 14:23   ` [PATCH v2 1/6] x86emul: fix FMA scalar operand sizes Jan Beulich
@ 2018-08-29 14:23   ` Jan Beulich
  2018-09-03 16:44     ` [PATCH v2 2/6] x86emul: extend MASKMOV{Q, DQU} tests Andrew Cooper
  2018-08-29 14:24   ` [PATCH v2 3/6] x86emul: support AVX512 opmask insns Jan Beulich
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-08-29 14:23 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

While deriving the first AVX512 pieces from existing code I've got the
(in the end wrong) impression that the emulation of these insns would be
broken. Besides testing that the instructions act as no-ops when the
controlling mask bits are all zero, add ones to also check that the data
merging actually works.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -2626,7 +2626,7 @@ int main(int argc, char **argv)
         printf("skipped\n");
 #endif
 
-    printf("%-40s", "Testing maskmovq (zero mask)...");
+    printf("%-40s", "Testing maskmovq %mm4,%mm4...");
     if ( stack_exec && cpu_has_sse )
     {
         decl_insn(maskmovq);
@@ -2639,12 +2639,25 @@ int main(int argc, char **argv)
         rc = x86_emulate(&ctxt, &emulops);
         if ( rc != X86EMUL_OKAY || !check_eip(maskmovq) )
             goto fail;
+
+        asm volatile ( "pcmpeqb %mm3, %mm3\n\t"
+                       "punpcklbw %mm3, %mm4\n" );
+        memset(res, 0x55, 24);
+
+        set_insn(maskmovq);
+        regs.edi = (unsigned long)(res + 2);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(maskmovq) ||
+             memcmp(res, res + 4, 8) ||
+             res[2] != 0xff55ff55 || res[3] != 0xff55ff55 )
+            goto fail;
+
         printf("okay\n");
     }
     else
         printf("skipped\n");
 
-    printf("%-40s", "Testing maskmovdqu (zero mask)...");
+    printf("%-40s", "Testing maskmovdqu %xmm3,%xmm3...");
     if ( stack_exec && cpu_has_sse2 )
     {
         decl_insn(maskmovdqu);
@@ -2653,9 +2666,24 @@ int main(int argc, char **argv)
                        put_insn(maskmovdqu, "maskmovdqu %xmm3, %xmm3") );
 
         set_insn(maskmovdqu);
+        regs.edi = 0;
         rc = x86_emulate(&ctxt, &emulops);
         if ( rc != X86EMUL_OKAY || !check_eip(maskmovdqu) )
             goto fail;
+
+        asm volatile ( "pcmpeqb %xmm4, %xmm4\n\t"
+                       "punpcklbw %xmm4, %xmm3\n" );
+        memset(res, 0x55, 48);
+
+        set_insn(maskmovdqu);
+        regs.edi = (unsigned long)(res + 4);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(maskmovdqu) ||
+             memcmp(res, res + 8, 16) ||
+             res[4] != 0xff55ff55 || res[5] != 0xff55ff55 ||
+             res[6] != 0xff55ff55 || res[7] != 0xff55ff55 )
+            goto fail;
+
         printf("okay\n");
     }
     else





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v2 3/6] x86emul: support AVX512 opmask insns
  2018-08-29 14:20 ` [PATCH v2 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
  2018-08-29 14:23   ` [PATCH v2 1/6] x86emul: fix FMA scalar operand sizes Jan Beulich
  2018-08-29 14:23   ` [PATCH v2 2/6] x86emul: extend MASKMOV{Q,DQU} tests Jan Beulich
@ 2018-08-29 14:24   ` Jan Beulich
  2018-09-03 17:57     ` Andrew Cooper
  2018-08-29 14:24   ` [PATCH v2 4/6] x86emul: clean up AVX2 insn use in test harness Jan Beulich
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-08-29 14:24 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

These are all VEX encoded, so the EVEX decoding logic continues to
remain unused at this point.

The new testcase is deliberately coded in assembly, as a C one would
have become almost unreadable due to the overwhelming amount of
__builtin_...() that would need to be used. After all the compiler has
no underlying type (yet) that could be operated on without builtins,
other than the vector types used for "normal" SIMD insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -16,6 +16,8 @@ FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
 
+OPMASK := avx512f avx512dq avx512bw
+
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
@@ -51,6 +53,10 @@ xop-vecs := $(avx-vecs)
 xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
 
+avx512f-opmask-vecs := 2
+avx512dq-opmask-vecs := 1
+avx512bw-opmask-vecs := 4 8
+
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.  For 3DNow!, however, force SSE
 # use for floating point operations, to avoid mixing MMX and FPU register
@@ -80,9 +86,13 @@ $(1)-cflags := \
 	   $(foreach flt,$($(1)-flts), \
 	     "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
 endef
+define opmask-defs
+$(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)")
+endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
 $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
@@ -100,6 +110,22 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 	)
 	mv $@.new $@
 
+$(addsuffix -opmask.h,$(OPMASK)): %.h: opmask.S testcase.mk Makefile
+	rm -f $@.new $*.bin
+	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
+	    for cflags in $($*-cflags) $($*-cflags-$(arch)); do \
+		$(MAKE) -f testcase.mk TESTCASE=$* XEN_TARGET_ARCH=$(arch) $*-cflags="$$cflags" all; \
+		prefix=$(shell echo $(subst -,_,$*) | sed -e 's,^\([0-9]\),_\1,'); \
+		flavor=$$(echo $${cflags} | sed -e 's, .*,,' -e 'y,-=,__,') ; \
+		(echo 'static const unsigned int __attribute__((section(".test, \"ax\", @progbits #")))' \
+		      "$${prefix}_$(arch)$${flavor}[] = {"; \
+		 od -v -t x $*.bin | sed -e 's/^[0-9]* /0x/' -e 's/ /, 0x/g' -e 's/$$/,/'; \
+		 echo "};") >>$@.new; \
+		rm -f $*.bin; \
+	    done; \
+	)
+	mv $@.new $@
+
 $(addsuffix .c,$(SIMD)):
 	ln -sf simd.c $@
 
@@ -145,4 +171,4 @@ x86-emulate.o test_x86_emulator.o wrappe
 x86-emulate.o: x86_emulate/x86_emulate.c
 x86-emulate.o: HOSTCFLAGS += -D__XEN_TOOLS__
 
-test_x86_emulator.o: $(addsuffix .h,$(TESTCASES))
+test_x86_emulator.o: $(addsuffix .h,$(TESTCASES)) $(addsuffix -opmask.h,$(OPMASK))
--- /dev/null
+++ b/tools/tests/x86_emulator/opmask.S
@@ -0,0 +1,144 @@
+#ifdef __i386__
+# define R(x) e##x
+# define DATA(x) x
+#else
+# if SIZE == 8
+#  define R(x) r##x
+# else
+#  define R(x) e##x
+# endif
+# define DATA(x) x(%rip)
+#endif
+
+#if SIZE == 1
+# define _(x) x##b
+#elif SIZE == 2
+# define _(x) x##w
+# define WIDEN(x) x##bw
+#elif SIZE == 4
+# define _(x) x##d
+# define WIDEN(x) x##wd
+#elif SIZE == 8
+# define _(x) x##q
+# define WIDEN(x) x##dq
+#endif
+
+    .macro check res1:req, res2:req, line:req
+    _(kmov)       %\res1, DATA(out)
+#if SIZE < 8 || !defined(__i386__)
+    _(kmov)       %\res2, %R(dx)
+    cmp           DATA(out), %R(dx)
+#else
+    sub           $8, %esp
+    kmovq         %\res2, (%esp)
+    pop           %ecx
+    pop           %edx
+    cmp           DATA(out), %ecx
+    jne           0f
+    cmp           DATA(out+4), %edx
+0:
+#endif
+    je            1f
+    mov           $\line, %eax
+    ret
+1:
+    .endm
+
+    .text
+    .globl _start
+_start:
+    _(kmov)       DATA(in1), %k1
+#if SIZE < 8 || !defined(__i386__)
+    mov           DATA(in2), %R(ax)
+    _(kmov)       %R(ax), %k2
+#else
+    _(kmov)       DATA(in2), %k2
+#endif
+
+    _(kor)        %k1, %k2, %k3
+    _(kand)       %k1, %k2, %k4
+    _(kandn)      %k3, %k4, %k5
+    _(kxor)       %k1, %k2, %k6
+    check         k5, k6, __LINE__
+
+    _(knot)       %k6, %k3
+    _(kxnor)      %k1, %k2, %k4
+    check         k3, k4, __LINE__
+
+    _(kshiftl)    $1, %k1, %k3
+    _(kshiftl)    $2, %k3, %k4
+    _(kshiftl)    $3, %k1, %k5
+    check         k4, k5, __LINE__
+
+    _(kshiftr)    $1, %k1, %k3
+    _(kshiftr)    $2, %k3, %k4
+    _(kshiftr)    $3, %k1, %k5
+    check         k4, k5, __LINE__
+
+    _(kortest)    %k6, %k6
+    jnbe          1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxor)       %k0, %k0, %k3
+    _(kortest)    %k3, %k3
+    jz            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxnor)      %k0, %k0, %k3
+    _(kortest)    %k3, %k3
+    jc            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+#if SIZE > 1
+
+    _(kshiftr)    $SIZE*4, %k3, %k4
+    WIDEN(kunpck) %k4, %k4, %k5
+    check         k3, k5, __LINE__
+
+#endif
+
+#if SIZE != 2 || defined(__AVX512DQ__)
+
+    _(kadd)       %k1, %k1, %k3
+    _(kshiftl)    $1, %k1, %k4
+    check         k3, k4, __LINE__
+
+    _(ktest)      %k2, %k1
+    jnbe          1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxor)       %k0, %k0, %k3
+    _(ktest)      %k0, %k3
+    jz            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxnor)      %k0, %k0, %k4
+    _(ktest)      %k0, %k4
+    jc            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+#endif
+
+    xor           %eax, %eax
+    ret
+
+    .section .rodata, "a", @progbits
+    .balign 8
+in1: .byte 0b10110011, 0b10001111, 0b00001111, 0b10000011, 0b11110000, 0b00111111, 0b10000000, 0b11111111
+in2: .byte 0b11111111, 0b00000001, 0b11111100, 0b00001111, 0b11000001, 0b11110000, 0b11110001, 0b11001101
+
+    .data
+    .balign 8
+out: .quad 0
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -18,6 +18,9 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx2.h"
 #include "avx2-sg.h"
 #include "xop.h"
+#include "avx512f-opmask.h"
+#include "avx512dq-opmask.h"
+#include "avx512bw-opmask.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -78,6 +81,24 @@ static bool simd_check_xop(void)
     return cpu_has_xop;
 }
 
+static bool simd_check_avx512f(void)
+{
+    return cpu_has_avx512f;
+}
+#define simd_check_avx512f_opmask simd_check_avx512f
+
+static bool simd_check_avx512dq(void)
+{
+    return cpu_has_avx512dq;
+}
+#define simd_check_avx512dq_opmask simd_check_avx512dq
+
+static bool simd_check_avx512bw(void)
+{
+    return cpu_has_avx512bw;
+}
+#define simd_check_avx512bw_opmask simd_check_avx512bw
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -223,6 +244,10 @@ static const struct {
     SIMD(XOP i16x16,              xop,      32i2),
     SIMD(XOP i32x8,               xop,      32i4),
     SIMD(XOP i64x4,               xop,      32i8),
+    SIMD(OPMASK/w,     avx512f_opmask,         2),
+    SIMD(OPMASK/b,    avx512dq_opmask,         1),
+    SIMD(OPMASK/d,    avx512bw_opmask,         4),
+    SIMD(OPMASK/q,    avx512bw_opmask,         8),
 #undef SIMD_
 #undef SIMD
 };
@@ -3469,8 +3494,8 @@ int main(int argc, char **argv)
             rc = x86_emulate(&ctxt, &emulops);
             if ( rc != X86EMUL_OKAY )
             {
-                printf("failed at %%eip == %08lx (opcode %08x)\n",
-                       (unsigned long)regs.eip, ctxt.opcode);
+                printf("failed (%d) at %%eip == %08lx (opcode %08x)\n",
+                       rc, (unsigned long)regs.eip, ctxt.opcode);
                 return 1;
             }
         }
--- a/tools/tests/x86_emulator/testcase.mk
+++ b/tools/tests/x86_emulator/testcase.mk
@@ -14,3 +14,9 @@ all: $(TESTCASE).bin
 	$(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x100000 -o $*.tmp $*.o
 	$(OBJCOPY) -O binary $*.tmp $@
 	rm -f $*.tmp
+
+%-opmask.bin: opmask.S
+	$(CC) $(filter-out -M% .%,$(CFLAGS)) -c $< -o $*.o
+	$(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x100000 -o $*.tmp $*.o
+	$(OBJCOPY) -O binary $*.tmp $@
+	rm -f $*.tmp
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -209,6 +209,9 @@ int emul_test_get_fpu(
     case X86EMUL_FPU_ymm:
         if ( cpu_has_avx )
             break;
+    case X86EMUL_FPU_opmask:
+        if ( cpu_has_avx512f )
+            break;
     default:
         return X86EMUL_UNHANDLEABLE;
     }
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -236,6 +236,36 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 21)) != 0; \
 })
 
+#define cpu_has_avx512f ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 16)) != 0; \
+})
+
+#define cpu_has_avx512dq ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 17)) != 0; \
+})
+
+#define cpu_has_avx512bw ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 30)) != 0; \
+})
+
 int emul_test_cpuid(
     uint32_t leaf,
     uint32_t subleaf,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -491,6 +491,7 @@ static const struct ext0f3a_table {
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
+    [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
@@ -1187,6 +1188,11 @@ static int _get_fpu(
             return X86EMUL_UNHANDLEABLE;
         break;
 
+    case X86EMUL_FPU_opmask:
+        if ( !(xcr0 & X86_XCR0_SSE) || !(xcr0 & X86_XCR0_OPMASK) )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
     default:
         break;
     }
@@ -1762,12 +1768,15 @@ static bool vcpu_has(
 #define vcpu_has_bmi2()        vcpu_has(         7, EBX,  8, ctxt, ops)
 #define vcpu_has_rtm()         vcpu_has(         7, EBX, 11, ctxt, ops)
 #define vcpu_has_mpx()         vcpu_has(         7, EBX, 14, ctxt, ops)
+#define vcpu_has_avx512f()     vcpu_has(         7, EBX, 16, ctxt, ops)
+#define vcpu_has_avx512dq()    vcpu_has(         7, EBX, 17, ctxt, ops)
 #define vcpu_has_rdseed()      vcpu_has(         7, EBX, 18, ctxt, ops)
 #define vcpu_has_adx()         vcpu_has(         7, EBX, 19, ctxt, ops)
 #define vcpu_has_smap()        vcpu_has(         7, EBX, 20, ctxt, ops)
 #define vcpu_has_clflushopt()  vcpu_has(         7, EBX, 23, ctxt, ops)
 #define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
 #define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
+#define vcpu_has_avx512bw()    vcpu_has(         7, EBX, 30, ctxt, ops)
 #define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
 #define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
 
@@ -2396,6 +2405,18 @@ x86_decode_twobyte(
         }
         break;
 
+    case X86EMUL_OPC_VEX(0, 0x90):    /* kmov{w,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x90): /* kmov{b,d} */
+        state->desc = DstReg | SrcMem | Mov;
+        state->simd_size = simd_other;
+        break;
+
+    case X86EMUL_OPC_VEX(0, 0x91):    /* kmov{w,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x91): /* kmov{b,d} */
+        state->desc = DstMem | SrcReg | Mov;
+        state->simd_size = simd_other;
+        break;
+
     case 0xae:
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         /* fall through */
@@ -6002,6 +6023,60 @@ x86_emulate(
             dst.val = src.val;
         break;
 
+    case X86EMUL_OPC_VEX(0x0f, 0x4a):    /* kadd{w,q} k,k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x41):    /* kand{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x41): /* kand{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x42):    /* kandn{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x42): /* kandn{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x45):    /* kor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x45): /* kor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x46):    /* kxnor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x46): /* kxnor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x47):    /* kxor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x47): /* kxor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x4a): /* kadd{b,d} k,k,k */
+        generate_exception_if(!vex.l, EXC_UD);
+    opmask_basic:
+        if ( vex.w )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( vex.pfx )
+            host_and_vcpu_must_have(avx512dq);
+    opmask_common:
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(!vex.r || (mode_64bit() && !(vex.reg & 8)) ||
+                              ea.type != OP_REG, EXC_UD);
+
+        vex.reg |= 8;
+        d &= ~TwoOp;
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        insn_bytes = PFX_BYTES + 2;
+
+        state->simd_size = simd_other;
+        op_bytes = 1; /* Any non-zero value will do. */
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x44):    /* knot{w,q} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x44): /* knot{b,d} k,k */
+        generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+        goto opmask_basic;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x4b):    /* kunpck{w,d}{d,q} k,k,k */
+        generate_exception_if(!vex.l, EXC_UD);
+        host_and_vcpu_must_have(avx512bw);
+        goto opmask_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f, 0x4b): /* kunpckbw k,k,k */
+        generate_exception_if(!vex.l || vex.w, EXC_UD);
+        goto opmask_common;
+
     CASE_SIMD_PACKED_FP(, 0x0f, 0x50):     /* movmskp{s,d} xmm,reg */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
     CASE_SIMD_PACKED_INT(0x0f, 0xd7):      /* pmovmskb {,x}mm,reg */
@@ -6552,6 +6627,154 @@ x86_emulate(
         dst.val = test_cc(b, _regs.eflags);
         break;
 
+    case X86EMUL_OPC_VEX(0x0f, 0x91):    /* kmov{w,q} k,mem */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x91): /* kmov{b,d} k,mem */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x90):    /* kmov{w,q} k/mem,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x90): /* kmov{b,d} k/mem,k */
+        generate_exception_if(vex.l || !vex.r, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.w )
+        {
+            host_and_vcpu_must_have(avx512bw);
+            op_bytes = 4 << !vex.pfx;
+        }
+        else if ( vex.pfx )
+        {
+            host_and_vcpu_must_have(avx512dq);
+            op_bytes = 1;
+        }
+        else
+            op_bytes = 2;
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* convert memory operand to (%rAX) */
+            vex.b = 1;
+            opc[1] &= 0x38;
+        }
+        insn_bytes = PFX_BYTES + 2;
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x92):    /* kmovw r32,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x92): /* kmovb r32,k */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x92): /* kmov{d,q} reg,k */
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.pfx == vex_f2 )
+            host_and_vcpu_must_have(avx512bw);
+        else
+        {
+            generate_exception_if(vex.w, EXC_UD);
+            if ( vex.pfx )
+                host_and_vcpu_must_have(avx512dq);
+        }
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert GPR source to %rAX. */
+        vex.b = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1] = modrm & 0xf8;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        ea.reg = decode_gpr(&_regs, modrm_rm);
+        invoke_stub("", "", "=m" (dummy) : "a" (*ea.reg));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        dst.type = OP_NONE;
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x93):    /* kmovw k,r32 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x93): /* kmovb k,r32 */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x93): /* kmov{d,q} k,reg */
+        generate_exception_if(vex.l || vex.reg != 0xf || ea.type != OP_REG,
+                              EXC_UD);
+        dst = ea;
+        dst.reg = decode_gpr(&_regs, modrm_reg);
+
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.pfx == vex_f2 )
+        {
+            host_and_vcpu_must_have(avx512bw);
+            dst.bytes = 4 << (mode_64bit() && vex.w);
+        }
+        else
+        {
+            generate_exception_if(vex.w, EXC_UD);
+            dst.bytes = 4;
+            if ( vex.pfx )
+                host_and_vcpu_must_have(avx512dq);
+        }
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert GPR destination to %rAX. */
+        vex.r = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1] = modrm & 0xc7;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x99):    /* ktest{w,q} k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x98):    /* kortest{w,q} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x98): /* kortest{b,d} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x99): /* ktest{b,d} k,k */
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.w )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( vex.pfx )
+            host_and_vcpu_must_have(avx512dq);
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+                    _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+                    [eflags] "+g" (_regs.eflags),
+                    "=a" (dst.val), [tmp] "=&r" (dummy)
+                    : [mask] "i" (EFLAGS_MASK));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        dst.type = OP_NONE;
+        break;
+
     case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
         msr_val = 0;
         fail_if(ops->cpuid == NULL);
@@ -8170,6 +8393,23 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+    opmask_shift_imm:
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_opmask);
+        op_bytes = 1; /* Any non-zero value will do. */
+        goto simd_0f_imm8;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x31): /* kshiftr{d,q} $imm8,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x33): /* kshiftl{d,q} $imm8,k,k */
+        host_and_vcpu_must_have(avx512bw);
+        goto opmask_shift_imm;
+
     case X86EMUL_OPC_66(0x0f3a, 0x44):     /* pclmulqdq $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,xmm/m128,xmm,xmm */
         host_and_vcpu_must_have(pclmulqdq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -170,6 +170,7 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_mmx, /* MMX instruction set (%mm0-%mm7) */
     X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */
     X86EMUL_FPU_ymm, /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
+    X86EMUL_FPU_opmask, /* AVX512 opmask instruction set (%k0-%k7) */
     /* This sentinel will never be passed to ->get_fpu(). */
     X86EMUL_FPU_none
 };
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -99,9 +99,12 @@
 #define cpu_has_rtm             boot_cpu_has(X86_FEATURE_RTM)
 #define cpu_has_fpu_sel         (!boot_cpu_has(X86_FEATURE_NO_FPU_SEL))
 #define cpu_has_mpx             boot_cpu_has(X86_FEATURE_MPX)
+#define cpu_has_avx512f         boot_cpu_has(X86_FEATURE_AVX512F)
+#define cpu_has_avx512dq        boot_cpu_has(X86_FEATURE_AVX512DQ)
 #define cpu_has_rdseed          boot_cpu_has(X86_FEATURE_RDSEED)
 #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
 #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
+#define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v2 4/6] x86emul: clean up AVX2 insn use in test harness
  2018-08-29 14:20 ` [PATCH v2 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (2 preceding siblings ...)
  2018-08-29 14:24   ` [PATCH v2 3/6] x86emul: support AVX512 opmask insns Jan Beulich
@ 2018-08-29 14:24   ` Jan Beulich
  2018-09-03 18:04     ` Andrew Cooper
  2018-08-29 14:25   ` [PATCH v2 5/6] x86emul: correct EVEX decoding Jan Beulich
  2018-08-29 14:25   ` [PATCH v2 6/6] x86emul: generalize vector length handling for AVX512/EVEX Jan Beulich
  5 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-08-29 14:24 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

Drop the pretty pointless conditionals from code testing AVX insns and
properly use AVX2 mnemonics in code testing AVX2 insns (the test harness
is already requiring sufficiently new a compiler/assembler).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -2071,11 +2071,6 @@ int main(int argc, char **argv)
         rc = x86_emulate(&ctxt, &emulops);
         if ( rc != X86EMUL_OKAY || !check_eip(vmovdqu_from_mem) )
             goto fail;
-#if 0 /* Don't use AVX2 instructions for now */
-        asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
-              "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
-              "vpmovmskb %%ymm0, %0" : "=r" (rc) );
-#else
         asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
               "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
               "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
@@ -2083,7 +2078,6 @@ int main(int argc, char **argv)
               "vpmovmskb %%xmm0, %0\n\t"
               "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
         rc |= i << 16;
-#endif
         if ( rc != 0xffffffff )
             goto fail;
         printf("okay\n");
@@ -2755,11 +2749,6 @@ int main(int argc, char **argv)
         rc = x86_emulate(&ctxt, &emulops);
         if ( rc != X86EMUL_OKAY || !check_eip(vlddqu) )
             goto fail;
-#if 0 /* Don't use AVX2 instructions for now */
-        asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
-              "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
-              "vpmovmskb %%ymm0, %0" : "=r" (rc) );
-#else
         asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
               "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
               "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
@@ -2767,7 +2756,6 @@ int main(int argc, char **argv)
               "vpmovmskb %%xmm0, %0\n\t"
               "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
         rc |= i << 16;
-#endif
         if ( ~rc )
             goto fail;
         printf("okay\n");
@@ -2806,15 +2794,9 @@ int main(int argc, char **argv)
     {
         decl_insn(vmovntdqa);
 
-#if 0 /* Don't use AVX2 instructions for now */
         asm volatile ( "vpxor %%ymm4, %%ymm4, %%ymm4\n"
                        put_insn(vmovntdqa, "vmovntdqa (%0), %%ymm4")
                        :: "c" (NULL) );
-#else
-        asm volatile ( "vpxor %xmm4, %xmm4, %xmm4\n"
-                       put_insn(vmovntdqa,
-                                ".byte 0xc4, 0xe2, 0x7d, 0x2a, 0x21") );
-#endif
 
         set_insn(vmovntdqa);
         memset(res, 0x55, 96);
@@ -2823,19 +2805,9 @@ int main(int argc, char **argv)
         rc = x86_emulate(&ctxt, &emulops);
         if ( rc != X86EMUL_OKAY || !check_eip(vmovntdqa) )
             goto fail;
-#if 0 /* Don't use AVX2 instructions for now */
         asm ( "vpcmpeqb %%ymm2, %%ymm2, %%ymm2\n\t"
               "vpcmpeqb %%ymm4, %%ymm2, %%ymm0\n\t"
               "vpmovmskb %%ymm0, %0" : "=r" (rc) );
-#else
-        asm ( "vextractf128 $1, %%ymm4, %%xmm3\n\t"
-              "vpcmpeqb %%xmm2, %%xmm2, %%xmm2\n\t"
-              "vpcmpeqb %%xmm4, %%xmm2, %%xmm0\n\t"
-              "vpcmpeqb %%xmm3, %%xmm2, %%xmm1\n\t"
-              "vpmovmskb %%xmm0, %0\n\t"
-              "vpmovmskb %%xmm1, %1" : "=r" (rc), "=r" (i) );
-        rc |= i << 16;
-#endif
         if ( ~rc )
             goto fail;
         printf("okay\n");
@@ -3161,12 +3133,7 @@ int main(int argc, char **argv)
 
         asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
                        "vpinsrd $0b00, %1, %%xmm1, %%xmm2\n\t"
-#if 0 /* Don't use AVX2 instructions for now */
                        put_insn(vpmaskmovd, "vpmaskmovd %%xmm1, %%xmm2, (%0)")
-#else
-                       put_insn(vpmaskmovd,
-                                ".byte 0xc4, 0xe2, 0x69, 0x8e, 0x0a")
-#endif
                        :: "d" (NULL), "r" (~0) );
 
         memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
@@ -3200,14 +3167,8 @@ int main(int argc, char **argv)
 
         asm volatile ( "vpxor %%xmm1, %%xmm1, %%xmm1\n\t"
                        "vpcmpeqd %%xmm0, %%xmm0, %%xmm0\n\t"
-#if 0 /* Don't use AVX2 instructions for now */
                        "vpblendd $0b0011, %%xmm0, %%xmm1, %%xmm2\n\t"
                        put_insn(vpmaskmovq, "vpmaskmovq %%xmm1, %%xmm2, (%0)")
-#else
-                       ".byte 0xc4, 0xe3, 0x71, 0x02, 0xd0, 0b0011\n\t"
-                       put_insn(vpmaskmovq,
-                                ".byte 0xc4, 0xe2, 0xe9, 0x8e, 0x0a")
-#endif
                        :: "d" (NULL) );
 
         memset(res + MMAP_SZ / sizeof(*res) - 8, 0xdb, 32);
@@ -3221,11 +3182,7 @@ int main(int argc, char **argv)
                     res + MMAP_SZ / sizeof(*res) - 4, 8) )
             goto fail;
 
-#if 0 /* Don't use AVX2 instructions for now */
         asm volatile ( "vpermq $0b00000001, %ymm2, %ymm2" );
-#else
-        asm volatile ( ".byte 0xc4, 0xe3, 0xfd, 0x00, 0xd2, 0b00000001" );
-#endif
         memset(res, 0xdb, 32);
         set_insn(vpmaskmovq);
         regs.edx = (unsigned long)(res - 2);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v2 5/6] x86emul: correct EVEX decoding
  2018-08-29 14:20 ` [PATCH v2 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (3 preceding siblings ...)
  2018-08-29 14:24   ` [PATCH v2 4/6] x86emul: clean up AVX2 insn use in test harness Jan Beulich
@ 2018-08-29 14:25   ` Jan Beulich
  2018-09-04 10:48     ` Andrew Cooper
  2018-08-29 14:25   ` [PATCH v2 6/6] x86emul: generalize vector length handling for AVX512/EVEX Jan Beulich
  5 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-08-29 14:25 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

Fix an inverted pair of checks, drop an incorrect instance of #UD
raising for non-64-bit mode, and add further generic checks.

Note: Other than SDM Vol 2 rev 067 states, EVEX.V' is _not_ ignored
      outside of 64-bit mode when the field does not encode a register.
      Just like EVEX.VVVV is required to be 0b1111 in that case, EVEX.V'
      is required to be 1 there.

Also rename the bcst field to br, as #UD generation for individual insns
will need to consider both of its possible meanings.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -650,7 +650,7 @@ union evex {
         uint8_t w:1;
         uint8_t opmsk:3;
         uint8_t RX:1;
-        uint8_t bcst:1;
+        uint8_t br:1;
         uint8_t lr:2;
         uint8_t z:1;
     };
@@ -2760,13 +2760,11 @@ x86_decode(
                         evex.raw[1] = vex.raw[1];
                         evex.raw[2] = insn_fetch_type(uint8_t);
 
-                        generate_exception_if(evex.mbs || !evex.mbz, EXC_UD);
+                        generate_exception_if(!evex.mbs || evex.mbz, EXC_UD);
+                        generate_exception_if(!evex.opmsk && evex.z, EXC_UD);
 
                         if ( !mode_64bit() )
-                        {
-                            generate_exception_if(!evex.RX, EXC_UD);
                             evex.R = 1;
-                        }
 
                         vex.opcx = evex.opcx;
                         break;
@@ -3404,6 +3402,7 @@ x86_emulate(
         d = (d & ~DstMask) | DstMem;
         /* Becomes a normal DstMem operation from here on. */
     case DstMem:
+        generate_exception_if(ea.type == OP_MEM && evex.z, EXC_UD);
         if ( state->simd_size )
         {
             generate_exception_if(lock_prefix, EXC_UD);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v2 6/6] x86emul: generalize vector length handling for AVX512/EVEX
  2018-08-29 14:20 ` [PATCH v2 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (4 preceding siblings ...)
  2018-08-29 14:25   ` [PATCH v2 5/6] x86emul: correct EVEX decoding Jan Beulich
@ 2018-08-29 14:25   ` Jan Beulich
  2018-09-04 11:02     ` Andrew Cooper
  5 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-08-29 14:25 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper

To allow for some code sharing where possible, copy VEX.L into EVEX.LR
even for VEX (or XOP) encoded insns. Make operand size determination
use this right away, at the same time adding consistency checks for the
EVEX scalar insn cases (the non-scalar ones aren't uniform enough for
the checking to be done in a central place like this).

Note that the broadcast case is not handled here, but will be taken care
of elsewhere (in just a single place rather than at least two).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Don't raise #UD in simd_scalar_opc case when EVEX.W != low-opcode-bit.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -191,14 +191,14 @@ enum simd_opsize {
      * Ordinary packed integers:
      * - 64 bits without prefix 66 (MMX)
      * - 128 bits with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      */
     simd_packed_int,
 
     /*
      * Ordinary packed/scalar floating point:
      * - 128 bits without prefix or with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      * - 32 bits with prefix F3 (scalar single)
      * - 64 bits with prefix F2 (scalar doubgle)
      */
@@ -207,14 +207,14 @@ enum simd_opsize {
     /*
      * Packed floating point:
      * - 128 bits without prefix or with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      */
     simd_packed_fp,
 
     /*
      * Single precision packed/scalar floating point:
      * - 128 bits without prefix (SSEn)
-     * - 128/256 bits depending on VEX.L, no prefix (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      * - 32 bits with prefix F3 (scalar)
      */
     simd_single_fp,
@@ -228,7 +228,7 @@ enum simd_opsize {
 
     /*
      * Scalar floating point:
-     * - 32/64 bits depending on VEX.W
+     * - 32/64 bits depending on VEX.W/EVEX.W
      */
     simd_scalar_vexw,
 
@@ -2818,6 +2818,9 @@ x86_decode(
 
                 opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
 
+                if ( !evex.mbs )
+                    evex.lr = vex.l;
+
                 if ( !(d & ModRM) )
                     break;
 
@@ -3148,7 +3151,7 @@ x86_decode(
             }
             /* fall through */
         case vex_66:
-            op_bytes = 16 << vex.l;
+            op_bytes = 16 << evex.lr;
             break;
         default:
             op_bytes = 0;
@@ -3172,9 +3175,17 @@ x86_decode(
     case simd_any_fp:
         switch ( vex.pfx )
         {
-        default:     op_bytes = 16 << vex.l; break;
-        case vex_f3: op_bytes = 4;           break;
-        case vex_f2: op_bytes = 8;           break;
+        default:
+            op_bytes = 16 << evex.lr;
+            break;
+        case vex_f3:
+            generate_exception_if(evex.mbs && evex.w, EXC_UD);
+            op_bytes = 4;
+            break;
+        case vex_f2:
+            generate_exception_if(evex.mbs && !evex.w, EXC_UD);
+            op_bytes = 8;
+            break;
         }
         break;
 





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v2 1/6] x86emul: fix FMA scalar operand sizes
  2018-08-29 14:23   ` [PATCH v2 1/6] x86emul: fix FMA scalar operand sizes Jan Beulich
@ 2018-09-03 16:43     ` Andrew Cooper
  2018-09-04  7:52       ` Jan Beulich
  0 siblings, 1 reply; 465+ messages in thread
From: Andrew Cooper @ 2018-09-03 16:43 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 29/08/18 15:23, Jan Beulich wrote:
> FMA insns, other than the earlier AVX additions, don't use the low
> opcode bit to distinguish between single and double vector elements.

I think I've worked out why "other than the" is so weird to read as a
native speaker here.  I think you mean "unlike the" in this context.

> While the difference is benign for packed flavors, the scalar ones
> need to use VEX.W here. Oddly enough the table entries didn't even use
> simd_scalar_fp, but uniformly used simd_packed_fp (implying the
> distinction was by [VEX-encoded] opcode prefix).

Was this a bug in the FMA patch then?

>
> Split simd_scalar_fp into simd_scalar_opc and simd_scalar_vexw, and
> correct 

Missing the rest of the sentence?  (v1 was similar)

>
> Also correct the scalar insn comments (they only ever use XMM registers
> as operands).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

As for the code content, Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v2 2/6] x86emul: extend MASKMOV{Q, DQU} tests
  2018-08-29 14:23   ` [PATCH v2 2/6] x86emul: extend MASKMOV{Q,DQU} tests Jan Beulich
@ 2018-09-03 16:44     ` Andrew Cooper
  0 siblings, 0 replies; 465+ messages in thread
From: Andrew Cooper @ 2018-09-03 16:44 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 29/08/18 15:23, Jan Beulich wrote:
> While deriving the first AVX512 pieces from existing code I've got the
> (in the end wrong) impression that the emulation of these insns would be
> broken. Besides testing that the instructions act as no-ops when the
> controlling mask bits are all zero, add ones to also check that the data
> merging actually works.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v2 3/6] x86emul: support AVX512 opmask insns
  2018-08-29 14:24   ` [PATCH v2 3/6] x86emul: support AVX512 opmask insns Jan Beulich
@ 2018-09-03 17:57     ` Andrew Cooper
  2018-09-04  7:58       ` Jan Beulich
  0 siblings, 1 reply; 465+ messages in thread
From: Andrew Cooper @ 2018-09-03 17:57 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 29/08/18 15:24, Jan Beulich wrote:
> These are all VEX encoded, so the EVEX decoding logic continues to
> remain unused at this point.
>
> The new testcase is deliberately coded in assembly, as a C one would
> have become almost unreadable due to the overwhelming amount of
> __builtin_...() that would need to be used. After all the compiler has
> no underlying type (yet) that could be operated on without builtins,
> other than the vector types used for "normal" SIMD insns.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -6002,6 +6023,60 @@ x86_emulate(
>              dst.val = src.val;
>          break;
>  
> +    case X86EMUL_OPC_VEX(0x0f, 0x4a):    /* kadd{w,q} k,k,k */
> +        if ( !vex.w )
> +            host_and_vcpu_must_have(avx512dq);

Why is this kadd handled differently?  As far as I can tell from the
manual, its encoding looks to be consistent.

I'm afraid that I'm going to have to stare at the manual a bit more
before I can review the rest of this patch.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v2 4/6] x86emul: clean up AVX2 insn use in test harness
  2018-08-29 14:24   ` [PATCH v2 4/6] x86emul: clean up AVX2 insn use in test harness Jan Beulich
@ 2018-09-03 18:04     ` Andrew Cooper
  0 siblings, 0 replies; 465+ messages in thread
From: Andrew Cooper @ 2018-09-03 18:04 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 29/08/18 15:24, Jan Beulich wrote:
> Drop the pretty pointless conditionals from code testing AVX insns and
> properly use AVX2 mnemonics in code testing AVX2 insns (the test harness
> is already requiring sufficiently new a compiler/assembler).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v2 1/6] x86emul: fix FMA scalar operand sizes
  2018-09-03 16:43     ` Andrew Cooper
@ 2018-09-04  7:52       ` Jan Beulich
  0 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-04  7:52 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 03.09.18 at 18:43, <andrew.cooper3@citrix.com> wrote:
> On 29/08/18 15:23, Jan Beulich wrote:
>> FMA insns, other than the earlier AVX additions, don't use the low
>> opcode bit to distinguish between single and double vector elements.
> 
> I think I've worked out why "other than the" is so weird to read as a
> native speaker here.  I think you mean "unlike the" in this context.

Changed; I'll try to remember this.

>> While the difference is benign for packed flavors, the scalar ones
>> need to use VEX.W here. Oddly enough the table entries didn't even use
>> simd_scalar_fp, but uniformly used simd_packed_fp (implying the
>> distinction was by [VEX-encoded] opcode prefix).
> 
> Was this a bug in the FMA patch then?

Yes.

>> Split simd_scalar_fp into simd_scalar_opc and simd_scalar_vexw, and
>> correct 
> 
> Missing the rest of the sentence?  (v1 was similar)

Oops: "...and correct FMA scalar table entries to use the latter."

>> Also correct the scalar insn comments (they only ever use XMM registers
>> as operands).
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> As for the code content, Reviewed-by: Andrew Cooper
> <andrew.cooper3@citrix.com>

Thanks.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v2 3/6] x86emul: support AVX512 opmask insns
  2018-09-03 17:57     ` Andrew Cooper
@ 2018-09-04  7:58       ` Jan Beulich
  0 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-04  7:58 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 03.09.18 at 19:57, <andrew.cooper3@citrix.com> wrote:
> On 29/08/18 15:24, Jan Beulich wrote:
>> These are all VEX encoded, so the EVEX decoding logic continues to
>> remain unused at this point.
>>
>> The new testcase is deliberately coded in assembly, as a C one would
>> have become almost unreadable due to the overwhelming amount of
>> __builtin_...() that would need to be used. After all the compiler has
>> no underlying type (yet) that could be operated on without builtins,
>> other than the vector types used for "normal" SIMD insns.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -6002,6 +6023,60 @@ x86_emulate(
>>              dst.val = src.val;
>>          break;
>>  
>> +    case X86EMUL_OPC_VEX(0x0f, 0x4a):    /* kadd{w,q} k,k,k */
>> +        if ( !vex.w )
>> +            host_and_vcpu_must_have(avx512dq);
> 
> Why is this kadd handled differently?  As far as I can tell from the
> manual, its encoding looks to be consistent.

It's not the encoding, but the AVX512DQ property of kaddw
which other k<op>w insns don't have (those all are AVX512F).

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v2 5/6] x86emul: correct EVEX decoding
  2018-08-29 14:25   ` [PATCH v2 5/6] x86emul: correct EVEX decoding Jan Beulich
@ 2018-09-04 10:48     ` Andrew Cooper
  2018-09-04 12:48       ` Jan Beulich
  0 siblings, 1 reply; 465+ messages in thread
From: Andrew Cooper @ 2018-09-04 10:48 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 29/08/18 15:25, Jan Beulich wrote:
> Fix an inverted pair of checks, drop an incorrect instance of #UD
> raising for non-64-bit mode, and add further generic checks.
>
> Note: Other than SDM Vol 2 rev 067 states, EVEX.V' is _not_ ignored
>       outside of 64-bit mode when the field does not encode a register.
>       Just like EVEX.VVVV is required to be 0b1111 in that case, EVEX.V'
>       is required to be 1 there.
>
> Also rename the bcst field to br, as #UD generation for individual insns
> will need to consider both of its possible meanings.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -650,7 +650,7 @@ union evex {
>          uint8_t w:1;
>          uint8_t opmsk:3;
>          uint8_t RX:1;
> -        uint8_t bcst:1;
> +        uint8_t br:1;
>          uint8_t lr:2;
>          uint8_t z:1;

I'm afraid that some of the choices of field naming in here makes the
code impossible to follow, due to their differences from the manual. 
Particularly, the tail end of the structure would be easier to follow if
it were:

diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c
b/xen/arch/x86/x86_emulate/x86_emulate.c
index f38c73b..bc0d39b 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -648,10 +648,10 @@ union evex {
         uint8_t mbs:1;
         uint8_t reg:4;
         uint8_t w:1;
-        uint8_t opmsk:3;
-        uint8_t RX:1;
-        uint8_t br:1;
-        uint8_t lr:2;
+        uint8_t aaa:3;
+        uint8_t V:1;
+        uint8_t b:1;
+        uint8_t l:2;
         uint8_t z:1;
     };
 };

The manual refers to EVEX.RX as the combination of the R and X fields in
the first byte, whereas the field currently named RX in Xen is V' in the
manual.  It is unfortunate that Intel chose L'L for the vector notation,
but l on its own is clearer than lr.

>      };
> @@ -2760,13 +2760,11 @@ x86_decode(
>                          evex.raw[1] = vex.raw[1];
>                          evex.raw[2] = insn_fetch_type(uint8_t);
>  
> -                        generate_exception_if(evex.mbs || !evex.mbz, EXC_UD);
> +                        generate_exception_if(!evex.mbs || evex.mbz, EXC_UD);
> +                        generate_exception_if(!evex.opmsk && evex.z, EXC_UD);

Where does this check derive from?  I presume you've calculated it from
Table 2-40 in the manual, but I don't see anything there which suggests
the restriction applies universally.  Every check in that table is
specific to certain classes of instruction.

>  
>                          if ( !mode_64bit() )
> -                        {
> -                            generate_exception_if(!evex.RX, EXC_UD);
>                              evex.R = 1;
> -                        }
>  
>                          vex.opcx = evex.opcx;
>                          break;
> @@ -3404,6 +3402,7 @@ x86_emulate(
>          d = (d & ~DstMask) | DstMem;
>          /* Becomes a normal DstMem operation from here on. */
>      case DstMem:
> +        generate_exception_if(ea.type == OP_MEM && evex.z, EXC_UD);

I can't find any statement that all DstMem prohibit zero-masking.

There is a statement saying that the subset of DstMem instructions which
require an encoded k register may not use zero-masking.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v2 6/6] x86emul: generalize vector length handling for AVX512/EVEX
  2018-08-29 14:25   ` [PATCH v2 6/6] x86emul: generalize vector length handling for AVX512/EVEX Jan Beulich
@ 2018-09-04 11:02     ` Andrew Cooper
  2018-09-04 12:50       ` Jan Beulich
  0 siblings, 1 reply; 465+ messages in thread
From: Andrew Cooper @ 2018-09-04 11:02 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

On 29/08/18 15:25, Jan Beulich wrote:
> @@ -2818,6 +2818,9 @@ x86_decode(
>  
>                  opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
>  
> +                if ( !evex.mbs )

This use of mbs is very confusing to read.  How about:

#define evex_encoded evex.mbs

which at least gives a semantic name to the how you are using the mbs bit?

~Andrew


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v2 5/6] x86emul: correct EVEX decoding
  2018-09-04 10:48     ` Andrew Cooper
@ 2018-09-04 12:48       ` Jan Beulich
  0 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-04 12:48 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 04.09.18 at 12:48, <andrew.cooper3@citrix.com> wrote:
> On 29/08/18 15:25, Jan Beulich wrote:
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -650,7 +650,7 @@ union evex {
>>          uint8_t w:1;
>>          uint8_t opmsk:3;
>>          uint8_t RX:1;
>> -        uint8_t bcst:1;
>> +        uint8_t br:1;
>>          uint8_t lr:2;
>>          uint8_t z:1;
> 
> I'm afraid that some of the choices of field naming in here makes the
> code impossible to follow, due to their differences from the manual. 
> Particularly, the tail end of the structure would be easier to follow if
> it were:
> 
> diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c
> b/xen/arch/x86/x86_emulate/x86_emulate.c
> index f38c73b..bc0d39b 100644
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -648,10 +648,10 @@ union evex {
>          uint8_t mbs:1;
>          uint8_t reg:4;
>          uint8_t w:1;
> -        uint8_t opmsk:3;
> -        uint8_t RX:1;
> -        uint8_t br:1;
> -        uint8_t lr:2;
> +        uint8_t aaa:3;
> +        uint8_t V:1;
> +        uint8_t b:1;
> +        uint8_t l:2;
>          uint8_t z:1;
>      };
>  };
> 
> The manual refers to EVEX.RX as the combination of the R and X fields in
> the first byte, whereas the field currently named RX in Xen is V' in the
> manual.  It is unfortunate that Intel chose L'L for the vector notation,
> but l on its own is clearer than lr.

The naming isn't very good, yes, but what you suggest won't work:
What you name b and l are dual purpose fields, hence I really want
to retain their names (standing for "broadcast or rounding" and
"length or rounding" respectively). Furthermore you'll notice there
is a field named b already. And l alone would further risk mixing up
with VEX.L.

As much as I avoided introducing a field named vvvv, I also don't
view it sensible to introduce a field named aaa. If Intel considers
these reasonable names in their manuals - so be it. They aren't
reasonable at all imo in code.

V as a name would make sense only together with vvvv, but the
bit again has dual use, and in its secondary use V is misleading
rather than helpful.

Besides all of this I now have almost 20 more patches on top of this.
I really don't see myself renaming all those field references. The
patch here has been available for long enough to give such a
comment / make such suggestions, even more so that I'm changing
a single field here only anyway.

Best I can offer is to attach comments to the fields pointing out
their SDM names. But even that I would view as slightly odd a
request _here_, as both EVEX and VEX unions have been around
in our code for quite some time.

>> @@ -2760,13 +2760,11 @@ x86_decode(
>>                          evex.raw[1] = vex.raw[1];
>>                          evex.raw[2] = insn_fetch_type(uint8_t);
>>  
>> -                        generate_exception_if(evex.mbs || !evex.mbz, EXC_UD);
>> +                        generate_exception_if(!evex.mbs || evex.mbz, EXC_UD);
>> +                        generate_exception_if(!evex.opmsk && evex.z, EXC_UD);
> 
> Where does this check derive from?  I presume you've calculated it from
> Table 2-40 in the manual, but I don't see anything there which suggests
> the restriction applies universally.  Every check in that table is
> specific to certain classes of instruction.

The check doesn't derive from any tables, but from the fact that
"zeroing-masking" makes no sense without "masking", and from the
observed hardware behavior.

>> @@ -3404,6 +3402,7 @@ x86_emulate(
>>          d = (d & ~DstMask) | DstMem;
>>          /* Becomes a normal DstMem operation from here on. */
>>      case DstMem:
>> +        generate_exception_if(ea.type == OP_MEM && evex.z, EXC_UD);
> 
> I can't find any statement that all DstMem prohibit zero-masking.
> 
> There is a statement saying that the subset of DstMem instructions which
> require an encoded k register may not use zero-masking.

Not sure what you mean by "encoded k register". Various EVEX and
ModRM fields can encode a k register.

No current instruction allows zeroing-masking on a memory destination
(nor on a k-register destination, if that's what you mean, but that's not
the same as DstMem), and I can't currently foresee this to change
without a CPUID bit telling us, if even the most obvious candidates
(VMOVAP{S,D} and VMOVDQA{32,64}) don't allow this. Hence I prefer the
check to be in a central place instead of getting repeated in a number
of places.

Note that "generic checks" in the description is not meant to imply that
these exclusively follow what Intel lists in their "generic" tables. I've done
quite a bit of prereq work and classification over the last year, and this
is one of the patterns that resulted from that work without the SDM
explicitly stating it.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v2 6/6] x86emul: generalize vector length handling for AVX512/EVEX
  2018-09-04 11:02     ` Andrew Cooper
@ 2018-09-04 12:50       ` Jan Beulich
  0 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-04 12:50 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel

>>> On 04.09.18 at 13:02, <andrew.cooper3@citrix.com> wrote:
> On 29/08/18 15:25, Jan Beulich wrote:
>> @@ -2818,6 +2818,9 @@ x86_decode(
>>  
>>                  opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
>>  
>> +                if ( !evex.mbs )
> 
> This use of mbs is very confusing to read.  How about:
> 
> #define evex_encoded evex.mbs
> 
> which at least gives a semantic name to the how you are using the mbs bit?

As you seem to think it helps, I can do that. To me the present form was
obvious enough.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support
  2018-08-09  8:15 [PATCH 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                   ` (6 preceding siblings ...)
  2018-08-29 14:20 ` [PATCH v2 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
@ 2018-09-18 11:46 ` Jan Beulich
  2018-09-18 11:53   ` [PATCH v3 01/34] x86emul: support AVX512 opmask insns Jan Beulich
                     ` (33 more replies)
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                   ` (4 subsequent siblings)
  12 siblings, 34 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:46 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

01: support AVX512 opmask insns
02: x86/HVM: grow MMIO cache data size to 64 bytes
03: correct EVEX decoding
04: generalize vector length handling for AVX512/EVEX
05: support basic AVX512 moves
06: test for correct EVEX Disp8 scaling
07: use AVX512 logic for emulating V{,P}MASKMOV*
08: support AVX512F legacy-equivalent arithmetic FP insns
09: support AVX512DQ logic FP insns
10: support AVX512F "normal" FP compare insns
11: support AVX512F misc legacy-equivalent FP insns
12: support AVX512F fused-multiply-add insns
13: support AVX512F legacy-equivalent logic insns
14: support AVX512{F,DQ} FP broadcast insns
15: support AVX512F v{,u}comis{d,s} insns
16: test: introduce eq()
17: support AVX512{F,BW} packed integer compare insns
18: support AVX512{F,BW} packed integer arithmetic insns
19: use simd_128 also for legacy vector shift insns
20: support AVX512{F,BW} shift/rotate insns
21: support AVX512{F,BW,DQ} extract insns
22: support AVX512{F,BW,DQ} insert insns
23: basic AVX512F testing
24: support AVX512{F,BW,DQ} integer broadcast insns
25: basic AVX512VL testing
26: support AVX512{F,BW} zero- and sign-extending moves
27: support AVX512{F,BW} down conversion moves
28: support AVX512{F,BW} integer unpack insns
29: support AVX512{F,BW,_VBMI} full permute insns
30: support AVX512{F,BW} integer shuffle insns
31: support AVX512{BW,DQ} mask move insns
32: basic AVX512BW testing
33: basic AVX512DQ testing
34: also allow running the 32-bit harness on a 64-bit distro

The main goal of this series is to support enough of the instructions
such that basic AVX512F, AVX512BW, AVX512DQ, and AVX512VL
tests can be run (this set is relevant as a basis in particular due to
it together mostly [entirely?] covering the legacy-equivalent AVX512
insns). Later additions then may simply enable further of the
(conditional) tests in simd*.c (or by other means).

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 01/34] x86emul: support AVX512 opmask insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
@ 2018-09-18 11:53   ` Jan Beulich
  2018-10-25 18:32     ` Andrew Cooper
  2018-09-18 11:53   ` [PATCH v3 02/34] x86/HVM: grow MMIO cache data size to 64 bytes Jan Beulich
                     ` (32 subsequent siblings)
  33 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:53 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

These are all VEX encoded, so the EVEX decoding logic continues to
remain unused at this point.

The new testcase is deliberately coded in assembly, as a C one would
have become almost unreadable due to the overwhelming amount of
__builtin_...() that would need to be used. After all the compiler has
no underlying type (yet) that could be operated on without builtins,
other than the vector types used for "normal" SIMD insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Use distinct temporary file names in testcase.mk. Additions to clean
    target.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -16,6 +16,8 @@ FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
 
+OPMASK := avx512f avx512dq avx512bw
+
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
@@ -51,6 +53,10 @@ xop-vecs := $(avx-vecs)
 xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
 
+avx512f-opmask-vecs := 2
+avx512dq-opmask-vecs := 1
+avx512bw-opmask-vecs := 4 8
+
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.  For 3DNow!, however, force SSE
 # use for floating point operations, to avoid mixing MMX and FPU register
@@ -80,9 +86,13 @@ $(1)-cflags := \
 	   $(foreach flt,$($(1)-flts), \
 	     "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
 endef
+define opmask-defs
+$(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)")
+endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
 $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
@@ -100,6 +110,22 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 	)
 	mv $@.new $@
 
+$(addsuffix -opmask.h,$(OPMASK)): %.h: opmask.S testcase.mk Makefile
+	rm -f $@.new $*.bin
+	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
+	    for cflags in $($*-cflags) $($*-cflags-$(arch)); do \
+		$(MAKE) -f testcase.mk TESTCASE=$* XEN_TARGET_ARCH=$(arch) $*-cflags="$$cflags" all; \
+		prefix=$(shell echo $(subst -,_,$*) | sed -e 's,^\([0-9]\),_\1,'); \
+		flavor=$$(echo $${cflags} | sed -e 's, .*,,' -e 'y,-=,__,') ; \
+		(echo 'static const unsigned int __attribute__((section(".test, \"ax\", @progbits #")))' \
+		      "$${prefix}_$(arch)$${flavor}[] = {"; \
+		 od -v -t x $*.bin | sed -e 's/^[0-9]* /0x/' -e 's/ /, 0x/g' -e 's/$$/,/'; \
+		 echo "};") >>$@.new; \
+		rm -f $*.bin; \
+	    done; \
+	)
+	mv $@.new $@
+
 $(addsuffix .c,$(SIMD)):
 	ln -sf simd.c $@
 
@@ -118,7 +144,8 @@ $(TARGET): x86-emulate.o test_x86_emulat
 
 .PHONY: clean
 clean:
-	rm -rf $(TARGET) *.o *~ core $(addsuffix .h,$(TESTCASES)) *.bin x86_emulate
+	rm -rf $(TARGET) *.o *~ core *.bin x86_emulate
+	rm -rf $(TARGET) $(addsuffix .h,$(TESTCASES)) $(addsuffix -opmask.h,$(OPMASK))
 
 .PHONY: distclean
 distclean: clean
@@ -145,4 +172,4 @@ x86-emulate.o test_x86_emulator.o wrappe
 x86-emulate.o: x86_emulate/x86_emulate.c
 x86-emulate.o: HOSTCFLAGS += -D__XEN_TOOLS__
 
-test_x86_emulator.o: $(addsuffix .h,$(TESTCASES))
+test_x86_emulator.o: $(addsuffix .h,$(TESTCASES)) $(addsuffix -opmask.h,$(OPMASK))
--- /dev/null
+++ b/tools/tests/x86_emulator/opmask.S
@@ -0,0 +1,144 @@
+#ifdef __i386__
+# define R(x) e##x
+# define DATA(x) x
+#else
+# if SIZE == 8
+#  define R(x) r##x
+# else
+#  define R(x) e##x
+# endif
+# define DATA(x) x(%rip)
+#endif
+
+#if SIZE == 1
+# define _(x) x##b
+#elif SIZE == 2
+# define _(x) x##w
+# define WIDEN(x) x##bw
+#elif SIZE == 4
+# define _(x) x##d
+# define WIDEN(x) x##wd
+#elif SIZE == 8
+# define _(x) x##q
+# define WIDEN(x) x##dq
+#endif
+
+    .macro check res1:req, res2:req, line:req
+    _(kmov)       %\res1, DATA(out)
+#if SIZE < 8 || !defined(__i386__)
+    _(kmov)       %\res2, %R(dx)
+    cmp           DATA(out), %R(dx)
+#else
+    sub           $8, %esp
+    kmovq         %\res2, (%esp)
+    pop           %ecx
+    pop           %edx
+    cmp           DATA(out), %ecx
+    jne           0f
+    cmp           DATA(out+4), %edx
+0:
+#endif
+    je            1f
+    mov           $\line, %eax
+    ret
+1:
+    .endm
+
+    .text
+    .globl _start
+_start:
+    _(kmov)       DATA(in1), %k1
+#if SIZE < 8 || !defined(__i386__)
+    mov           DATA(in2), %R(ax)
+    _(kmov)       %R(ax), %k2
+#else
+    _(kmov)       DATA(in2), %k2
+#endif
+
+    _(kor)        %k1, %k2, %k3
+    _(kand)       %k1, %k2, %k4
+    _(kandn)      %k3, %k4, %k5
+    _(kxor)       %k1, %k2, %k6
+    check         k5, k6, __LINE__
+
+    _(knot)       %k6, %k3
+    _(kxnor)      %k1, %k2, %k4
+    check         k3, k4, __LINE__
+
+    _(kshiftl)    $1, %k1, %k3
+    _(kshiftl)    $2, %k3, %k4
+    _(kshiftl)    $3, %k1, %k5
+    check         k4, k5, __LINE__
+
+    _(kshiftr)    $1, %k1, %k3
+    _(kshiftr)    $2, %k3, %k4
+    _(kshiftr)    $3, %k1, %k5
+    check         k4, k5, __LINE__
+
+    _(kortest)    %k6, %k6
+    jnbe          1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxor)       %k0, %k0, %k3
+    _(kortest)    %k3, %k3
+    jz            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxnor)      %k0, %k0, %k3
+    _(kortest)    %k3, %k3
+    jc            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+#if SIZE > 1
+
+    _(kshiftr)    $SIZE*4, %k3, %k4
+    WIDEN(kunpck) %k4, %k4, %k5
+    check         k3, k5, __LINE__
+
+#endif
+
+#if SIZE != 2 || defined(__AVX512DQ__)
+
+    _(kadd)       %k1, %k1, %k3
+    _(kshiftl)    $1, %k1, %k4
+    check         k3, k4, __LINE__
+
+    _(ktest)      %k2, %k1
+    jnbe          1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxor)       %k0, %k0, %k3
+    _(ktest)      %k0, %k3
+    jz            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxnor)      %k0, %k0, %k4
+    _(ktest)      %k0, %k4
+    jc            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+#endif
+
+    xor           %eax, %eax
+    ret
+
+    .section .rodata, "a", @progbits
+    .balign 8
+in1: .byte 0b10110011, 0b10001111, 0b00001111, 0b10000011, 0b11110000, 0b00111111, 0b10000000, 0b11111111
+in2: .byte 0b11111111, 0b00000001, 0b11111100, 0b00001111, 0b11000001, 0b11110000, 0b11110001, 0b11001101
+
+    .data
+    .balign 8
+out: .quad 0
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -18,6 +18,9 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx2.h"
 #include "avx2-sg.h"
 #include "xop.h"
+#include "avx512f-opmask.h"
+#include "avx512dq-opmask.h"
+#include "avx512bw-opmask.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -78,6 +81,24 @@ static bool simd_check_xop(void)
     return cpu_has_xop;
 }
 
+static bool simd_check_avx512f(void)
+{
+    return cpu_has_avx512f;
+}
+#define simd_check_avx512f_opmask simd_check_avx512f
+
+static bool simd_check_avx512dq(void)
+{
+    return cpu_has_avx512dq;
+}
+#define simd_check_avx512dq_opmask simd_check_avx512dq
+
+static bool simd_check_avx512bw(void)
+{
+    return cpu_has_avx512bw;
+}
+#define simd_check_avx512bw_opmask simd_check_avx512bw
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -223,6 +244,10 @@ static const struct {
     SIMD(XOP i16x16,              xop,      32i2),
     SIMD(XOP i32x8,               xop,      32i4),
     SIMD(XOP i64x4,               xop,      32i8),
+    SIMD(OPMASK/w,     avx512f_opmask,         2),
+    SIMD(OPMASK/b,    avx512dq_opmask,         1),
+    SIMD(OPMASK/d,    avx512bw_opmask,         4),
+    SIMD(OPMASK/q,    avx512bw_opmask,         8),
 #undef SIMD_
 #undef SIMD
 };
@@ -3426,8 +3451,8 @@ int main(int argc, char **argv)
             rc = x86_emulate(&ctxt, &emulops);
             if ( rc != X86EMUL_OKAY )
             {
-                printf("failed at %%eip == %08lx (opcode %08x)\n",
-                       (unsigned long)regs.eip, ctxt.opcode);
+                printf("failed (%d) at %%eip == %08lx (opcode %08x)\n",
+                       rc, (unsigned long)regs.eip, ctxt.opcode);
                 return 1;
             }
         }
--- a/tools/tests/x86_emulator/testcase.mk
+++ b/tools/tests/x86_emulator/testcase.mk
@@ -14,3 +14,9 @@ all: $(TESTCASE).bin
 	$(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x100000 -o $*.tmp $*.o
 	$(OBJCOPY) -O binary $*.tmp $@
 	rm -f $*.tmp
+
+%-opmask.bin: opmask.S
+	$(CC) $(filter-out -M% .%,$(CFLAGS)) -c $< -o $(basename $@).o
+	$(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x100000 -o $(basename $@).tmp $(basename $@).o
+	$(OBJCOPY) -O binary $(basename $@).tmp $@
+	rm -f $(basename $@).tmp
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -209,6 +209,9 @@ int emul_test_get_fpu(
     case X86EMUL_FPU_ymm:
         if ( cpu_has_avx )
             break;
+    case X86EMUL_FPU_opmask:
+        if ( cpu_has_avx512f )
+            break;
     default:
         return X86EMUL_UNHANDLEABLE;
     }
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -236,6 +236,36 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 21)) != 0; \
 })
 
+#define cpu_has_avx512f ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 16)) != 0; \
+})
+
+#define cpu_has_avx512dq ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 17)) != 0; \
+})
+
+#define cpu_has_avx512bw ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 30)) != 0; \
+})
+
 int emul_test_cpuid(
     uint32_t leaf,
     uint32_t subleaf,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -491,6 +491,7 @@ static const struct ext0f3a_table {
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
+    [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
@@ -1187,6 +1188,11 @@ static int _get_fpu(
             return X86EMUL_UNHANDLEABLE;
         break;
 
+    case X86EMUL_FPU_opmask:
+        if ( !(xcr0 & X86_XCR0_SSE) || !(xcr0 & X86_XCR0_OPMASK) )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
     default:
         break;
     }
@@ -1762,12 +1768,15 @@ static bool vcpu_has(
 #define vcpu_has_bmi2()        vcpu_has(         7, EBX,  8, ctxt, ops)
 #define vcpu_has_rtm()         vcpu_has(         7, EBX, 11, ctxt, ops)
 #define vcpu_has_mpx()         vcpu_has(         7, EBX, 14, ctxt, ops)
+#define vcpu_has_avx512f()     vcpu_has(         7, EBX, 16, ctxt, ops)
+#define vcpu_has_avx512dq()    vcpu_has(         7, EBX, 17, ctxt, ops)
 #define vcpu_has_rdseed()      vcpu_has(         7, EBX, 18, ctxt, ops)
 #define vcpu_has_adx()         vcpu_has(         7, EBX, 19, ctxt, ops)
 #define vcpu_has_smap()        vcpu_has(         7, EBX, 20, ctxt, ops)
 #define vcpu_has_clflushopt()  vcpu_has(         7, EBX, 23, ctxt, ops)
 #define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
 #define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
+#define vcpu_has_avx512bw()    vcpu_has(         7, EBX, 30, ctxt, ops)
 #define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
 #define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
 
@@ -2396,6 +2405,18 @@ x86_decode_twobyte(
         }
         break;
 
+    case X86EMUL_OPC_VEX(0, 0x90):    /* kmov{w,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x90): /* kmov{b,d} */
+        state->desc = DstReg | SrcMem | Mov;
+        state->simd_size = simd_other;
+        break;
+
+    case X86EMUL_OPC_VEX(0, 0x91):    /* kmov{w,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x91): /* kmov{b,d} */
+        state->desc = DstMem | SrcReg | Mov;
+        state->simd_size = simd_other;
+        break;
+
     case 0xae:
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         /* fall through */
@@ -6002,6 +6023,60 @@ x86_emulate(
             dst.val = src.val;
         break;
 
+    case X86EMUL_OPC_VEX(0x0f, 0x4a):    /* kadd{w,q} k,k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x41):    /* kand{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x41): /* kand{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x42):    /* kandn{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x42): /* kandn{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x45):    /* kor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x45): /* kor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x46):    /* kxnor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x46): /* kxnor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x47):    /* kxor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x47): /* kxor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x4a): /* kadd{b,d} k,k,k */
+        generate_exception_if(!vex.l, EXC_UD);
+    opmask_basic:
+        if ( vex.w )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( vex.pfx )
+            host_and_vcpu_must_have(avx512dq);
+    opmask_common:
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(!vex.r || (mode_64bit() && !(vex.reg & 8)) ||
+                              ea.type != OP_REG, EXC_UD);
+
+        vex.reg |= 8;
+        d &= ~TwoOp;
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        insn_bytes = PFX_BYTES + 2;
+
+        state->simd_size = simd_other;
+        op_bytes = 1; /* Any non-zero value will do. */
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x44):    /* knot{w,q} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x44): /* knot{b,d} k,k */
+        generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+        goto opmask_basic;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x4b):    /* kunpck{w,d}{d,q} k,k,k */
+        generate_exception_if(!vex.l, EXC_UD);
+        host_and_vcpu_must_have(avx512bw);
+        goto opmask_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f, 0x4b): /* kunpckbw k,k,k */
+        generate_exception_if(!vex.l || vex.w, EXC_UD);
+        goto opmask_common;
+
     CASE_SIMD_PACKED_FP(, 0x0f, 0x50):     /* movmskp{s,d} xmm,reg */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
     CASE_SIMD_PACKED_INT(0x0f, 0xd7):      /* pmovmskb {,x}mm,reg */
@@ -6552,6 +6627,154 @@ x86_emulate(
         dst.val = test_cc(b, _regs.eflags);
         break;
 
+    case X86EMUL_OPC_VEX(0x0f, 0x91):    /* kmov{w,q} k,mem */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x91): /* kmov{b,d} k,mem */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x90):    /* kmov{w,q} k/mem,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x90): /* kmov{b,d} k/mem,k */
+        generate_exception_if(vex.l || !vex.r, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.w )
+        {
+            host_and_vcpu_must_have(avx512bw);
+            op_bytes = 4 << !vex.pfx;
+        }
+        else if ( vex.pfx )
+        {
+            host_and_vcpu_must_have(avx512dq);
+            op_bytes = 1;
+        }
+        else
+            op_bytes = 2;
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* convert memory operand to (%rAX) */
+            vex.b = 1;
+            opc[1] &= 0x38;
+        }
+        insn_bytes = PFX_BYTES + 2;
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x92):    /* kmovw r32,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x92): /* kmovb r32,k */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x92): /* kmov{d,q} reg,k */
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.pfx == vex_f2 )
+            host_and_vcpu_must_have(avx512bw);
+        else
+        {
+            generate_exception_if(vex.w, EXC_UD);
+            if ( vex.pfx )
+                host_and_vcpu_must_have(avx512dq);
+        }
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert GPR source to %rAX. */
+        vex.b = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1] = modrm & 0xf8;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        ea.reg = decode_gpr(&_regs, modrm_rm);
+        invoke_stub("", "", "=m" (dummy) : "a" (*ea.reg));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        dst.type = OP_NONE;
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x93):    /* kmovw k,r32 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x93): /* kmovb k,r32 */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x93): /* kmov{d,q} k,reg */
+        generate_exception_if(vex.l || vex.reg != 0xf || ea.type != OP_REG,
+                              EXC_UD);
+        dst = ea;
+        dst.reg = decode_gpr(&_regs, modrm_reg);
+
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.pfx == vex_f2 )
+        {
+            host_and_vcpu_must_have(avx512bw);
+            dst.bytes = 4 << (mode_64bit() && vex.w);
+        }
+        else
+        {
+            generate_exception_if(vex.w, EXC_UD);
+            dst.bytes = 4;
+            if ( vex.pfx )
+                host_and_vcpu_must_have(avx512dq);
+        }
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert GPR destination to %rAX. */
+        vex.r = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1] = modrm & 0xc7;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x99):    /* ktest{w,q} k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x98):    /* kortest{w,q} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x98): /* kortest{b,d} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x99): /* ktest{b,d} k,k */
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.w )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( vex.pfx )
+            host_and_vcpu_must_have(avx512dq);
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+                    _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+                    [eflags] "+g" (_regs.eflags),
+                    "=a" (dst.val), [tmp] "=&r" (dummy)
+                    : [mask] "i" (EFLAGS_MASK));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        dst.type = OP_NONE;
+        break;
+
     case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
         msr_val = 0;
         fail_if(ops->cpuid == NULL);
@@ -8170,6 +8393,23 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+    opmask_shift_imm:
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_opmask);
+        op_bytes = 1; /* Any non-zero value will do. */
+        goto simd_0f_imm8;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x31): /* kshiftr{d,q} $imm8,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x33): /* kshiftl{d,q} $imm8,k,k */
+        host_and_vcpu_must_have(avx512bw);
+        goto opmask_shift_imm;
+
     case X86EMUL_OPC_66(0x0f3a, 0x44):     /* pclmulqdq $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,xmm/m128,xmm,xmm */
         host_and_vcpu_must_have(pclmulqdq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -170,6 +170,7 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_mmx, /* MMX instruction set (%mm0-%mm7) */
     X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */
     X86EMUL_FPU_ymm, /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
+    X86EMUL_FPU_opmask, /* AVX512 opmask instruction set (%k0-%k7) */
     /* This sentinel will never be passed to ->get_fpu(). */
     X86EMUL_FPU_none
 };
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -99,9 +99,12 @@
 #define cpu_has_rtm             boot_cpu_has(X86_FEATURE_RTM)
 #define cpu_has_fpu_sel         (!boot_cpu_has(X86_FEATURE_NO_FPU_SEL))
 #define cpu_has_mpx             boot_cpu_has(X86_FEATURE_MPX)
+#define cpu_has_avx512f         boot_cpu_has(X86_FEATURE_AVX512F)
+#define cpu_has_avx512dq        boot_cpu_has(X86_FEATURE_AVX512DQ)
 #define cpu_has_rdseed          boot_cpu_has(X86_FEATURE_RDSEED)
 #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
 #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
+#define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 02/34] x86/HVM: grow MMIO cache data size to 64 bytes
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
  2018-09-18 11:53   ` [PATCH v3 01/34] x86emul: support AVX512 opmask insns Jan Beulich
@ 2018-09-18 11:53   ` Jan Beulich
  2018-09-18 16:05     ` Paul Durrant
  2018-10-25 18:36     ` Andrew Cooper
  2018-09-18 11:55   ` [PATCH v3 03/34] x86emul: correct EVEX decoding Jan Beulich
                     ` (31 subsequent siblings)
  33 siblings, 2 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:53 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Paul Durrant, Wei Liu

This is needed before enabling any AVX512 insns in the emulator. Change
the way alignment is enforced at the same time.

Add a check that the buffer won't actually overflow, and while at it
also convert the check for accesses to not cross page boundaries.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -866,7 +866,18 @@ static int hvmemul_phys_mmio_access(
     int rc = X86EMUL_OKAY;
 
     /* Accesses must fall within a page. */
-    BUG_ON((gpa & ~PAGE_MASK) + size > PAGE_SIZE);
+    if ( (gpa & ~PAGE_MASK) + size > PAGE_SIZE )
+    {
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    /* Accesses must not overflow the cache's buffer. */
+    if ( size > sizeof(cache->buffer) )
+    {
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
 
     /*
      * hvmemul_do_io() cannot handle non-power-of-2 accesses or
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -42,15 +42,14 @@ struct hvm_vcpu_asid {
 };
 
 /*
- * We may read or write up to m256 as a number of device-model
+ * We may read or write up to m512 as a number of device-model
  * transactions.
  */
 struct hvm_mmio_cache {
     unsigned long gla;
     unsigned int size;
     uint8_t dir;
-    uint8_t pad[3]; /* make buffer[] long-aligned */
-    uint8_t buffer[32];
+    uint8_t buffer[64] __aligned(sizeof(long));
 };
 
 struct hvm_vcpu_io {





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 03/34] x86emul: correct EVEX decoding
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
  2018-09-18 11:53   ` [PATCH v3 01/34] x86emul: support AVX512 opmask insns Jan Beulich
  2018-09-18 11:53   ` [PATCH v3 02/34] x86/HVM: grow MMIO cache data size to 64 bytes Jan Beulich
@ 2018-09-18 11:55   ` Jan Beulich
  2018-09-18 11:55   ` [PATCH v3 04/34] x86emul: generalize vector length handling for AVX512/EVEX Jan Beulich
                     ` (30 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:55 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Fix an inverted pair of checks, drop an incorrect instance of #UD
raising for non-64-bit mode, and add further generic checks.

Note: Other than SDM Vol 2 rev 067 states, EVEX.V' is _not_ ignored
      outside of 64-bit mode when the field does not encode a register.
      Just like EVEX.VVVV is required to be 0b1111 in that case, EVEX.V'
      is required to be 1 there.

Also rename the bcst field to br, as #UD generation for individual insns
will need to consider both of its possible meanings.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -650,7 +650,7 @@ union evex {
         uint8_t w:1;
         uint8_t opmsk:3;
         uint8_t RX:1;
-        uint8_t bcst:1;
+        uint8_t br:1;
         uint8_t lr:2;
         uint8_t z:1;
     };
@@ -2760,13 +2760,11 @@ x86_decode(
                         evex.raw[1] = vex.raw[1];
                         evex.raw[2] = insn_fetch_type(uint8_t);
 
-                        generate_exception_if(evex.mbs || !evex.mbz, EXC_UD);
+                        generate_exception_if(!evex.mbs || evex.mbz, EXC_UD);
+                        generate_exception_if(!evex.opmsk && evex.z, EXC_UD);
 
                         if ( !mode_64bit() )
-                        {
-                            generate_exception_if(!evex.RX, EXC_UD);
                             evex.R = 1;
-                        }
 
                         vex.opcx = evex.opcx;
                         break;
@@ -3404,6 +3402,7 @@ x86_emulate(
         d = (d & ~DstMask) | DstMem;
         /* Becomes a normal DstMem operation from here on. */
     case DstMem:
+        generate_exception_if(ea.type == OP_MEM && evex.z, EXC_UD);
         if ( state->simd_size )
         {
             generate_exception_if(lock_prefix, EXC_UD);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 04/34] x86emul: generalize vector length handling for AVX512/EVEX
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (2 preceding siblings ...)
  2018-09-18 11:55   ` [PATCH v3 03/34] x86emul: correct EVEX decoding Jan Beulich
@ 2018-09-18 11:55   ` Jan Beulich
  2018-09-18 11:56   ` [PATCH v3 05/34] x86emul: support basic AVX512 moves Jan Beulich
                     ` (29 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:55 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

To allow for some code sharing where possible, copy VEX.L into EVEX.LR
even for VEX (or XOP) encoded insns. Make operand size determination
use this right away, at the same time adding consistency checks for the
EVEX scalar insn cases (the non-scalar ones aren't uniform enough for
the checking to be done in a central place like this).

Note that the broadcast case is not handled here, but will be taken care
of elsewhere (in just a single place rather than at least two).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Introduce evex_encoded() to replace open-coded evex.mbs checks.
v2: Don't raise #UD in simd_scalar_opc case when EVEX.W != low-opcode-bit.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -191,14 +191,14 @@ enum simd_opsize {
      * Ordinary packed integers:
      * - 64 bits without prefix 66 (MMX)
      * - 128 bits with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      */
     simd_packed_int,
 
     /*
      * Ordinary packed/scalar floating point:
      * - 128 bits without prefix or with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      * - 32 bits with prefix F3 (scalar single)
      * - 64 bits with prefix F2 (scalar doubgle)
      */
@@ -207,14 +207,14 @@ enum simd_opsize {
     /*
      * Packed floating point:
      * - 128 bits without prefix or with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      */
     simd_packed_fp,
 
     /*
      * Single precision packed/scalar floating point:
      * - 128 bits without prefix (SSEn)
-     * - 128/256 bits depending on VEX.L, no prefix (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      * - 32 bits with prefix F3 (scalar)
      */
     simd_single_fp,
@@ -228,7 +228,7 @@ enum simd_opsize {
 
     /*
      * Scalar floating point:
-     * - 32/64 bits depending on VEX.W
+     * - 32/64 bits depending on VEX.W/EVEX.W
      */
     simd_scalar_vexw,
 
@@ -2249,6 +2249,7 @@ int x86emul_unhandleable_rw(
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
 #define evex (state->evex)
+#define evex_encoded() (evex.mbs)
 #define ea (state->ea)
 
 static int
@@ -2818,6 +2819,9 @@ x86_decode(
 
                 opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
 
+                if ( !evex_encoded() )
+                    evex.lr = vex.l;
+
                 if ( !(d & ModRM) )
                     break;
 
@@ -3148,7 +3152,7 @@ x86_decode(
             }
             /* fall through */
         case vex_66:
-            op_bytes = 16 << vex.l;
+            op_bytes = 16 << evex.lr;
             break;
         default:
             op_bytes = 0;
@@ -3172,9 +3176,17 @@ x86_decode(
     case simd_any_fp:
         switch ( vex.pfx )
         {
-        default:     op_bytes = 16 << vex.l; break;
-        case vex_f3: op_bytes = 4;           break;
-        case vex_f2: op_bytes = 8;           break;
+        default:
+            op_bytes = 16 << evex.lr;
+            break;
+        case vex_f3:
+            generate_exception_if(evex_encoded() && evex.w, EXC_UD);
+            op_bytes = 4;
+            break;
+        case vex_f2:
+            generate_exception_if(evex_encoded() && !evex.w, EXC_UD);
+            op_bytes = 8;
+            break;
         }
         break;
 





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 05/34] x86emul: support basic AVX512 moves
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (3 preceding siblings ...)
  2018-09-18 11:55   ` [PATCH v3 04/34] x86emul: generalize vector length handling for AVX512/EVEX Jan Beulich
@ 2018-09-18 11:56   ` Jan Beulich
  2018-09-18 11:57   ` [PATCH v3 06/34] x86emul: test for correct EVEX Disp8 scaling Jan Beulich
                     ` (28 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:56 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note: SDM Vol 2 rev 067 is not really consistent about EVEX.L'L for LIG
      insns - the only place where this is made explicit is a table in
      the section titled "Vector Length Orthogonality": While they
      tolerate 0, 1, and 2, a value of 3 uniformly leads to #UD.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Restrict k-reg reading to insns with memory operand. Shrink scope of
    "disp8scale".
v2: Move "full" into more narrow scope.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1985,6 +1985,53 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovq %xmm1,32(%edx)...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_to_mem);
+
+        asm volatile ( "pcmpgtb %%xmm1, %%xmm1\n"
+                       put_insn(evex_vmovq_to_mem, "%{evex%} vmovq %%xmm1, 32(%0)")
+                       :: "d" (NULL) );
+
+        memset(res, 0xdb, 64);
+        set_insn(evex_vmovq_to_mem);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_mem) ||
+             *((uint64_t *)res + 4) ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing {evex} vmovq 32(%edx),%xmm0...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_from_mem);
+
+        asm volatile ( "pcmpeqb %%xmm0, %%xmm0\n"
+                       put_insn(evex_vmovq_from_mem, "%{evex%} vmovq 32(%0), %%xmm0")
+                       :: "d" (NULL) );
+
+        set_insn(evex_vmovq_from_mem);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_from_mem) )
+            goto fail;
+        asm ( "vmovq %1, %%xmm1\n\t"
+              "vpcmpeqq %%zmm0, %%zmm1, %%k0\n"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) );
+        if ( rc != 0xff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movdqu %xmm2,(%ecx)...");
     if ( stack_exec && cpu_has_sse2 )
     {
@@ -2085,6 +2132,118 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vmovdqu32 %zmm2,(%ecx){%k1}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovdqu32_to_mem);
+
+        memset(res, 0x55, 128);
+
+        asm volatile ( "vpcmpeqd %%ymm2, %%ymm2, %%ymm2\n\t"
+                       "kmovw %1,%%k1\n"
+                       put_insn(vmovdqu32_to_mem,
+                                "vmovdqu32 %%zmm2, (%0)%{%%k1%}")
+                       :: "c" (NULL), "rm" (res[0]) );
+        set_insn(vmovdqu32_to_mem);
+
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || memcmp(res + 16, res + 24, 32) ||
+             !check_eip(vmovdqu32_to_mem) )
+            goto fail;
+
+        res[16] = ~0; res[18] = ~0; res[20] = ~0; res[22] = ~0;
+        res[24] =  0; res[26] =  0; res[28] =  0; res[30] =  0;
+        if ( memcmp(res, res + 16, 64) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovdqu32 64(%edx),%zmm2{%k2}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovdqu32_from_mem);
+
+        asm volatile ( "knotw %%k1, %%k2\n"
+                       put_insn(vmovdqu32_from_mem,
+                                "vmovdqu32 64(%0), %%zmm2%{%%k2%}")
+                       :: "d" (NULL) );
+
+        set_insn(vmovdqu32_from_mem);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovdqu32_from_mem) )
+            goto fail;
+        asm ( "vpcmpeqd %1, %%zmm2, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[0]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovdqu16 %zmm3,(%ecx){%k1}...");
+    if ( stack_exec && cpu_has_avx512bw )
+    {
+        decl_insn(vmovdqu16_to_mem);
+
+        memset(res, 0x55, 128);
+
+        asm volatile ( "vpcmpeqw %%ymm3, %%ymm3, %%ymm3\n\t"
+                       "kmovd %1,%%k1\n"
+                       put_insn(vmovdqu16_to_mem,
+                                "vmovdqu16 %%zmm3, (%0)%{%%k1%}")
+                       :: "c" (NULL), "rm" (res[0]) );
+        set_insn(vmovdqu16_to_mem);
+
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || memcmp(res + 16, res + 24, 32) ||
+             !check_eip(vmovdqu16_to_mem) )
+            goto fail;
+
+        for ( i = 16; i < 24; ++i )
+            res[i] |= 0x0000ffff;
+        for ( ; i < 32; ++i )
+            res[i] &= 0xffff0000;
+        if ( memcmp(res, res + 16, 64) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovdqu16 64(%edx),%zmm3{%k2}...");
+    if ( stack_exec && cpu_has_avx512bw )
+    {
+        decl_insn(vmovdqu16_from_mem);
+
+        asm volatile ( "knotd %%k1, %%k2\n"
+                       put_insn(vmovdqu16_from_mem,
+                                "vmovdqu16 64(%0), %%zmm3%{%%k2%}")
+                       :: "d" (NULL) );
+
+        set_insn(vmovdqu16_from_mem);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovdqu16_from_mem) )
+            goto fail;
+        asm ( "vpcmpeqw %1, %%zmm3, %%k0\n\t"
+              "kmovd %%k0, %0" : "=r" (rc) : "m" (res[0]) );
+        if ( rc != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movsd %xmm5,(%ecx)...");
     memset(res, 0x77, 64);
     memset(res + 10, 0x66, 8);
@@ -2186,6 +2345,71 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vmovsd %xmm5,16(%ecx){%k3}...");
+    memset(res, 0x88, 128);
+    memset(res + 20, 0x77, 8);
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovsd_masked_to_mem);
+
+        asm volatile ( "vbroadcastsd %0, %%ymm5\n\t"
+                       "kxorw %%k3, %%k3, %%k3\n"
+                       put_insn(vmovsd_masked_to_mem,
+                                "vmovsd %%xmm5, 16(%1)%{%%k3%}")
+                       :: "m" (res[20]), "c" (NULL) );
+
+        set_insn(vmovsd_masked_to_mem);
+        regs.ecx = 0;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(vmovsd_masked_to_mem) )
+            goto fail;
+
+        asm volatile ( "kmovw %0, %%k3\n" :: "m" (res[20]) );
+
+        set_insn(vmovsd_masked_to_mem);
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(vmovsd_masked_to_mem) ||
+             memcmp(res, res + 16, 64) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+    {
+        printf("skipped\n");
+        memset(res + 4, 0x77, 8);
+    }
+
+    printf("%-40s", "Testing vmovaps (%edx),%zmm7{%k3}{z}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovaps_masked_from_mem);
+
+        asm volatile ( "vpcmpeqd %%xmm7, %%xmm7, %%xmm7\n\t"
+                       "vbroadcastss %%xmm7, %%zmm7\n"
+                       put_insn(vmovaps_masked_from_mem,
+                                "vmovaps (%0), %%zmm7%{%%k3%}%{z%}")
+                       :: "d" (NULL) );
+
+        set_insn(vmovaps_masked_from_mem);
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovaps_masked_from_mem) )
+            goto fail;
+        asm ( "vcmpeqps %1, %%zmm7, %%k0\n\t"
+              "vxorps %%xmm0, %%xmm0, %%xmm0\n\t"
+              "vcmpeqps %%zmm0, %%zmm7, %%k1\n\t"
+              "kxorw %%k1, %%k0, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[16]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movd %mm3,32(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
     {
@@ -2341,6 +2565,55 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovd %xmm3,32(%ecx)...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_to_mem);
+
+        asm volatile ( "pcmpeqb %%xmm3, %%xmm3\n"
+                       put_insn(evex_vmovd_to_mem,
+                                "%{evex%} vmovd %%xmm3, 32(%0)")
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(evex_vmovd_to_mem);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovd_to_mem) ||
+             res[8] + 1 ||
+             memcmp(res, res + 9, 28) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing {evex} vmovd 32(%ecx),%xmm4...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_from_mem);
+
+        asm volatile ( "pcmpeqb %%xmm4, %%xmm4\n"
+                       put_insn(evex_vmovd_from_mem,
+                                "%{evex%} vmovd 32(%0), %%xmm4")
+                       :: "c" (NULL) );
+
+        set_insn(evex_vmovd_from_mem);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovd_from_mem) )
+            goto fail;
+        asm ( "vmovd %1, %%xmm0\n\t"
+              "vpcmpeqd %%zmm4, %%zmm0, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movd %mm3,%ebx...");
     if ( stack_exec && cpu_has_mmx )
     {
@@ -2507,6 +2780,57 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovd %xmm2,%ebx...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(evex_vmovd_to_reg,
+                                "%{evex%} vmovd %%xmm2, %%ebx")
+                       :: );
+
+        set_insn(evex_vmovd_to_reg);
+#ifdef __x86_64__
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+#else
+        regs.ebx = 0xbdbdbdbdUL;
+#endif
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(evex_vmovd_to_reg) ||
+             regs.ebx != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing {evex} vmovd %ebx,%xmm1...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_from_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpgtb %%xmm1, %%xmm1\n"
+                       put_insn(evex_vmovd_from_reg,
+                                "%{evex%} vmovd %%ebx, %%xmm1")
+                       :: );
+
+        set_insn(evex_vmovd_from_reg);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(evex_vmovd_from_reg) )
+            goto fail;
+        asm ( "vmovd %1, %%xmm0\n\t"
+              "vpcmpeqd %%zmm1, %%zmm0, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #ifdef __x86_64__
     printf("%-40s", "Testing movq %mm3,32(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
@@ -2584,6 +2908,36 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovq %xmm11,32(%ecx)...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_to_mem2);
+
+        asm volatile ( "pcmpeqb %%xmm11, %%xmm11\n"
+#if 0 /* This may not work, as the assembler might pick opcode D6. */
+                       put_insn(evex_vmovq_to_mem2,
+                                "{evex} vmovq %%xmm11, 32(%0)")
+#else
+                       put_insn(evex_vmovq_to_mem2,
+                                ".byte 0x62, 0xf1, 0xfd, 0x08, 0x7e, 0x49, 0x04")
+#endif
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(evex_vmovq_to_mem2);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_mem2) ||
+             *((long *)res + 4) + 1 ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movq %mm3,%rbx...");
     if ( stack_exec && cpu_has_mmx )
     {
@@ -2643,6 +2997,28 @@ int main(int argc, char **argv)
     }
     else
         printf("skipped\n");
+
+    printf("%-40s", "Testing vmovq %xmm22,%rbx...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqq %%xmm2, %%xmm2\n\t"
+                       "vmovq %%xmm2, %%xmm22\n"
+                       put_insn(evex_vmovq_to_reg, "vmovq %%xmm22, %%rbx")
+                       :: );
+
+        set_insn(evex_vmovq_to_reg);
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_reg) ||
+             regs.rbx + 1 )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
 #endif
 
     printf("%-40s", "Testing maskmovq %mm4,%mm4...");
@@ -2812,6 +3188,32 @@ int main(int argc, char **argv)
             goto fail;
         printf("okay\n");
     }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovntdqa 64(%ecx),%zmm4...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovntdqa);
+
+        asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
+                       put_insn(evex_vmovntdqa, "vmovntdqa 64(%0), %%zmm4")
+                       :: "c" (NULL) );
+
+        set_insn(evex_vmovntdqa);
+        memset(res, 0x55, 192);
+        memset(res + 16, 0xff, 64);
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovntdqa) )
+            goto fail;
+        asm ( "vpbroadcastd %1, %%zmm2\n\t"
+              "vpcmpeqd %%zmm4, %%zmm2, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "0" (~0) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
     else
         printf("skipped\n");
 
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -210,6 +210,7 @@ int emul_test_get_fpu(
         if ( cpu_has_avx )
             break;
     case X86EMUL_FPU_opmask:
+    case X86EMUL_FPU_zmm:
         if ( cpu_has_avx512f )
             break;
     default:
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -266,6 +266,16 @@ static inline uint64_t xgetbv(uint32_t x
     (res.b & (1U << 30)) != 0; \
 })
 
+#define cpu_has_avx512vl ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 31)) != 0; \
+})
+
 int emul_test_cpuid(
     uint32_t leaf,
     uint32_t subleaf,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -243,9 +243,25 @@ enum simd_opsize {
 };
 typedef uint8_t simd_opsize_t;
 
+enum disp8scale {
+    /* Values 0 ... 4 are explicit sizes. */
+    d8s_bw = 5,
+    d8s_dq,
+    /*
+     * All further values must strictly be last and in the order
+     * given so that arithmetic on the values works.
+     */
+    d8s_vl,
+    d8s_vl_by_2,
+    d8s_vl_by_4,
+    d8s_vl_by_8,
+};
+typedef uint8_t disp8scale_t;
+
 static const struct twobyte_table {
     opcode_desc_t desc;
-    simd_opsize_t size;
+    simd_opsize_t size:4;
+    disp8scale_t d8s:4;
 } twobyte_table[256] = {
     [0x00] = { ModRM },
     [0x01] = { ImplicitOps|ModRM },
@@ -260,8 +276,8 @@ static const struct twobyte_table {
     [0x0d] = { ImplicitOps|ModRM },
     [0x0e] = { ImplicitOps },
     [0x0f] = { ModRM|SrcImmByte },
-    [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
-    [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+    [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl },
+    [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
     [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
@@ -270,10 +286,10 @@ static const struct twobyte_table {
     [0x18 ... 0x1f] = { ImplicitOps|ModRM },
     [0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
     [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
-    [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp },
-    [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp },
+    [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl },
+    [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp, d8s_vl },
     [0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+    [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x2c ... 0x2d] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp },
     [0x30 ... 0x35] = { ImplicitOps },
@@ -292,8 +308,8 @@ static const struct twobyte_table {
     [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
-    [0x6e] = { DstImplicit|SrcMem|ModRM|Mov },
-    [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int },
+    [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq },
+    [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
     [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM },
     [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
@@ -301,8 +317,8 @@ static const struct twobyte_table {
     [0x78] = { ImplicitOps|ModRM },
     [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int },
     [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0x7e] = { DstMem|SrcImplicit|ModRM|Mov },
-    [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
+    [0x7e] = { DstMem|SrcImplicit|ModRM|Mov, simd_none, d8s_dq },
+    [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x80 ... 0x8f] = { DstImplicit|SrcImm },
     [0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
     [0xa0 ... 0xa1] = { ImplicitOps|Mov },
@@ -344,14 +360,14 @@ static const struct twobyte_table {
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
-    [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
+    [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
     [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
+    [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
@@ -406,6 +422,7 @@ static const struct ext0f38_table {
     uint8_t to_mem:1;
     uint8_t two_op:1;
     uint8_t vsib:1;
+    disp8scale_t d8s:4;
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
@@ -418,7 +435,7 @@ static const struct ext0f38_table {
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
-    [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
     [0x2c ... 0x2d] = { .simd_size = simd_other },
     [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
@@ -656,6 +673,22 @@ union evex {
     };
 };
 
+#define EVEX_PFX_BYTES 4
+#define init_evex(stub) ({ \
+    uint8_t *buf_ = get_stub(stub); \
+    buf_[0] = 0x62; \
+    buf_ + EVEX_PFX_BYTES; \
+})
+
+#define copy_EVEX(ptr, evex) ({ \
+    if ( !mode_64bit() ) \
+        (evex).reg |= 8; \
+    (ptr)[1 - EVEX_PFX_BYTES] = (evex).raw[0]; \
+    (ptr)[2 - EVEX_PFX_BYTES] = (evex).raw[1]; \
+    (ptr)[3 - EVEX_PFX_BYTES] = (evex).raw[2]; \
+    container_of((ptr) + 1 - EVEX_PFX_BYTES, typeof(evex), raw[0]); \
+})
+
 #define rep_prefix()   (vex.pfx >= vex_f3)
 #define repe_prefix()  (vex.pfx == vex_f3)
 #define repne_prefix() (vex.pfx == vex_f2)
@@ -768,6 +801,7 @@ typedef union {
     uint64_t mmx;
     uint64_t __attribute__ ((aligned(16))) xmm[2];
     uint64_t __attribute__ ((aligned(32))) ymm[4];
+    uint64_t __attribute__ ((aligned(64))) zmm[8];
 } mmval_t;
 
 /*
@@ -1183,6 +1217,11 @@ static int _get_fpu(
 
     switch ( type )
     {
+    case X86EMUL_FPU_zmm:
+        if ( !(xcr0 & X86_XCR0_ZMM) || !(xcr0 & X86_XCR0_HI_ZMM) ||
+             !(xcr0 & X86_XCR0_OPMASK) )
+            return X86EMUL_UNHANDLEABLE;
+        /* fall through */
     case X86EMUL_FPU_ymm:
         if ( !(xcr0 & X86_XCR0_SSE) || !(xcr0 & X86_XCR0_YMM) )
             return X86EMUL_UNHANDLEABLE;
@@ -1777,6 +1816,7 @@ static bool vcpu_has(
 #define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
 #define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
 #define vcpu_has_avx512bw()    vcpu_has(         7, EBX, 30, ctxt, ops)
+#define vcpu_has_avx512vl()    vcpu_has(         7, EBX, 31, ctxt, ops)
 #define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
 #define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
 
@@ -2150,6 +2190,62 @@ static unsigned long *decode_vex_gpr(
     return decode_gpr(regs, ~vex_reg & (mode_64bit() ? 0xf : 7));
 }
 
+static unsigned int decode_disp8scale(enum disp8scale scale,
+                                      const struct x86_emulate_state *state)
+{
+    switch ( scale )
+    {
+    case d8s_bw:
+        return state->evex.w;
+
+    default:
+        if ( scale < d8s_vl )
+            return scale;
+        if ( state->evex.br )
+        {
+    case d8s_dq:
+            return 2 + state->evex.w;
+        }
+        break;
+    }
+
+    switch ( state->simd_size )
+    {
+    case simd_any_fp:
+    case simd_single_fp:
+        if ( !(state->evex.pfx & VEX_PREFIX_SCALAR_MASK) )
+            break;
+        /* fall through */
+    case simd_scalar_opc:
+    case simd_scalar_vexw:
+        return 2 + state->evex.w;
+
+    case simd_128:
+        /* These should have an explicit size specified. */
+        ASSERT_UNREACHABLE();
+        return 4;
+
+    default:
+        break;
+    }
+
+    return 4 + state->evex.lr - (scale - d8s_vl);
+}
+
+#define avx512_vlen_check(lig) do { \
+    switch ( evex.lr ) \
+    { \
+    default: \
+        generate_exception(EXC_UD); \
+    case 2: \
+        break; \
+    case 0: case 1: \
+        if (!(lig)) \
+            host_and_vcpu_must_have(avx512vl); \
+        break; \
+    } \
+} while ( false )
+
 static bool is_aligned(enum x86_segment seg, unsigned long offs,
                        unsigned int size, struct x86_emulate_ctxt *ctxt,
                        const struct x86_emulate_ops *ops)
@@ -2399,6 +2495,7 @@ x86_decode_twobyte(
         if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */
         {
     case X86EMUL_OPC_VEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */
+    case X86EMUL_OPC_EVEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */
             state->desc = DstImplicit | SrcMem | TwoOp;
             state->simd_size = simd_other;
             /* Avoid the state->desc clobbering of TwoOp below. */
@@ -2834,6 +2931,8 @@ x86_decode(
 
     if ( d & ModRM )
     {
+        unsigned int disp8scale = 0;
+
         d &= ~ModRM;
 #undef ModRM /* Only its aliases are valid to use from here on. */
         modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
@@ -2876,6 +2975,9 @@ x86_decode(
             break;
 
         case ext_0f:
+            if ( evex_encoded() )
+                disp8scale = decode_disp8scale(twobyte_table[b].d8s, state);
+
             switch ( b )
             {
             case 0x20: /* mov cr,reg */
@@ -2900,6 +3002,8 @@ x86_decode(
             if ( ext0f38_table[b].vsib )
                 d |= vSIB;
             state->simd_size = ext0f38_table[b].simd_size;
+            if ( evex_encoded() )
+                disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
             break;
 
         case ext_8f09:
@@ -2968,7 +3072,7 @@ x86_decode(
                     ea.mem.off = insn_fetch_type(int16_t);
                 break;
             case 1:
-                ea.mem.off += insn_fetch_type(int8_t);
+                ea.mem.off += insn_fetch_type(int8_t) << disp8scale;
                 break;
             case 2:
                 ea.mem.off += insn_fetch_type(int16_t);
@@ -3027,7 +3131,7 @@ x86_decode(
                 pc_rel = mode_64bit();
                 break;
             case 1:
-                ea.mem.off += insn_fetch_type(int8_t);
+                ea.mem.off += insn_fetch_type(int8_t) << disp8scale;
                 break;
             case 2:
                 ea.mem.off += insn_fetch_type(int32_t);
@@ -3228,10 +3332,11 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d, *opc = NULL;
-    unsigned int first_byte = 0, insn_bytes = 0;
+    unsigned int first_byte = 0, elem_bytes, insn_bytes = 0;
+    uint64_t op_mask = ~0ULL;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
-    bool sfence = false;
+    bool sfence = false, fault_suppression = false;
     struct operand src = { .reg = PTR_POISON };
     struct operand dst = { .reg = PTR_POISON };
     unsigned long cr4;
@@ -3272,6 +3377,7 @@ x86_emulate(
     b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
+    elem_bytes = 4 << evex.w;
 
     generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD);
 
@@ -3346,6 +3452,28 @@ x86_emulate(
         break;
     }
 
+    /* With a memory operand, fetch the mask register in use (if any). */
+    if ( ea.type == OP_MEM && evex.opmsk )
+    {
+        uint8_t *stb = get_stub(stub);
+
+        /* KMOV{W,Q} %k<n>, (%rax) */
+        stb[0] = 0xc4;
+        stb[1] = 0xe1;
+        stb[2] = cpu_has_avx512bw ? 0xf8 : 0x78;
+        stb[3] = 0x91;
+        stb[4] = evex.opmsk << 3;
+        insn_bytes = 5;
+        stb[5] = 0xc3;
+
+        invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask));
+
+        insn_bytes = 0;
+        put_stub(stub);
+
+        fault_suppression = true;
+    }
+
     /* Decode (but don't fetch) the destination operand: register or memory. */
     switch ( d & DstMask )
     {
@@ -5708,6 +5836,41 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 2;
         break;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x2b): /* vmovntp{s,d} [xyz]mm,mem */
+        generate_exception_if(ea.type != OP_MEM || evex.opmsk, EXC_UD);
+        sfence = true;
+        fault_suppression = false;
+        /* fall through */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x10): /* vmovup{s,d} [xyz]mm/mem,[xyz]mm{k} */
+    CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x10): /* vmovs{s,d} mem,xmm{k} */
+                                            /* vmovs{s,d} xmm,xmm,xmm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x11): /* vmovup{s,d} [xyz]mm,[xyz]mm/mem{k} */
+    CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x11): /* vmovs{s,d} xmm,mem{k} */
+                                            /* vmovs{s,d} xmm,xmm,xmm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x28): /* vmovap{s,d} [xyz]mm/mem,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x29): /* vmovap{s,d} [xyz]mm,[xyz]mm/mem{k} */
+        /* vmovs{s,d} to/from memory have only two operands. */
+        if ( (b & ~1) == 0x10 && ea.type == OP_MEM )
+            d |= TwoOp;
+        generate_exception_if(evex.br, EXC_UD);
+        generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
+    simd_zmm:
+        get_fpu(X86EMUL_FPU_zmm);
+        opc = init_evex(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* convert memory operand to (%rAX) */
+            evex.b = 1;
+            opc[1] &= 0x38;
+        }
+        insn_bytes = EVEX_PFX_BYTES + 2;
+        break;
+
     case X86EMUL_OPC_66(0x0f, 0x12):       /* movlpd m64,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x12):   /* vmovlpd m64,xmm,xmm */
     CASE_SIMD_PACKED_FP(, 0x0f, 0x13):     /* movlp{s,d} xmm,m64 */
@@ -6348,6 +6511,41 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */
+        generate_exception_if((evex.lr || evex.opmsk || evex.br ||
+                               evex.reg != 0xf || !evex.RX),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        opc = init_evex(stub);
+        opc[0] = b;
+        /* Convert memory/GPR operand to (%rAX). */
+        evex.b = 1;
+        if ( !mode_64bit() )
+            evex.w = 0;
+        opc[1] = modrm & 0x38;
+        insn_bytes = EVEX_PFX_BYTES + 2;
+        opc[2] = 0xc3;
+
+        copy_EVEX(opc, evex);
+        invoke_stub("", "", "+m" (src.val) : "a" (&src.val));
+        dst.val = src.val;
+
+        put_stub(stub);
+        ASSERT(!state->simd_size);
+        break;
+
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
+        generate_exception_if(evex.lr || !evex.w || evex.opmsk || evex.br,
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        d |= TwoOp;
+        op_bytes = 8;
+        goto simd_zmm;
+
     case X86EMUL_OPC_66(0x0f, 0xe7):     /* movntdq xmm,m128 */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq {x,y}mm,mem */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -6368,6 +6566,30 @@ x86_emulate(
             goto simd_0f_avx;
         goto simd_0f_sse2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe7): /* vmovntdq [xyz]mm,mem */
+        generate_exception_if(ea.type != OP_MEM || evex.opmsk || evex.w,
+                              EXC_UD);
+        sfence = true;
+        fault_suppression = false;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6f): /* vmovdqa{32,64} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x6f): /* vmovdqu{32,64} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x7f): /* vmovdqa{32,64} [xyz]mm,[xyz]mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x7f): /* vmovdqu{32,64} [xyz]mm,[xyz]mm/mem{k} */
+    vmovdqa:
+        generate_exception_if(evex.br, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(false);
+        d |= TwoOp;
+        op_bytes = 16 << evex.lr;
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_F2(0x0f, 0x6f): /* vmovdqu{8,16} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F2(0x0f, 0x7f): /* vmovdqu{8,16} [xyz]mm,[xyz]mm/mem{k} */
+        host_and_vcpu_must_have(avx512bw);
+        elem_bytes = 1 << evex.w;
+        goto vmovdqa;
+
     case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
         generate_exception_if(vex.l, EXC_UD);
         d |= TwoOp;
@@ -7734,6 +7956,15 @@ x86_emulate(
         }
         goto movdqa;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x2a): /* vmovntdqa mem,[xyz]mm */
+        generate_exception_if(ea.type != OP_MEM || evex.opmsk || evex.w,
+                              EXC_UD);
+        /* Ignore the non-temporal hint for now, using vmovdqa32 instead. */
+        asm volatile ( "mfence" ::: "memory" );
+        b = 0x6f;
+        evex.opcx = vex_0f;
+        goto vmovdqa;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */
@@ -8787,17 +9018,27 @@ x86_emulate(
     else if ( state->simd_size )
     {
         generate_exception_if(!op_bytes, EXC_UD);
-        generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
+        generate_exception_if((vex.opcx && (d & TwoOp) &&
+                               (vex.reg != 0xf || (evex_encoded() && !evex.RX))),
                               EXC_UD);
 
         if ( !opc )
             BUG();
-        opc[insn_bytes - PFX_BYTES] = 0xc3;
-        copy_REX_VEX(opc, rex_prefix, vex);
+        if ( evex_encoded() )
+        {
+            opc[insn_bytes - EVEX_PFX_BYTES] = 0xc3;
+            copy_EVEX(opc, evex);
+        }
+        else
+        {
+            opc[insn_bytes - PFX_BYTES] = 0xc3;
+            copy_REX_VEX(opc, rex_prefix, vex);
+        }
 
         if ( ea.type == OP_MEM )
         {
             uint32_t mxcsr = 0;
+            uint64_t full = 0;
 
             if ( op_bytes < 16 ||
                  (vex.opcx
@@ -8819,6 +9060,44 @@ x86_emulate(
                                   !is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
                                               ctxt, ops),
                                   EXC_GP, 0);
+
+            if ( evex.br )
+            {
+                ASSERT((d & DstMask) != DstMem);
+                op_bytes = elem_bytes;
+            }
+            if ( evex.opmsk )
+            {
+                ASSERT(!(op_bytes % elem_bytes));
+                full = ~0ULL >> (64 - op_bytes / elem_bytes);
+                op_mask &= full;
+            }
+            if ( fault_suppression )
+            {
+                if ( !op_mask )
+                    goto simd_no_mem;
+                if ( !evex.br )
+                {
+                    first_byte = __builtin_ctzll(op_mask);
+                    op_mask >>= first_byte;
+                    full >>= first_byte;
+                    first_byte *= elem_bytes;
+                    op_bytes = (64 - __builtin_clzll(op_mask)) * elem_bytes;
+                }
+            }
+            /*
+             * Independent of fault suppression we may need to read (parts of)
+             * the memory operand for the purpose of merging without splitting
+             * the write below into multiple ones. Note that the EVEX.Z check
+             * here isn't strictly needed, due to there not currently being
+             * any instructions allowing zeroing-merging on memory writes (and
+             * we raise #UD during DstMem processing far above in this case),
+             * yet conceptually the read is then unnecessary.
+             */
+            if ( evex.opmsk && !evex.z && (d & DstMask) == DstMem &&
+                 op_mask != full )
+                d = (d & ~SrcMask) | SrcMem;
+
             switch ( d & SrcMask )
             {
             case SrcMem:
@@ -8865,7 +9144,10 @@ x86_emulate(
             }
         }
         else
+        {
+        simd_no_mem:
             dst.type = OP_NONE;
+        }
 
         /* {,v}maskmov{q,dqu}, as an exception, uses rDI. */
         if ( likely((ctxt->opcode & ~(X86EMUL_OPC_PFX_MASK |
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -171,6 +171,7 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */
     X86EMUL_FPU_ymm, /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
     X86EMUL_FPU_opmask, /* AVX512 opmask instruction set (%k0-%k7) */
+    X86EMUL_FPU_zmm, /* AVX512 instruction set (%zmm0-%zmm7/31) */
     /* This sentinel will never be passed to ->get_fpu(). */
     X86EMUL_FPU_none
 };
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -105,6 +105,7 @@
 #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
 #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
 #define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
+#define cpu_has_avx512vl        boot_cpu_has(X86_FEATURE_AVX512VL)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 06/34] x86emul: test for correct EVEX Disp8 scaling
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (4 preceding siblings ...)
  2018-09-18 11:56   ` [PATCH v3 05/34] x86emul: support basic AVX512 moves Jan Beulich
@ 2018-09-18 11:57   ` Jan Beulich
  2018-09-18 11:57   ` [PATCH v3 07/34] x86emul: use AVX512 logic for emulating V{, P}MASKMOV* Jan Beulich
                     ` (27 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:57 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Besides the already existing tests (which are going to be extended once
respective ISA extension support is complete), let's also ensure for
every individual insn that their Disp8 scaling (and memory access width)
are correct.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -139,7 +139,7 @@ $(addsuffix .h,$(SIMD) $(FMA) $(SG)): si
 
 xop.h: simd-fma.c
 
-$(TARGET): x86-emulate.o test_x86_emulator.o wrappers.o
+$(TARGET): x86-emulate.o test_x86_emulator.o evex-disp8.o wrappers.o
 	$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
 
 .PHONY: clean
@@ -166,7 +166,7 @@ x86.h := $(addprefix $(XEN_ROOT)/tools/i
                      x86-vendors.h x86-defns.h msr-index.h)
 x86_emulate.h := x86-emulate.h x86_emulate/x86_emulate.h $(x86.h)
 
-x86-emulate.o test_x86_emulator.o wrappers.o: %.o: %.c $(x86_emulate.h)
+x86-emulate.o test_x86_emulator.o evex-disp8.o wrappers.o: %.o: %.c $(x86_emulate.h)
 	$(HOSTCC) $(HOSTCFLAGS) -c -g -o $@ $<
 
 x86-emulate.o: x86_emulate/x86_emulate.c
--- /dev/null
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -0,0 +1,437 @@
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "x86-emulate.h"
+
+struct test {
+    const char *mnemonic;
+    unsigned int opc:8;
+    unsigned int spc:2;
+    unsigned int pfx:2;
+    unsigned int vsz:3;
+    unsigned int esz:4;
+    unsigned int scale:1;
+    unsigned int ext:3;
+};
+
+enum spc {
+    SPC_invalid,
+    SPC_0f,
+    SPC_0f38,
+    SPC_0f3a,
+};
+
+enum pfx {
+    PFX_,
+    PFX_66,
+    PFX_f3,
+    PFX_f2
+};
+
+enum vl {
+    VL_128,
+    VL_256,
+    VL_512,
+};
+
+enum scale {
+    SC_vl,
+    SC_el,
+};
+
+enum vsz {
+    VSZ_vl,
+    VSZ_vl_2, /* VL / 2 */
+    VSZ_vl_4, /* VL / 4 */
+    VSZ_vl_8, /* VL / 8 */
+    /* "no broadcast" implied from here on. */
+    VSZ_el,
+    VSZ_el_2, /* EL * 2 */
+    VSZ_el_4, /* EL * 4 */
+    VSZ_el_8, /* EL * 8 */
+};
+
+enum esz {
+    ESZ_d,
+    ESZ_q,
+    ESZ_dq,
+    ESZ_sd,
+    ESZ_d_nb,
+    ESZ_q_nb,
+    /* "no broadcast" implied from here on. */
+    ESZ_b,
+    ESZ_w,
+    ESZ_bw,
+};
+
+#ifndef __i386__
+# define ESZ_dq64 ESZ_dq
+#else
+# define ESZ_dq64 ESZ_d
+#endif
+
+#define INSNX(m, p, sp, o, e, vs, es, sc) { \
+    .mnemonic = #m, .opc = 0x##o, .spc = SPC_##sp, .pfx = PFX_##p, \
+    .vsz = VSZ_##vs, .esz = ESZ_##es, .scale = SC_##sc, .ext = 0##e \
+}
+#define INSN(m, p, sp, o, vs, es, sc) INSNX(m, p, sp, o, 0, vs, es, sc)
+#define INSN_PFP(m, sp, o) \
+    INSN(m##pd, 66, sp, o, vl, q, vl), \
+    INSN(m##ps,   , sp, o, vl, d, vl)
+#define INSN_PFP_NB(m, sp, o) \
+    INSN(m##pd, 66, sp, o, vl, q_nb, vl), \
+    INSN(m##ps,   , sp, o, vl, d_nb, vl)
+#define INSN_SFP(m, sp, o) \
+    INSN(m##sd, f2, sp, o, el, q, el), \
+    INSN(m##ss, f3, sp, o, el, d, el)
+
+#define INSN_FP(m, sp, o) \
+    INSN_PFP(m, sp, o), \
+    INSN_SFP(m, sp, o)
+
+static const struct test avx512f_all[] = {
+    INSN_SFP(mov,            0f, 10),
+    INSN_SFP(mov,            0f, 11),
+    INSN_PFP_NB(mova,        0f, 28),
+    INSN_PFP_NB(mova,        0f, 29),
+    INSN(movdqa32,     66,   0f, 6f,    vl,   d_nb, vl),
+    INSN(movdqa32,     66,   0f, 7f,    vl,   d_nb, vl),
+    INSN(movdqa64,     66,   0f, 6f,    vl,   q_nb, vl),
+    INSN(movdqa64,     66,   0f, 7f,    vl,   q_nb, vl),
+    INSN(movdqu32,     f3,   0f, 6f,    vl,   d_nb, vl),
+    INSN(movdqu32,     f3,   0f, 7f,    vl,   d_nb, vl),
+    INSN(movdqu64,     f3,   0f, 6f,    vl,   q_nb, vl),
+    INSN(movdqu64,     f3,   0f, 7f,    vl,   q_nb, vl),
+    INSN(movntdq,      66,   0f, e7,    vl,   d_nb, vl),
+    INSN(movntdqa,     66, 0f38, 2a,    vl,   d_nb, vl),
+    INSN_PFP_NB(movnt,       0f, 2b),
+    INSN_PFP_NB(movu,        0f, 10),
+    INSN_PFP_NB(movu,        0f, 11),
+};
+
+static const struct test avx512f_128[] = {
+    INSN(mov,       66,   0f, 6e, el, dq64, el),
+    INSN(mov,       66,   0f, 7e, el, dq64, el),
+    INSN(movq,      f3,   0f, 7e, el,    q, el),
+    INSN(movq,      66,   0f, d6, el,    q, el),
+};
+
+static const struct test avx512bw_all[] = {
+    INSN(movdqu8,     f2,   0f, 6f,    vl,   b, vl),
+    INSN(movdqu8,     f2,   0f, 7f,    vl,   b, vl),
+    INSN(movdqu16,    f2,   0f, 6f,    vl,   w, vl),
+    INSN(movdqu16,    f2,   0f, 7f,    vl,   w, vl),
+};
+
+static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
+static const unsigned char vl_128[] = { VL_128 };
+
+/*
+ * This table, indicating the presence of an immediate (byte) for an opcode
+ * space 0f major opcode, is indexed by high major opcode byte nibble, with
+ * each table element then bit-indexed by low major opcode byte nibble.
+ */
+static const uint16_t imm0f[16] = {
+    [0x7] = (1 << 0x0) /* vpshuf* */ |
+            (1 << 0x1) /* vps{ll,ra,rl}w */ |
+            (1 << 0x2) /* vps{l,r}ld, vp{rol,ror,sra}{d,q} */ |
+            (1 << 0x3) /* vps{l,r}l{,d}q */,
+    [0xc] = (1 << 0x2) /* vcmp{p,s}{d,s} */ |
+            (1 << 0x4) /* vpinsrw */ |
+            (1 << 0x5) /* vpextrw */ |
+            (1 << 0x6) /* vshufp{d,s} */,
+};
+
+static struct x86_emulate_ops emulops;
+
+static unsigned int accessed[3 * 64];
+
+static bool record_access(enum x86_segment seg, unsigned long offset,
+                          unsigned int bytes)
+{
+    while ( bytes-- )
+    {
+        if ( offset >= ARRAY_SIZE(accessed) )
+            return false;
+        ++accessed[offset++];
+    }
+
+    return true;
+}
+
+static int read(enum x86_segment seg, unsigned long offset, void *p_data,
+                unsigned int bytes, struct x86_emulate_ctxt *ctxt)
+{
+    if ( !record_access(seg, offset, bytes) )
+        return X86EMUL_UNHANDLEABLE;
+    memset(p_data, 0, bytes);
+    return X86EMUL_OKAY;
+}
+
+static int write(enum x86_segment seg, unsigned long offset, void *p_data,
+                 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
+{
+    if ( !record_access(seg, offset, bytes) )
+        return X86EMUL_UNHANDLEABLE;
+    return X86EMUL_OKAY;
+}
+
+static void test_one(const struct test *test, enum vl vl,
+                     unsigned char *instr, struct x86_emulate_ctxt *ctxt)
+{
+    unsigned int vsz, esz, i;
+    int rc;
+    bool sg = strstr(test->mnemonic, "gather") ||
+              strstr(test->mnemonic, "scatter");
+    bool imm = test->spc == SPC_0f3a ||
+               (test->spc == SPC_0f &&
+                (imm0f[test->opc >> 4] & (1 << (test->opc & 0xf))));
+    union evex {
+        uint8_t raw[3];
+        struct {
+            uint8_t opcx:2;
+            uint8_t mbz:2;
+            uint8_t R:1;
+            uint8_t b:1;
+            uint8_t x:1;
+            uint8_t r:1;
+            uint8_t pfx:2;
+            uint8_t mbs:1;
+            uint8_t reg:4;
+            uint8_t w:1;
+            uint8_t opmsk:3;
+            uint8_t RX:1;
+            uint8_t bcst:1;
+            uint8_t lr:2;
+            uint8_t z:1;
+        };
+    } evex = {
+        .opcx = test->spc, .pfx = test->pfx, .lr = vl,
+        .R = 1, .b = 1, .x = 1, .r = 1, .mbs = 1,
+        .reg = 0xf, .RX = 1, .opmsk = sg,
+    };
+
+    switch ( test->esz )
+    {
+    case ESZ_b:
+        esz = 1;
+        break;
+
+    case ESZ_w:
+        esz = 2;
+        evex.w = 1;
+        break;
+
+    case ESZ_d: case ESZ_d_nb:
+        esz = 4;
+        break;
+
+    case ESZ_q: case ESZ_q_nb:
+        esz = 8;
+        evex.w = 1;
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+    }
+
+    switch ( test->vsz )
+    {
+    case VSZ_vl:
+        vsz = 16 << vl;
+        break;
+
+    case VSZ_vl_2:
+        vsz = 8 << vl;
+        break;
+
+    case VSZ_vl_4:
+        vsz = 4 << vl;
+        break;
+
+    case VSZ_vl_8:
+        vsz = 2 << vl;
+        break;
+
+    case VSZ_el:
+        vsz = esz;
+        break;
+
+    case VSZ_el_2:
+        vsz = esz * 2;
+        break;
+
+    case VSZ_el_4:
+        vsz = esz * 4;
+        break;
+
+    case VSZ_el_8:
+        vsz = esz * 8;
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+    }
+
+    /*
+     * Note: SIB addressing is used here, such that S/G insns can be handled
+     * without extra conditionals.
+     */
+    instr[0] = 0x62;
+    instr[1] = evex.raw[0];
+    instr[2] = evex.raw[1];
+    instr[3] = evex.raw[2];
+    instr[4] = test->opc;
+    instr[5] = 0x44 | (test->ext << 3); /* ModR/M */
+    instr[6] = 0x12; /* SIB: base rDX, index none / xMM4 */
+    instr[7] = 1; /* Disp8 */
+    instr[8] = 0; /* immediate, if any */
+
+    asm volatile ( "kxnorw %k1, %k1, %k1" );
+    asm volatile ( "vxorps %xmm4, %xmm4, %xmm4" );
+
+    ctxt->regs->eip = (unsigned long)&instr[0];
+    ctxt->regs->edx = 0;
+    memset(accessed, 0, sizeof(accessed));
+
+    rc = x86_emulate(ctxt, &emulops);
+    if ( rc != X86EMUL_OKAY ||
+         (ctxt->regs->eip != (unsigned long)&instr[8 + imm]) )
+        goto fail;
+
+    for ( i = 0; i < vsz; ++i )
+         if ( accessed[i] )
+             goto fail;
+    for ( ; i < (test->scale == SC_vl ? vsz : esz) + (sg ? esz : vsz); ++i )
+         if ( accessed[i] != (sg ? vsz / esz : 1) )
+             goto fail;
+    for ( ; i < ARRAY_SIZE(accessed); ++i )
+         if ( accessed[i] )
+             goto fail;
+
+    /* Also check the broadcast case, if available. */
+    if ( test->vsz >= VSZ_el || test->scale != SC_vl )
+        return;
+
+    switch ( test->esz )
+    {
+    case ESZ_d_nb: case ESZ_q_nb:
+    case ESZ_b: case ESZ_w: case ESZ_bw:
+        return;
+
+    case ESZ_d: case ESZ_q:
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+    }
+
+    evex.bcst = 1;
+    instr[3] = evex.raw[2];
+
+    ctxt->regs->eip = (unsigned long)&instr[0];
+    memset(accessed, 0, sizeof(accessed));
+
+    rc = x86_emulate(ctxt, &emulops);
+    if ( rc != X86EMUL_OKAY ||
+         (ctxt->regs->eip != (unsigned long)&instr[8 + imm]) )
+        goto fail;
+
+    for ( i = 0; i < esz; ++i )
+         if ( accessed[i] )
+             goto fail;
+    for ( ; i < esz * 2; ++i )
+         if ( accessed[i] != 1 )
+             goto fail;
+    for ( ; i < ARRAY_SIZE(accessed); ++i )
+         if ( accessed[i] )
+             goto fail;
+
+    return;
+
+ fail:
+    printf("failed (v%s%s %u-bit)\n", test->mnemonic,
+           evex.bcst ? "/bcst" : "", 128 << vl);
+    exit(1);
+}
+
+static void test_pair(const struct test *tmpl, enum vl vl,
+                      enum esz esz1, const char *suffix1,
+                      enum esz esz2, const char *suffix2,
+                      unsigned char *instr, struct x86_emulate_ctxt *ctxt)
+{
+    struct test test = *tmpl;
+    char mnemonic[24];
+
+    test.esz = esz1;
+    snprintf(mnemonic, ARRAY_SIZE(mnemonic), "%s%s", tmpl->mnemonic, suffix1);
+    test.mnemonic = mnemonic;
+    test_one(&test, vl, instr, ctxt);
+
+    test.esz = esz2;
+    snprintf(mnemonic, ARRAY_SIZE(mnemonic), "%s%s", tmpl->mnemonic, suffix2);
+    test.mnemonic = mnemonic;
+    test_one(&test, vl, instr, ctxt);
+}
+
+static void test_group(const struct test tests[], unsigned int nr_test,
+                       const unsigned char vl[], unsigned int nr_vl,
+                       void *instr, struct x86_emulate_ctxt *ctxt)
+{
+    unsigned int i, j;
+
+    for ( i = 0; i < nr_test; ++i )
+    {
+        for ( j = 0; j < nr_vl; ++j )
+        {
+            if ( vl[0] == VL_512 && vl[j] != VL_512 && !cpu_has_avx512vl )
+                continue;
+
+            switch ( tests[i].esz )
+            {
+            default:
+                test_one(&tests[i], vl[j], instr, ctxt);
+                break;
+
+            case ESZ_bw:
+                test_pair(&tests[i], vl[j], ESZ_b, "b", ESZ_w, "w",
+                          instr, ctxt);
+                break;
+
+            case ESZ_dq:
+                test_pair(&tests[i], vl[j], ESZ_d, "d", ESZ_q, "q",
+                          instr, ctxt);
+                break;
+
+            case ESZ_sd:
+                test_pair(&tests[i], vl[j],
+                          ESZ_d, tests[i].vsz < VSZ_el ? "ps" : "ss",
+                          ESZ_q, tests[i].vsz < VSZ_el ? "pd" : "sd",
+                          instr, ctxt);
+                break;
+            }
+        }
+    }
+}
+
+void evex_disp8_test(void *instr, struct x86_emulate_ctxt *ctxt,
+                     const struct x86_emulate_ops *ops)
+{
+    emulops = *ops;
+    emulops.read = read;
+    emulops.write = write;
+
+#define RUN(feat, vl) do { \
+    if ( cpu_has_##feat ) \
+    { \
+        printf("%-40s", "Testing " #feat "/" #vl " disp8 handling..."); \
+        test_group(feat ## _ ## vl, ARRAY_SIZE(feat ## _ ## vl), \
+                   vl_ ## vl, ARRAY_SIZE(vl_ ## vl), instr, ctxt); \
+        printf("okay\n"); \
+    } \
+} while ( false )
+
+    RUN(avx512f, all);
+    RUN(avx512f, 128);
+    RUN(avx512bw, all);
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3795,6 +3795,9 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    if ( stack_exec )
+        evex_disp8_test(instr, &ctxt, &emulops);
+
     for ( j = 0; j < ARRAY_SIZE(blobs); j++ )
     {
         if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -71,6 +71,9 @@ WRAP(puts);
 
 #include "x86_emulate/x86_emulate.h"
 
+void evex_disp8_test(void *instr, struct x86_emulate_ctxt *ctxt,
+                     const struct x86_emulate_ops *ops);
+
 static inline uint64_t xgetbv(uint32_t xcr)
 {
     uint32_t lo, hi;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 07/34] x86emul: use AVX512 logic for emulating V{, P}MASKMOV*
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (5 preceding siblings ...)
  2018-09-18 11:57   ` [PATCH v3 06/34] x86emul: test for correct EVEX Disp8 scaling Jan Beulich
@ 2018-09-18 11:57   ` Jan Beulich
  2018-09-18 11:58   ` [PATCH v3 08/34] x86emul: support AVX512F legacy-equivalent arithmetic FP insns Jan Beulich
                     ` (26 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:57 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

The more generic AVX512 implementation allows quite a bit of insn-
specific code to be dropped/shared.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -437,8 +437,8 @@ static const struct ext0f38_table {
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
-    [0x2c ... 0x2d] = { .simd_size = simd_other },
-    [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
+    [0x2c ... 0x2d] = { .simd_size = simd_packed_fp },
+    [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
@@ -447,8 +447,8 @@ static const struct ext0f38_table {
     [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
     [0x5a] = { .simd_size = simd_128, .two_op = 1 },
     [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
-    [0x8c] = { .simd_size = simd_other },
-    [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
+    [0x8c] = { .simd_size = simd_packed_int },
+    [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
     [0x96 ... 0x98] = { .simd_size = simd_packed_fp },
     [0x99] = { .simd_size = simd_scalar_vexw },
@@ -7974,6 +7974,8 @@ x86_emulate(
 
         generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
         host_and_vcpu_must_have(avx);
+        elem_bytes = 4 << (b & 1);
+    vmaskmov:
         get_fpu(X86EMUL_FPU_ymm);
 
         /*
@@ -7988,7 +7990,7 @@ x86_emulate(
         opc = init_prefixes(stub);
         pvex = copy_VEX(opc, vex);
         pvex->opcx = vex_0f;
-        if ( !(b & 1) )
+        if ( elem_bytes == 4 )
             pvex->pfx = vex_none;
         opc[0] = 0x50; /* vmovmskp{s,d} */
         /* Use %rax as GPR destination and VEX.vvvv as source. */
@@ -8001,21 +8003,9 @@ x86_emulate(
         invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
         put_stub(stub);
 
-        if ( !ea.val )
-            goto complete_insn;
-
-        op_bytes = 4 << (b & 1);
-        first_byte = __builtin_ctz(ea.val);
-        ea.val >>= first_byte;
-        first_byte *= op_bytes;
-        op_bytes *= 32 - __builtin_clz(ea.val);
-
-        /*
-         * Even for the memory write variant a memory read is needed, unless
-         * all set mask bits are contiguous.
-         */
-        if ( ea.val & (ea.val + 1) )
-            d = (d & ~SrcMask) | SrcMem;
+        evex.opmsk = 1; /* fake */
+        op_mask = ea.val;
+        fault_suppression = true;
 
         opc = init_prefixes(stub);
         opc[0] = b;
@@ -8066,63 +8056,10 @@ x86_emulate(
 
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
-    {
-        typeof(vex) *pvex;
-        unsigned int mask = vex.w ? 0x80808080U : 0x88888888U;
-
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
         host_and_vcpu_must_have(avx2);
-        get_fpu(X86EMUL_FPU_ymm);
-
-        /*
-         * While we can't reasonably provide fully correct behavior here
-         * (in particular, for writes, avoiding the memory read in anticipation
-         * of all elements in the range eventually being written), we can (and
-         * should) still limit the memory access to the smallest possible range
-         * (suppressing it altogether if all mask bits are clear), to provide
-         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
-         * for that purpose.
-         */
-        opc = init_prefixes(stub);
-        pvex = copy_VEX(opc, vex);
-        pvex->opcx = vex_0f;
-        opc[0] = 0xd7; /* vpmovmskb */
-        /* Use %rax as GPR destination and VEX.vvvv as source. */
-        pvex->r = 1;
-        pvex->b = !mode_64bit() || (vex.reg >> 3);
-        opc[1] = 0xc0 | (~vex.reg & 7);
-        pvex->reg = 0xf;
-        opc[2] = 0xc3;
-
-        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
-        put_stub(stub);
-
-        /* Convert byte granular result to dword/qword granularity. */
-        ea.val &= mask;
-        if ( !ea.val )
-            goto complete_insn;
-
-        first_byte = __builtin_ctz(ea.val) & ~((4 << vex.w) - 1);
-        ea.val >>= first_byte;
-        op_bytes = 32 - __builtin_clz(ea.val);
-
-        /*
-         * Even for the memory write variant a memory read is needed, unless
-         * all set mask bits are contiguous.
-         */
-        if ( ea.val & (ea.val + ~mask + 1) )
-            d = (d & ~SrcMask) | SrcMem;
-
-        opc = init_prefixes(stub);
-        opc[0] = b;
-        /* Convert memory operand to (%rAX). */
-        rex_prefix &= ~REX_B;
-        vex.b = 1;
-        opc[1] = modrm & 0x38;
-        insn_bytes = PFX_BYTES + 2;
-
-        break;
-    }
+        elem_bytes = 4 << vex.w;
+        goto vmaskmov;
 
     case X86EMUL_OPC_VEX_66(0x0f38, 0x90): /* vpgatherd{d,q} {x,y}mm,mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x91): /* vpgatherq{d,q} {x,y}mm,mem,{x,y}mm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 08/34] x86emul: support AVX512F legacy-equivalent arithmetic FP insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (6 preceding siblings ...)
  2018-09-18 11:57   ` [PATCH v3 07/34] x86emul: use AVX512 logic for emulating V{, P}MASKMOV* Jan Beulich
@ 2018-09-18 11:58   ` Jan Beulich
  2018-09-18 11:59   ` [PATCH v3 09/34] x86emul: support AVX512DQ logic " Jan Beulich
                     ` (25 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:58 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -90,6 +90,10 @@ enum esz {
     INSN_SFP(m, sp, o)
 
 static const struct test avx512f_all[] = {
+    INSN_FP(add,             0f, 58),
+    INSN_FP(div,             0f, 5e),
+    INSN_FP(max,             0f, 5f),
+    INSN_FP(min,             0f, 5d),
     INSN_SFP(mov,            0f, 10),
     INSN_SFP(mov,            0f, 11),
     INSN_PFP_NB(mova,        0f, 28),
@@ -107,6 +111,9 @@ static const struct test avx512f_all[] =
     INSN_PFP_NB(movnt,       0f, 2b),
     INSN_PFP_NB(movu,        0f, 10),
     INSN_PFP_NB(movu,        0f, 11),
+    INSN_FP(mul,             0f, 59),
+    INSN_FP(sqrt,            0f, 51),
+    INSN_FP(sub,             0f, 5c),
 };
 
 static const struct test avx512f_128[] = {
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -298,12 +298,12 @@ static const struct twobyte_table {
     [0x3a] = { DstReg|SrcImmByte|ModRM },
     [0x40 ... 0x4f] = { DstReg|SrcMem|ModRM|Mov },
     [0x50] = { DstReg|SrcImplicit|ModRM|Mov },
-    [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },
+    [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp, d8s_vl },
     [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
     [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
-    [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+    [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+    [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
@@ -5853,10 +5853,22 @@ x86_emulate(
         if ( (b & ~1) == 0x10 && ea.type == OP_MEM )
             d |= TwoOp;
         generate_exception_if(evex.br, EXC_UD);
-        generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
+        /* fall through */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x51):    /* vsqrtp{s,d} [xyz]mm/mem,[xyz]mm{k} */
+                                            /* vsqrts{s,d} xmm/m32,xmm,xmm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x58):    /* vadd{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x59):    /* vmul{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5c):    /* vsub{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5d):    /* vmin{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5e):    /* vdiv{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5f):    /* vmax{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
+                               (ea.type == OP_MEM && evex.br &&
+                                (evex.pfx & VEX_PREFIX_SCALAR_MASK))),
                               EXC_UD);
         host_and_vcpu_must_have(avx512f);
-        avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
+        if ( ea.type == OP_MEM || !evex.br )
+            avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
     simd_zmm:
         get_fpu(X86EMUL_FPU_zmm);
         opc = init_evex(stub);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 09/34] x86emul: support AVX512DQ logic FP insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (7 preceding siblings ...)
  2018-09-18 11:58   ` [PATCH v3 08/34] x86emul: support AVX512F legacy-equivalent arithmetic FP insns Jan Beulich
@ 2018-09-18 11:59   ` " Jan Beulich
  2018-09-18 11:59   ` [PATCH v3 10/34] x86emul: support AVX512F "normal" FP compare insns Jan Beulich
                     ` (24 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:59 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -130,6 +130,13 @@ static const struct test avx512bw_all[]
     INSN(movdqu16,    f2,   0f, 7f,    vl,   w, vl),
 };
 
+static const struct test avx512dq_all[] = {
+    INSN_PFP(and,              0f, 54),
+    INSN_PFP(andn,             0f, 55),
+    INSN_PFP(or,               0f, 56),
+    INSN_PFP(xor,              0f, 57),
+};
+
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
 static const unsigned char vl_128[] = { VL_128 };
 
@@ -441,4 +448,5 @@ void evex_disp8_test(void *instr, struct
     RUN(avx512f, all);
     RUN(avx512f, 128);
     RUN(avx512bw, all);
+    RUN(avx512dq, all);
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -300,7 +300,7 @@ static const struct twobyte_table {
     [0x50] = { DstReg|SrcImplicit|ModRM|Mov },
     [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp, d8s_vl },
     [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
-    [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+    [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
     [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
@@ -6319,6 +6319,17 @@ x86_emulate(
         dst.bytes = 4;
         break;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x54): /* vandp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x55): /* vandnp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x56): /* vorp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x57): /* vxorp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
+                               (ea.type != OP_MEM && evex.br)),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512dq);
+        avx512_vlen_check(false);
+        goto simd_zmm;
+
     CASE_SIMD_ALL_FP(, 0x0f, 0x5a):        /* cvt{p,s}{s,d}2{p,s}{s,d} xmm/mem,xmm */
     CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a):    /* vcvtp{s,d}2p{s,d} xmm/mem,xmm */
                                            /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm */





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 10/34] x86emul: support AVX512F "normal" FP compare insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (8 preceding siblings ...)
  2018-09-18 11:59   ` [PATCH v3 09/34] x86emul: support AVX512DQ logic " Jan Beulich
@ 2018-09-18 11:59   ` Jan Beulich
  2018-09-18 12:00   ` [PATCH v3 11/34] x86emul: support AVX512F misc legacy-equivalent FP insns Jan Beulich
                     ` (23 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 11:59 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Also correct the AVX counterpart's comment.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -91,6 +91,7 @@ enum esz {
 
 static const struct test avx512f_all[] = {
     INSN_FP(add,             0f, 58),
+    INSN_FP(cmp,             0f, c2),
     INSN_FP(div,             0f, 5e),
     INSN_FP(max,             0f, 5f),
     INSN_FP(min,             0f, 5d),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -350,7 +350,7 @@ static const struct twobyte_table {
     [0xbf] = { DstReg|SrcMem16|ModRM|Mov },
     [0xc0] = { ByteOp|DstMem|SrcReg|ModRM },
     [0xc1] = { DstMem|SrcReg|ModRM },
-    [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp },
+    [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp, d8s_vl },
     [0xc3] = { DstMem|SrcReg|ModRM|Mov },
     [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
     [0xc5] = { DstReg|SrcImmByte|ModRM|Mov },
@@ -7428,7 +7428,7 @@ x86_emulate(
         goto add;
 
     CASE_SIMD_ALL_FP(, 0x0f, 0xc2):        /* cmp{p,s}{s,d} $imm8,xmm/mem,xmm */
-    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2):    /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2):    /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_FP(, 0x0f, 0xc6):     /* shufp{s,d} $imm8,xmm/mem,xmm */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
         d = (d & ~SrcMask) | SrcMem;
@@ -7442,6 +7442,30 @@ x86_emulate(
         }
         goto simd_0f_imm8_avx;
 
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0xc2): /* vcmp{p,s}{s,d} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+        generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
+                               (ea.type == OP_MEM && evex.br &&
+                                (evex.pfx & VEX_PREFIX_SCALAR_MASK)) ||
+                               !evex.r || !evex.R || evex.z),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( ea.type == OP_MEM || !evex.br )
+            avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
+        d = (d & ~SrcMask) | SrcMem;
+        get_fpu(X86EMUL_FPU_zmm);
+        opc = init_evex(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* convert memory operand to (%rAX) */
+            evex.b = 1;
+            opc[1] &= 0x38;
+        }
+        opc[2] = imm1;
+        insn_bytes = EVEX_PFX_BYTES + 3;
+        break;
+
     case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
         /* Ignore the non-temporal hint for now. */
         vcpu_must_have(sse2);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 11/34] x86emul: support AVX512F misc legacy-equivalent FP insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (9 preceding siblings ...)
  2018-09-18 11:59   ` [PATCH v3 10/34] x86emul: support AVX512F "normal" FP compare insns Jan Beulich
@ 2018-09-18 12:00   ` Jan Beulich
  2018-09-18 12:00   ` [PATCH v3 12/34] x86emul: support AVX512F fused-multiply-add insns Jan Beulich
                     ` (22 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:00 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Also correct an AVX counterpart's comment.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -113,8 +113,11 @@ static const struct test avx512f_all[] =
     INSN_PFP_NB(movu,        0f, 10),
     INSN_PFP_NB(movu,        0f, 11),
     INSN_FP(mul,             0f, 59),
+    INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
     INSN_FP(sub,             0f, 5c),
+    INSN_PFP(unpckh,         0f, 15),
+    INSN_PFP(unpckl,         0f, 14),
 };
 
 static const struct test avx512f_128[] = {
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -280,7 +280,7 @@ static const struct twobyte_table {
     [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
-    [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+    [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
     [0x16] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x17] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
     [0x18 ... 0x1f] = { ImplicitOps|ModRM },
@@ -354,7 +354,7 @@ static const struct twobyte_table {
     [0xc3] = { DstMem|SrcReg|ModRM|Mov },
     [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
     [0xc5] = { DstReg|SrcImmByte|ModRM|Mov },
-    [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp },
+    [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp, d8s_vl },
     [0xc7] = { ImplicitOps|ModRM },
     [0xc8 ... 0xcf] = { ImplicitOps },
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
@@ -5922,6 +5922,17 @@ x86_emulate(
         host_and_vcpu_must_have(sse3);
         goto simd_0f_xmm;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x14): /* vunpcklp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x15): /* vunpckhp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
+                              EXC_UD);
+        fault_suppression = false;
+    avx512f_no_sae:
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
+        avx512_vlen_check(false);
+        goto simd_zmm;
+
     case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
     case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
     case X86EMUL_OPC(0x0f, 0x22): /* mov reg,cr */
@@ -6601,11 +6612,9 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_F3(0x0f, 0x7f): /* vmovdqu{32,64} [xyz]mm,[xyz]mm/mem{k} */
     vmovdqa:
         generate_exception_if(evex.br, EXC_UD);
-        host_and_vcpu_must_have(avx512f);
-        avx512_vlen_check(false);
         d |= TwoOp;
         op_bytes = 16 << evex.lr;
-        goto simd_zmm;
+        goto avx512f_no_sae;
 
     case X86EMUL_OPC_EVEX_F2(0x0f, 0x6f): /* vmovdqu{8,16} [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_F2(0x0f, 0x7f): /* vmovdqu{8,16} [xyz]mm,[xyz]mm/mem{k} */
@@ -7430,7 +7439,7 @@ x86_emulate(
     CASE_SIMD_ALL_FP(, 0x0f, 0xc2):        /* cmp{p,s}{s,d} $imm8,xmm/mem,xmm */
     CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2):    /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_FP(, 0x0f, 0xc6):     /* shufp{s,d} $imm8,xmm/mem,xmm */
-    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
         d = (d & ~SrcMask) | SrcMem;
         if ( vex.opcx == vex_none )
         {
@@ -7451,7 +7460,9 @@ x86_emulate(
         host_and_vcpu_must_have(avx512f);
         if ( ea.type == OP_MEM || !evex.br )
             avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
-        d = (d & ~SrcMask) | SrcMem;
+    simd_imm8_zmm:
+        if ( (d & SrcMask) == SrcImmByte )
+            d = (d & ~SrcMask) | SrcMem;
         get_fpu(X86EMUL_FPU_zmm);
         opc = init_evex(stub);
         opc[0] = b;
@@ -7495,6 +7506,15 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         goto simd_0f_to_gpr;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        fault_suppression = false;
+        generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
+        avx512_vlen_check(false);
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
     {
         union {




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 12/34] x86emul: support AVX512F fused-multiply-add insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (10 preceding siblings ...)
  2018-09-18 12:00   ` [PATCH v3 11/34] x86emul: support AVX512F misc legacy-equivalent FP insns Jan Beulich
@ 2018-09-18 12:00   ` Jan Beulich
  2018-09-18 12:01   ` [PATCH v3 13/34] x86emul: support AVX512F legacy-equivalent logic insns Jan Beulich
                     ` (21 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:00 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -93,6 +93,36 @@ static const struct test avx512f_all[] =
     INSN_FP(add,             0f, 58),
     INSN_FP(cmp,             0f, c2),
     INSN_FP(div,             0f, 5e),
+    INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
+    INSN(fmadd132,     66, 0f38, 99,    el,     sd, el),
+    INSN(fmadd213,     66, 0f38, a8,    vl,     sd, vl),
+    INSN(fmadd213,     66, 0f38, a9,    el,     sd, el),
+    INSN(fmadd231,     66, 0f38, b8,    vl,     sd, vl),
+    INSN(fmadd231,     66, 0f38, b9,    el,     sd, el),
+    INSN(fmaddsub132,  66, 0f38, 96,    vl,     sd, vl),
+    INSN(fmaddsub213,  66, 0f38, a6,    vl,     sd, vl),
+    INSN(fmaddsub231,  66, 0f38, b6,    vl,     sd, vl),
+    INSN(fmsub132,     66, 0f38, 9a,    vl,     sd, vl),
+    INSN(fmsub132,     66, 0f38, 9b,    el,     sd, el),
+    INSN(fmsub213,     66, 0f38, aa,    vl,     sd, vl),
+    INSN(fmsub213,     66, 0f38, ab,    el,     sd, el),
+    INSN(fmsub231,     66, 0f38, ba,    vl,     sd, vl),
+    INSN(fmsub231,     66, 0f38, bb,    el,     sd, el),
+    INSN(fmsubadd132,  66, 0f38, 97,    vl,     sd, vl),
+    INSN(fmsubadd213,  66, 0f38, a7,    vl,     sd, vl),
+    INSN(fmsubadd231,  66, 0f38, b7,    vl,     sd, vl),
+    INSN(fnmadd132,    66, 0f38, 9c,    vl,     sd, vl),
+    INSN(fnmadd132,    66, 0f38, 9d,    el,     sd, el),
+    INSN(fnmadd213,    66, 0f38, ac,    vl,     sd, vl),
+    INSN(fnmadd213,    66, 0f38, ad,    el,     sd, el),
+    INSN(fnmadd231,    66, 0f38, bc,    vl,     sd, vl),
+    INSN(fnmadd231,    66, 0f38, bd,    el,     sd, el),
+    INSN(fnmsub132,    66, 0f38, 9e,    vl,     sd, vl),
+    INSN(fnmsub132,    66, 0f38, 9f,    el,     sd, el),
+    INSN(fnmsub213,    66, 0f38, ae,    vl,     sd, vl),
+    INSN(fnmsub213,    66, 0f38, af,    el,     sd, el),
+    INSN(fnmsub231,    66, 0f38, be,    vl,     sd, vl),
+    INSN(fnmsub231,    66, 0f38, bf,    el,     sd, el),
     INSN_FP(max,             0f, 5f),
     INSN_FP(min,             0f, 5d),
     INSN_SFP(mov,            0f, 10),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -450,30 +450,30 @@ static const struct ext0f38_table {
     [0x8c] = { .simd_size = simd_packed_int },
     [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
-    [0x96 ... 0x98] = { .simd_size = simd_packed_fp },
-    [0x99] = { .simd_size = simd_scalar_vexw },
-    [0x9a] = { .simd_size = simd_packed_fp },
-    [0x9b] = { .simd_size = simd_scalar_vexw },
-    [0x9c] = { .simd_size = simd_packed_fp },
-    [0x9d] = { .simd_size = simd_scalar_vexw },
-    [0x9e] = { .simd_size = simd_packed_fp },
-    [0x9f] = { .simd_size = simd_scalar_vexw },
-    [0xa6 ... 0xa8] = { .simd_size = simd_packed_fp },
-    [0xa9] = { .simd_size = simd_scalar_vexw },
-    [0xaa] = { .simd_size = simd_packed_fp },
-    [0xab] = { .simd_size = simd_scalar_vexw },
-    [0xac] = { .simd_size = simd_packed_fp },
-    [0xad] = { .simd_size = simd_scalar_vexw },
-    [0xae] = { .simd_size = simd_packed_fp },
-    [0xaf] = { .simd_size = simd_scalar_vexw },
-    [0xb6 ... 0xb8] = { .simd_size = simd_packed_fp },
-    [0xb9] = { .simd_size = simd_scalar_vexw },
-    [0xba] = { .simd_size = simd_packed_fp },
-    [0xbb] = { .simd_size = simd_scalar_vexw },
-    [0xbc] = { .simd_size = simd_packed_fp },
-    [0xbd] = { .simd_size = simd_scalar_vexw },
-    [0xbe] = { .simd_size = simd_packed_fp },
-    [0xbf] = { .simd_size = simd_scalar_vexw },
+    [0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x99] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0x9a] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x9b] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0x9c] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x9d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0x9e] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x9f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xa6 ... 0xa8] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xa9] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xaa] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xab] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xac] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xad] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xae] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xaf] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xb6 ... 0xb8] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xb9] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xba] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xbb] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xbc] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xbd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xbe] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xbf] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0xc8 ... 0xcd] = { .simd_size = simd_other },
     [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xdc ... 0xdf] = { .simd_size = simd_packed_int },
@@ -8277,6 +8277,49 @@ x86_emulate(
         host_and_vcpu_must_have(fma);
         goto simd_0f_ymm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512f);
+        if ( ea.type == OP_MEM || !evex.br )
+            avx512_vlen_check(false);
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} xmm/mem,xmm,xmm{k} */
+        host_and_vcpu_must_have(avx512f);
+        if ( ea.type == OP_MEM )
+        {
+            generate_exception_if(evex.br, EXC_UD);
+            avx512_vlen_check(true);
+        }
+        goto simd_zmm;
+
     case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 13/34] x86emul: support AVX512F legacy-equivalent logic insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (11 preceding siblings ...)
  2018-09-18 12:00   ` [PATCH v3 12/34] x86emul: support AVX512F fused-multiply-add insns Jan Beulich
@ 2018-09-18 12:01   ` Jan Beulich
  2018-09-18 12:02   ` [PATCH v3 14/34] x86emul: support AVX512{F, DQ} FP broadcast insns Jan Beulich
                     ` (20 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:01 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Plus vpternlog{d,q} as being extensively used by the compiler, in order
to facilitate test enabling in the harness as soon as possible. Also the
twobyte_table[] entries for a few more insns get their .d8s field set
right away, in order to not split and later re-combine the groups.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -143,6 +143,11 @@ static const struct test avx512f_all[] =
     INSN_PFP_NB(movu,        0f, 10),
     INSN_PFP_NB(movu,        0f, 11),
     INSN_FP(mul,             0f, 59),
+    INSN(pand,         66,   0f, db,    vl,     dq, vl),
+    INSN(pandn,        66,   0f, df,    vl,     dq, vl),
+    INSN(por,          66,   0f, eb,    vl,     dq, vl),
+    INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
+    INSN(pxor,         66,   0f, ef,    vl,     dq, vl),
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
     INSN_FP(sub,             0f, 5c),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -362,13 +362,13 @@ static const struct twobyte_table {
     [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
-    [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
-    [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
@@ -491,6 +491,7 @@ static const struct ext0f3a_table {
     uint8_t to_mem:1;
     uint8_t two_op:1;
     uint8_t four_op:1;
+    disp8scale_t d8s:4;
 } ext0f3a_table[256] = {
     [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
@@ -508,6 +509,7 @@ static const struct ext0f3a_table {
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
+    [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
@@ -3006,20 +3008,33 @@ x86_decode(
                 disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
             break;
 
+        case ext_0f3a:
+            /*
+             * Cannot update d here yet, as the immediate operand still
+             * needs fetching.
+             */
+            state->simd_size = ext0f3a_table[b].simd_size;
+            if ( evex_encoded() )
+                disp8scale = decode_disp8scale(ext0f3a_table[b].d8s, state);
+            break;
+
         case ext_8f09:
             if ( ext8f09_table[b].two_op )
                 d |= TwoOp;
             state->simd_size = ext8f09_table[b].simd_size;
             break;
 
-        case ext_0f3a:
         case ext_8f08:
+        case ext_8f0a:
             /*
              * Cannot update d here yet, as the immediate operand still
              * needs fetching.
              */
-        default:
             break;
+
+        default:
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNIMPLEMENTED;
         }
 
         if ( modrm_mod == 3 )
@@ -3198,7 +3213,6 @@ x86_decode(
         else if ( ext0f3a_table[b].four_op && !mode_64bit() && vex.opcx )
             imm1 &= 0x7f;
         state->desc = d;
-        state->simd_size = ext0f3a_table[b].simd_size;
         rc = x86_decode_0f3a(state, ctxt, ops);
         break;
 
@@ -5927,6 +5941,11 @@ x86_emulate(
         generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
                               EXC_UD);
         fault_suppression = false;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xdb): /* vpand{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xdf): /* vpandn{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xeb): /* vpor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xef): /* vpxor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     avx512f_no_sae:
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
@@ -7510,6 +7529,8 @@ x86_emulate(
         fault_suppression = false;
         generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
                               EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x25): /* vpternlog{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
         avx512_vlen_check(false);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 14/34] x86emul: support AVX512{F, DQ} FP broadcast insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (12 preceding siblings ...)
  2018-09-18 12:01   ` [PATCH v3 13/34] x86emul: support AVX512F legacy-equivalent logic insns Jan Beulich
@ 2018-09-18 12:02   ` Jan Beulich
  2018-09-18 12:03   ` [PATCH v3 15/34] x86emul: support AVX512F v{, u}comis{d, s} insns Jan Beulich
                     ` (19 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:02 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -91,6 +91,7 @@ enum esz {
 
 static const struct test avx512f_all[] = {
     INSN_FP(add,             0f, 58),
+    INSN(broadcastss,  66, 0f38, 18,    el,      d, el),
     INSN_FP(cmp,             0f, c2),
     INSN_FP(div,             0f, 5e),
     INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
@@ -162,6 +163,15 @@ static const struct test avx512f_128[] =
     INSN(movq,      66,   0f, d6, el,    q, el),
 };
 
+static const struct test avx512f_no128[] = {
+    INSN(broadcastf32x4, 66, 0f38, 1a, el_4,  d, vl),
+    INSN(broadcastsd,    66, 0f38, 19, el,    q, el),
+};
+
+static const struct test avx512f_512[] = {
+    INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
+};
+
 static const struct test avx512bw_all[] = {
     INSN(movdqu8,     f2,   0f, 6f,    vl,   b, vl),
     INSN(movdqu8,     f2,   0f, 7f,    vl,   b, vl),
@@ -176,8 +186,19 @@ static const struct test avx512dq_all[]
     INSN_PFP(xor,              0f, 57),
 };
 
+static const struct test avx512dq_no128[] = {
+    INSN(broadcastf32x2, 66, 0f38, 19, el_2, d, vl),
+    INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
+};
+
+static const struct test avx512dq_512[] = {
+    INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
+};
+
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
 static const unsigned char vl_128[] = { VL_128 };
+static const unsigned char vl_no128[] = { VL_512, VL_256 };
+static const unsigned char vl_512[] = { VL_512 };
 
 /*
  * This table, indicating the presence of an immediate (byte) for an opcode
@@ -486,6 +507,10 @@ void evex_disp8_test(void *instr, struct
 
     RUN(avx512f, all);
     RUN(avx512f, 128);
+    RUN(avx512f, no128);
+    RUN(avx512f, 512);
     RUN(avx512bw, all);
     RUN(avx512dq, all);
+    RUN(avx512dq, no128);
+    RUN(avx512dq, 512);
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -234,10 +234,16 @@ enum simd_opsize {
 
     /*
      * 128 bits of integer or floating point data, with no further
-     * formatting information.
+     * formatting information, or with it encoded by EVEX.W.
      */
     simd_128,
 
+    /*
+     * 256 bits of integer or floating point data, with formatting
+     * encoded by EVEX.W.
+     */
+    simd_256,
+
     /* Operand size encoded in non-standard way. */
     simd_other
 };
@@ -430,8 +436,10 @@ static const struct ext0f38_table {
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
     [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x18 ... 0x19] = { .simd_size = simd_scalar_opc, .two_op = 1 },
-    [0x1a] = { .simd_size = simd_128, .two_op = 1 },
+    [0x18] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 2 },
+    [0x19] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 3 },
+    [0x1a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
+    [0x1b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
@@ -3320,6 +3328,10 @@ x86_decode(
         op_bytes = 16;
         break;
 
+    case simd_256:
+        op_bytes = 32;
+        break;
+
     default:
         op_bytes = 0;
         break;
@@ -7969,6 +7981,42 @@ x86_emulate(
         dst.type = OP_NONE;
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
+        generate_exception_if(evex.w || evex.br, EXC_UD);
+    avx512_broadcast:
+        /*
+         * For the respective code below the main switch() to work we need to
+         * fold op_mask here: A source element gets read whenever any of its
+         * respective destination elements' mask bits is set.
+         */
+        if ( fault_suppression )
+        {
+            n = 1 << ((b & 3) - evex.w);
+            ASSERT(op_bytes == n * elem_bytes);
+            for ( i = n; i < (16 << evex.lr) / elem_bytes; i += n )
+                op_mask |= (op_mask >> i) & ((1 << n) - 1);
+        }
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */
+                                            /* vbroadcastf64x4 m256,zmm{k} */
+        generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */
+                                            /* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */
+        generate_exception_if(!evex.lr || evex.br, EXC_UD);
+        if ( !evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        goto avx512_broadcast;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */
+                                            /* vbroadcastf64x2 m128,{y,z}mm{k} */
+        generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.br,
+                              EXC_UD);
+        if ( evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        goto avx512_broadcast;
+
     case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 15/34] x86emul: support AVX512F v{, u}comis{d, s} insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (13 preceding siblings ...)
  2018-09-18 12:02   ` [PATCH v3 14/34] x86emul: support AVX512{F, DQ} FP broadcast insns Jan Beulich
@ 2018-09-18 12:03   ` Jan Beulich
  2018-09-18 12:03   ` [PATCH v3 16/34] x86emul/test: introduce eq() Jan Beulich
                     ` (18 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:03 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -93,6 +93,8 @@ static const struct test avx512f_all[] =
     INSN_FP(add,             0f, 58),
     INSN(broadcastss,  66, 0f38, 18,    el,      d, el),
     INSN_FP(cmp,             0f, c2),
+    INSN(comisd,       66,   0f, 2f,    el,      q, el),
+    INSN(comiss,         ,   0f, 2f,    el,      d, el),
     INSN_FP(div,             0f, 5e),
     INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
     INSN(fmadd132,     66, 0f38, 99,    el,     sd, el),
@@ -152,6 +154,8 @@ static const struct test avx512f_all[] =
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
     INSN_FP(sub,             0f, 5c),
+    INSN(ucomisd,      66,   0f, 2e,    el,      q, el),
+    INSN(ucomiss,        ,   0f, 2e,    el,      d, el),
     INSN_PFP(unpckh,         0f, 15),
     INSN_PFP(unpckl,         0f, 14),
 };
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -297,7 +297,7 @@ static const struct twobyte_table {
     [0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x2c ... 0x2d] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp },
+    [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp, simd_none, d8s_dq },
     [0x30 ... 0x35] = { ImplicitOps },
     [0x37] = { ImplicitOps },
     [0x38] = { DstReg|SrcMem|ModRM },
@@ -6101,24 +6101,34 @@ x86_emulate(
         }
 
         opc = init_prefixes(stub);
+        op_bytes = 4 << vex.pfx;
+    vcomi:
         opc[0] = b;
         opc[1] = modrm;
         if ( ea.type == OP_MEM )
         {
-            rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, vex.pfx ? 8 : 4,
-                           ctxt);
+            rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
             if ( rc != X86EMUL_OKAY )
                 goto done;
 
             /* Convert memory operand to (%rAX). */
             rex_prefix &= ~REX_B;
             vex.b = 1;
+            evex.b = 1;
             opc[1] &= 0x38;
         }
-        insn_bytes = PFX_BYTES + 2;
+        if ( evex_encoded() )
+        {
+            insn_bytes = EVEX_PFX_BYTES + 2;
+            copy_EVEX(opc, evex);
+        }
+        else
+        {
+            insn_bytes = PFX_BYTES + 2;
+            copy_REX_VEX(opc, rex_prefix, vex);
+        }
         opc[2] = 0xc3;
 
-        copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
                     _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
                     [eflags] "+g" (_regs.eflags),
@@ -6129,6 +6139,19 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x2e): /* vucomis{s,d} xmm/mem,xmm */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x2f): /* vcomis{s,d} xmm/mem,xmm */
+        generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk ||
+                               (ea.type == OP_MEM && evex.br) ||
+                               evex.w != evex.pfx),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        opc = init_evex(stub);
+        op_bytes = 4 << evex.w;
+        goto vcomi;
+
     case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->write_msr == NULL);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 16/34] x86emul/test: introduce eq()
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (14 preceding siblings ...)
  2018-09-18 12:03   ` [PATCH v3 15/34] x86emul: support AVX512F v{, u}comis{d, s} insns Jan Beulich
@ 2018-09-18 12:03   ` Jan Beulich
  2018-09-18 12:04   ` [PATCH v3 17/34] x86emul: support AVX512{F, BW} packed integer compare insns Jan Beulich
                     ` (17 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:03 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

In preparation for sensible to-boolean conversion on AVX512, wrap
another abstraction function around the present to_bool(<x> == <y>), to
get rid of the open-coded == (which will get in the way of using
built-in functions instead). For the future AVX512 use scalar operands
can't be used then anymore: Use (vec_t){} when the operand is zero,
and broadcast (if available) otherwise (assume pre-AVX512 when broadcast
is not available, in which case a plain scalar is still fine).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -46,6 +46,10 @@ static inline bool _to_bool(byte_vec_t b
 # define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
 #endif
 
+#ifndef eq
+# define eq(x, y) to_bool((x) == (y))
+#endif
+
 #if VEC_SIZE == FLOAT_SIZE
 # define to_int(x) ((vec_t){ (int)(x)[0] })
 #elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
@@ -605,18 +609,18 @@ int simd_test(void)
     touch(src);
     x = src;
     touch(x);
-    if ( !to_bool(x == src) ) return __LINE__;
+    if ( !eq(x, src) ) return __LINE__;
 
     touch(src);
     y = x + src;
     touch(src);
     touch(y);
-    if ( !to_bool(y == 2 * src) ) return __LINE__;
+    if ( !eq(y, 2 * src) ) return __LINE__;
 
     touch(src);
     z = y -= src;
     touch(z);
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 
 #if defined(UINT_SIZE)
 
@@ -628,7 +632,7 @@ int simd_test(void)
     z ^= inv;
     touch(inv);
     touch(x);
-    if ( !to_bool((x & ~y) == z) ) return __LINE__;
+    if ( !eq(x & ~y, z) ) return __LINE__;
 
 #elif ELEM_SIZE > 1 || VEC_SIZE <= 8
 
@@ -639,7 +643,7 @@ int simd_test(void)
     z = src + inv;
     touch(inv);
     z *= (src - inv);
-    if ( !to_bool(x - y == z) ) return __LINE__;
+    if ( !eq(x - y, z) ) return __LINE__;
 
 #endif
 
@@ -648,10 +652,10 @@ int simd_test(void)
     x = src * alt;
     touch(alt);
     y = src / alt;
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
     touch(alt);
     touch(src);
-    if ( !to_bool(x * -alt == -src) ) return __LINE__;
+    if ( !eq(x * -alt, -src) ) return __LINE__;
 
 # if defined(recip) && defined(to_int)
 
@@ -659,16 +663,16 @@ int simd_test(void)
     x = recip(src);
     touch(src);
     touch(x);
-    if ( !to_bool(to_int(recip(x)) == src) ) return __LINE__;
+    if ( !eq(to_int(recip(x)), src) ) return __LINE__;
 
 #  ifdef rsqrt
     x = src * src;
     touch(x);
     y = rsqrt(x);
     touch(y);
-    if ( !to_bool(to_int(recip(y)) == src) ) return __LINE__;
+    if ( !eq(to_int(recip(y)), src) ) return __LINE__;
     touch(src);
-    if ( !to_bool(to_int(y) == to_int(recip(src))) ) return __LINE__;
+    if ( !eq(to_int(y), to_int(recip(src))) ) return __LINE__;
 #  endif
 
 # endif
@@ -676,7 +680,7 @@ int simd_test(void)
 # ifdef sqrt
     x = src * src;
     touch(x);
-    if ( !to_bool(sqrt(x) == src) ) return __LINE__;
+    if ( !eq(sqrt(x), src) ) return __LINE__;
 # endif
 
 # ifdef trunc
@@ -684,20 +688,20 @@ int simd_test(void)
     y = (vec_t){ 1 };
     touch(x);
     z = trunc(x);
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
 # endif
 
 # ifdef frac
     touch(src);
     x = frac(src);
     touch(src);
-    if ( !to_bool(x == 0) ) return __LINE__;
+    if ( !eq(x, (vec_t){}) ) return __LINE__;
 
     x = 1 / (src + 1);
     touch(x);
     y = frac(x);
     touch(x);
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 # endif
 
 # if defined(trunc) && defined(frac)
@@ -707,7 +711,7 @@ int simd_test(void)
     touch(x);
     z = frac(x);
     touch(x);
-    if ( !to_bool(x == y + z) ) return __LINE__;
+    if ( !eq(x, y + z) ) return __LINE__;
 # endif
 
 #else
@@ -720,16 +724,16 @@ int simd_test(void)
     y[ELEM_COUNT - 1] = y[0] = j = ELEM_COUNT;
     for ( i = 1; i < ELEM_COUNT / 2; ++i )
         y[ELEM_COUNT - i - 1] = y[i] = y[i - 1] + (j -= 2);
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 
 #  ifdef mul_hi
     touch(alt);
     x = mul_hi(src, alt);
     touch(alt);
 #   ifdef INT_SIZE
-    if ( !to_bool(x == (alt < 0)) ) return __LINE__;
+    if ( !eq(x, alt < 0) ) return __LINE__;
 #   else
-    if ( !to_bool(x == (src & alt) + alt) ) return __LINE__;
+    if ( !eq(x, (src & alt) + alt) ) return __LINE__;
 #   endif
 #  endif
 
@@ -745,7 +749,7 @@ int simd_test(void)
         z[i] = res;
         z[i + 1] = res >> (ELEM_SIZE << 3);
     }
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
 #  endif
 
     z = src;
@@ -757,12 +761,12 @@ int simd_test(void)
     touch(z);
     y = z << 2;
     touch(z);
-    if ( !to_bool(x == y + y) ) return __LINE__;
+    if ( !eq(x, y + y) ) return __LINE__;
 
     touch(x);
     z = x >> 2;
     touch(x);
-    if ( !to_bool(y == z + z) ) return __LINE__;
+    if ( !eq(y, z + z) ) return __LINE__;
 
     z = src;
 #  ifdef INT_SIZE
@@ -781,11 +785,11 @@ int simd_test(void)
     touch(j);
     y = z << j;
     touch(j);
-    if ( !to_bool(x == y + y) ) return __LINE__;
+    if ( !eq(x, y + y) ) return __LINE__;
 
     z = x >> j;
     touch(j);
-    if ( !to_bool(y == z + z) ) return __LINE__;
+    if ( !eq(y, z + z) ) return __LINE__;
 
 # endif
 
@@ -809,12 +813,12 @@ int simd_test(void)
     --sh;
     touch(sh);
     y = z << sh;
-    if ( !to_bool(x == y + y) ) return __LINE__;
+    if ( !eq(x, y + y) ) return __LINE__;
 
 #  if (defined(__AVX2__) && ELEM_SIZE >= 4) || defined(__XOP__)
     touch(sh);
     x = y >> sh;
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 #  endif
 
 # endif
@@ -828,7 +832,7 @@ int simd_test(void)
     touch(inv);
     y = max(src, inv);
     touch(inv);
-    if ( !to_bool(x + y == src + inv) ) return __LINE__;
+    if ( !eq(x + y, src + inv) ) return __LINE__;
 # else
     x = src * alt;
     y = inv * alt;
@@ -837,33 +841,33 @@ int simd_test(void)
     touch(y);
     y = min(x, y);
     touch(y);
-    if ( !to_bool((y + z) * alt == src + inv) ) return __LINE__;
+    if ( !eq((y + z) * alt, src + inv) ) return __LINE__;
 # endif
 #endif
 
 #ifdef abs
     x = src * alt;
     touch(x);
-    if ( !to_bool(abs(x) == src) ) return __LINE__;
+    if ( !eq(abs(x), src) ) return __LINE__;
 #endif
 
 #ifdef copysignz
     touch(alt);
-    if ( !to_bool(copysignz((vec_t){} + 1, alt) == alt) ) return __LINE__;
+    if ( !eq(copysignz((vec_t){} + 1, alt), alt) ) return __LINE__;
 #endif
 
 #ifdef swap
     touch(src);
-    if ( !to_bool(swap(src) == inv) ) return __LINE__;
+    if ( !eq(swap(src), inv) ) return __LINE__;
 #endif
 
 #ifdef swap2
     touch(src);
-    if ( !to_bool(swap2(src) == inv) ) return __LINE__;
+    if ( !eq(swap2(src), inv) ) return __LINE__;
 #endif
 
 #if defined(broadcast)
-    if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
+    if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
 #endif
 
 #if defined(interleave_lo) && defined(interleave_hi)
@@ -877,7 +881,11 @@ int simd_test(void)
 # else
     z = (x - y) * alt;
 # endif
-    if ( !to_bool(z == ELEM_COUNT / 2) ) return __LINE__;
+# ifdef broadcast
+    if ( !eq(z, broadcast(ELEM_COUNT / 2)) ) return __LINE__;
+# else
+    if ( !eq(z, ELEM_COUNT / 2) ) return __LINE__;
+# endif
 #endif
 
 #if defined(INT_SIZE) && defined(widen1) && defined(interleave_lo)
@@ -887,7 +895,7 @@ int simd_test(void)
     touch(x);
     z = widen1(x);
     touch(x);
-    if ( !to_bool(z == y) ) return __LINE__;
+    if ( !eq(z, y) ) return __LINE__;
 
 # ifdef widen2
     y = interleave_lo(alt < 0, alt < 0);
@@ -895,7 +903,7 @@ int simd_test(void)
     touch(x);
     z = widen2(x);
     touch(x);
-    if ( !to_bool(z == y) ) return __LINE__;
+    if ( !eq(z, y) ) return __LINE__;
 
 #  ifdef widen3
     y = interleave_lo(alt < 0, alt < 0);
@@ -904,7 +912,7 @@ int simd_test(void)
     touch(x);
     z = widen3(x);
     touch(x);
-    if ( !to_bool(z == y) ) return __LINE__;
+    if ( !eq(z, y) ) return __LINE__;
 #  endif
 # endif
 
@@ -919,21 +927,21 @@ int simd_test(void)
     touch(src);
     x = widen1(src);
     touch(src);
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 # endif
 
 # ifdef widen2
     touch(src);
     x = widen2(src);
     touch(src);
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 # endif
 
 # ifdef widen3
     touch(src);
     x = widen3(src);
     touch(src);
-    if ( !to_bool(x == interleave_lo(z, (vec_t){})) ) return __LINE__;
+    if ( !eq(x, interleave_lo(z, (vec_t){})) ) return __LINE__;
 # endif
 
 #endif
@@ -942,14 +950,14 @@ int simd_test(void)
     touch(src);
     x = dup_lo(src);
     touch(src);
-    if ( !to_bool(x - src == (alt - 1) / 2) ) return __LINE__;
+    if ( !eq(x - src, (alt - 1) / 2) ) return __LINE__;
 #endif
 
 #ifdef dup_hi
     touch(src);
     x = dup_hi(src);
     touch(src);
-    if ( !to_bool(x - src == (alt + 1) / 2) ) return __LINE__;
+    if ( !eq(x - src, (alt + 1) / 2) ) return __LINE__;
 #endif
 
     for ( i = 0; i < ELEM_COUNT; ++i )
@@ -961,7 +969,7 @@ int simd_test(void)
 # else
     select(&z, src, inv, alt > 0);
 # endif
-    if ( !to_bool(z == y) ) return __LINE__;
+    if ( !eq(z, y) ) return __LINE__;
 #endif
 
 #ifdef select2
@@ -970,14 +978,14 @@ int simd_test(void)
 # else
     select2(&z, src, inv, alt > 0);
 # endif
-    if ( !to_bool(z == y) ) return __LINE__;
+    if ( !eq(z, y) ) return __LINE__;
 #endif
 
 #ifdef mix
     touch(src);
     touch(inv);
     x = mix(src, inv);
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 
 # ifdef addsub
     touch(src);
@@ -986,22 +994,22 @@ int simd_test(void)
     touch(src);
     touch(inv);
     y = mix(src - inv, src + inv);
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 # endif
 #endif
 
 #ifdef rotr
     x = rotr(src, 1);
     y = (src & (ELEM_COUNT - 1)) + 1;
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 #endif
 
 #ifdef dot_product
     touch(src);
     touch(inv);
     x = dot_product(src, inv);
-    if ( !to_bool(x == (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
-                                 (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
+    if ( !eq(x, (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
+                          (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
 #endif
 
 #ifdef hadd
@@ -1022,7 +1030,7 @@ int simd_test(void)
     x = hsub(src, inv);
     for ( i = ELEM_COUNT; i >>= 1; )
         x = hadd(x, (vec_t){});
-    if ( !to_bool(x == 0) ) return __LINE__;
+    if ( !eq(x, (vec_t){}) ) return __LINE__;
 # endif
 #endif
 
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -20,6 +20,10 @@ ENTRY(fma_test);
 # endif
 #endif
 
+#ifndef eq
+# define eq(x, y) to_bool((x) == (y))
+#endif
+
 #if VEC_SIZE == 16
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps(x, y)
@@ -62,38 +66,38 @@ int fma_test(void)
     y = (src - one) * inv;
     touch(src);
     z = inv * src + inv;
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 
     touch(src);
     z = -inv * src - inv;
-    if ( !to_bool(-x == z) ) return __LINE__;
+    if ( !eq(-x, z) ) return __LINE__;
 
     touch(src);
     z = inv * src - inv;
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
 
     touch(src);
     z = -inv * src + inv;
-    if ( !to_bool(-y == z) ) return __LINE__;
+    if ( !eq(-y, z) ) return __LINE__;
     touch(src);
 
     x = src + inv;
     y = src - inv;
     touch(inv);
     z = src * one + inv;
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 
     touch(inv);
     z = -src * one - inv;
-    if ( !to_bool(-x == z) ) return __LINE__;
+    if ( !eq(-x, z) ) return __LINE__;
 
     touch(inv);
     z = src * one - inv;
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
 
     touch(inv);
     z = -src * one + inv;
-    if ( !to_bool(-y == z) ) return __LINE__;
+    if ( !eq(-y, z) ) return __LINE__;
     touch(inv);
 
 #if defined(addsub) && defined(fmaddsub)
@@ -101,21 +105,21 @@ int fma_test(void)
     y = addsub(src * inv, -one);
     touch(one);
     z = fmaddsub(src, inv, one);
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 
     touch(one);
     z = fmaddsub(src, inv, -one);
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
     touch(one);
 
     x = addsub(src * inv, one);
     touch(inv);
     z = fmaddsub(src, inv, one);
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 
     touch(inv);
     z = fmaddsub(src, inv, -one);
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
     touch(inv);
 #endif
 




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 17/34] x86emul: support AVX512{F, BW} packed integer compare insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (15 preceding siblings ...)
  2018-09-18 12:03   ` [PATCH v3 16/34] x86emul/test: introduce eq() Jan Beulich
@ 2018-09-18 12:04   ` Jan Beulich
  2018-09-18 12:05   ` [PATCH v3 18/34] x86emul: support AVX512{F, BW} packed integer arithmetic insns Jan Beulich
                     ` (16 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:04 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Include VPTEST{,N}M{B,D,Q,W} as once again possibly used by the compiler
for comparison against all-zero vectors.

Also table entries for a few more insns get their .d8s field set right
away, again in order to not split and later re-combine the groups.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -148,8 +148,16 @@ static const struct test avx512f_all[] =
     INSN_FP(mul,             0f, 59),
     INSN(pand,         66,   0f, db,    vl,     dq, vl),
     INSN(pandn,        66,   0f, df,    vl,     dq, vl),
+    INSN(pcmp,         66, 0f3a, 1f,    vl,     dq, vl),
+    INSN(pcmpeqd,      66,   0f, 76,    vl,      d, vl),
+    INSN(pcmpeqq,      66, 0f38, 29,    vl,      q, vl),
+    INSN(pcmpgtd,      66,   0f, 66,    vl,      d, vl),
+    INSN(pcmpgtq,      66, 0f38, 37,    vl,      q, vl),
+    INSN(pcmpu,        66, 0f3a, 1e,    vl,     dq, vl),
     INSN(por,          66,   0f, eb,    vl,     dq, vl),
     INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
+    INSN(ptestm,       66, 0f38, 27,    vl,     dq, vl),
+    INSN(ptestnm,      f3, 0f38, 27,    vl,     dq, vl),
     INSN(pxor,         66,   0f, ef,    vl,     dq, vl),
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
@@ -181,6 +189,14 @@ static const struct test avx512bw_all[]
     INSN(movdqu8,     f2,   0f, 7f,    vl,   b, vl),
     INSN(movdqu16,    f2,   0f, 6f,    vl,   w, vl),
     INSN(movdqu16,    f2,   0f, 7f,    vl,   w, vl),
+    INSN(pcmp,        66, 0f3a, 3f,    vl,  bw, vl),
+    INSN(pcmpeqb,     66,   0f, 74,    vl,   b, vl),
+    INSN(pcmpeqw,     66,   0f, 75,    vl,   w, vl),
+    INSN(pcmpgtb,     66,   0f, 64,    vl,   b, vl),
+    INSN(pcmpgtw,     66,   0f, 65,    vl,   w, vl),
+    INSN(pcmpu,       66, 0f3a, 3e,    vl,  bw, vl),
+    INSN(ptestm,      66, 0f38, 26,    vl,  bw, vl),
+    INSN(ptestnm,     f3, 0f38, 26,    vl,  bw, vl),
 };
 
 static const struct test avx512dq_all[] = {
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -311,14 +311,14 @@ static const struct twobyte_table {
     [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq },
     [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
     [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM },
-    [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x77] = { DstImplicit|SrcNone },
     [0x78] = { ImplicitOps|ModRM },
     [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int },
@@ -442,13 +442,13 @@ static const struct ext0f38_table {
     [0x1b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
-    [0x28 ... 0x29] = { .simd_size = simd_packed_int },
+    [0x26 ... 0x29] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
     [0x2c ... 0x2d] = { .simd_size = simd_packed_fp },
     [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
-    [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
+    [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40] = { .simd_size = simd_packed_int },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x45 ... 0x47] = { .simd_size = simd_packed_int },
@@ -514,6 +514,7 @@ static const struct ext0f3a_table {
     [0x18] = { .simd_size = simd_128 },
     [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
+    [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
@@ -521,6 +522,7 @@ static const struct ext0f3a_table {
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
+    [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
     [0x42] = { .simd_size = simd_packed_int },
     [0x44] = { .simd_size = simd_packed_int },
@@ -6558,6 +6560,32 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x26): /* vptestnm{b,w} [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x27): /* vptestnm{d,q} [xyz]mm/mem,[xyz]mm,k{k} */
+        op_bytes = 16 << evex.lr;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x64): /* vpcmpeqb [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x65): /* vpcmpeqw [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x66): /* vpcmpeqd [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x74): /* vpcmpgtb [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x75): /* vpcmpgtw [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x76): /* vpcmpgtd [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x26): /* vptestm{b,w} [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x27): /* vptestm{d,q} [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x29): /* vpcmpeqq [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x37): /* vpcmpgtq [xyz]mm/mem,[xyz]mm,k{k} */
+        generate_exception_if(!evex.r || !evex.R || evex.z, EXC_UD);
+        if ( b & (ext == ext_0f38 ? 1 : 2) )
+        {
+            generate_exception_if(b != 0x27 && evex.w != (b & 1), EXC_UD);
+            goto avx512f_no_sae;
+        }
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = 1 << (ext == ext_0f ? b & 1 : evex.w);
+        avx512_vlen_check(false);
+        goto simd_zmm;
+
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     CASE_SIMD_PACKED_INT(0x0f, 0x7e):    /* mov{d,q} {,x}mm,r/m */
@@ -7566,6 +7594,7 @@ x86_emulate(
                               EXC_UD);
         /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x25): /* vpternlog{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    avx512f_imm_no_sae:
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
         avx512_vlen_check(false);
@@ -8739,6 +8768,19 @@ x86_emulate(
         break;
     }
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1e): /* vpcmpu{d,q} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1f): /* vpcmp{d,q} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3e): /* vpcmpu{b,w} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3f): /* vpcmp{b,w} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+        generate_exception_if(!evex.r || !evex.R || evex.z, EXC_UD);
+        if ( !(b & 0x20) )
+            goto avx512f_imm_no_sae;
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = 1 << evex.w;
+        avx512_vlen_check(false);
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
         host_and_vcpu_must_have(sse4_1);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 18/34] x86emul: support AVX512{F, BW} packed integer arithmetic insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (16 preceding siblings ...)
  2018-09-18 12:04   ` [PATCH v3 17/34] x86emul: support AVX512{F, BW} packed integer compare insns Jan Beulich
@ 2018-09-18 12:05   ` Jan Beulich
  2018-09-18 12:05   ` [PATCH v3 19/34] x86emul: use simd_128 also for legacy vector shift insns Jan Beulich
                     ` (15 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:05 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note: vpadd* / vpsub* et al are put at seemingly the wrong slot of the
big switch(). This is in anticipation of adding vpunpck* to those
groups (see the legacy/VEX encoded case labels nearby to support this).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -146,6 +146,8 @@ static const struct test avx512f_all[] =
     INSN_PFP_NB(movu,        0f, 10),
     INSN_PFP_NB(movu,        0f, 11),
     INSN_FP(mul,             0f, 59),
+    INSN(paddd,        66,   0f, fe,    vl,      d, vl),
+    INSN(paddq,        66,   0f, d4,    vl,      q, vl),
     INSN(pand,         66,   0f, db,    vl,     dq, vl),
     INSN(pandn,        66,   0f, df,    vl,     dq, vl),
     INSN(pcmp,         66, 0f3a, 1f,    vl,     dq, vl),
@@ -154,7 +156,16 @@ static const struct test avx512f_all[] =
     INSN(pcmpgtd,      66,   0f, 66,    vl,      d, vl),
     INSN(pcmpgtq,      66, 0f38, 37,    vl,      q, vl),
     INSN(pcmpu,        66, 0f3a, 1e,    vl,     dq, vl),
+    INSN(pmaxs,        66, 0f38, 3d,    vl,     dq, vl),
+    INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
+    INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
+    INSN(pminu,        66, 0f38, 3b,    vl,     dq, vl),
+    INSN(pmuldq,       66, 0f38, 28,    vl,      q, vl),
+    INSN(pmulld,       66, 0f38, 40,    vl,      d, vl),
+    INSN(pmuludq,      66,   0f, f4,    vl,      q, vl),
     INSN(por,          66,   0f, eb,    vl,     dq, vl),
+    INSN(psubd,        66,   0f, fa,    vl,      d, vl),
+    INSN(psubq,        66,   0f, fb,    vl,      q, vl),
     INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
     INSN(ptestm,       66, 0f38, 27,    vl,     dq, vl),
     INSN(ptestnm,      f3, 0f38, 27,    vl,     dq, vl),
@@ -189,12 +200,39 @@ static const struct test avx512bw_all[]
     INSN(movdqu8,     f2,   0f, 7f,    vl,   b, vl),
     INSN(movdqu16,    f2,   0f, 6f,    vl,   w, vl),
     INSN(movdqu16,    f2,   0f, 7f,    vl,   w, vl),
+    INSN(paddb,       66,   0f, fc,    vl,   b, vl),
+    INSN(paddsb,      66,   0f, ec,    vl,   b, vl),
+    INSN(paddsw,      66,   0f, ed,    vl,   w, vl),
+    INSN(paddusb,     66,   0f, dc,    vl,   b, vl),
+    INSN(paddusw,     66,   0f, dd,    vl,   w, vl),
+    INSN(paddw,       66,   0f, fd,    vl,   w, vl),
+    INSN(pavgb,       66,   0f, e0,    vl,   b, vl),
+    INSN(pavgw,       66,   0f, e3,    vl,   w, vl),
     INSN(pcmp,        66, 0f3a, 3f,    vl,  bw, vl),
     INSN(pcmpeqb,     66,   0f, 74,    vl,   b, vl),
     INSN(pcmpeqw,     66,   0f, 75,    vl,   w, vl),
     INSN(pcmpgtb,     66,   0f, 64,    vl,   b, vl),
     INSN(pcmpgtw,     66,   0f, 65,    vl,   w, vl),
     INSN(pcmpu,       66, 0f3a, 3e,    vl,  bw, vl),
+    INSN(pmaddwd,     66,   0f, f5,    vl,   w, vl),
+    INSN(pmaxsb,      66, 0f38, 3c,    vl,   b, vl),
+    INSN(pmaxsw,      66,   0f, ee,    vl,   w, vl),
+    INSN(pmaxub,      66,   0f, de,    vl,   b, vl),
+    INSN(pmaxuw,      66, 0f38, 3e,    vl,   w, vl),
+    INSN(pminsb,      66, 0f38, 38,    vl,   b, vl),
+    INSN(pminsw,      66,   0f, ea,    vl,   w, vl),
+    INSN(pminub,      66,   0f, da,    vl,   b, vl),
+    INSN(pminuw,      66, 0f38, 3a,    vl,   w, vl),
+    INSN(pmulhuw,     66,   0f, e4,    vl,   w, vl),
+    INSN(pmulhw,      66,   0f, e5,    vl,   w, vl),
+    INSN(pmullw,      66,   0f, d5,    vl,   w, vl),
+    INSN(psadbw,      66,   0f, f6,    vl,   b, vl),
+    INSN(psubb,       66,   0f, f8,    vl,   b, vl),
+    INSN(psubsb,      66,   0f, e8,    vl,   b, vl),
+    INSN(psubsw,      66,   0f, e9,    vl,   w, vl),
+    INSN(psubusb,     66,   0f, d8,    vl,   b, vl),
+    INSN(psubusw,     66,   0f, d9,    vl,   w, vl),
+    INSN(psubw,       66,   0f, f9,    vl,   w, vl),
     INSN(ptestm,      66, 0f38, 26,    vl,  bw, vl),
     INSN(ptestnm,     f3, 0f38, 26,    vl,  bw, vl),
 };
@@ -203,6 +241,7 @@ static const struct test avx512dq_all[]
     INSN_PFP(and,              0f, 54),
     INSN_PFP(andn,             0f, 55),
     INSN_PFP(or,               0f, 56),
+    INSN(pmullq,         66, 0f38, 40,   vl,  q, vl),
     INSN_PFP(xor,              0f, 57),
 };
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -365,21 +365,21 @@ static const struct twobyte_table {
     [0xc8 ... 0xcf] = { ImplicitOps },
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
     [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
-    [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf7] = { DstMem|SrcMem|ModRM|Mov, simd_packed_int },
-    [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xff] = { ModRM }
 };
 
@@ -449,7 +449,7 @@ static const struct ext0f38_table {
     [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x40] = { .simd_size = simd_packed_int },
+    [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x45 ... 0x47] = { .simd_size = simd_packed_int },
     [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
@@ -5960,6 +5960,10 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xdf): /* vpandn{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xeb): /* vpor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xef): /* vpxor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x39): /* vpmins{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3b): /* vpminu{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3d): /* vpmaxs{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3f): /* vpmaxu{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     avx512f_no_sae:
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
@@ -6560,6 +6564,37 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf5): /* vpmaddwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        fault_suppression = false;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd5): /* vpmullw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd8): /* vpsubusb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd9): /* vpsubusw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xdc): /* vpaddusb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xdd): /* vpaddusw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe0): /* vpavgb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe3): /* vpavgw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe5): /* vpmulhw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe8): /* vpsubsb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe9): /* vpsubsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xec): /* vpaddsb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xed): /* vpaddsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf8): /* vpsubb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf9): /* vpsubw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xfc): /* vpaddb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xfd): /* vpaddw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = 1 << (b & 1);
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xfa): /* vpsubd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xfb): /* vpsubq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xfe): /* vpaddd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.w != (b & 1), EXC_UD);
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_EVEX_F3(0x0f38, 0x26): /* vptestnm{b,w} [xyz]mm/mem,[xyz]mm,k{k} */
     case X86EMUL_OPC_EVEX_F3(0x0f38, 0x27): /* vptestnm{d,q} [xyz]mm/mem,[xyz]mm,k{k} */
         op_bytes = 16 << evex.lr;
@@ -6586,6 +6621,12 @@ x86_emulate(
         avx512_vlen_check(false);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd4): /* vpaddq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf4): /* vpmuludq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x28): /* vpmuldq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(!evex.w, EXC_UD);
+        goto avx512f_no_sae;
+
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     CASE_SIMD_PACKED_INT(0x0f, 0x7e):    /* mov{d,q} {,x}mm,r/m */
@@ -7837,6 +7878,16 @@ x86_emulate(
         vcpu_must_have(mmxext);
         goto simd_0f_mmx;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xda): /* vpminub [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xde): /* vpmaxub [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe4): /* vpmulhuw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xea): /* vpminsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xee): /* vpmaxsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = b & 0x10 ? 1 : 2;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_66(0x0f, 0xe6):       /* cvttpd2dq xmm/mem,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe6):   /* vcvttpd2dq {x,y}mm/mem,xmm */
     case X86EMUL_OPC_F3(0x0f, 0xe6):       /* cvtdq2pd xmm/mem,xmm */
@@ -8210,6 +8261,20 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_2);
         goto simd_0f38_common;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x38): /* vpminsb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3a): /* vpminuw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3c): /* vpmaxsb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3e): /* vpmaxuw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = b & 2 ?: 1;
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x40): /* vpmull{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        if ( evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_66(0x0f38, 0xdb):     /* aesimc xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xdb): /* vaesimc xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f38, 0xdc):     /* aesenc xmm/m128,xmm,xmm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 19/34] x86emul: use simd_128 also for legacy vector shift insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (17 preceding siblings ...)
  2018-09-18 12:05   ` [PATCH v3 18/34] x86emul: support AVX512{F, BW} packed integer arithmetic insns Jan Beulich
@ 2018-09-18 12:05   ` Jan Beulich
  2018-09-18 12:05   ` [PATCH v3 20/34] x86emul: support AVX512{F, BW} shift/rotate insns Jan Beulich
                     ` (14 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:05 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

This eliminates a separate case block here, and allows to get away with
fewer new ones when adding AVX512 vector shifts.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -364,19 +364,19 @@ static const struct twobyte_table {
     [0xc7] = { ImplicitOps|ModRM },
     [0xc8 ... 0xcf] = { ImplicitOps },
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_128 },
     [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
     [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
-    [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_128 },
     [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_128 },
     [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf7] = { DstMem|SrcMem|ModRM|Mov, simd_packed_int },
     [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
@@ -3327,7 +3327,8 @@ x86_decode(
         break;
 
     case simd_128:
-        op_bytes = 16;
+        /* The special case here are MMX shift insns. */
+        op_bytes = vex.opcx || vex.pfx ? 16 : 8;
         break;
 
     case simd_256:
@@ -6455,6 +6456,12 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f, 0x75): /* vpcmpeqw {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT(0x0f, 0x76):    /* pcmpeqd {,x}mm/mem,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x76): /* vpcmpeqd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd1):    /* psrlw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd2):    /* psrld {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd3):    /* psrlq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xd4):     /* paddq xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xd4): /* vpaddq {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT(0x0f, 0xd5):    /* pmullw {,x}mm/mem,{,x}mm */
@@ -6477,6 +6484,10 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f, 0xdf): /* vpandn {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xe0):     /* pavgb xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe0): /* vpavgb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe1):    /* psraw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe2):    /* psrad {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe2): /* vpsrad xmm/m128,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xe3):     /* pavgw xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe3): /* vpavgw {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xe4):     /* pmulhuw xmm/m128,xmm */
@@ -6499,6 +6510,12 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f, 0xee): /* vpmaxsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT(0x0f, 0xef):    /* pxor {,x}mm/mem,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xef): /* vpxor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf1):    /* psllw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf2):    /* pslld {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf2): /* vpslld xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf3):    /* psllq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xf4):     /* pmuludq xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xf4): /* vpmuludq {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xf6):     /* psadbw xmm/m128,xmm */
@@ -7831,25 +7848,6 @@ x86_emulate(
         }
         break;
 
-    CASE_SIMD_PACKED_INT(0x0f, 0xd1):    /* psrlw {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xd2):    /* psrld {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xd3):    /* psrlq {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xe1):    /* psraw {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xe2):    /* psrad {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xe2): /* vpsrad xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xf1):    /* psllw {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xf2):    /* pslld {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xf2): /* vpslld xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xf3):    /* psllq {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,{x,y}mm,{x,y}mm */
-        op_bytes = vex.pfx ? 16 : 8;
-        goto simd_0f_int;
-
     case X86EMUL_OPC(0x0f, 0xd4):        /* paddq mm/m64,mm */
     case X86EMUL_OPC(0x0f, 0xf4):        /* pmuludq mm/m64,mm */
     case X86EMUL_OPC(0x0f, 0xfb):        /* psubq mm/m64,mm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 20/34] x86emul: support AVX512{F, BW} shift/rotate insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (18 preceding siblings ...)
  2018-09-18 12:05   ` [PATCH v3 19/34] x86emul: use simd_128 also for legacy vector shift insns Jan Beulich
@ 2018-09-18 12:05   ` Jan Beulich
  2018-09-18 12:06   ` [PATCH v3 21/34] x86emul: support AVX512{F, BW, DQ} extract insns Jan Beulich
                     ` (13 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:05 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note that simd_packed_fp for the opcode space 0f38 major opcodes 14 and
15 is not really correct, but sufficient for the purposes here. Further
adjustments may later be needed for the down conversion unsigned
saturating VPMOV* insns, first and foremost for the different Disp8
scaling those ones use.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -164,6 +164,24 @@ static const struct test avx512f_all[] =
     INSN(pmulld,       66, 0f38, 40,    vl,      d, vl),
     INSN(pmuludq,      66,   0f, f4,    vl,      q, vl),
     INSN(por,          66,   0f, eb,    vl,     dq, vl),
+    INSNX(prol,        66,   0f, 72, 1, vl,     dq, vl),
+    INSN(prolv,        66, 0f38, 15,    vl,     dq, vl),
+    INSNX(pror,        66,   0f, 72, 0, vl,     dq, vl),
+    INSN(prorv,        66, 0f38, 14,    vl,     dq, vl),
+    INSN(pslld,        66,   0f, f2,    el_4,    d, vl),
+    INSNX(pslld,       66,   0f, 72, 6, vl,      d, vl),
+    INSN(psllq,        66,   0f, f3,    el_2,    q, vl),
+    INSNX(psllq,       66,   0f, 73, 6, vl,      q, vl),
+    INSN(psllv,        66, 0f38, 47,    vl,     dq, vl),
+    INSNX(psra,        66,   0f, 72, 4, vl,     dq, vl),
+    INSN(psrad,        66,   0f, e2,    el_4,    d, vl),
+    INSN(psraq,        66,   0f, e2,    el_2,    q, vl),
+    INSN(psrav,        66, 0f38, 46,    vl,     dq, vl),
+    INSN(psrld,        66,   0f, d2,    el_4,    d, vl),
+    INSNX(psrld,       66,   0f, 72, 2, vl,      d, vl),
+    INSN(psrlq,        66,   0f, d3,    el_2,    q, vl),
+    INSNX(psrlq,       66,   0f, 73, 2, vl,      q, vl),
+    INSN(psrlv,        66, 0f38, 45,    vl,     dq, vl),
     INSN(psubd,        66,   0f, fa,    vl,      d, vl),
     INSN(psubq,        66,   0f, fb,    vl,      q, vl),
     INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
@@ -227,6 +245,17 @@ static const struct test avx512bw_all[]
     INSN(pmulhw,      66,   0f, e5,    vl,   w, vl),
     INSN(pmullw,      66,   0f, d5,    vl,   w, vl),
     INSN(psadbw,      66,   0f, f6,    vl,   b, vl),
+    INSNX(pslldq,     66,   0f, 73, 7, vl,   b, vl),
+    INSN(psllvw,      66, 0f38, 12,    vl,   w, vl),
+    INSN(psllw,       66,   0f, f1,    el_8, w, vl),
+    INSNX(psllw,      66,   0f, 71, 6, vl,   w, vl),
+    INSN(psravw,      66, 0f38, 11,    vl,   w, vl),
+    INSN(psraw,       66,   0f, e1,    el_8, w, vl),
+    INSNX(psraw,      66,   0f, 71, 4, vl,   w, vl),
+    INSNX(psrldq,     66,   0f, 73, 3, vl,   b, vl),
+    INSN(psrlvw,      66, 0f38, 10,    vl,   w, vl),
+    INSN(psrlw,       66,   0f, d1,    el_8, w, vl),
+    INSNX(psrlw,      66,   0f, 71, 2, vl,   w, vl),
     INSN(psubb,       66,   0f, f8,    vl,   b, vl),
     INSN(psubsb,      66,   0f, e8,    vl,   b, vl),
     INSN(psubsw,      66,   0f, e9,    vl,   w, vl),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -317,7 +317,7 @@ static const struct twobyte_table {
     [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq },
     [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
-    [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM },
+    [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl },
     [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x77] = { DstImplicit|SrcNone },
     [0x78] = { ImplicitOps|ModRM },
@@ -364,19 +364,19 @@ static const struct twobyte_table {
     [0xc7] = { ImplicitOps|ModRM },
     [0xc8 ... 0xcf] = { ImplicitOps },
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_128 },
+    [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_128, 4 },
     [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
     [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
-    [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_128 },
+    [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_128, 4 },
     [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_128 },
+    [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_128, 4 },
     [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf7] = { DstMem|SrcMem|ModRM|Mov, simd_packed_int },
     [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
@@ -432,9 +432,9 @@ static const struct ext0f38_table {
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
-    [0x10] = { .simd_size = simd_packed_int },
+    [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
-    [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
+    [0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x18] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 2 },
     [0x19] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 3 },
@@ -451,7 +451,7 @@ static const struct ext0f38_table {
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x45 ... 0x47] = { .simd_size = simd_packed_int },
+    [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
     [0x5a] = { .simd_size = simd_128, .two_op = 1 },
     [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
@@ -5961,10 +5961,15 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xdf): /* vpandn{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xeb): /* vpor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xef): /* vpxor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x14): /* vprorv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x15): /* vprolv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x39): /* vpmins{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x3b): /* vpminu{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x3d): /* vpmaxs{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x3f): /* vpmaxu{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x45): /* vpsrlv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x46): /* vpsrav{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x47): /* vpsllv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     avx512f_no_sae:
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
@@ -6581,6 +6586,9 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf5): /* vpmaddwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;
@@ -6606,6 +6614,16 @@ x86_emulate(
         elem_bytes = 1 << (b & 1);
         goto avx512f_no_sae;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe2): /* vpsra{d,q} xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf2): /* vpslld xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.br, EXC_UD);
+        fault_suppression = false;
+        if ( b == 0xe2 )
+            goto avx512f_no_sae;
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfa): /* vpsubd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfb): /* vpsubq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfe): /* vpaddd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -6886,6 +6904,37 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x71): /* Grp12 */
+        switch ( modrm_reg & 7 )
+        {
+        case 2: /* vpsrlw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 4: /* vpsraw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 6: /* vpsllw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        avx512bw_shift_imm:
+            fault_suppression = false;
+            op_bytes = 16 << evex.lr;
+            state->simd_size = simd_packed_int;
+            goto avx512bw_imm;
+        }
+        goto unrecognized_insn;
+
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x72): /* Grp13 */
+        switch ( modrm_reg & 7 )
+        {
+        case 2: /* vpsrld $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 6: /* vpslld $imm8,[xyz]mm/mem,[xyz]mm{k} */
+            generate_exception_if(evex.w, EXC_UD);
+            /* fall through */
+        case 0: /* vpror{d,q} $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 1: /* vprol{d,q} $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 4: /* vpsra{d,q} $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        avx512f_shift_imm:
+            op_bytes = 16 << evex.lr;
+            state->simd_size = simd_packed_int;
+            goto avx512f_imm_no_sae;
+        }
+        goto unrecognized_insn;
+
     case X86EMUL_OPC(0x0f, 0x73):        /* Grp14 */
         switch ( modrm_reg & 7 )
         {
@@ -6911,6 +6960,19 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x73): /* Grp14 */
+        switch ( modrm_reg & 7 )
+        {
+        case 2: /* vpsrlq $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 6: /* vpsllq $imm8,[xyz]mm/mem,[xyz]mm{k} */
+            generate_exception_if(!evex.w, EXC_UD);
+            goto avx512f_shift_imm;
+        case 3: /* vpsrldq $imm8,{x,y}mm,{x,y}mm */
+        case 7: /* vpslldq $imm8,{x,y}mm,{x,y}mm */
+            goto avx512bw_shift_imm;
+        }
+        goto unrecognized_insn;
+
     case X86EMUL_OPC(0x0f, 0x77):        /* emms */
     case X86EMUL_OPC_VEX(0x0f, 0x77):    /* vzero{all,upper} */
         if ( vex.opcx != vex_none )
@@ -8082,6 +8144,14 @@ x86_emulate(
         dst.type = OP_NONE;
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x10): /* vpsrlvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x11): /* vpsravw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x12): /* vpsllvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = 1 << evex.w;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
         generate_exception_if(evex.w || evex.br, EXC_UD);
     avx512_broadcast:
@@ -8838,6 +8908,7 @@ x86_emulate(
         generate_exception_if(!evex.r || !evex.R || evex.z, EXC_UD);
         if ( !(b & 0x20) )
             goto avx512f_imm_no_sae;
+    avx512bw_imm:
         host_and_vcpu_must_have(avx512bw);
         generate_exception_if(evex.br, EXC_UD);
         elem_bytes = 1 << evex.w;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 21/34] x86emul: support AVX512{F, BW, DQ} extract insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (19 preceding siblings ...)
  2018-09-18 12:05   ` [PATCH v3 20/34] x86emul: support AVX512{F, BW} shift/rotate insns Jan Beulich
@ 2018-09-18 12:06   ` Jan Beulich
  2018-09-18 12:07   ` [PATCH v3 22/34] x86emul: support AVX512{F, BW, DQ} insert insns Jan Beulich
                     ` (12 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:06 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -198,6 +198,7 @@ static const struct test avx512f_all[] =
 };
 
 static const struct test avx512f_128[] = {
+    INSN(extractps, 66, 0f3a, 17, el,    d, el),
     INSN(mov,       66,   0f, 6e, el, dq64, el),
     INSN(mov,       66,   0f, 7e, el, dq64, el),
     INSN(movq,      f3,   0f, 7e, el,    q, el),
@@ -207,10 +208,14 @@ static const struct test avx512f_128[] =
 static const struct test avx512f_no128[] = {
     INSN(broadcastf32x4, 66, 0f38, 1a, el_4,  d, vl),
     INSN(broadcastsd,    66, 0f38, 19, el,    q, el),
+    INSN(extractf32x4,   66, 0f3a, 19, el_4,  d, vl),
+    INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
 };
 
 static const struct test avx512f_512[] = {
     INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
+    INSN(extractf64x4,   66, 0f3a, 1b, el_4, q, vl),
+    INSN(extracti64x4,   66, 0f3a, 3b, el_4, q, vl),
 };
 
 static const struct test avx512bw_all[] = {
@@ -266,6 +271,12 @@ static const struct test avx512bw_all[]
     INSN(ptestnm,     f3, 0f38, 26,    vl,  bw, vl),
 };
 
+static const struct test avx512bw_128[] = {
+    INSN(pextrb, 66, 0f3a, 14, el, b, el),
+//       pextrw, 66,   0f, c5,     w
+    INSN(pextrw, 66, 0f3a, 15, el, w, el),
+};
+
 static const struct test avx512dq_all[] = {
     INSN_PFP(and,              0f, 54),
     INSN_PFP(andn,             0f, 55),
@@ -274,13 +285,21 @@ static const struct test avx512dq_all[]
     INSN_PFP(xor,              0f, 57),
 };
 
+static const struct test avx512dq_128[] = {
+    INSN(pextr, 66, 0f3a, 16, el, dq64, el),
+};
+
 static const struct test avx512dq_no128[] = {
     INSN(broadcastf32x2, 66, 0f38, 19, el_2, d, vl),
     INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
+    INSN(extractf64x2,   66, 0f3a, 19, el_2, q, vl),
+    INSN(extracti64x2,   66, 0f3a, 39, el_2, q, vl),
 };
 
 static const struct test avx512dq_512[] = {
     INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
+    INSN(extractf32x8,   66, 0f3a, 1b, el_8, d, vl),
+    INSN(extracti32x8,   66, 0f3a, 3b, el_8, d, vl),
 };
 
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
@@ -598,7 +617,9 @@ void evex_disp8_test(void *instr, struct
     RUN(avx512f, no128);
     RUN(avx512f, 512);
     RUN(avx512bw, all);
+    RUN(avx512bw, 128);
     RUN(avx512dq, all);
+    RUN(avx512dq, 128);
     RUN(avx512dq, no128);
     RUN(avx512dq, 512);
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -510,9 +510,13 @@ static const struct ext0f3a_table {
     [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc },
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
-    [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
+    [0x14] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 0 },
+    [0x15] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 1 },
+    [0x16] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = d8s_dq },
+    [0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 2 },
     [0x18] = { .simd_size = simd_128 },
-    [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
+    [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
+    [0x1b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
     [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x20] = { .simd_size = simd_none },
@@ -521,7 +525,8 @@ static const struct ext0f3a_table {
     [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
-    [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
+    [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
+    [0x3b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
     [0x42] = { .simd_size = simd_packed_int },
@@ -2655,6 +2660,8 @@ x86_decode_0f3a(
      ... X86EMUL_OPC_66(0, 0x17):     /* pextr*, extractps */
     case X86EMUL_OPC_VEX_66(0, 0x14)
      ... X86EMUL_OPC_VEX_66(0, 0x17): /* vpextr*, vextractps */
+    case X86EMUL_OPC_EVEX_66(0, 0x14)
+     ... X86EMUL_OPC_EVEX_66(0, 0x17): /* vpextr*, vextractps */
     case X86EMUL_OPC_VEX_F2(0, 0xf0): /* rorx */
         break;
 
@@ -8827,9 +8834,9 @@ x86_emulate(
         opc[0] = b;
         /* Convert memory/GPR operand to (%rAX). */
         rex_prefix &= ~REX_B;
-        vex.b = 1;
+        evex.b = vex.b = 1;
         if ( !mode_64bit() )
-            vex.w = 0;
+            evex.w = vex.w = 0;
         opc[1] = modrm & 0x38;
         opc[2] = imm1;
         opc[3] = 0xc3;
@@ -8839,7 +8846,10 @@ x86_emulate(
             --opc;
         }
 
-        copy_REX_VEX(opc, rex_prefix, vex);
+        if ( evex_encoded() )
+            copy_EVEX(opc, evex);
+        else
+            copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", "=m" (dst.val) : "a" (&dst.val));
         put_stub(stub);
 
@@ -8859,6 +8869,52 @@ x86_emulate(
         opc = init_prefixes(stub);
         goto pextr;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xc5):   /* vpextrw $imm8,xmm,reg */
+        generate_exception_if(ea.type != OP_REG, EXC_UD);
+        /* Convert to alternative encoding: We want to use a memory operand. */
+        evex.opcx = ext_0f3a;
+        b = 0x15;
+        modrm <<= 3;
+        evex.r = evex.b;
+        evex.R = evex.x;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x14): /* vpextrb $imm8,xmm,r/m */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x15): /* vpextrw $imm8,xmm,r/m */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x16): /* vpextr{d,q} $imm8,xmm,r/m */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x17): /* vextractps $imm8,xmm,r/m */
+        generate_exception_if((evex.lr || evex.reg != 0xf || !evex.RX ||
+                               evex.opmsk || evex.br),
+                              EXC_UD);
+        if ( !(b & 2) )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( !(b & 1) )
+            host_and_vcpu_must_have(avx512dq);
+        else
+            host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_zmm);
+        opc = init_evex(stub);
+        goto pextr;
+
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x19): /* vextractf32x4 $imm8,{y,z}mm,xmm/m128{k} */
+                                            /* vextractf64x2 $imm8,{y,z}mm,xmm/m128{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x39): /* vextracti32x4 $imm8,{y,z}mm,xmm/m128{k} */
+                                            /* vextracti64x2 $imm8,{y,z}mm,xmm/m128{k} */
+        if ( evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        generate_exception_if(!evex.lr || evex.br, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1b): /* vextractf32x8 $imm8,zmm,ymm/m256{k} */
+                                            /* vextractf64x4 $imm8,zmm,ymm/m256{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3b): /* vextracti32x8 $imm8,zmm,ymm/m256{k} */
+                                            /* vextracti64x4 $imm8,zmm,ymm/m256{k} */
+        if ( !evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        generate_exception_if(evex.lr != 2 || evex.br, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm_no_sae;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */
     {
         uint32_t mxcsr;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 22/34] x86emul: support AVX512{F, BW, DQ} insert insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (20 preceding siblings ...)
  2018-09-18 12:06   ` [PATCH v3 21/34] x86emul: support AVX512{F, BW, DQ} extract insns Jan Beulich
@ 2018-09-18 12:07   ` Jan Beulich
  2018-09-18 12:07   ` [PATCH v3 23/34] x86emul: basic AVX512F testing Jan Beulich
                     ` (11 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:07 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Also correct the comment of the AVX form of VINSERTPS.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -199,6 +199,7 @@ static const struct test avx512f_all[] =
 
 static const struct test avx512f_128[] = {
     INSN(extractps, 66, 0f3a, 17, el,    d, el),
+    INSN(insertps,  66, 0f3a, 21, el,    d, el),
     INSN(mov,       66,   0f, 6e, el, dq64, el),
     INSN(mov,       66,   0f, 7e, el, dq64, el),
     INSN(movq,      f3,   0f, 7e, el,    q, el),
@@ -210,12 +211,16 @@ static const struct test avx512f_no128[]
     INSN(broadcastsd,    66, 0f38, 19, el,    q, el),
     INSN(extractf32x4,   66, 0f3a, 19, el_4,  d, vl),
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
+    INSN(insertf32x4,    66, 0f3a, 18, el_4,  d, vl),
+    INSN(inserti32x4,    66, 0f3a, 38, el_4,  d, vl),
 };
 
 static const struct test avx512f_512[] = {
     INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
     INSN(extractf64x4,   66, 0f3a, 1b, el_4, q, vl),
     INSN(extracti64x4,   66, 0f3a, 3b, el_4, q, vl),
+    INSN(insertf64x4,    66, 0f3a, 1a, el_4, q, vl),
+    INSN(inserti64x4,    66, 0f3a, 3a, el_4, q, vl),
 };
 
 static const struct test avx512bw_all[] = {
@@ -275,6 +280,8 @@ static const struct test avx512bw_128[]
     INSN(pextrb, 66, 0f3a, 14, el, b, el),
 //       pextrw, 66,   0f, c5,     w
     INSN(pextrw, 66, 0f3a, 15, el, w, el),
+    INSN(pinsrb, 66, 0f3a, 20, el, b, el),
+    INSN(pinsrw, 66,   0f, c4, el, w, el),
 };
 
 static const struct test avx512dq_all[] = {
@@ -287,6 +294,7 @@ static const struct test avx512dq_all[]
 
 static const struct test avx512dq_128[] = {
     INSN(pextr, 66, 0f3a, 16, el, dq64, el),
+    INSN(pinsr, 66, 0f3a, 22, el, dq64, el),
 };
 
 static const struct test avx512dq_no128[] = {
@@ -294,12 +302,16 @@ static const struct test avx512dq_no128[
     INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
     INSN(extractf64x2,   66, 0f3a, 19, el_2, q, vl),
     INSN(extracti64x2,   66, 0f3a, 39, el_2, q, vl),
+    INSN(insertf64x2,    66, 0f3a, 18, el_2, q, vl),
+    INSN(inserti64x2,    66, 0f3a, 38, el_2, q, vl),
 };
 
 static const struct test avx512dq_512[] = {
     INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
     INSN(extractf32x8,   66, 0f3a, 1b, el_8, d, vl),
     INSN(extracti32x8,   66, 0f3a, 3b, el_8, d, vl),
+    INSN(insertf32x8,    66, 0f3a, 1a, el_8, d, vl),
+    INSN(inserti32x8,    66, 0f3a, 3a, el_8, d, vl),
 };
 
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -358,7 +358,7 @@ static const struct twobyte_table {
     [0xc1] = { DstMem|SrcReg|ModRM },
     [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp, d8s_vl },
     [0xc3] = { DstMem|SrcReg|ModRM|Mov },
-    [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
+    [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int, 1 },
     [0xc5] = { DstReg|SrcImmByte|ModRM|Mov },
     [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp, d8s_vl },
     [0xc7] = { ImplicitOps|ModRM },
@@ -514,17 +514,19 @@ static const struct ext0f3a_table {
     [0x15] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 1 },
     [0x16] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = d8s_dq },
     [0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 2 },
-    [0x18] = { .simd_size = simd_128 },
+    [0x18] = { .simd_size = simd_128, .d8s = 4 },
     [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
+    [0x1a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
     [0x1b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
     [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x20] = { .simd_size = simd_none },
-    [0x21] = { .simd_size = simd_other },
-    [0x22] = { .simd_size = simd_none },
+    [0x20] = { .simd_size = simd_none, .d8s = 0 },
+    [0x21] = { .simd_size = simd_other, .d8s = 2 },
+    [0x22] = { .simd_size = simd_none, .d8s = d8s_dq },
     [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
-    [0x38] = { .simd_size = simd_128 },
+    [0x38] = { .simd_size = simd_128, .d8s = 4 },
+    [0x3a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
     [0x3b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
@@ -2565,6 +2567,7 @@ x86_decode_twobyte(
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         /* fall through */
     case X86EMUL_OPC_VEX_66(0, 0xc4): /* vpinsrw */
+    case X86EMUL_OPC_EVEX_66(0, 0xc4): /* vpinsrw */
         state->desc = DstReg | SrcMem16;
         break;
 
@@ -2667,6 +2670,7 @@ x86_decode_0f3a(
 
     case X86EMUL_OPC_66(0, 0x20):     /* pinsrb */
     case X86EMUL_OPC_VEX_66(0, 0x20): /* vpinsrb */
+    case X86EMUL_OPC_EVEX_66(0, 0x20): /* vpinsrb */
         state->desc = DstImplicit | SrcMem;
         if ( modrm_mod != 3 )
             state->desc |= ByteOp;
@@ -2674,6 +2678,7 @@ x86_decode_0f3a(
 
     case X86EMUL_OPC_66(0, 0x22):     /* pinsr{d,q} */
     case X86EMUL_OPC_VEX_66(0, 0x22): /* vpinsr{d,q} */
+    case X86EMUL_OPC_EVEX_66(0, 0x22): /* vpinsr{d,q} */
         state->desc = DstImplicit | SrcMem;
         break;
 
@@ -7700,6 +7705,23 @@ x86_emulate(
         ea.type = OP_MEM;
         goto simd_0f_int_imm8;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xc4):   /* vpinsrw $imm8,r32/m16,xmm,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x20): /* vpinsrb $imm8,r32/m8,xmm,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x22): /* vpinsr{d,q} $imm8,r/m,xmm,xmm */
+        generate_exception_if(evex.lr || evex.opmsk || evex.br, EXC_UD);
+        if ( b & 2 )
+            host_and_vcpu_must_have(avx512dq);
+        else
+            host_and_vcpu_must_have(avx512bw);
+        if ( !mode_64bit() )
+            evex.w = 0;
+        memcpy(mmvalp, &src.val, op_bytes);
+        ea.type = OP_MEM;
+        op_bytes = src.bytes;
+        d = SrcMem16; /* Fake for the common SIMD code below. */
+        state->simd_size = simd_other;
+        goto avx512f_imm_no_sae;
+
     CASE_SIMD_PACKED_INT(0x0f, 0xc5):      /* pextrw $imm8,{,x}mm,reg */
     case X86EMUL_OPC_VEX_66(0x0f, 0xc5):   /* vpextrw $imm8,xmm,reg */
         generate_exception_if(vex.l, EXC_UD);
@@ -8895,8 +8917,12 @@ x86_emulate(
         opc = init_evex(stub);
         goto pextr;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x18): /* vinsertf32x4 $imm8,xmm/m128,{y,z}mm{k} */
+                                            /* vinsertf64x2 $imm8,xmm/m128,{y,z}mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x19): /* vextractf32x4 $imm8,{y,z}mm,xmm/m128{k} */
                                             /* vextractf64x2 $imm8,{y,z}mm,xmm/m128{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x38): /* vinserti32x4 $imm8,xmm/m128,{y,z}mm{k} */
+                                            /* vinserti64x2 $imm8,xmm/m128,{y,z}mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x39): /* vextracti32x4 $imm8,{y,z}mm,xmm/m128{k} */
                                             /* vextracti64x2 $imm8,{y,z}mm,xmm/m128{k} */
         if ( evex.w )
@@ -8905,8 +8931,12 @@ x86_emulate(
         fault_suppression = false;
         goto avx512f_imm_no_sae;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1a): /* vinsertf32x4 $imm8,ymm/m256,zmm{k} */
+                                            /* vinsertf64x2 $imm8,ymm/m256,zmm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1b): /* vextractf32x8 $imm8,zmm,ymm/m256{k} */
                                             /* vextractf64x4 $imm8,zmm,ymm/m256{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3a): /* vinserti32x4 $imm8,ymm/m256,zmm{k} */
+                                            /* vinserti64x2 $imm8,ymm/m256,zmm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3b): /* vextracti32x8 $imm8,zmm,ymm/m256{k} */
                                             /* vextracti64x4 $imm8,zmm,ymm/m256{k} */
         if ( !evex.w )
@@ -8999,13 +9029,19 @@ x86_emulate(
         op_bytes = 4;
         goto simd_0f3a_common;
 
-    case X86EMUL_OPC_VEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m32,xmm,xmm */
         op_bytes = 4;
         /* fall through */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x41): /* vdppd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m32,xmm,xmm */
+        op_bytes = 4;
+        generate_exception_if(evex.lr || evex.w || evex.opmsk || evex.br,
+                              EXC_UD);
+        goto avx512f_imm_no_sae;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
         if ( !vex.w )




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 23/34] x86emul: basic AVX512F testing
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (21 preceding siblings ...)
  2018-09-18 12:07   ` [PATCH v3 22/34] x86emul: support AVX512{F, BW, DQ} insert insns Jan Beulich
@ 2018-09-18 12:07   ` Jan Beulich
  2018-09-18 12:08   ` [PATCH v3 24/34] x86emul: support AVX512{F, BW, DQ} integer broadcast insns Jan Beulich
                     ` (10 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:07 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Test various of the insns which have been implemented already.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,7 +11,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -52,6 +52,9 @@ avx2-sg-flts := 4 8
 xop-vecs := $(avx-vecs)
 xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
+avx512f-vecs := 64
+avx512f-ints := 4 8
+avx512f-flts := 4 8
 
 avx512f-opmask-vecs := 2
 avx512dq-opmask-vecs := 1
@@ -137,7 +140,7 @@ $(addsuffix .c,$(SG)):
 
 $(addsuffix .h,$(SIMD) $(FMA) $(SG)): simd.h
 
-xop.h: simd-fma.c
+xop.h avx512f.h: simd-fma.c
 
 $(TARGET): x86-emulate.o test_x86_emulator.o evex-disp8.o wrappers.o
 	$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -2,7 +2,41 @@
 
 ENTRY(simd_test);
 
-#if VEC_SIZE == 8 && defined(__SSE__)
+#if defined(__AVX512F__)
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# if VEC_SIZE == 4
+#  define eq(x, y) ({ \
+    float x_ = (x)[0]; \
+    float __attribute__((vector_size(16))) y_ = { (y)[0] }; \
+    unsigned short r_; \
+    asm ( "vcmpss $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
+    r_ == 1; \
+})
+# elif VEC_SIZE == 8
+#  define eq(x, y) ({ \
+    double x_ = (x)[0]; \
+    double __attribute__((vector_size(16))) y_ = { (y)[0] }; \
+    unsigned short r_; \
+    asm ( "vcmpsd $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
+    r_ == 1; \
+})
+# elif FLOAT_SIZE == 4
+/*
+ * gcc's (up to at least 8.2) __builtin_ia32_cmpps256_mask() has an anomaly in
+ * that its return type is QI rather than UQI, and hence the value would get
+ * sign-extended before comapring to ALL_TRUE. The same oddity does not matter
+ * for __builtin_ia32_cmppd256_mask(), as there only 4 bits are significant.
+ * Hence the extra " & ALL_TRUE".
+ */
+#  define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
+# elif FLOAT_SIZE == 8
+#  define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
+# elif INT_SIZE == 4
+#  define eq(x, y) (B(pcmpeqd, _mask, x, y, -1) == ALL_TRUE)
+# elif INT_SIZE == 8
+#  define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
+# endif
+#elif VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
 #elif VEC_SIZE == 16
 # if defined(__AVX__) && defined(FLOAT_SIZE)
@@ -93,6 +127,50 @@ static inline bool _to_bool(byte_vec_t b
     touch(x); \
     __builtin_ia32_pfrcpit2(__builtin_ia32_pfrsqit1(__builtin_ia32_pfmul(t_, t_), x), t_); \
 })
+#elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
+      (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if VEC_SIZE > FLOAT_SIZE
+#  if FLOAT_SIZE == 4
+#   define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vbroadcastss %1, %0" \
+          : "=v" (t_) : "m" (*(float[1]){ x }) ); \
+    t_; \
+})
+#   define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
+#   define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
+#   define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+#   if VEC_SIZE == 16
+#    define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
+#    define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
+#    define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
+#   endif
+#  elif FLOAT_SIZE == 8
+#   if VEC_SIZE >= 32
+#    define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vbroadcastsd %1, %0" : "=v" (t_) \
+          : "m" (*(double[1]){ x }) ); \
+    t_; \
+})
+#   else
+#    define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastq %1, %0" \
+          : "=v" (t_) : "m" (*(double[1]){ x }) ); \
+    t_; \
+})
+#   endif
+#   define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
+#   define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
+#   define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
+#   if VEC_SIZE == 16
+#    define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
+#    define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
+#    define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
+#   endif
+#  endif
+# endif
 #elif FLOAT_SIZE == 4 && defined(__SSE__)
 # if VEC_SIZE == 32 && defined(__AVX__)
 #  if defined(__AVX2__)
@@ -191,7 +269,30 @@ static inline bool _to_bool(byte_vec_t b
 #  define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
 # endif
 #endif
-#if VEC_SIZE == 16 && defined(__SSE2__)
+#if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
+     defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if INT_SIZE == 4 || UINT_SIZE == 4
+#  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
+                              (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
+# endif
+# if INT_SIZE == 4
+#  define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
+#  define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
+#  define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 4
+#  define max(x, y) ((vec_t)B(pmaxud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), (vdi_t)undef(), ~0))
+# elif INT_SIZE == 8
+#  define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 8
+#  define max(x, y) ((vec_t)B(pmaxuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# endif
+#elif VEC_SIZE == 16 && defined(__SSE2__)
 # if INT_SIZE == 1 || UINT_SIZE == 1
 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)(x), (vqi_t)(y)))
@@ -587,6 +688,10 @@ static inline bool _to_bool(byte_vec_t b
 # endif
 #endif
 
+#if defined(__AVX512F__) && defined(FLOAT_SIZE)
+# include "simd-fma.c"
+#endif
+
 int simd_test(void)
 {
     unsigned int i, j;
@@ -1034,7 +1139,8 @@ int simd_test(void)
 # endif
 #endif
 
-#if defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)
+#if (defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)) || \
+    (defined(__AVX512F__) && defined(FLOAT_SIZE))
     return -fma_test();
 #endif
 
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -70,9 +70,111 @@ typedef int __attribute__((vector_size(V
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 #endif
 
+#if VEC_SIZE == 16
+# define B(n, s, a...)   __builtin_ia32_ ## n ## 128 ## s(a)
+# define B_(n, s, a...)  __builtin_ia32_ ## n ##        s(a)
+#elif VEC_SIZE == 32
+# define B(n, s, a...)   __builtin_ia32_ ## n ## 256 ## s(a)
+#elif VEC_SIZE == 64
+# define B(n, s, a...)   __builtin_ia32_ ## n ## 512 ## s(a)
+# define BR(n, s, a...)  __builtin_ia32_ ## n ## 512 ## s(a, 4)
+#endif
+#ifndef B_
+# define B_ B
+#endif
+#ifndef BR
+# define BR B
+# define BR_ B_
+#endif
+#ifndef BR_
+# define BR_ BR
+#endif
+
+#ifdef __AVX512F__
+
+/*
+ * The original plan was to effect use of EVEX encodings for scalar as well as
+ * 128- and 256-bit insn variants by restricting the compiler to use (on 64-bit
+ * only of course) XMM16-XMM31 only. All sorts of compiler errors result when
+ * doing this with gcc 8.2. Therefore resort to injecting {evex} prefixes,
+ * which has the benefit of also working for 32-bit. Granted, there is a lot of
+ * escaping to get right here.
+ */
+asm ( ".macro override insn    \n\t"
+      ".macro $\\insn o:vararg \n\t"
+      ".purgem \\insn          \n\t"
+      "{evex} \\insn \\(\\)o   \n\t"
+      ".macro \\insn o:vararg  \n\t"
+      "$\\insn \\(\\(\\))o     \n\t"
+      ".endm                   \n\t"
+      ".endm                   \n\t"
+      ".macro \\insn o:vararg  \n\t"
+      "$\\insn \\(\\)o         \n\t"
+      ".endm                   \n\t"
+      ".endm" );
+
+#define OVR(n) asm ( "override v" #n )
+#define OVR_SFP(n) OVR(n ## sd); OVR(n ## ss)
+
+#ifdef __AVX512VL__
+# ifdef __AVX512BW__
+#  define OVR_BW(n) OVR(p ## n ## b); OVR(p ## n ## w)
+# else
+#  define OVR_BW(n)
+# endif
+# define OVR_DQ(n) OVR(p ## n ## d); OVR(p ## n ## q)
+# define OVR_VFP(n) OVR(n ## pd); OVR(n ## ps)
+#else
+# define OVR_BW(n)
+# define OVR_DQ(n)
+# define OVR_VFP(n)
+#endif
+
+#define OVR_FMA(n, w) OVR_ ## w(n ## 132); OVR_ ## w(n ## 213); \
+                      OVR_ ## w(n ## 231)
+#define OVR_FP(n) OVR_VFP(n); OVR_SFP(n)
+#define OVR_INT(n) OVR_BW(n); OVR_DQ(n)
+
+OVR_SFP(broadcast);
+OVR_SFP(comi);
+OVR_FP(add);
+OVR_FP(div);
+OVR(extractps);
+OVR_FMA(fmadd, FP);
+OVR_FMA(fmsub, FP);
+OVR_FMA(fnmadd, FP);
+OVR_FMA(fnmsub, FP);
+OVR(insertps);
+OVR_FP(max);
+OVR_FP(min);
+OVR(movd);
+OVR(movq);
+OVR_SFP(mov);
+OVR_FP(mul);
+OVR_FP(sqrt);
+OVR_FP(sub);
+OVR_SFP(ucomi);
+
+#undef OVR_VFP
+#undef OVR_SFP
+#undef OVR_INT
+#undef OVR_FP
+#undef OVR_FMA
+#undef OVR_DQ
+#undef OVR_BW
+#undef OVR
+
+#endif
+
 /*
  * Suppress value propagation by the compiler, preventing unwanted
  * optimization. This at once makes the compiler use memory operands
  * more often, which for our purposes is the more interesting case.
  */
 #define touch(var) asm volatile ( "" : "+m" (var) )
+
+static inline vec_t undef(void)
+{
+    vec_t v = v;
+    return v;
+}
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -1,10 +1,9 @@
+#if !defined(__XOP__) && !defined(__AVX512F__)
 #include "simd.h"
-
-#ifndef __XOP__
 ENTRY(fma_test);
 #endif
 
-#if VEC_SIZE < 16
+#if VEC_SIZE < 16 && !defined(to_bool)
 # define to_bool(cmp) (!~(cmp)[0])
 #elif VEC_SIZE == 16
 # if FLOAT_SIZE == 4
@@ -24,7 +23,13 @@ ENTRY(fma_test);
 # define eq(x, y) to_bool((x) == (y))
 #endif
 
-#if VEC_SIZE == 16
+#if defined(__AVX512F__) && VEC_SIZE > FLOAT_SIZE
+# if FLOAT_SIZE == 4
+#  define fmaddsub(x, y, z) BR(vfmaddsubps, _mask, x, y, z, ~0)
+# elif FLOAT_SIZE == 8
+#  define fmaddsub(x, y, z) BR(vfmaddsubpd, _mask, x, y, z, ~0)
+# endif
+#elif VEC_SIZE == 16
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps(x, y)
 #  if defined(__FMA4__) || defined(__FMA__)
@@ -50,6 +55,10 @@ ENTRY(fma_test);
 # endif
 #endif
 
+#if defined(fmaddsub) && !defined(addsub)
+# define addsub(x, y) fmaddsub(x, broadcast(1), y)
+#endif
+
 int fma_test(void)
 {
     unsigned int i;
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -21,6 +21,7 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512f-opmask.h"
 #include "avx512dq-opmask.h"
 #include "avx512bw-opmask.h"
+#include "avx512f.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -248,6 +249,14 @@ static const struct {
     SIMD(OPMASK/b,    avx512dq_opmask,         1),
     SIMD(OPMASK/d,    avx512bw_opmask,         4),
     SIMD(OPMASK/q,    avx512bw_opmask,         8),
+    SIMD(AVX512F f32 scalar,  avx512f,        f4),
+    SIMD(AVX512F f32x16,      avx512f,      64f4),
+    SIMD(AVX512F f64 scalar,  avx512f,        f8),
+    SIMD(AVX512F f64x8,       avx512f,      64f8),
+    SIMD(AVX512F s32x16,      avx512f,      64i4),
+    SIMD(AVX512F u32x16,      avx512f,      64u4),
+    SIMD(AVX512F s64x8,       avx512f,      64i8),
+    SIMD(AVX512F u64x8,       avx512f,      64u8),
 #undef SIMD_
 #undef SIMD
 };




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 24/34] x86emul: support AVX512{F, BW, DQ} integer broadcast insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (22 preceding siblings ...)
  2018-09-18 12:07   ` [PATCH v3 23/34] x86emul: basic AVX512F testing Jan Beulich
@ 2018-09-18 12:08   ` Jan Beulich
  2018-09-18 12:09   ` [PATCH v3 25/34] x86emul: basic AVX512VL testing Jan Beulich
                     ` (9 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:08 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note that the pbroadcastw table entry in evex-disp8.c is slightly
different from what one would expect, due to it requiring EVEX.W to be
zero.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -150,6 +150,9 @@ static const struct test avx512f_all[] =
     INSN(paddq,        66,   0f, d4,    vl,      q, vl),
     INSN(pand,         66,   0f, db,    vl,     dq, vl),
     INSN(pandn,        66,   0f, df,    vl,     dq, vl),
+//       pbroadcast,   66, 0f38, 7c,          dq64
+    INSN(pbroadcastd,  66, 0f38, 58,    el,      d, el),
+    INSN(pbroadcastq,  66, 0f38, 59,    el,      q, el),
     INSN(pcmp,         66, 0f3a, 1f,    vl,     dq, vl),
     INSN(pcmpeqd,      66,   0f, 76,    vl,      d, vl),
     INSN(pcmpeqq,      66, 0f38, 29,    vl,      q, vl),
@@ -208,6 +211,7 @@ static const struct test avx512f_128[] =
 
 static const struct test avx512f_no128[] = {
     INSN(broadcastf32x4, 66, 0f38, 1a, el_4,  d, vl),
+    INSN(broadcasti32x4, 66, 0f38, 5a, el_4,  d, vl),
     INSN(broadcastsd,    66, 0f38, 19, el,    q, el),
     INSN(extractf32x4,   66, 0f3a, 19, el_4,  d, vl),
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
@@ -217,6 +221,7 @@ static const struct test avx512f_no128[]
 
 static const struct test avx512f_512[] = {
     INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
+    INSN(broadcasti64x4, 66, 0f38, 5b, el_4, q, vl),
     INSN(extractf64x4,   66, 0f3a, 1b, el_4, q, vl),
     INSN(extracti64x4,   66, 0f3a, 3b, el_4, q, vl),
     INSN(insertf64x4,    66, 0f3a, 1a, el_4, q, vl),
@@ -236,6 +241,10 @@ static const struct test avx512bw_all[]
     INSN(paddw,       66,   0f, fd,    vl,   w, vl),
     INSN(pavgb,       66,   0f, e0,    vl,   b, vl),
     INSN(pavgw,       66,   0f, e3,    vl,   w, vl),
+    INSN(pbroadcastb, 66, 0f38, 78,    el,   b, el),
+//       pbroadcastb, 66, 0f38, 7a,          b
+    INSN(pbroadcastw, 66, 0f38, 79,    el_2, b, vl),
+//       pbroadcastw, 66, 0f38, 7b,          b
     INSN(pcmp,        66, 0f3a, 3f,    vl,  bw, vl),
     INSN(pcmpeqb,     66,   0f, 74,    vl,   b, vl),
     INSN(pcmpeqw,     66,   0f, 75,    vl,   w, vl),
@@ -287,6 +296,7 @@ static const struct test avx512bw_128[]
 static const struct test avx512dq_all[] = {
     INSN_PFP(and,              0f, 54),
     INSN_PFP(andn,             0f, 55),
+    INSN(broadcasti32x2, 66, 0f38, 59, el_2,  d, vl),
     INSN_PFP(or,               0f, 56),
     INSN(pmullq,         66, 0f38, 40,   vl,  q, vl),
     INSN_PFP(xor,              0f, 57),
@@ -300,6 +310,7 @@ static const struct test avx512dq_128[]
 static const struct test avx512dq_no128[] = {
     INSN(broadcastf32x2, 66, 0f38, 19, el_2, d, vl),
     INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
+    INSN(broadcasti64x2, 66, 0f38, 5a, el_2, q, vl),
     INSN(extractf64x2,   66, 0f3a, 19, el_2, q, vl),
     INSN(extracti64x2,   66, 0f3a, 39, el_2, q, vl),
     INSN(insertf64x2,    66, 0f3a, 18, el_2, q, vl),
@@ -308,6 +319,7 @@ static const struct test avx512dq_no128[
 
 static const struct test avx512dq_512[] = {
     INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
+    INSN(broadcasti32x8, 66, 0f38, 5b, el_8, d, vl),
     INSN(extractf32x8,   66, 0f3a, 1b, el_8, d, vl),
     INSN(extracti32x8,   66, 0f3a, 3b, el_8, d, vl),
     INSN(insertf32x8,    66, 0f3a, 1a, el_8, d, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -272,9 +272,33 @@ static inline bool _to_bool(byte_vec_t b
 #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
      defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
 # if INT_SIZE == 4 || UINT_SIZE == 4
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastd %1, %0" \
+          : "=v" (t_) : "m" (*(int[1]){ x }) ); \
+    t_; \
+})
+#  define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
+    t_; \
+})
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
 # elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastq %1, %0" \
+          : "=v" (t_) : "m" (*(long long[1]){ x }) ); \
+    t_; \
+})
+#  ifdef __x86_64__
+#   define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastq %1, %0" : "=v" (t_) : "r" ((x) + 0ULL) ); \
+    t_; \
+})
+#  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
 # endif
 # if INT_SIZE == 4
@@ -971,10 +995,14 @@ int simd_test(void)
     if ( !eq(swap2(src), inv) ) return __LINE__;
 #endif
 
-#if defined(broadcast)
+#ifdef broadcast
     if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
 #endif
 
+#ifdef broadcast2
+    if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -452,9 +452,13 @@ static const struct ext0f38_table {
     [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
-    [0x5a] = { .simd_size = simd_128, .two_op = 1 },
-    [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
+    [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
+    [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
+    [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
+    [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x78] = { .simd_size = simd_other, .two_op = 1 },
+    [0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 },
+    [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 },
     [0x8c] = { .simd_size = simd_packed_int },
     [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
@@ -2615,6 +2619,11 @@ x86_decode_0f38(
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         break;
 
+    case X86EMUL_OPC_EVEX_66(0, 0x7a): /* vpbroadcastb */
+    case X86EMUL_OPC_EVEX_66(0, 0x7b): /* vpbroadcastw */
+    case X86EMUL_OPC_EVEX_66(0, 0x7c): /* vpbroadcast{d,q} */
+        break;
+
     case 0xf0: /* movbe / crc32 */
         state->desc |= repne_prefix() ? ByteOp : Mov;
         if ( rep_prefix() )
@@ -8182,6 +8191,8 @@ x86_emulate(
         goto avx512f_no_sae;
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,[xyz]mm{k} */
+        op_bytes = elem_bytes;
         generate_exception_if(evex.w || evex.br, EXC_UD);
     avx512_broadcast:
         /*
@@ -8200,17 +8211,27 @@ x86_emulate(
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */
                                             /* vbroadcastf64x4 m256,zmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x5b): /* vbroadcasti32x8 m256,zmm{k} */
+                                            /* vbroadcasti64x4 m256,zmm{k} */
         generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD);
         /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */
                                             /* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */
-        generate_exception_if(!evex.lr || evex.br, EXC_UD);
+        generate_exception_if(!evex.lr, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,[xyz]mm{k} */
+                                            /* vbroadcasti32x2 xmm/m64,[xyz]mm{k} */
+        if ( b == 0x59 )
+            op_bytes = 8;
+        generate_exception_if(evex.br, EXC_UD);
         if ( !evex.w )
             host_and_vcpu_must_have(avx512dq);
         goto avx512_broadcast;
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */
                                             /* vbroadcastf64x2 m128,{y,z}mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x5a): /* vbroadcasti32x4 m128,{y,z}mm{k} */
+                                            /* vbroadcasti64x2 m128,{y,z}mm{k} */
         generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.br,
                               EXC_UD);
         if ( evex.w )
@@ -8404,6 +8425,45 @@ x86_emulate(
         generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
         goto simd_0f_avx2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.w || evex.br, EXC_UD);
+        op_bytes = elem_bytes = 1 << (b & 1);
+        /* See the comment at the avx512_broadcast label. */
+        op_mask |= !(b & 1 ? !(uint32_t)op_mask : !op_mask);
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7a): /* vpbroadcastb r32,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7b): /* vpbroadcastw r32,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7c): /* vpbroadcast{d,q} reg,[xyz]mm{k} */
+        generate_exception_if((ea.type != OP_REG || evex.br ||
+                               evex.reg != 0xf || !evex.RX),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(false);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        opc = init_evex(stub);
+        opc[0] = b;
+        /* Convert GPR source to %rAX. */
+        evex.b = 1;
+        if ( !mode_64bit() )
+            evex.w = 0;
+        opc[1] = modrm & 0xf8;
+        insn_bytes = EVEX_PFX_BYTES + 2;
+        opc[2] = 0xc3;
+
+        copy_EVEX(opc, evex);
+        invoke_stub("", "", "+m" (src.val) : "a" (src.val));
+
+        put_stub(stub);
+        ASSERT(!state->simd_size);
+        break;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 25/34] x86emul: basic AVX512VL testing
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (23 preceding siblings ...)
  2018-09-18 12:08   ` [PATCH v3 24/34] x86emul: support AVX512{F, BW, DQ} integer broadcast insns Jan Beulich
@ 2018-09-18 12:09   ` Jan Beulich
  2018-09-18 12:09   ` [PATCH v3 26/34] x86emul: support AVX512{F, BW} zero- and sign-extending moves Jan Beulich
                     ` (8 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:09 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Test the 128- and 256-bit variants of the insns which have been
implemented already.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -52,7 +52,7 @@ avx2-sg-flts := 4 8
 xop-vecs := $(avx-vecs)
 xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
-avx512f-vecs := 64
+avx512f-vecs := 64 16 32
 avx512f-ints := 4 8
 avx512f-flts := 4 8
 
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -5,13 +5,13 @@ ENTRY(fma_test);
 
 #if VEC_SIZE < 16 && !defined(to_bool)
 # define to_bool(cmp) (!~(cmp)[0])
-#elif VEC_SIZE == 16
+#elif VEC_SIZE == 16 && !defined(__AVX512VL__)
 # if FLOAT_SIZE == 4
 #  define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
 # elif FLOAT_SIZE == 8
 #  define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
 # endif
-#elif VEC_SIZE == 32
+#elif VEC_SIZE == 32 && !defined(__AVX512VL__)
 # if FLOAT_SIZE == 4
 #  define to_bool(cmp) __builtin_ia32_vtestcps256(cmp, (vec_t){} == 0)
 # elif FLOAT_SIZE == 8
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -533,7 +533,7 @@ static inline bool _to_bool(byte_vec_t b
 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 64))
 # endif
 #endif
-#if VEC_SIZE == 16 && defined(__SSE4_1__)
+#if VEC_SIZE == 16 && defined(__SSE4_1__) && !defined(__AVX512VL__)
 # if INT_SIZE == 1
 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxsb128((vqi_t)(x), (vqi_t)(y)))
 #  define min(x, y) ((vec_t)__builtin_ia32_pminsb128((vqi_t)(x), (vqi_t)(y)))
@@ -587,7 +587,7 @@ static inline bool _to_bool(byte_vec_t b
 #  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
 # endif
 #endif
-#if VEC_SIZE == 32 && defined(__AVX__)
+#if VEC_SIZE == 32 && defined(__AVX__) && !defined(__AVX512VL__)
 # if FLOAT_SIZE == 4
 #  define dot_product(x, y) ({ \
     vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -92,6 +92,15 @@ typedef long long __attribute__((vector_
 
 #ifdef __AVX512F__
 
+#if VEC_SIZE < 64
+# pragma GCC target ( "avx512vl" )
+#endif
+
+#define REN(insn, old, new)                      \
+    asm ( ".macro v" #insn #old " o:vararg \n\t" \
+          "v" #insn #new " \\o             \n\t" \
+          ".endm" )
+
 /*
  * The original plan was to effect use of EVEX encodings for scalar as well as
  * 128- and 256-bit insn variants by restricting the compiler to use (on 64-bit
@@ -135,25 +144,88 @@ asm ( ".macro override insn    \n\t"
 #define OVR_FP(n) OVR_VFP(n); OVR_SFP(n)
 #define OVR_INT(n) OVR_BW(n); OVR_DQ(n)
 
+OVR_INT(broadcast);
 OVR_SFP(broadcast);
 OVR_SFP(comi);
 OVR_FP(add);
+OVR_INT(add);
 OVR_FP(div);
 OVR(extractps);
 OVR_FMA(fmadd, FP);
+OVR_FMA(fmaddsub, VFP);
 OVR_FMA(fmsub, FP);
+OVR_FMA(fmsubadd, VFP);
 OVR_FMA(fnmadd, FP);
 OVR_FMA(fnmsub, FP);
 OVR(insertps);
 OVR_FP(max);
+OVR_INT(maxs);
+OVR_INT(maxu);
 OVR_FP(min);
+OVR_INT(mins);
+OVR_INT(minu);
 OVR(movd);
 OVR(movq);
 OVR_SFP(mov);
+OVR_VFP(mova);
+OVR_VFP(movnt);
+OVR_VFP(movu);
 OVR_FP(mul);
+OVR_VFP(shuf);
+OVR_INT(sll);
+OVR_DQ(sllv);
 OVR_FP(sqrt);
+OVR_INT(sra);
+OVR_DQ(srav);
+OVR_INT(srl);
+OVR_DQ(srlv);
 OVR_FP(sub);
+OVR_INT(sub);
 OVR_SFP(ucomi);
+OVR_VFP(unpckh);
+OVR_VFP(unpckl);
+
+#ifdef __AVX512VL__
+# if ELEM_SIZE == 8 && defined(__AVX512DQ__)
+REN(extract, f128, f64x2);
+REN(extract, i128, i64x2);
+REN(insert, f128, f64x2);
+REN(insert, i128, i64x2);
+# else
+REN(extract, f128, f32x4);
+REN(extract, i128, i32x4);
+REN(insert, f128, f32x4);
+REN(insert, i128, i32x4);
+# endif
+# if ELEM_SIZE == 8
+REN(movdqa, , 64);
+REN(movdqu, , 64);
+REN(pand, , q);
+REN(pandn, , q);
+REN(por, , q);
+REN(pxor, , q);
+# else
+#  if ELEM_SIZE == 1 && defined(__AVX512BW__)
+REN(movdq, a, u8);
+REN(movdqu, , 8);
+#  elif ELEM_SIZE == 2 && defined(__AVX512BW__)
+REN(movdq, a, u16);
+REN(movdqu, , 16);
+#  else
+REN(movdqa, , 32);
+REN(movdqu, , 32);
+#  endif
+REN(pand, , d);
+REN(pandn, , d);
+REN(por, , d);
+REN(pxor, , d);
+# endif
+OVR(movntdq);
+OVR(movntdqa);
+OVR(pmulld);
+OVR(pmuldq);
+OVR(pmuludq);
+#endif
 
 #undef OVR_VFP
 #undef OVR_SFP
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -88,6 +88,11 @@ static bool simd_check_avx512f(void)
 }
 #define simd_check_avx512f_opmask simd_check_avx512f
 
+static bool simd_check_avx512f_vl(void)
+{
+    return cpu_has_avx512f && cpu_has_avx512vl;
+}
+
 static bool simd_check_avx512dq(void)
 {
     return cpu_has_avx512dq;
@@ -142,11 +147,21 @@ static const struct {
       .check_cpu = simd_check_ ## feat,                             \
       .set_regs = simd_set_regs,                                    \
       .check_regs = simd_check_regs }
+#define AVX512VL_(bits, desc, feat, form)                          \
+    { .code = feat ## _x86_ ## bits ## _D ## _ ## form,            \
+      .size = sizeof(feat ## _x86_ ## bits ## _D ## _ ## form),    \
+      .bitness = bits, .name = "AVX512" #desc,                     \
+      .check_cpu = simd_check_ ## feat ## _vl,                     \
+      .set_regs = simd_set_regs,                                   \
+      .check_regs = simd_check_regs }
 #ifdef __x86_64__
 # define SIMD(desc, feat, form) SIMD_(64, desc, feat, form), \
                                 SIMD_(32, desc, feat, form)
+# define AVX512VL(desc, feat, form) AVX512VL_(64, desc, feat, form), \
+                                    AVX512VL_(32, desc, feat, form)
 #else
 # define SIMD(desc, feat, form) SIMD_(32, desc, feat, form)
+# define AVX512VL(desc, feat, form) AVX512VL_(32, desc, feat, form)
 #endif
     SIMD(3DNow! single,          _3dnow,     8f4),
     SIMD(SSE scalar single,      sse,         f4),
@@ -257,6 +272,20 @@ static const struct {
     SIMD(AVX512F u32x16,      avx512f,      64u4),
     SIMD(AVX512F s64x8,       avx512f,      64i8),
     SIMD(AVX512F u64x8,       avx512f,      64u8),
+    AVX512VL(VL f32x4,        avx512f,      16f4),
+    AVX512VL(VL f64x2,        avx512f,      16f8),
+    AVX512VL(VL f32x8,        avx512f,      32f4),
+    AVX512VL(VL f64x4,        avx512f,      32f8),
+    AVX512VL(VL s32x4,        avx512f,      16i4),
+    AVX512VL(VL u32x4,        avx512f,      16u4),
+    AVX512VL(VL s32x8,        avx512f,      32i4),
+    AVX512VL(VL u32x8,        avx512f,      32u4),
+    AVX512VL(VL s64x2,        avx512f,      16i8),
+    AVX512VL(VL u64x2,        avx512f,      16u8),
+    AVX512VL(VL s64x4,        avx512f,      32i8),
+    AVX512VL(VL u64x4,        avx512f,      32u8),
+#undef AVX512VL_
+#undef AVX512VL
 #undef SIMD_
 #undef SIMD
 };




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 26/34] x86emul: support AVX512{F, BW} zero- and sign-extending moves
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (24 preceding siblings ...)
  2018-09-18 12:09   ` [PATCH v3 25/34] x86emul: basic AVX512VL testing Jan Beulich
@ 2018-09-18 12:09   ` Jan Beulich
  2018-09-18 12:09   ` [PATCH v3 27/34] x86emul: support AVX512{F, BW} down conversion moves Jan Beulich
                     ` (7 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:09 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note that the testing in simd.c doesn't really follow the ISA extension
pattern - to fit the scheme, extensions from byte and word granular
vectors can (currently) sensibly only happen in the AVX512BW case (and
hence respective abstraction macros will be added there rather than
here).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -163,6 +163,16 @@ static const struct test avx512f_all[] =
     INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
     INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
     INSN(pminu,        66, 0f38, 3b,    vl,     dq, vl),
+    INSN(pmovsxbd,     66, 0f38, 21,    vl_4,    b, vl),
+    INSN(pmovsxbq,     66, 0f38, 22,    vl_8,    b, vl),
+    INSN(pmovsxwd,     66, 0f38, 23,    vl_2,    w, vl),
+    INSN(pmovsxwq,     66, 0f38, 24,    vl_4,    w, vl),
+    INSN(pmovsxdq,     66, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovzxbd,     66, 0f38, 31,    vl_4,    b, vl),
+    INSN(pmovzxbq,     66, 0f38, 32,    vl_8,    b, vl),
+    INSN(pmovzxwd,     66, 0f38, 33,    vl_2,    w, vl),
+    INSN(pmovzxwq,     66, 0f38, 34,    vl_4,    w, vl),
+    INSN(pmovzxdq,     66, 0f38, 35,    vl_2, d_nb, vl),
     INSN(pmuldq,       66, 0f38, 28,    vl,      q, vl),
     INSN(pmulld,       66, 0f38, 40,    vl,      d, vl),
     INSN(pmuludq,      66,   0f, f4,    vl,      q, vl),
@@ -260,6 +270,8 @@ static const struct test avx512bw_all[]
     INSN(pminsw,      66,   0f, ea,    vl,   w, vl),
     INSN(pminub,      66,   0f, da,    vl,   b, vl),
     INSN(pminuw,      66, 0f38, 3a,    vl,   w, vl),
+    INSN(pmovsxbw,    66, 0f38, 20,    vl_2, b, vl),
+    INSN(pmovzxbw,    66, 0f38, 30,    vl_2, b, vl),
     INSN(pmulhuw,     66,   0f, e4,    vl,   w, vl),
     INSN(pmulhw,      66,   0f, e5,    vl,   w, vl),
     INSN(pmullw,      66,   0f, d5,    vl,   w, vl),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -441,13 +441,23 @@ static const struct ext0f38_table {
     [0x1a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
     [0x1b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
+    [0x20] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x21] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x22] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_8 },
+    [0x23] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x24] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x25] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x26 ... 0x29] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
     [0x2c ... 0x2d] = { .simd_size = simd_packed_fp },
     [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
-    [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
+    [0x30] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x31] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x32] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_8 },
+    [0x33] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x34] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x35] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
@@ -8297,6 +8307,25 @@ x86_emulate(
         op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
         goto simd_0f_int;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x30): /* vpmovzxbw {x,y}mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x23): /* vpmovsxwd {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x24): /* vpmovsxwq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x25): /* vpmovsxdq {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x31): /* vpmovzxbd xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x32): /* vpmovzxbq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x33): /* vpmovzxwd {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x35): /* vpmovzxdq {x,y}mm/mem,[xyz]mm{k} */
+        generate_exception_if(evex.w && (b & 7) == 5, EXC_UD);
+        op_bytes = 32 >> (pmov_convert_delta[b & 7] + 1 - evex.lr);
+        elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_66(0x0f38, 0x2a):     /* movntdqa m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -305,10 +305,12 @@ static inline bool _to_bool(byte_vec_t b
 #  define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
 #  define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
 #  define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovsxdq, _mask, x, (vdi_t)undef(), ~0))
 # elif UINT_SIZE == 4
 #  define max(x, y) ((vec_t)B(pmaxud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #  define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), (vdi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovzxdq, _mask, (vsi_half_t)(x), (vdi_t)undef(), ~0))
 # elif INT_SIZE == 8
 #  define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -171,6 +171,16 @@ OVR_VFP(mova);
 OVR_VFP(movnt);
 OVR_VFP(movu);
 OVR_FP(mul);
+OVR(pmovsxbd);
+OVR(pmovsxbq);
+OVR(pmovsxdq);
+OVR(pmovsxwd);
+OVR(pmovsxwq);
+OVR(pmovzxbd);
+OVR(pmovzxbq);
+OVR(pmovzxdq);
+OVR(pmovzxwd);
+OVR(pmovzxwq);
 OVR_VFP(shuf);
 OVR_INT(sll);
 OVR_DQ(sllv);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 27/34] x86emul: support AVX512{F, BW} down conversion moves
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (25 preceding siblings ...)
  2018-09-18 12:09   ` [PATCH v3 26/34] x86emul: support AVX512{F, BW} zero- and sign-extending moves Jan Beulich
@ 2018-09-18 12:09   ` Jan Beulich
  2018-09-18 12:10   ` [PATCH v3 28/34] x86emul: support AVX512{F, BW} integer unpack insns Jan Beulich
                     ` (6 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:09 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note that the vpmov{,s,us}{d,q}w table entries in evex-disp8.c are
slightly different from what one would expect, due to them requiring
EVEX.W to be zero.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -163,11 +163,26 @@ static const struct test avx512f_all[] =
     INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
     INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
     INSN(pminu,        66, 0f38, 3b,    vl,     dq, vl),
+    INSN(pmovdb,       f3, 0f38, 31,    vl_4,    b, vl),
+    INSN(pmovdw,       f3, 0f38, 33,    vl_2,    b, vl),
+    INSN(pmovqb,       f3, 0f38, 32,    vl_8,    b, vl),
+    INSN(pmovqd,       f3, 0f38, 35,    vl_2, d_nb, vl),
+    INSN(pmovqw,       f3, 0f38, 34,    vl_4,    b, vl),
+    INSN(pmovsdb,      f3, 0f38, 21,    vl_4,    b, vl),
+    INSN(pmovsdw,      f3, 0f38, 23,    vl_2,    b, vl),
+    INSN(pmovsqb,      f3, 0f38, 22,    vl_8,    b, vl),
+    INSN(pmovsqd,      f3, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovsqw,      f3, 0f38, 24,    vl_4,    b, vl),
     INSN(pmovsxbd,     66, 0f38, 21,    vl_4,    b, vl),
     INSN(pmovsxbq,     66, 0f38, 22,    vl_8,    b, vl),
     INSN(pmovsxwd,     66, 0f38, 23,    vl_2,    w, vl),
     INSN(pmovsxwq,     66, 0f38, 24,    vl_4,    w, vl),
     INSN(pmovsxdq,     66, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovusdb,     f3, 0f38, 11,    vl_4,    b, vl),
+    INSN(pmovusdw,     f3, 0f38, 13,    vl_2,    b, vl),
+    INSN(pmovusqb,     f3, 0f38, 12,    vl_8,    b, vl),
+    INSN(pmovusqd,     f3, 0f38, 15,    vl_2, d_nb, vl),
+    INSN(pmovusqw,     f3, 0f38, 14,    vl_4,    b, vl),
     INSN(pmovzxbd,     66, 0f38, 31,    vl_4,    b, vl),
     INSN(pmovzxbq,     66, 0f38, 32,    vl_8,    b, vl),
     INSN(pmovzxwd,     66, 0f38, 33,    vl_2,    w, vl),
@@ -270,7 +285,10 @@ static const struct test avx512bw_all[]
     INSN(pminsw,      66,   0f, ea,    vl,   w, vl),
     INSN(pminub,      66,   0f, da,    vl,   b, vl),
     INSN(pminuw,      66, 0f38, 3a,    vl,   w, vl),
+    INSN(pmovswb,     f3, 0f38, 20,    vl_2, b, vl),
     INSN(pmovsxbw,    66, 0f38, 20,    vl_2, b, vl),
+    INSN(pmovuswb,    f3, 0f38, 10,    vl_2, b, vl),
+    INSN(pmovwb,      f3, 0f38, 30,    vl_2, b, vl),
     INSN(pmovzxbw,    66, 0f38, 30,    vl_2, b, vl),
     INSN(pmulhuw,     66,   0f, e4,    vl,   w, vl),
     INSN(pmulhw,      66,   0f, e5,    vl,   w, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -271,6 +271,17 @@ static inline bool _to_bool(byte_vec_t b
 #endif
 #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
      defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if ELEM_COUNT == 8 /* vextracti{32,64}x4 */ || \
+     (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextracti32x8 */ || \
+     (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
+#  define low_half(x) ({ \
+    half_t t_; \
+    asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+    t_; \
+})
+# endif
 # if INT_SIZE == 4 || UINT_SIZE == 4
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -285,6 +296,7 @@ static inline bool _to_bool(byte_vec_t b
 })
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+#  define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
 # elif INT_SIZE == 8 || UINT_SIZE == 8
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -714,6 +726,27 @@ static inline bool _to_bool(byte_vec_t b
 # endif
 #endif
 
+#if VEC_SIZE >= 16
+
+# if !defined(low_half) && defined(HALF_SIZE)
+static inline half_t low_half(vec_t x)
+{
+#  if HALF_SIZE < VEC_SIZE
+    half_t y;
+    unsigned int i;
+
+    for ( i = 0; i < ELEM_COUNT / 2; ++i )
+        y[i] = x[i];
+
+    return y;
+#  else
+    return x;
+#  endif
+}
+# endif
+
+#endif
+
 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
 # include "simd-fma.c"
 #endif
@@ -1081,6 +1114,21 @@ int simd_test(void)
 
 #endif
 
+#if defined(widen1) && defined(shrink1)
+    {
+        half_t aux1 = low_half(src), aux2;
+
+        touch(aux1);
+        x = widen1(aux1);
+        touch(x);
+        aux2 = shrink1(x);
+        touch(aux2);
+        for ( i = 0; i < ELEM_COUNT / 2; ++i )
+            if ( aux2[i] != src[i] )
+                return __LINE__;
+    }
+#endif
+
 #ifdef dup_lo
     touch(src);
     x = dup_lo(src);
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -70,6 +70,23 @@ typedef int __attribute__((vector_size(V
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 #endif
 
+#if VEC_SIZE >= 16
+
+# if ELEM_COUNT >= 2
+#  if VEC_SIZE > 32
+#   define HALF_SIZE (VEC_SIZE / 2)
+#  else
+#   define HALF_SIZE 16
+#  endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(HALF_SIZE))) half_t;
+typedef char __attribute__((vector_size(HALF_SIZE))) vqi_half_t;
+typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
+typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
+typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+# endif
+
+#endif
+
 #if VEC_SIZE == 16
 # define B(n, s, a...)   __builtin_ia32_ ## n ## 128 ## s(a)
 # define B_(n, s, a...)  __builtin_ia32_ ## n ##        s(a)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3046,7 +3046,22 @@ x86_decode(
                 d |= vSIB;
             state->simd_size = ext0f38_table[b].simd_size;
             if ( evex_encoded() )
-                disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
+            {
+                /*
+                 * VPMOVUS* are identical to VPMOVS* Disp8-scaling-wise, but
+                 * their attributes don't match those of the vex_66 encoded
+                 * insns with the same base opcodes. Rather than adding new
+                 * columns to the table, handle this here for now.
+                 */
+                if ( evex.pfx != vex_f3 || (b & 0xf8) != 0x10 )
+                    disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
+                else
+                {
+                    disp8scale = decode_disp8scale(ext0f38_table[b + 0x10].d8s,
+                                                   state);
+                    state->simd_size = simd_other;
+                }
+            }
             break;
 
         case ext_0f3a:
@@ -8307,10 +8322,14 @@ x86_emulate(
         op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
         goto simd_0f_int;
 
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10): /* vpmovuswb [xyz]mm,{x,y}mm/mem{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20): /* vpmovswb [xyz]mm,{x,y}mm/mem{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x30): /* vpmovzxbw {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x30): /* vpmovwb [xyz]mm,{x,y}mm/mem{k} */
         host_and_vcpu_must_have(avx512bw);
-        /* fall through */
+        if ( evex.pfx != vex_f3 )
+        {
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x23): /* vpmovsxwd {x,y}mm/mem,[xyz]mm{k} */
@@ -8321,7 +8340,28 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x33): /* vpmovzxwd {x,y}mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x35): /* vpmovzxdq {x,y}mm/mem,[xyz]mm{k} */
-        generate_exception_if(evex.w && (b & 7) == 5, EXC_UD);
+            generate_exception_if(evex.w && (b & 7) == 5, EXC_UD);
+        }
+        else
+        {
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x11): /* vpmovusdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x12): /* vpmovusqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x13): /* vpmovusdw [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x14): /* vpmovusqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x15): /* vpmovusqd [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x21): /* vpmovsdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x22): /* vpmovsqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x23): /* vpmovsdw [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x24): /* vpmovsqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x25): /* vpmovsqd [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x31): /* vpmovdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x32): /* vpmovqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x33): /* vpmovdw [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x34): /* vpmovqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x35): /* vpmovqd [xyz]mm,{x,y}mm/mem{k} */
+            generate_exception_if(evex.w, EXC_UD);
+            d = DstMem | SrcReg | TwoOp;
+        }
         op_bytes = 32 >> (pmov_convert_delta[b & 7] + 1 - evex.lr);
         elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
         goto avx512f_no_sae;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 28/34] x86emul: support AVX512{F, BW} integer unpack insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (26 preceding siblings ...)
  2018-09-18 12:09   ` [PATCH v3 27/34] x86emul: support AVX512{F, BW} down conversion moves Jan Beulich
@ 2018-09-18 12:10   ` Jan Beulich
  2018-09-18 12:11   ` [PATCH v3 29/34] x86emul: support AVX512{F, BW, _VBMI} full permute insns Jan Beulich
                     ` (5 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:10 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

There's once again one extra twobyte_table[] entry which gets its Disp8
shift value set right away without getting support implemented just yet,
again to avoid needlessly splitting groups of entries.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -215,6 +215,10 @@ static const struct test avx512f_all[] =
     INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
     INSN(ptestm,       66, 0f38, 27,    vl,     dq, vl),
     INSN(ptestnm,      f3, 0f38, 27,    vl,     dq, vl),
+    INSN(punpckhdq,    66,   0f, 6a,    vl,      d, vl),
+    INSN(punpckhqdq,   66,   0f, 6d,    vl,      q, vl),
+    INSN(punpckldq,    66,   0f, 62,    vl,      d, vl),
+    INSN(punpcklqdq,   66,   0f, 6c,    vl,      q, vl),
     INSN(pxor,         66,   0f, ef,    vl,     dq, vl),
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
@@ -313,6 +317,10 @@ static const struct test avx512bw_all[]
     INSN(psubw,       66,   0f, f9,    vl,   w, vl),
     INSN(ptestm,      66, 0f38, 26,    vl,  bw, vl),
     INSN(ptestnm,     f3, 0f38, 26,    vl,  bw, vl),
+    INSN(punpckhbw,   66,   0f, 68,    vl,   b, vl),
+    INSN(punpckhwd,   66,   0f, 69,    vl,   w, vl),
+    INSN(punpcklbw,   66,   0f, 60,    vl,   b, vl),
+    INSN(punpcklwd,   66,   0f, 61,    vl,   w, vl),
 };
 
 static const struct test avx512bw_128[] = {
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -294,6 +294,10 @@ static inline bool _to_bool(byte_vec_t b
     asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
     t_; \
 })
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  endif
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
 #  define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
@@ -311,6 +315,10 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 #  endif
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
 # endif
 # if INT_SIZE == 4
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -198,6 +198,10 @@ OVR(pmovzxbq);
 OVR(pmovzxdq);
 OVR(pmovzxwd);
 OVR(pmovzxwq);
+OVR(punpckhdq);
+OVR(punpckhqdq);
+OVR(punpckldq);
+OVR(punpcklqdq);
 OVR_VFP(shuf);
 OVR_INT(sll);
 OVR_DQ(sllv);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -310,10 +310,10 @@ static const struct twobyte_table {
     [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
-    [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
     [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
-    [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
+    [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq },
     [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
@@ -6632,6 +6632,12 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x60): /* vpunpcklbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x61): /* vpunpcklwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x68): /* vpunpckhbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x69): /* vpunpckhwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        op_bytes = 16 << evex.lr;
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
@@ -6660,6 +6666,13 @@ x86_emulate(
         elem_bytes = 1 << (b & 1);
         goto avx512f_no_sae;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x62): /* vpunpckldq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6a): /* vpunpckhdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.w, EXC_UD);
+        fault_suppression = false;
+        op_bytes = 16 << evex.lr;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xe2): /* vpsra{d,q} xmm/m128,[xyz]mm,[xyz]mm{k} */
@@ -6702,6 +6715,10 @@ x86_emulate(
         avx512_vlen_check(false);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6c): /* vpunpcklqdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6d): /* vpunpckhqdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        fault_suppression = false;
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd4): /* vpaddq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf4): /* vpmuludq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x28): /* vpmuldq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 29/34] x86emul: support AVX512{F, BW, _VBMI} full permute insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (27 preceding siblings ...)
  2018-09-18 12:10   ` [PATCH v3 28/34] x86emul: support AVX512{F, BW} integer unpack insns Jan Beulich
@ 2018-09-18 12:11   ` Jan Beulich
  2018-09-18 12:11   ` [PATCH v3 29/34] x86emul: support AVX512{F, BW} integer shuffle insns Jan Beulich
                     ` (4 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:11 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Take the liberty and also correct the (public interface) name of the
AVX512_VBMI feature flag, on the assumption that no external consumer
has actually been using that flag so far. Furthermore make it have
AVX512BW instead of AVX512F as a prerequisite, for requiring full
64-bit mask registers (the upper 48 bits of which can't be accessed
other than through XSAVE/XRSTOR without AVX512BW support).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -159,6 +159,10 @@ static const struct test avx512f_all[] =
     INSN(pcmpgtd,      66,   0f, 66,    vl,      d, vl),
     INSN(pcmpgtq,      66, 0f38, 37,    vl,      q, vl),
     INSN(pcmpu,        66, 0f3a, 1e,    vl,     dq, vl),
+    INSN(permi2,       66, 0f38, 76,    vl,     dq, vl),
+    INSN(permi2,       66, 0f38, 77,    vl,     sd, vl),
+    INSN(permt2,       66, 0f38, 7e,    vl,     dq, vl),
+    INSN(permt2,       66, 0f38, 7f,    vl,     sd, vl),
     INSN(pmaxs,        66, 0f38, 3d,    vl,     dq, vl),
     INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
     INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
@@ -280,6 +284,8 @@ static const struct test avx512bw_all[]
     INSN(pcmpgtb,     66,   0f, 64,    vl,   b, vl),
     INSN(pcmpgtw,     66,   0f, 65,    vl,   w, vl),
     INSN(pcmpu,       66, 0f3a, 3e,    vl,  bw, vl),
+    INSN(permi2w,     66, 0f38, 75,    vl,   w, vl),
+    INSN(permt2w,     66, 0f38, 7d,    vl,   w, vl),
     INSN(pmaddwd,     66,   0f, f5,    vl,   w, vl),
     INSN(pmaxsb,      66, 0f38, 3c,    vl,   b, vl),
     INSN(pmaxsw,      66,   0f, ee,    vl,   w, vl),
@@ -364,6 +370,11 @@ static const struct test avx512dq_512[]
     INSN(inserti32x8,    66, 0f3a, 3a, el_8, d, vl),
 };
 
+static const struct test avx512_vbmi_all[] = {
+    INSN(permi2b,       66, 0f38, 75, vl, b, vl),
+    INSN(permt2b,       66, 0f38, 7d, vl, b, vl),
+};
+
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
 static const unsigned char vl_128[] = { VL_128 };
 static const unsigned char vl_no128[] = { VL_512, VL_256 };
@@ -684,4 +695,5 @@ void evex_disp8_test(void *instr, struct
     RUN(avx512dq, 128);
     RUN(avx512dq, no128);
     RUN(avx512dq, 512);
+    RUN(avx512_vbmi, all);
 }
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -144,6 +144,9 @@ static inline bool _to_bool(byte_vec_t b
 #    define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
 #    define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
 #    define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
+#   else
+#    define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
+#    define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
 #   endif
 #  elif FLOAT_SIZE == 8
 #   if VEC_SIZE >= 32
@@ -168,6 +171,9 @@ static inline bool _to_bool(byte_vec_t b
 #    define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
 #    define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
 #    define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
+#   else
+#    define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
+#    define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
 #   endif
 #  endif
 # endif
@@ -297,6 +303,9 @@ static inline bool _to_bool(byte_vec_t b
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  else
+#   define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
@@ -318,6 +327,9 @@ static inline bool _to_bool(byte_vec_t b
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  else
+#   define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
 # endif
@@ -763,6 +775,7 @@ int simd_test(void)
 {
     unsigned int i, j;
     vec_t x, y, z, src, inv, alt, sh;
+    vint_t interleave_lo, interleave_hi;
 
     for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
     {
@@ -776,6 +789,9 @@ int simd_test(void)
         if ( !(i & (i + 1)) )
             --j;
         sh[i] = j;
+
+        interleave_lo[i] = ((i & 1) * ELEM_COUNT) | (i >> 1);
+        interleave_hi[i] = interleave_lo[i] + (ELEM_COUNT / 2);
     }
 
     touch(src);
@@ -1069,7 +1085,7 @@ int simd_test(void)
     x = src * alt;
     y = interleave_lo(x, alt < 0);
     touch(x);
-    z = widen1(x);
+    z = widen1(low_half(x));
     touch(x);
     if ( !eq(z, y) ) return __LINE__;
 
@@ -1101,7 +1117,7 @@ int simd_test(void)
 
 # ifdef widen1
     touch(src);
-    x = widen1(src);
+    x = widen1(low_half(src));
     touch(src);
     if ( !eq(x, y) ) return __LINE__;
 # endif
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -70,6 +70,16 @@ typedef int __attribute__((vector_size(V
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 #endif
 
+#if ELEM_SIZE == 1
+typedef vqi_t vint_t;
+#elif ELEM_SIZE == 2
+typedef vhi_t vint_t;
+#elif ELEM_SIZE == 4
+typedef vsi_t vint_t;
+#elif ELEM_SIZE == 8
+typedef vdi_t vint_t;
+#endif
+
 #if VEC_SIZE >= 16
 
 # if ELEM_COUNT >= 2
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -279,6 +279,16 @@ static inline uint64_t xgetbv(uint32_t x
     (res.b & (1U << 31)) != 0; \
 })
 
+#define cpu_has_avx512_vbmi ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.c = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.c & (1U << 1)) != 0; \
+})
+
 int emul_test_cpuid(
     uint32_t leaf,
     uint32_t subleaf,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -466,9 +466,13 @@ static const struct ext0f38_table {
     [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
     [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
     [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x78] = { .simd_size = simd_other, .two_op = 1 },
     [0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 },
     [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 },
+    [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x8c] = { .simd_size = simd_packed_int },
     [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
@@ -1850,6 +1854,7 @@ static bool vcpu_has(
 #define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
 #define vcpu_has_avx512bw()    vcpu_has(         7, EBX, 30, ctxt, ops)
 #define vcpu_has_avx512vl()    vcpu_has(         7, EBX, 31, ctxt, ops)
+#define vcpu_has_avx512_vbmi() vcpu_has(         7, ECX,  1, ctxt, ops)
 #define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
 #define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
 
@@ -6001,6 +6006,11 @@ x86_emulate(
     CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x15): /* vunpckhp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
                               EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x76): /* vpermi2{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x77): /* vpermi2p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7e): /* vpermt2{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7f): /* vpermt2p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;
         /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xdb): /* vpand{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -8511,6 +8521,16 @@ x86_emulate(
         generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
         goto simd_0f_avx2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        if ( !evex.w )
+            host_and_vcpu_must_have(avx512_vbmi);
+        else
+            host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} */
         host_and_vcpu_must_have(avx512bw);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -107,6 +107,9 @@
 #define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
 #define cpu_has_avx512vl        boot_cpu_has(X86_FEATURE_AVX512VL)
 
+/* CPUID level 0x00000007:0.ecx */
+#define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
+
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)
 
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -224,7 +224,7 @@ XEN_CPUFEATURE(AVX512VL,      5*32+31) /
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0.ecx, word 6 */
 XEN_CPUFEATURE(PREFETCHWT1,   6*32+ 0) /*A  PREFETCHWT1 instruction */
-XEN_CPUFEATURE(AVX512VBMI,    6*32+ 1) /*A  AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(AVX512_VBMI,   6*32+ 1) /*A  AVX-512 Vector Byte Manipulation Instrs */
 XEN_CPUFEATURE(UMIP,          6*32+ 2) /*S  User Mode Instruction Prevention */
 XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
 XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -254,12 +254,17 @@ def crunch_numbers(state):
         AVX2: [AVX512F],
 
         # AVX512F is taken to mean hardware support for 512bit registers
-        # (which in practice depends on the EVEX prefix to encode), and the
-        # instructions themselves. All further AVX512 features are built on
-        # top of AVX512F
+        # (which in practice depends on the EVEX prefix to encode) as well
+        # as mask registers, and the instructions themselves. All further
+        # AVX512 features are built on top of AVX512F
         AVX512F: [AVX512DQ, AVX512IFMA, AVX512PF, AVX512ER, AVX512CD,
-                  AVX512BW, AVX512VL, AVX512VBMI, AVX512_4VNNIW,
-                  AVX512_4FMAPS, AVX512_VPOPCNTDQ],
+                  AVX512BW, AVX512VL, AVX512_4VNNIW, AVX512_4FMAPS,
+                  AVX512_VPOPCNTDQ],
+
+        # AVX512 extensions acting solely on vectors of bytes/words are made
+        # dependents of AVX512BW (as to requiring wider than 16-bit mask
+        # registers), despite the SDM not formally making this connection.
+        AVX512BW: [AVX512_VBMI],
 
         # The features:
         #   * Single Thread Indirect Branch Predictors




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 29/34] x86emul: support AVX512{F, BW} integer shuffle insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (28 preceding siblings ...)
  2018-09-18 12:11   ` [PATCH v3 29/34] x86emul: support AVX512{F, BW, _VBMI} full permute insns Jan Beulich
@ 2018-09-18 12:11   ` Jan Beulich
  2018-09-18 12:12   ` [PATCH v3 30/34] x86emul: support AVX512{BW, DQ} mask move insns Jan Beulich
                     ` (3 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:11 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Also include shuff{32x4,64x2} as being very similar to shufi{32x4,64x2}.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -200,6 +200,7 @@ static const struct test avx512f_all[] =
     INSN(prolv,        66, 0f38, 15,    vl,     dq, vl),
     INSNX(pror,        66,   0f, 72, 0, vl,     dq, vl),
     INSN(prorv,        66, 0f38, 14,    vl,     dq, vl),
+    INSN(pshufd,       66,   0f, 70,    vl,      d, vl),
     INSN(pslld,        66,   0f, f2,    el_4,    d, vl),
     INSNX(pslld,       66,   0f, 72, 6, vl,      d, vl),
     INSN(psllq,        66,   0f, f3,    el_2,    q, vl),
@@ -250,6 +251,10 @@ static const struct test avx512f_no128[]
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
     INSN(insertf32x4,    66, 0f3a, 18, el_4,  d, vl),
     INSN(inserti32x4,    66, 0f3a, 38, el_4,  d, vl),
+    INSN(shuff32x4,      66, 0f3a, 23, vl,    d, vl),
+    INSN(shuff64x2,      66, 0f3a, 23, vl,    q, vl),
+    INSN(shufi32x4,      66, 0f3a, 43, vl,    d, vl),
+    INSN(shufi64x2,      66, 0f3a, 43, vl,    q, vl),
 };
 
 static const struct test avx512f_512[] = {
@@ -304,6 +309,9 @@ static const struct test avx512bw_all[]
     INSN(pmulhw,      66,   0f, e5,    vl,   w, vl),
     INSN(pmullw,      66,   0f, d5,    vl,   w, vl),
     INSN(psadbw,      66,   0f, f6,    vl,   b, vl),
+    INSN(pshufb,      66, 0f38, 00,    vl,   b, vl),
+    INSN(pshufhw,     f3,   0f, 70,    vl,   w, vl),
+    INSN(pshuflw,     f2,   0f, 70,    vl,   w, vl),
     INSNX(pslldq,     66,   0f, 73, 7, vl,   b, vl),
     INSN(psllvw,      66, 0f38, 12,    vl,   w, vl),
     INSN(psllw,       66,   0f, f1,    el_8, w, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -147,6 +147,10 @@ static inline bool _to_bool(byte_vec_t b
 #   else
 #    define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
 #    define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
+#    define swap(x) ({ \
+    vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
+    B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \
+})
 #   endif
 #  elif FLOAT_SIZE == 8
 #   if VEC_SIZE >= 32
@@ -174,6 +178,10 @@ static inline bool _to_bool(byte_vec_t b
 #   else
 #    define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
 #    define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
+#    define swap(x) ({ \
+    vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
+    B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \
+})
 #   endif
 #  endif
 # endif
@@ -303,9 +311,14 @@ static inline bool _to_bool(byte_vec_t b
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0))
 #  else
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, \
+                             B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \
+                               VEC_SIZE == 32 ? 0b01 : 0b00011011, (vsi_t)undef(), ~0), \
+                             0b00011011, (vsi_t)undef(), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
@@ -327,9 +340,14 @@ static inline bool _to_bool(byte_vec_t b
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b01001110, (vsi_t)undef(), ~0))
 #  else
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, \
+                             (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), (vdi_t)(x), \
+                                      VEC_SIZE == 32 ? 0b01 : 0b00011011, (vdi_t)undef(), ~0), \
+                             0b01001110, (vsi_t)undef(), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
 # endif
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -119,6 +119,12 @@ typedef long long __attribute__((vector_
 
 #ifdef __AVX512F__
 
+/* Sadly there are a few exceptions to the general naming rules. */
+#define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
+#define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
+#define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
+#define __builtin_ia32_shuf_i64x2_512_mask __builtin_ia32_shuf_i64x2_mask
+
 #if VEC_SIZE < 64
 # pragma GCC target ( "avx512vl" )
 #endif
@@ -208,6 +214,7 @@ OVR(pmovzxbq);
 OVR(pmovzxdq);
 OVR(pmovzxwd);
 OVR(pmovzxwq);
+OVR(pshufd);
 OVR(punpckhdq);
 OVR(punpckhqdq);
 OVR(punpckldq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -316,7 +316,7 @@ static const struct twobyte_table {
     [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq },
     [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
-    [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
+    [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other, d8s_vl },
     [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl },
     [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x77] = { DstImplicit|SrcNone },
@@ -430,7 +430,8 @@ static const struct ext0f38_table {
     uint8_t vsib:1;
     disp8scale_t d8s:4;
 } ext0f38_table[256] = {
-    [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+    [0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x01 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
@@ -541,6 +542,7 @@ static const struct ext0f3a_table {
     [0x20] = { .simd_size = simd_none, .d8s = 0 },
     [0x21] = { .simd_size = simd_other, .d8s = 2 },
     [0x22] = { .simd_size = simd_none, .d8s = d8s_dq },
+    [0x23] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128, .d8s = 4 },
@@ -550,6 +552,7 @@ static const struct ext0f3a_table {
     [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
     [0x42] = { .simd_size = simd_packed_int },
+    [0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x44] = { .simd_size = simd_packed_int },
     [0x46] = { .simd_size = simd_packed_int },
     [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -6653,6 +6656,7 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf5): /* vpmaddwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x00): /* vpshufb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;
         /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd5): /* vpmullw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -6924,6 +6928,20 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x70): /* vpshufd $imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x70): /* vpshufhw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F2(0x0f, 0x70): /* vpshuflw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        if ( evex.pfx == vex_66 )
+            generate_exception_if(evex.w, EXC_UD);
+        else
+        {
+            host_and_vcpu_must_have(avx512bw);
+            generate_exception_if(evex.br, EXC_UD);
+        }
+        d = (d & ~SrcMask) | SrcMem | TwoOp;
+        op_bytes = 16 << evex.lr;
+        goto avx512f_imm_no_sae;
+
     CASE_SIMD_PACKED_INT(0x0f, 0x71):    /* Grp12 */
     case X86EMUL_OPC_VEX_66(0x0f, 0x71):
     CASE_SIMD_PACKED_INT(0x0f, 0x72):    /* Grp13 */
@@ -9093,7 +9111,13 @@ x86_emulate(
                                             /* vextracti64x2 $imm8,{y,z}mm,xmm/m128{k} */
         if ( evex.w )
             host_and_vcpu_must_have(avx512dq);
-        generate_exception_if(!evex.lr || evex.br, EXC_UD);
+        generate_exception_if(evex.br, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x23): /* vshuff32x4 $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+                                            /* vshuff64x2 $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x43): /* vshufi32x4 $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+                                            /* vshufi64x2 $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(!evex.lr, EXC_UD);
         fault_suppression = false;
         goto avx512f_imm_no_sae;
 




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 30/34] x86emul: support AVX512{BW, DQ} mask move insns
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (29 preceding siblings ...)
  2018-09-18 12:11   ` [PATCH v3 29/34] x86emul: support AVX512{F, BW} integer shuffle insns Jan Beulich
@ 2018-09-18 12:12   ` Jan Beulich
  2018-09-18 12:14   ` [PATCH v3 32/34] x86emul: basic AVX512BW testing Jan Beulich
                     ` (2 subsequent siblings)
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:12 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Entries to the tables in evex-disp8.c are added despite these insns not
allowing for memory operands, with the goal of the tables giving a
complete picture of the supported EVEX-encoded insns in the end.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -300,9 +300,12 @@ static const struct test avx512bw_all[]
     INSN(pminsw,      66,   0f, ea,    vl,   w, vl),
     INSN(pminub,      66,   0f, da,    vl,   b, vl),
     INSN(pminuw,      66, 0f38, 3a,    vl,   w, vl),
+//       pmovb2m,     f3, 0f38, 29,          b
+//       pmovm2,      f3, 0f38, 28,         bw
     INSN(pmovswb,     f3, 0f38, 20,    vl_2, b, vl),
     INSN(pmovsxbw,    66, 0f38, 20,    vl_2, b, vl),
     INSN(pmovuswb,    f3, 0f38, 10,    vl_2, b, vl),
+//       pmovw2m,     f3, 0f38, 29,          w
     INSN(pmovwb,      f3, 0f38, 30,    vl_2, b, vl),
     INSN(pmovzxbw,    66, 0f38, 30,    vl_2, b, vl),
     INSN(pmulhuw,     66,   0f, e4,    vl,   w, vl),
@@ -350,6 +353,9 @@ static const struct test avx512dq_all[]
     INSN_PFP(andn,             0f, 55),
     INSN(broadcasti32x2, 66, 0f38, 59, el_2,  d, vl),
     INSN_PFP(or,               0f, 56),
+//       pmovd2m,        f3, 0f38, 39,        d
+//       pmovm2,         f3, 0f38, 38,       dq
+//       pmovq2m,        f3, 0f38, 39,        q
     INSN(pmullq,         66, 0f38, 40,   vl,  q, vl),
     INSN_PFP(xor,              0f, 57),
 };
--- a/tools/tests/x86_emulator/opmask.S
+++ b/tools/tests/x86_emulator/opmask.S
@@ -12,17 +12,23 @@
 
 #if SIZE == 1
 # define _(x) x##b
+# define _v(x, t) _v_(x##q, t)
 #elif SIZE == 2
 # define _(x) x##w
+# define _v(x, t) _v_(x##d, t)
 # define WIDEN(x) x##bw
 #elif SIZE == 4
 # define _(x) x##d
+# define _v(x, t) _v_(x##w, t)
 # define WIDEN(x) x##wd
 #elif SIZE == 8
 # define _(x) x##q
+# define _v(x, t) _v_(x##b, t)
 # define WIDEN(x) x##dq
 #endif
 
+#define _v_(x, t) v##x##t
+
     .macro check res1:req, res2:req, line:req
     _(kmov)       %\res1, DATA(out)
 #if SIZE < 8 || !defined(__i386__)
@@ -131,6 +137,15 @@ _start:
 
 #endif
 
+#if SIZE > 2 ? defined(__AVX512BW__) : defined(__AVX512DQ__)
+
+    _(kmov)       DATA(in1), %k0
+    _v(pmovm2,)   %k0, %zmm7
+    _v(pmov,2m)   %zmm7, %k3
+    check         k0, k3, __LINE__
+
+#endif
+
     xor           %eax, %eax
     ret
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -8411,6 +8411,21 @@ x86_emulate(
         elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
         goto avx512f_no_sae;
 
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x29): /* vpmov{b,w}2m [xyz]mm,k */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x39): /* vpmov{d,q}2m [xyz]mm,k */
+        generate_exception_if(!evex.r || !evex.R, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x28): /* vpmovm2{b,w} k,[xyz]mm */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x38): /* vpmovm2{d,q} k,[xyz]mm */
+        if ( b & 0x10 )
+            host_and_vcpu_must_have(avx512dq);
+        else
+            host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.opmsk || ea.type != OP_REG, EXC_UD);
+        d |= TwoOp;
+        op_bytes = 16 << evex.lr;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_66(0x0f38, 0x2a):     /* movntdqa m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 32/34] x86emul: basic AVX512BW testing
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (30 preceding siblings ...)
  2018-09-18 12:12   ` [PATCH v3 30/34] x86emul: support AVX512{BW, DQ} mask move insns Jan Beulich
@ 2018-09-18 12:14   ` Jan Beulich
  2018-09-18 12:14   ` [PATCH v3 33/34] x86emul: basic AVX512DQ testing Jan Beulich
  2018-09-18 12:14   ` [PATCH v3 34/34] x86emul: also allow running the 32-bit harness on a 64-bit distro Jan Beulich
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:14 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Test various of the insns which have been implemented already.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,7 +11,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -55,6 +55,9 @@ xop-flts := $(avx-flts)
 avx512f-vecs := 64 16 32
 avx512f-ints := 4 8
 avx512f-flts := 4 8
+avx512bw-vecs := $(avx512f-vecs)
+avx512bw-ints := 1 2
+avx512bw-flts :=
 
 avx512f-opmask-vecs := 2
 avx512dq-opmask-vecs := 1
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -31,6 +31,10 @@ ENTRY(simd_test);
 #  define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
 # elif FLOAT_SIZE == 8
 #  define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
+# elif INT_SIZE == 1
+#  define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# elif INT_SIZE == 2
+#  define eq(x, y) (B(pcmpeqw, _mask, x, y, -1) == ALL_TRUE)
 # elif INT_SIZE == 4
 #  define eq(x, y) (B(pcmpeqd, _mask, x, y, -1) == ALL_TRUE)
 # elif INT_SIZE == 8
@@ -368,6 +372,87 @@ static inline bool _to_bool(byte_vec_t b
 #  define max(x, y) ((vec_t)B(pmaxuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 # endif
+#elif (INT_SIZE == 1 || UINT_SIZE == 1 || INT_SIZE == 2 || UINT_SIZE == 2) && \
+      defined(__AVX512BW__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if INT_SIZE == 1 || UINT_SIZE == 1
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastb %1, %0" \
+          : "=v" (t_) : "m" (*(char[1]){ x }) ); \
+    t_; \
+})
+#  define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastb %k1, %0" : "=v" (t_) : "r" (x) ); \
+    t_; \
+})
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhbw, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpcklbw, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufb, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0))
+#  elif defined(__AVX512VBMI__)
+#   define interleave_hi(x, y) ((vec_t)B(vpermi2varqi, _mask, (vqi_t)(x), interleave_hi, (vqi_t)(y), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(vpermt2varqi, _mask, interleave_lo, (vqi_t)(x), (vqi_t)(y), ~0))
+#  endif
+#  define mix(x, y) ((vec_t)B(movdquqi, _mask, (vqi_t)(x), (vqi_t)(y), \
+                              (0b0101010101010101010101010101010101010101010101010101010101010101LL & ALL_TRUE)))
+#  define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
+#  define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0))
+#  define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastw %1, %0" \
+          : "=v" (t_) : "m" (*(short[1]){ x }) ); \
+    t_; \
+})
+#  define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastw %k1, %0" : "=v" (t_) : "r" (x) ); \
+    t_; \
+})
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhwd, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpcklwd, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, \
+                             (vsi_t)B(pshufhw, _mask, \
+                                      B(pshuflw, _mask, (vhi_t)(x), 0b00011011, (vhi_t)undef(), ~0), \
+                                      0b00011011, (vhi_t)undef(), ~0), \
+                             0b01001110, (vsi_t)undef(), ~0))
+#  else
+#   define interleave_hi(x, y) ((vec_t)B(vpermi2varhi, _mask, (vhi_t)(x), interleave_hi, (vhi_t)(y), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(vpermt2varhi, _mask, interleave_lo, (vhi_t)(x), (vhi_t)(y), ~0))
+#  endif
+#  define mix(x, y) ((vec_t)B(movdquhi, _mask, (vhi_t)(x), (vhi_t)(y), \
+                              (0b01010101010101010101010101010101 & ALL_TRUE)))
+#  define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
+#  define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0))
+# endif
+# if INT_SIZE == 1
+#  define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovsxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovsxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0))
+#  define widen3(x) ((vec_t)B(pmovsxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 1
+#  define max(x, y) ((vec_t)B(pmaxub, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminub, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovzxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovzxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0))
+#  define widen3(x) ((vec_t)B(pmovzxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0))
+# elif INT_SIZE == 2
+#  define max(x, y) B(pmaxsw, _mask, x, y, undef(), ~0)
+#  define min(x, y) B(pminsw, _mask, x, y, undef(), ~0)
+#  define mul_hi(x, y) B(pmulhw, _mask, x, y, undef(), ~0)
+#  define widen1(x) ((vec_t)B(pmovsxwd, _mask, x, (vsi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovsxwq, _mask, x, (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 2
+#  define max(x, y) ((vec_t)B(pmaxuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
+#  define mul_hi(x, y) ((vec_t)B(pmulhuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovzxwd, _mask, (vhi_half_t)(x), (vsi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovzxwq, _mask, (vhi_quarter_t)(x), (vdi_t)undef(), ~0))
+# endif
 #elif VEC_SIZE == 16 && defined(__SSE2__)
 # if INT_SIZE == 1 || UINT_SIZE == 1
 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
@@ -559,7 +644,7 @@ static inline bool _to_bool(byte_vec_t b
 #  endif
 # endif
 #endif
-#if VEC_SIZE == 16 && defined(__SSSE3__)
+#if VEC_SIZE == 16 && defined(__SSSE3__) && !defined(__AVX512VL__)
 # if INT_SIZE == 1
 #  define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
 # elif INT_SIZE == 2
@@ -783,6 +868,40 @@ static inline half_t low_half(vec_t x)
 }
 # endif
 
+# if !defined(low_quarter) && defined(QUARTER_SIZE)
+static inline quarter_t low_quarter(vec_t x)
+{
+#  if QUARTER_SIZE < VEC_SIZE
+    quarter_t y;
+    unsigned int i;
+
+    for ( i = 0; i < ELEM_COUNT / 4; ++i )
+        y[i] = x[i];
+
+    return y;
+#  else
+    return x;
+#  endif
+}
+# endif
+
+# if !defined(low_eighth) && defined(EIGHTH_SIZE)
+static inline eighth_t low_eighth(vec_t x)
+{
+#  if EIGHTH_SIZE < VEC_SIZE
+    eighth_t y;
+    unsigned int i;
+
+    for ( i = 0; i < ELEM_COUNT / 4; ++i )
+        y[i] = x[i];
+
+    return y;
+#  else
+    return x;
+#  endif
+}
+# endif
+
 #endif
 
 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
@@ -1111,7 +1230,7 @@ int simd_test(void)
     y = interleave_lo(alt < 0, alt < 0);
     y = interleave_lo(z, y);
     touch(x);
-    z = widen2(x);
+    z = widen2(low_quarter(x));
     touch(x);
     if ( !eq(z, y) ) return __LINE__;
 
@@ -1120,7 +1239,7 @@ int simd_test(void)
     y = interleave_lo(y, y);
     y = interleave_lo(z, y);
     touch(x);
-    z = widen3(x);
+    z = widen3(low_eighth(x));
     touch(x);
     if ( !eq(z, y) ) return __LINE__;
 #  endif
@@ -1142,14 +1261,14 @@ int simd_test(void)
 
 # ifdef widen2
     touch(src);
-    x = widen2(src);
+    x = widen2(low_quarter(src));
     touch(src);
     if ( !eq(x, z) ) return __LINE__;
 # endif
 
 # ifdef widen3
     touch(src);
-    x = widen3(src);
+    x = widen3(low_eighth(src));
     touch(src);
     if ( !eq(x, interleave_lo(z, (vec_t){})) ) return __LINE__;
 # endif
@@ -1169,6 +1288,36 @@ int simd_test(void)
             if ( aux2[i] != src[i] )
                 return __LINE__;
     }
+#endif
+
+#if defined(widen2) && defined(shrink2)
+    {
+        quarter_t aux1 = low_quarter(src), aux2;
+
+        touch(aux1);
+        x = widen2(aux1);
+        touch(x);
+        aux2 = shrink2(x);
+        touch(aux2);
+        for ( i = 0; i < ELEM_COUNT / 4; ++i )
+            if ( aux2[i] != src[i] )
+                return __LINE__;
+    }
+#endif
+
+#if defined(widen3) && defined(shrink3)
+    {
+        eighth_t aux1 = low_eighth(src), aux2;
+
+        touch(aux1);
+        x = widen3(aux1);
+        touch(x);
+        aux2 = shrink3(x);
+        touch(aux2);
+        for ( i = 0; i < ELEM_COUNT / 8; ++i )
+            if ( aux2[i] != src[i] )
+                return __LINE__;
+    }
 #endif
 
 #ifdef dup_lo
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -95,6 +95,32 @@ typedef int __attribute__((vector_size(H
 typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
 # endif
 
+# if ELEM_COUNT >= 4
+#  if VEC_SIZE > 64
+#   define QUARTER_SIZE (VEC_SIZE / 4)
+#  else
+#   define QUARTER_SIZE 16
+#  endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(QUARTER_SIZE))) quarter_t;
+typedef char __attribute__((vector_size(QUARTER_SIZE))) vqi_quarter_t;
+typedef short __attribute__((vector_size(QUARTER_SIZE))) vhi_quarter_t;
+typedef int __attribute__((vector_size(QUARTER_SIZE))) vsi_quarter_t;
+typedef long long __attribute__((vector_size(QUARTER_SIZE))) vdi_quarter_t;
+# endif
+
+# if ELEM_COUNT >= 8
+#  if VEC_SIZE > 128
+#   define EIGHTH_SIZE (VEC_SIZE / 8)
+#  else
+#   define EIGHTH_SIZE 16
+#  endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(EIGHTH_SIZE))) eighth_t;
+typedef char __attribute__((vector_size(EIGHTH_SIZE))) vqi_eighth_t;
+typedef short __attribute__((vector_size(EIGHTH_SIZE))) vhi_eighth_t;
+typedef int __attribute__((vector_size(EIGHTH_SIZE))) vsi_eighth_t;
+typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t;
+# endif
+
 #endif
 
 #if VEC_SIZE == 16
@@ -182,6 +208,9 @@ OVR_SFP(broadcast);
 OVR_SFP(comi);
 OVR_FP(add);
 OVR_INT(add);
+OVR_BW(adds);
+OVR_BW(addus);
+OVR_BW(avg);
 OVR_FP(div);
 OVR(extractps);
 OVR_FMA(fmadd, FP);
@@ -229,6 +258,8 @@ OVR_INT(srl);
 OVR_DQ(srlv);
 OVR_FP(sub);
 OVR_INT(sub);
+OVR_BW(subs);
+OVR_BW(subus);
 OVR_SFP(ucomi);
 OVR_VFP(unpckh);
 OVR_VFP(unpckl);
@@ -275,6 +306,29 @@ OVR(pmuldq);
 OVR(pmuludq);
 #endif
 
+#ifdef __AVX512BW__
+OVR(pextrb);
+OVR(pextrw);
+OVR(pinsrb);
+OVR(pinsrw);
+OVR(pmaddwd);
+OVR(pmovsxbw);
+OVR(pmovzxbw);
+OVR(pmulhuw);
+OVR(pmulhw);
+OVR(pmullw);
+OVR(psadbw);
+OVR(pshufb);
+OVR(pshufhw);
+OVR(pshuflw);
+OVR(punpckhbw);
+OVR(punpckhwd);
+OVR(punpcklbw);
+OVR(punpcklwd);
+OVR(slldq);
+OVR(srldq);
+#endif
+
 #undef OVR_VFP
 #undef OVR_SFP
 #undef OVR_INT
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -22,6 +22,7 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512dq-opmask.h"
 #include "avx512bw-opmask.h"
 #include "avx512f.h"
+#include "avx512bw.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -105,6 +106,11 @@ static bool simd_check_avx512bw(void)
 }
 #define simd_check_avx512bw_opmask simd_check_avx512bw
 
+static bool simd_check_avx512bw_vl(void)
+{
+    return cpu_has_avx512bw && cpu_has_avx512vl;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -284,6 +290,18 @@ static const struct {
     AVX512VL(VL u64x2,        avx512f,      16u8),
     AVX512VL(VL s64x4,        avx512f,      32i8),
     AVX512VL(VL u64x4,        avx512f,      32u8),
+    SIMD(AVX512BW s8x64,     avx512bw,      64i1),
+    SIMD(AVX512BW u8x64,     avx512bw,      64u1),
+    SIMD(AVX512BW s16x32,    avx512bw,      64i2),
+    SIMD(AVX512BW u16x32,    avx512bw,      64u2),
+    AVX512VL(BW+VL s8x16,    avx512bw,      16i1),
+    AVX512VL(BW+VL u8x16,    avx512bw,      16u1),
+    AVX512VL(BW+VL s8x32,    avx512bw,      32i1),
+    AVX512VL(BW+VL u8x32,    avx512bw,      32u1),
+    AVX512VL(BW+VL s16x8,    avx512bw,      16i2),
+    AVX512VL(BW+VL u16x8,    avx512bw,      16u2),
+    AVX512VL(BW+VL s16x16,   avx512bw,      32i2),
+    AVX512VL(BW+VL u16x16,   avx512bw,      32u2),
 #undef AVX512VL_
 #undef AVX512VL
 #undef SIMD_




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 33/34] x86emul: basic AVX512DQ testing
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (31 preceding siblings ...)
  2018-09-18 12:14   ` [PATCH v3 32/34] x86emul: basic AVX512BW testing Jan Beulich
@ 2018-09-18 12:14   ` Jan Beulich
  2018-09-18 12:14   ` [PATCH v3 34/34] x86emul: also allow running the 32-bit harness on a 64-bit distro Jan Beulich
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:14 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Test various of the insns which have been implemented already.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,7 +11,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -58,9 +58,12 @@ avx512f-flts := 4 8
 avx512bw-vecs := $(avx512f-vecs)
 avx512bw-ints := 1 2
 avx512bw-flts :=
+avx512dq-vecs := $(avx512f-vecs)
+avx512dq-ints := $(avx512f-ints)
+avx512dq-flts := $(avx512f-flts)
 
 avx512f-opmask-vecs := 2
-avx512dq-opmask-vecs := 1
+avx512dq-opmask-vecs := 1 2
 avx512bw-opmask-vecs := 4 8
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -121,6 +121,34 @@ typedef int __attribute__((vector_size(E
 typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t;
 # endif
 
+# define DECL_PAIR(w) \
+typedef w ## _t pair_t; \
+typedef vsi_ ## w ## _t vsi_pair_t; \
+typedef vdi_ ## w ## _t vdi_pair_t
+# define DECL_QUARTET(w) \
+typedef w ## _t quartet_t; \
+typedef vsi_ ## w ## _t vsi_quartet_t; \
+typedef vdi_ ## w ## _t vdi_quartet_t
+# define DECL_OCTET(w) \
+typedef w ## _t octet_t; \
+typedef vsi_ ## w ## _t vsi_octet_t; \
+typedef vdi_ ## w ## _t vdi_octet_t
+
+# if ELEM_COUNT == 4
+DECL_PAIR(half);
+# elif ELEM_COUNT == 8
+DECL_PAIR(quarter);
+DECL_QUARTET(half);
+# elif ELEM_COUNT == 16
+DECL_PAIR(eighth);
+DECL_QUARTET(quarter);
+DECL_OCTET(half);
+# endif
+
+# undef DECL_OCTET
+# undef DECL_QUARTET
+# undef DECL_PAIR
+
 #endif
 
 #if VEC_SIZE == 16
@@ -146,6 +174,14 @@ typedef long long __attribute__((vector_
 #ifdef __AVX512F__
 
 /* Sadly there are a few exceptions to the general naming rules. */
+#define __builtin_ia32_broadcastf32x4_512_mask __builtin_ia32_broadcastf32x4_512
+#define __builtin_ia32_broadcasti32x4_512_mask __builtin_ia32_broadcasti32x4_512
+#define __builtin_ia32_insertf32x4_512_mask __builtin_ia32_insertf32x4_mask
+#define __builtin_ia32_insertf32x8_512_mask __builtin_ia32_insertf32x8_mask
+#define __builtin_ia32_insertf64x4_512_mask __builtin_ia32_insertf64x4_mask
+#define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask
+#define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask
+#define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask
 #define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
 #define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
 #define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
@@ -329,6 +365,18 @@ OVR(slldq);
 OVR(srldq);
 #endif
 
+#ifdef __AVX512DQ__
+OVR_VFP(and);
+OVR_VFP(andn);
+OVR_VFP(or);
+OVR(pextrd);
+OVR(pextrq);
+OVR(pinsrd);
+OVR(pinsrq);
+OVR(pmullq);
+OVR_VFP(xor);
+#endif
+
 #undef OVR_VFP
 #undef OVR_SFP
 #undef OVR_INT
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -23,6 +23,7 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512bw-opmask.h"
 #include "avx512f.h"
 #include "avx512bw.h"
+#include "avx512dq.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -100,6 +101,11 @@ static bool simd_check_avx512dq(void)
 }
 #define simd_check_avx512dq_opmask simd_check_avx512dq
 
+static bool simd_check_avx512dq_vl(void)
+{
+    return cpu_has_avx512dq && cpu_has_avx512vl;
+}
+
 static bool simd_check_avx512bw(void)
 {
     return cpu_has_avx512bw;
@@ -267,9 +273,10 @@ static const struct {
     SIMD(XOP i32x8,               xop,      32i4),
     SIMD(XOP i64x4,               xop,      32i8),
     SIMD(OPMASK/w,     avx512f_opmask,         2),
-    SIMD(OPMASK/b,    avx512dq_opmask,         1),
-    SIMD(OPMASK/d,    avx512bw_opmask,         4),
-    SIMD(OPMASK/q,    avx512bw_opmask,         8),
+    SIMD(OPMASK+DQ/b, avx512dq_opmask,         1),
+    SIMD(OPMASK+DQ/w, avx512dq_opmask,         2),
+    SIMD(OPMASK+BW/d, avx512bw_opmask,         4),
+    SIMD(OPMASK+BW/q, avx512bw_opmask,         8),
     SIMD(AVX512F f32 scalar,  avx512f,        f4),
     SIMD(AVX512F f32x16,      avx512f,      64f4),
     SIMD(AVX512F f64 scalar,  avx512f,        f8),
@@ -302,6 +309,24 @@ static const struct {
     AVX512VL(BW+VL u16x8,    avx512bw,      16u2),
     AVX512VL(BW+VL s16x16,   avx512bw,      32i2),
     AVX512VL(BW+VL u16x16,   avx512bw,      32u2),
+    SIMD(AVX512DQ f32x16,    avx512dq,      64f4),
+    SIMD(AVX512DQ f64x8,     avx512dq,      64f8),
+    SIMD(AVX512DQ s32x16,    avx512dq,      64i4),
+    SIMD(AVX512DQ u32x16,    avx512dq,      64u4),
+    SIMD(AVX512DQ s64x8,     avx512dq,      64i8),
+    SIMD(AVX512DQ u64x8,     avx512dq,      64u8),
+    AVX512VL(DQ+VL f32x4,    avx512dq,      16f4),
+    AVX512VL(DQ+VL f64x2,    avx512dq,      16f8),
+    AVX512VL(DQ+VL f32x8,    avx512dq,      32f4),
+    AVX512VL(DQ+VL f64x4,    avx512dq,      32f8),
+    AVX512VL(DQ+VL s32x4,    avx512dq,      16i4),
+    AVX512VL(DQ+VL u32x4,    avx512dq,      16u4),
+    AVX512VL(DQ+VL s32x8,    avx512dq,      32i4),
+    AVX512VL(DQ+VL u32x8,    avx512dq,      32u4),
+    AVX512VL(DQ+VL s64x2,    avx512dq,      16i8),
+    AVX512VL(DQ+VL u64x2,    avx512dq,      16u8),
+    AVX512VL(DQ+VL s64x4,    avx512dq,      32i8),
+    AVX512VL(DQ+VL u64x4,    avx512dq,      32u8),
 #undef AVX512VL_
 #undef AVX512VL
 #undef SIMD_
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -134,6 +134,27 @@ static inline bool _to_bool(byte_vec_t b
 #elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
       (VEC_SIZE == 64 || defined(__AVX512VL__))
 # if VEC_SIZE > FLOAT_SIZE
+#  if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
+       (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \
+       (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
+#   define low_half(x) ({ \
+    half_t t_; \
+    asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+    t_; \
+})
+#  endif
+#  if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \
+       (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
+#   define low_quarter(x) ({ \
+    quarter_t t_; \
+    asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
+    t_; \
+})
+#  endif
 #  if FLOAT_SIZE == 4
 #   define broadcast(x) ({ \
     vec_t t_; \
@@ -141,6 +162,17 @@ static inline bool _to_bool(byte_vec_t b
           : "=v" (t_) : "m" (*(float[1]){ x }) ); \
     t_; \
 })
+#   if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+#    define broadcast_pair(x) ({ \
+    vec_t t_; \
+    asm ( "vbroadcastf32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
+    t_; \
+})
+#   endif
+#   if VEC_SIZE == 64 && defined(__AVX512DQ__)
+#    define broadcast_octet(x) B(broadcastf32x8_, _mask, x, undef(), ~0)
+#    define insert_octet(x, y, p) B(insertf32x8_, _mask, x, y, p, undef(), ~0)
+#   endif
 #   define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
 #   define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
 #   define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
@@ -149,6 +181,13 @@ static inline bool _to_bool(byte_vec_t b
 #    define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
 #    define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
 #   else
+#    define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0)
+#    define insert_pair(x, y, p) \
+    B(insertf32x4_, _mask, x, \
+      /* Cast needed below to work around gcc 7.x quirk. */ \
+      (p) & 1 ? (typeof(y))__builtin_ia32_shufps(y, y, 0b01000100) : (y), \
+      (p) >> 1, x, 3 << ((p) * 2))
+#    define insert_quartet(x, y, p) B(insertf32x4_, _mask, x, y, p, undef(), ~0)
 #    define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
 #    define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
 #    define swap(x) ({ \
@@ -172,6 +211,14 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 #   endif
+#   if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+#    define broadcast_pair(x) B(broadcastf64x2_, _mask, x, undef(), ~0)
+#    define insert_pair(x, y, p) B(insertf64x2_, _mask, x, y, p, undef(), ~0)
+#   endif
+#   if VEC_SIZE == 64
+#    define broadcast_quartet(x) B(broadcastf64x4_, , x, undef(), ~0)
+#    define insert_quartet(x, y, p) B(insertf64x4_, _mask, x, y, p, undef(), ~0)
+#   endif
 #   define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
 #   define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
 #   define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
@@ -300,6 +347,16 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 # endif
+# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextracti32x4 */ || \
+       (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
+#  define low_quarter(x) ({ \
+    quarter_t t_; \
+    asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
+    t_; \
+})
+# endif
 # if INT_SIZE == 4 || UINT_SIZE == 4
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -312,11 +369,30 @@ static inline bool _to_bool(byte_vec_t b
     asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
     t_; \
 })
+#  ifdef __AVX512DQ__
+#   define broadcast_pair(x) ({ \
+    vec_t t_; \
+    asm ( "vbroadcasti32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
+    t_; \
+})
+#  endif
+#  if VEC_SIZE == 64 && defined(__AVX512DQ__)
+#   define broadcast_octet(x) ((vec_t)B(broadcasti32x8_, _mask, (vsi_octet_t)(x), (vsi_t)undef(), ~0))
+#   define insert_octet(x, y, p) ((vec_t)B(inserti32x8_, _mask, (vsi_t)(x), (vsi_octet_t)(y), p, (vsi_t)undef(), ~0))
+#  endif
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0))
 #  else
+#   define broadcast_quartet(x) ((vec_t)B(broadcasti32x4_, _mask, (vsi_quartet_t)(x), (vsi_t)undef(), ~0))
+#   define insert_pair(x, y, p) \
+    (vec_t)(B(inserti32x4_, _mask, (vsi_t)(x), \
+              /* First cast needed below to work around gcc 7.x quirk. */ \
+              (p) & 1 ? (vsi_pair_t)__builtin_ia32_pshufd((vsi_pair_t)(y), 0b01000100) \
+                      : (vsi_pair_t)(y), \
+              (p) >> 1, (vsi_t)(x), 3 << ((p) * 2)))
+#   define insert_quartet(x, y, p) ((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), (vsi_quartet_t)(y), p, (vsi_t)undef(), ~0))
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
 #   define swap(x) ((vec_t)B(pshufd, _mask, \
@@ -341,6 +417,14 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 #  endif
+#  if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+#   define broadcast_pair(x) ((vec_t)B(broadcasti64x2_, _mask, (vdi_pair_t)(x), (vdi_t)undef(), ~0))
+#   define insert_pair(x, y, p) ((vec_t)B(inserti64x2_, _mask, (vdi_t)(x), (vdi_pair_t)(y), p, (vdi_t)undef(), ~0))
+#  endif
+#  if VEC_SIZE == 64
+#   define broadcast_quartet(x) ((vec_t)B(broadcasti64x4_, , (vdi_quartet_t)(x), (vdi_t)undef(), ~0))
+#   define insert_quartet(x, y, p) ((vec_t)B(inserti64x4_, _mask, (vdi_t)(x), (vdi_quartet_t)(y), p, (vdi_t)undef(), ~0))
+#  endif
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
@@ -892,7 +976,7 @@ static inline eighth_t low_eighth(vec_t
     eighth_t y;
     unsigned int i;
 
-    for ( i = 0; i < ELEM_COUNT / 4; ++i )
+    for ( i = 0; i < ELEM_COUNT / 8; ++i )
         y[i] = x[i];
 
     return y;
@@ -904,6 +988,50 @@ static inline eighth_t low_eighth(vec_t
 
 #endif
 
+#ifdef broadcast_pair
+# if ELEM_COUNT == 4
+#  define broadcast_half broadcast_pair
+# elif ELEM_COUNT == 8
+#  define broadcast_quarter broadcast_pair
+# elif ELEM_COUNT == 16
+#  define broadcast_eighth broadcast_pair
+# endif
+#endif
+
+#ifdef insert_pair
+# if ELEM_COUNT == 4
+#  define insert_half insert_pair
+# elif ELEM_COUNT == 8
+#  define insert_quarter insert_pair
+# elif ELEM_COUNT == 16
+#  define insert_eighth insert_pair
+# endif
+#endif
+
+#ifdef broadcast_quartet
+# if ELEM_COUNT == 8
+#  define broadcast_half broadcast_quartet
+# elif ELEM_COUNT == 16
+#  define broadcast_quarter broadcast_quartet
+# endif
+#endif
+
+#ifdef insert_quartet
+# if ELEM_COUNT == 8
+#  define insert_half insert_quartet
+# elif ELEM_COUNT == 16
+#  define insert_quarter insert_quartet
+# endif
+#endif
+
+#if defined(broadcast_octet) && ELEM_COUNT == 16
+# define broadcast_half broadcast_octet
+#endif
+
+#if defined(insert_octet) && ELEM_COUNT == 16
+# define insert_half insert_octet
+#endif
+
 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
 # include "simd-fma.c"
 #endif
@@ -1199,6 +1327,60 @@ int simd_test(void)
     if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
 #endif
 
+#if defined(broadcast_half) && defined(insert_half)
+    {
+        half_t aux = low_half(src);
+
+        touch(aux);
+        x = broadcast_half(aux);
+        touch(aux);
+        y = insert_half(src, aux, 1);
+        if ( !eq(x, y) ) return __LINE__;
+    }
+#endif
+
+#if defined(broadcast_quarter) && defined(insert_quarter)
+    {
+        quarter_t aux = low_quarter(src);
+
+        touch(aux);
+        x = broadcast_quarter(aux);
+        touch(aux);
+        y = insert_quarter(src, aux, 1);
+        touch(aux);
+        y = insert_quarter(y, aux, 2);
+        touch(aux);
+        y = insert_quarter(y, aux, 3);
+        if ( !eq(x, y) ) return __LINE__;
+    }
+#endif
+
+#if defined(broadcast_eighth) && defined(insert_eighth) && \
+    /* At least gcc 7.3 "optimizes" away all insert_eighth() calls below. */ \
+    __GNUC__ >= 8
+    {
+        eighth_t aux = low_eighth(src);
+
+        touch(aux);
+        x = broadcast_eighth(aux);
+        touch(aux);
+        y = insert_eighth(src, aux, 1);
+        touch(aux);
+        y = insert_eighth(y, aux, 2);
+        touch(aux);
+        y = insert_eighth(y, aux, 3);
+        touch(aux);
+        y = insert_eighth(y, aux, 4);
+        touch(aux);
+        y = insert_eighth(y, aux, 5);
+        touch(aux);
+        y = insert_eighth(y, aux, 6);
+        touch(aux);
+        y = insert_eighth(y, aux, 7);
+        if ( !eq(x, y) ) return __LINE__;
+    }
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v3 34/34] x86emul: also allow running the 32-bit harness on a 64-bit distro
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (32 preceding siblings ...)
  2018-09-18 12:14   ` [PATCH v3 33/34] x86emul: basic AVX512DQ testing Jan Beulich
@ 2018-09-18 12:14   ` Jan Beulich
  33 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-18 12:14 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

In order to be able to verify the 32-bit variant builds and runs,
introduce a respective target (and the necessary other adjustments).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/.gitignore
+++ b/.gitignore
@@ -240,6 +240,7 @@ tools/security/xensec_tool
 tools/tests/depriv/depriv-fd-checker
 tools/tests/x86_emulator/*.bin
 tools/tests/x86_emulator/*.tmp
+tools/tests/x86_emulator/32/x86_emulate
 tools/tests/x86_emulator/3dnow*.[ch]
 tools/tests/x86_emulator/asm
 tools/tests/x86_emulator/avx*.[ch]
--- /dev/null
+++ b/tools/tests/x86_emulator/32/Makefile
@@ -0,0 +1,4 @@
+override XEN_COMPILE_ARCH := x86_32
+XEN_ROOT = $(CURDIR)/../../../..
+VPATH += ..
+include ../Makefile
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -1,5 +1,7 @@
 
+ifeq ($(XEN_ROOT),)
 XEN_ROOT=$(CURDIR)/../../..
+endif
 include $(XEN_ROOT)/tools/Rules.mk
 
 TARGET := test_x86_emulator
@@ -18,6 +20,12 @@ TESTCASES := blowfish $(SIMD) $(FMA) $(S
 
 OPMASK := avx512f avx512dq avx512bw
 
+ifeq ($(origin XEN_COMPILE_ARCH),override)
+
+HOSTCFLAGS += -m32
+
+else
+
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
@@ -148,6 +156,8 @@ $(addsuffix .h,$(SIMD) $(FMA) $(SG)): si
 
 xop.h avx512f.h: simd-fma.c
 
+endif # 32-bit override
+
 $(TARGET): x86-emulate.o test_x86_emulator.o evex-disp8.o wrappers.o
 	$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
 
@@ -162,6 +172,15 @@ distclean: clean
 .PHONY: install uninstall
 install uninstall:
 
+.PHONY: run32 clean32
+ifeq ($(XEN_TARGET_ARCH),x86_64)
+run32 clean32: %32: $(addsuffix .h,$(TESTCASES)) $(addsuffix -opmask.h,$(OPMASK))
+	$(MAKE) -C 32 $*
+clean: clean32
+else
+run32 clean32: %32: %
+endif
+
 x86_emulate:
 	[ -L $@ ] || ln -sf $(XEN_ROOT)/xen/arch/x86/$@
 





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* Re: [PATCH v3 02/34] x86/HVM: grow MMIO cache data size to 64 bytes
  2018-09-18 11:53   ` [PATCH v3 02/34] x86/HVM: grow MMIO cache data size to 64 bytes Jan Beulich
@ 2018-09-18 16:05     ` Paul Durrant
  2018-10-25 18:36     ` Andrew Cooper
  1 sibling, 0 replies; 465+ messages in thread
From: Paul Durrant @ 2018-09-18 16:05 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Andrew Cooper, Wei Liu, George Dunlap

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 18 September 2018 12:54
> To: xen-devel <xen-devel@lists.xenproject.org>
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant
> <Paul.Durrant@citrix.com>; Wei Liu <wei.liu2@citrix.com>; George Dunlap
> <George.Dunlap@citrix.com>
> Subject: [PATCH v3 02/34] x86/HVM: grow MMIO cache data size to 64 bytes
> 
> This is needed before enabling any AVX512 insns in the emulator. Change
> the way alignment is enforced at the same time.
> 
> Add a check that the buffer won't actually overflow, and while at it
> also convert the check for accesses to not cross page boundaries.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

> ---
> v3: New.
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -866,7 +866,18 @@ static int hvmemul_phys_mmio_access(
>      int rc = X86EMUL_OKAY;
> 
>      /* Accesses must fall within a page. */
> -    BUG_ON((gpa & ~PAGE_MASK) + size > PAGE_SIZE);
> +    if ( (gpa & ~PAGE_MASK) + size > PAGE_SIZE )
> +    {
> +        ASSERT_UNREACHABLE();
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> +
> +    /* Accesses must not overflow the cache's buffer. */
> +    if ( size > sizeof(cache->buffer) )
> +    {
> +        ASSERT_UNREACHABLE();
> +        return X86EMUL_UNHANDLEABLE;
> +    }
> 
>      /*
>       * hvmemul_do_io() cannot handle non-power-of-2 accesses or
> --- a/xen/include/asm-x86/hvm/vcpu.h
> +++ b/xen/include/asm-x86/hvm/vcpu.h
> @@ -42,15 +42,14 @@ struct hvm_vcpu_asid {
>  };
> 
>  /*
> - * We may read or write up to m256 as a number of device-model
> + * We may read or write up to m512 as a number of device-model
>   * transactions.
>   */
>  struct hvm_mmio_cache {
>      unsigned long gla;
>      unsigned int size;
>      uint8_t dir;
> -    uint8_t pad[3]; /* make buffer[] long-aligned */
> -    uint8_t buffer[32];
> +    uint8_t buffer[64] __aligned(sizeof(long));
>  };
> 
>  struct hvm_vcpu_io {
> 
> 
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support
  2018-08-09  8:15 [PATCH 0/6] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                   ` (7 preceding siblings ...)
  2018-09-18 11:46 ` [PATCH v3 00/34] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
@ 2018-09-25 13:14 ` Jan Beulich
  2018-09-25 13:25   ` [PATCH v4 01/44] x86emul: support AVX512 opmask insns Jan Beulich
                     ` (43 more replies)
  2018-11-19 10:00 ` [PATCH v5 00/47] x86emul: fair parts of AVX512 support Jan Beulich
                   ` (3 subsequent siblings)
  12 siblings, 44 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:14 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

01: support AVX512 opmask insns
02: x86/HVM: grow MMIO cache data size to 64 bytes
03: correct EVEX decoding
04: generalize vector length handling for AVX512/EVEX
05: support basic AVX512 moves
06: test for correct EVEX Disp8 scaling
07: also allow running the 32-bit harness on a 64-bit distro
08: use AVX512 logic for emulating V{,P}MASKMOV*
09: support AVX512F legacy-equivalent arithmetic FP insns
10: support AVX512DQ logic FP insns
11: support AVX512F "normal" FP compare insns
12: support AVX512F misc legacy-equivalent FP insns
13: support AVX512F fused-multiply-add insns
14: support AVX512F legacy-equivalent logic insns
15: support AVX512{F,DQ} FP broadcast insns
16: support AVX512F v{,u}comis{d,s} insns
17: test: introduce eq()
18: support AVX512{F,BW} packed integer compare insns
19: support AVX512{F,BW} packed integer arithmetic insns
20: use simd_128 also for legacy vector shift insns
21: support AVX512{F,BW} shift/rotate insns
22: support AVX512{F,BW,DQ} extract insns
23: support AVX512{F,BW,DQ} insert insns
24: basic AVX512F testing
25: support AVX512{F,BW,DQ} integer broadcast insns
26: basic AVX512VL testing
27: support AVX512{F,BW} zero- and sign-extending moves
28: support AVX512{F,BW} down conversion moves
29: support AVX512{F,BW} integer unpack insns
30: support AVX512{F,BW,_VBMI} full permute insns
31: support AVX512{F,BW} integer shuffle insns
32: support AVX512{BW,DQ} mask move insns
33: basic AVX512BW testing
34: basic AVX512DQ testing
35: support AVX512F move high/low insns
36: support AVX512F move duplicate insns
37: support AVX512{F,BW,VBMI} permute insns
38: support AVX512BW pack insns
39: support AVX512F floating-point conversion insns
40: support AVX512F legacy-equivalent packed int/FP conversion insns
41: support AVX512F legacy-equivalent scalar int/FP conversion insns
42: support AVX512DQ packed quad-int/FP conversion insns
43: support AVX512{F,DQ} uint-to-FP conversion insns
44: support AVX512{F,DQ} FP-to-uint conversion insns

The main goal of this series is to support enough of the instructions
such that basic AVX512F, AVX512BW, AVX512DQ, and AVX512VL
tests can be run (this set is relevant as a basis in particular due to
it together mostly [entirely?] covering the legacy-equivalent AVX512
insns). Later additions then (will) simply enable further of the
(conditional) tests in simd*.c (or by other means).

Besides the additions to the series there are two main changes to
previously posted patches: While implementing support for convert
instructions I've noticed an issue with insns being WIG outside of
64-bit mode, but EVEX.W being meaningful in 64-bit mode (movd,
pextrd, pinsrd, and the scalar convert insns). Furthermore I now
hope that the test harness will also run on AVX512VL-incapable
hardware (albeit I'm not myself able to test this, so I can't exclude
there are still issues left). For details, including further smaller
adjustments, please see the individual patches.

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 01/44] x86emul: support AVX512 opmask insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
@ 2018-09-25 13:25   ` Jan Beulich
  2018-09-26  6:06     ` Jan Beulich
  2018-09-25 13:26   ` [PATCH v4 02/44] x86/HVM: grow MMIO cache data size to 64 bytes Jan Beulich
                     ` (42 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:25 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

These are all VEX encoded, so the EVEX decoding logic continues to
remain unused at this point.

The new testcase is deliberately coded in assembly, as a C one would
have become almost unreadable due to the overwhelming amount of
__builtin_...() that would need to be used. After all the compiler has
no underlying type (yet) that could be operated on without builtins,
other than the vector types used for "normal" SIMD insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Use distinct temporary file names in testcase.mk. Additions to clean
    target.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -16,6 +16,8 @@ FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
 
+OPMASK := avx512f avx512dq avx512bw
+
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
@@ -51,6 +53,10 @@ xop-vecs := $(avx-vecs)
 xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
 
+avx512f-opmask-vecs := 2
+avx512dq-opmask-vecs := 1
+avx512bw-opmask-vecs := 4 8
+
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.  For 3DNow!, however, force SSE
 # use for floating point operations, to avoid mixing MMX and FPU register
@@ -80,9 +86,13 @@ $(1)-cflags := \
 	   $(foreach flt,$($(1)-flts), \
 	     "-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
 endef
+define opmask-defs
+$(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)")
+endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
 $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
 
 $(addsuffix .h,$(TESTCASES)): %.h: %.c testcase.mk Makefile
 	rm -f $@.new $*.bin
@@ -100,6 +110,22 @@ $(addsuffix .h,$(TESTCASES)): %.h: %.c t
 	)
 	mv $@.new $@
 
+$(addsuffix -opmask.h,$(OPMASK)): %.h: opmask.S testcase.mk Makefile
+	rm -f $@.new $*.bin
+	$(foreach arch,$(filter-out $(XEN_COMPILE_ARCH),x86_32) $(XEN_COMPILE_ARCH), \
+	    for cflags in $($*-cflags) $($*-cflags-$(arch)); do \
+		$(MAKE) -f testcase.mk TESTCASE=$* XEN_TARGET_ARCH=$(arch) $*-cflags="$$cflags" all; \
+		prefix=$(shell echo $(subst -,_,$*) | sed -e 's,^\([0-9]\),_\1,'); \
+		flavor=$$(echo $${cflags} | sed -e 's, .*,,' -e 'y,-=,__,') ; \
+		(echo 'static const unsigned int __attribute__((section(".test, \"ax\", @progbits #")))' \
+		      "$${prefix}_$(arch)$${flavor}[] = {"; \
+		 od -v -t x $*.bin | sed -e 's/^[0-9]* /0x/' -e 's/ /, 0x/g' -e 's/$$/,/'; \
+		 echo "};") >>$@.new; \
+		rm -f $*.bin; \
+	    done; \
+	)
+	mv $@.new $@
+
 $(addsuffix .c,$(SIMD)):
 	ln -sf simd.c $@
 
@@ -118,7 +144,8 @@ $(TARGET): x86-emulate.o test_x86_emulat
 
 .PHONY: clean
 clean:
-	rm -rf $(TARGET) *.o *~ core $(addsuffix .h,$(TESTCASES)) *.bin x86_emulate
+	rm -rf $(TARGET) *.o *~ core *.bin x86_emulate
+	rm -rf $(TARGET) $(addsuffix .h,$(TESTCASES)) $(addsuffix -opmask.h,$(OPMASK))
 
 .PHONY: distclean
 distclean: clean
@@ -145,4 +172,4 @@ x86-emulate.o test_x86_emulator.o wrappe
 x86-emulate.o: x86_emulate/x86_emulate.c
 x86-emulate.o: HOSTCFLAGS += -D__XEN_TOOLS__
 
-test_x86_emulator.o: $(addsuffix .h,$(TESTCASES))
+test_x86_emulator.o: $(addsuffix .h,$(TESTCASES)) $(addsuffix -opmask.h,$(OPMASK))
--- /dev/null
+++ b/tools/tests/x86_emulator/opmask.S
@@ -0,0 +1,144 @@
+#ifdef __i386__
+# define R(x) e##x
+# define DATA(x) x
+#else
+# if SIZE == 8
+#  define R(x) r##x
+# else
+#  define R(x) e##x
+# endif
+# define DATA(x) x(%rip)
+#endif
+
+#if SIZE == 1
+# define _(x) x##b
+#elif SIZE == 2
+# define _(x) x##w
+# define WIDEN(x) x##bw
+#elif SIZE == 4
+# define _(x) x##d
+# define WIDEN(x) x##wd
+#elif SIZE == 8
+# define _(x) x##q
+# define WIDEN(x) x##dq
+#endif
+
+    .macro check res1:req, res2:req, line:req
+    _(kmov)       %\res1, DATA(out)
+#if SIZE < 8 || !defined(__i386__)
+    _(kmov)       %\res2, %R(dx)
+    cmp           DATA(out), %R(dx)
+#else
+    sub           $8, %esp
+    kmovq         %\res2, (%esp)
+    pop           %ecx
+    pop           %edx
+    cmp           DATA(out), %ecx
+    jne           0f
+    cmp           DATA(out+4), %edx
+0:
+#endif
+    je            1f
+    mov           $\line, %eax
+    ret
+1:
+    .endm
+
+    .text
+    .globl _start
+_start:
+    _(kmov)       DATA(in1), %k1
+#if SIZE < 8 || !defined(__i386__)
+    mov           DATA(in2), %R(ax)
+    _(kmov)       %R(ax), %k2
+#else
+    _(kmov)       DATA(in2), %k2
+#endif
+
+    _(kor)        %k1, %k2, %k3
+    _(kand)       %k1, %k2, %k4
+    _(kandn)      %k3, %k4, %k5
+    _(kxor)       %k1, %k2, %k6
+    check         k5, k6, __LINE__
+
+    _(knot)       %k6, %k3
+    _(kxnor)      %k1, %k2, %k4
+    check         k3, k4, __LINE__
+
+    _(kshiftl)    $1, %k1, %k3
+    _(kshiftl)    $2, %k3, %k4
+    _(kshiftl)    $3, %k1, %k5
+    check         k4, k5, __LINE__
+
+    _(kshiftr)    $1, %k1, %k3
+    _(kshiftr)    $2, %k3, %k4
+    _(kshiftr)    $3, %k1, %k5
+    check         k4, k5, __LINE__
+
+    _(kortest)    %k6, %k6
+    jnbe          1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxor)       %k0, %k0, %k3
+    _(kortest)    %k3, %k3
+    jz            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxnor)      %k0, %k0, %k3
+    _(kortest)    %k3, %k3
+    jc            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+#if SIZE > 1
+
+    _(kshiftr)    $SIZE*4, %k3, %k4
+    WIDEN(kunpck) %k4, %k4, %k5
+    check         k3, k5, __LINE__
+
+#endif
+
+#if SIZE != 2 || defined(__AVX512DQ__)
+
+    _(kadd)       %k1, %k1, %k3
+    _(kshiftl)    $1, %k1, %k4
+    check         k3, k4, __LINE__
+
+    _(ktest)      %k2, %k1
+    jnbe          1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxor)       %k0, %k0, %k3
+    _(ktest)      %k0, %k3
+    jz            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+    _(kxnor)      %k0, %k0, %k4
+    _(ktest)      %k0, %k4
+    jc            1f
+    mov           $__LINE__, %eax
+    ret
+1:
+
+#endif
+
+    xor           %eax, %eax
+    ret
+
+    .section .rodata, "a", @progbits
+    .balign 8
+in1: .byte 0b10110011, 0b10001111, 0b00001111, 0b10000011, 0b11110000, 0b00111111, 0b10000000, 0b11111111
+in2: .byte 0b11111111, 0b00000001, 0b11111100, 0b00001111, 0b11000001, 0b11110000, 0b11110001, 0b11001101
+
+    .data
+    .balign 8
+out: .quad 0
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -18,6 +18,9 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx2.h"
 #include "avx2-sg.h"
 #include "xop.h"
+#include "avx512f-opmask.h"
+#include "avx512dq-opmask.h"
+#include "avx512bw-opmask.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -78,6 +81,24 @@ static bool simd_check_xop(void)
     return cpu_has_xop;
 }
 
+static bool simd_check_avx512f(void)
+{
+    return cpu_has_avx512f;
+}
+#define simd_check_avx512f_opmask simd_check_avx512f
+
+static bool simd_check_avx512dq(void)
+{
+    return cpu_has_avx512dq;
+}
+#define simd_check_avx512dq_opmask simd_check_avx512dq
+
+static bool simd_check_avx512bw(void)
+{
+    return cpu_has_avx512bw;
+}
+#define simd_check_avx512bw_opmask simd_check_avx512bw
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -223,6 +244,10 @@ static const struct {
     SIMD(XOP i16x16,              xop,      32i2),
     SIMD(XOP i32x8,               xop,      32i4),
     SIMD(XOP i64x4,               xop,      32i8),
+    SIMD(OPMASK/w,     avx512f_opmask,         2),
+    SIMD(OPMASK/b,    avx512dq_opmask,         1),
+    SIMD(OPMASK/d,    avx512bw_opmask,         4),
+    SIMD(OPMASK/q,    avx512bw_opmask,         8),
 #undef SIMD_
 #undef SIMD
 };
@@ -3426,8 +3451,8 @@ int main(int argc, char **argv)
             rc = x86_emulate(&ctxt, &emulops);
             if ( rc != X86EMUL_OKAY )
             {
-                printf("failed at %%eip == %08lx (opcode %08x)\n",
-                       (unsigned long)regs.eip, ctxt.opcode);
+                printf("failed (%d) at %%eip == %08lx (opcode %08x)\n",
+                       rc, (unsigned long)regs.eip, ctxt.opcode);
                 return 1;
             }
         }
--- a/tools/tests/x86_emulator/testcase.mk
+++ b/tools/tests/x86_emulator/testcase.mk
@@ -14,3 +14,9 @@ all: $(TESTCASE).bin
 	$(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x100000 -o $*.tmp $*.o
 	$(OBJCOPY) -O binary $*.tmp $@
 	rm -f $*.tmp
+
+%-opmask.bin: opmask.S
+	$(CC) $(filter-out -M% .%,$(CFLAGS)) -c $< -o $(basename $@).o
+	$(LD) $(LDFLAGS_DIRECT) -N -Ttext 0x100000 -o $(basename $@).tmp $(basename $@).o
+	$(OBJCOPY) -O binary $(basename $@).tmp $@
+	rm -f $(basename $@).tmp
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -209,6 +209,9 @@ int emul_test_get_fpu(
     case X86EMUL_FPU_ymm:
         if ( cpu_has_avx )
             break;
+    case X86EMUL_FPU_opmask:
+        if ( cpu_has_avx512f )
+            break;
     default:
         return X86EMUL_UNHANDLEABLE;
     }
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -236,6 +236,36 @@ static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 21)) != 0; \
 })
 
+#define cpu_has_avx512f ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 16)) != 0; \
+})
+
+#define cpu_has_avx512dq ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 17)) != 0; \
+})
+
+#define cpu_has_avx512bw ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 30)) != 0; \
+})
+
 int emul_test_cpuid(
     uint32_t leaf,
     uint32_t subleaf,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -491,6 +491,7 @@ static const struct ext0f3a_table {
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
+    [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
@@ -1187,6 +1188,11 @@ static int _get_fpu(
             return X86EMUL_UNHANDLEABLE;
         break;
 
+    case X86EMUL_FPU_opmask:
+        if ( !(xcr0 & X86_XCR0_SSE) || !(xcr0 & X86_XCR0_OPMASK) )
+            return X86EMUL_UNHANDLEABLE;
+        break;
+
     default:
         break;
     }
@@ -1762,12 +1768,15 @@ static bool vcpu_has(
 #define vcpu_has_bmi2()        vcpu_has(         7, EBX,  8, ctxt, ops)
 #define vcpu_has_rtm()         vcpu_has(         7, EBX, 11, ctxt, ops)
 #define vcpu_has_mpx()         vcpu_has(         7, EBX, 14, ctxt, ops)
+#define vcpu_has_avx512f()     vcpu_has(         7, EBX, 16, ctxt, ops)
+#define vcpu_has_avx512dq()    vcpu_has(         7, EBX, 17, ctxt, ops)
 #define vcpu_has_rdseed()      vcpu_has(         7, EBX, 18, ctxt, ops)
 #define vcpu_has_adx()         vcpu_has(         7, EBX, 19, ctxt, ops)
 #define vcpu_has_smap()        vcpu_has(         7, EBX, 20, ctxt, ops)
 #define vcpu_has_clflushopt()  vcpu_has(         7, EBX, 23, ctxt, ops)
 #define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
 #define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
+#define vcpu_has_avx512bw()    vcpu_has(         7, EBX, 30, ctxt, ops)
 #define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
 #define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
 
@@ -2396,6 +2405,18 @@ x86_decode_twobyte(
         }
         break;
 
+    case X86EMUL_OPC_VEX(0, 0x90):    /* kmov{w,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x90): /* kmov{b,d} */
+        state->desc = DstReg | SrcMem | Mov;
+        state->simd_size = simd_other;
+        break;
+
+    case X86EMUL_OPC_VEX(0, 0x91):    /* kmov{w,q} */
+    case X86EMUL_OPC_VEX_66(0, 0x91): /* kmov{b,d} */
+        state->desc = DstMem | SrcReg | Mov;
+        state->simd_size = simd_other;
+        break;
+
     case 0xae:
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         /* fall through */
@@ -6002,6 +6023,60 @@ x86_emulate(
             dst.val = src.val;
         break;
 
+    case X86EMUL_OPC_VEX(0x0f, 0x4a):    /* kadd{w,q} k,k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x41):    /* kand{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x41): /* kand{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x42):    /* kandn{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x42): /* kandn{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x45):    /* kor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x45): /* kor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x46):    /* kxnor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x46): /* kxnor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX(0x0f, 0x47):    /* kxor{w,q} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x47): /* kxor{b,d} k,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x4a): /* kadd{b,d} k,k,k */
+        generate_exception_if(!vex.l, EXC_UD);
+    opmask_basic:
+        if ( vex.w )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( vex.pfx )
+            host_and_vcpu_must_have(avx512dq);
+    opmask_common:
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(!vex.r || (mode_64bit() && !(vex.reg & 8)) ||
+                              ea.type != OP_REG, EXC_UD);
+
+        vex.reg |= 8;
+        d &= ~TwoOp;
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        insn_bytes = PFX_BYTES + 2;
+
+        state->simd_size = simd_other;
+        op_bytes = 1; /* Any non-zero value will do. */
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x44):    /* knot{w,q} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x44): /* knot{b,d} k,k */
+        generate_exception_if(vex.l || vex.reg != 0xf, EXC_UD);
+        goto opmask_basic;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x4b):    /* kunpck{w,d}{d,q} k,k,k */
+        generate_exception_if(!vex.l, EXC_UD);
+        host_and_vcpu_must_have(avx512bw);
+        goto opmask_common;
+
+    case X86EMUL_OPC_VEX_66(0x0f, 0x4b): /* kunpckbw k,k,k */
+        generate_exception_if(!vex.l || vex.w, EXC_UD);
+        goto opmask_common;
+
     CASE_SIMD_PACKED_FP(, 0x0f, 0x50):     /* movmskp{s,d} xmm,reg */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x50): /* vmovmskp{s,d} {x,y}mm,reg */
     CASE_SIMD_PACKED_INT(0x0f, 0xd7):      /* pmovmskb {,x}mm,reg */
@@ -6552,6 +6627,154 @@ x86_emulate(
         dst.val = test_cc(b, _regs.eflags);
         break;
 
+    case X86EMUL_OPC_VEX(0x0f, 0x91):    /* kmov{w,q} k,mem */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x91): /* kmov{b,d} k,mem */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x90):    /* kmov{w,q} k/mem,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x90): /* kmov{b,d} k/mem,k */
+        generate_exception_if(vex.l || !vex.r, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.w )
+        {
+            host_and_vcpu_must_have(avx512bw);
+            op_bytes = 4 << !vex.pfx;
+        }
+        else if ( vex.pfx )
+        {
+            host_and_vcpu_must_have(avx512dq);
+            op_bytes = 1;
+        }
+        else
+            op_bytes = 2;
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* convert memory operand to (%rAX) */
+            vex.b = 1;
+            opc[1] &= 0x38;
+        }
+        insn_bytes = PFX_BYTES + 2;
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x92):    /* kmovw r32,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x92): /* kmovb r32,k */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x92): /* kmov{d,q} reg,k */
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.pfx == vex_f2 )
+            host_and_vcpu_must_have(avx512bw);
+        else
+        {
+            generate_exception_if(vex.w, EXC_UD);
+            if ( vex.pfx )
+                host_and_vcpu_must_have(avx512dq);
+        }
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert GPR source to %rAX. */
+        vex.b = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1] = modrm & 0xf8;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        ea.reg = decode_gpr(&_regs, modrm_rm);
+        invoke_stub("", "", "=m" (dummy) : "a" (*ea.reg));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        dst.type = OP_NONE;
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x93):    /* kmovw k,r32 */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x93): /* kmovb k,r32 */
+    case X86EMUL_OPC_VEX_F2(0x0f, 0x93): /* kmov{d,q} k,reg */
+        generate_exception_if(vex.l || vex.reg != 0xf || ea.type != OP_REG,
+                              EXC_UD);
+        dst = ea;
+        dst.reg = decode_gpr(&_regs, modrm_reg);
+
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.pfx == vex_f2 )
+        {
+            host_and_vcpu_must_have(avx512bw);
+            dst.bytes = 4 << (mode_64bit() && vex.w);
+        }
+        else
+        {
+            generate_exception_if(vex.w, EXC_UD);
+            dst.bytes = 4;
+            if ( vex.pfx )
+                host_and_vcpu_must_have(avx512dq);
+        }
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        /* Convert GPR destination to %rAX. */
+        vex.r = 1;
+        if ( !mode_64bit() )
+            vex.w = 0;
+        opc[1] = modrm & 0xc7;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        invoke_stub("", "", "=a" (dst.val) : [dummy] "i" (0));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        break;
+
+    case X86EMUL_OPC_VEX(0x0f, 0x99):    /* ktest{w,q} k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+        /* fall through */
+    case X86EMUL_OPC_VEX(0x0f, 0x98):    /* kortest{w,q} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x98): /* kortest{b,d} k,k */
+    case X86EMUL_OPC_VEX_66(0x0f, 0x99): /* ktest{b,d} k,k */
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( vex.w )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( vex.pfx )
+            host_and_vcpu_must_have(avx512dq);
+
+        get_fpu(X86EMUL_FPU_opmask);
+
+        opc = init_prefixes(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        opc[2] = 0xc3;
+
+        copy_VEX(opc, vex);
+        invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+                    _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
+                    [eflags] "+g" (_regs.eflags),
+                    "=a" (dst.val), [tmp] "=&r" (dummy)
+                    : [mask] "i" (EFLAGS_MASK));
+
+        put_stub(stub);
+
+        ASSERT(!state->simd_size);
+        dst.type = OP_NONE;
+        break;
+
     case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
         msr_val = 0;
         fail_if(ops->cpuid == NULL);
@@ -8170,6 +8393,23 @@ x86_emulate(
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
+        if ( !vex.w )
+            host_and_vcpu_must_have(avx512dq);
+    opmask_shift_imm:
+        generate_exception_if(vex.l || !vex.r || vex.reg != 0xf ||
+                              ea.type != OP_REG, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_opmask);
+        op_bytes = 1; /* Any non-zero value will do. */
+        goto simd_0f_imm8;
+
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x31): /* kshiftr{d,q} $imm8,k,k */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x33): /* kshiftl{d,q} $imm8,k,k */
+        host_and_vcpu_must_have(avx512bw);
+        goto opmask_shift_imm;
+
     case X86EMUL_OPC_66(0x0f3a, 0x44):     /* pclmulqdq $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,xmm/m128,xmm,xmm */
         host_and_vcpu_must_have(pclmulqdq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -170,6 +170,7 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_mmx, /* MMX instruction set (%mm0-%mm7) */
     X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */
     X86EMUL_FPU_ymm, /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
+    X86EMUL_FPU_opmask, /* AVX512 opmask instruction set (%k0-%k7) */
     /* This sentinel will never be passed to ->get_fpu(). */
     X86EMUL_FPU_none
 };
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -99,9 +99,12 @@
 #define cpu_has_rtm             boot_cpu_has(X86_FEATURE_RTM)
 #define cpu_has_fpu_sel         (!boot_cpu_has(X86_FEATURE_NO_FPU_SEL))
 #define cpu_has_mpx             boot_cpu_has(X86_FEATURE_MPX)
+#define cpu_has_avx512f         boot_cpu_has(X86_FEATURE_AVX512F)
+#define cpu_has_avx512dq        boot_cpu_has(X86_FEATURE_AVX512DQ)
 #define cpu_has_rdseed          boot_cpu_has(X86_FEATURE_RDSEED)
 #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
 #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
+#define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 02/44] x86/HVM: grow MMIO cache data size to 64 bytes
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
  2018-09-25 13:25   ` [PATCH v4 01/44] x86emul: support AVX512 opmask insns Jan Beulich
@ 2018-09-25 13:26   ` Jan Beulich
  2018-09-25 13:27   ` [PATCH v4 03/44] x86emul: correct EVEX decoding Jan Beulich
                     ` (41 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:26 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

This is needed before enabling any AVX512 insns in the emulator. Change
the way alignment is enforced at the same time.

Add a check that the buffer won't actually overflow, and while at it
also convert the check for accesses to not cross page boundaries.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul.durrant@citrix.com>
---
v3: New.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -866,7 +866,18 @@ static int hvmemul_phys_mmio_access(
     int rc = X86EMUL_OKAY;
 
     /* Accesses must fall within a page. */
-    BUG_ON((gpa & ~PAGE_MASK) + size > PAGE_SIZE);
+    if ( (gpa & ~PAGE_MASK) + size > PAGE_SIZE )
+    {
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    /* Accesses must not overflow the cache's buffer. */
+    if ( size > sizeof(cache->buffer) )
+    {
+        ASSERT_UNREACHABLE();
+        return X86EMUL_UNHANDLEABLE;
+    }
 
     /*
      * hvmemul_do_io() cannot handle non-power-of-2 accesses or
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -42,15 +42,14 @@ struct hvm_vcpu_asid {
 };
 
 /*
- * We may read or write up to m256 as a number of device-model
+ * We may read or write up to m512 as a number of device-model
  * transactions.
  */
 struct hvm_mmio_cache {
     unsigned long gla;
     unsigned int size;
     uint8_t dir;
-    uint8_t pad[3]; /* make buffer[] long-aligned */
-    uint8_t buffer[32];
+    uint8_t buffer[64] __aligned(sizeof(long));
 };
 
 struct hvm_vcpu_io {





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 03/44] x86emul: correct EVEX decoding
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
  2018-09-25 13:25   ` [PATCH v4 01/44] x86emul: support AVX512 opmask insns Jan Beulich
  2018-09-25 13:26   ` [PATCH v4 02/44] x86/HVM: grow MMIO cache data size to 64 bytes Jan Beulich
@ 2018-09-25 13:27   ` Jan Beulich
  2018-10-26 14:33     ` Andrew Cooper
  2018-09-25 13:28   ` [PATCH v4 04/44] x86emul: generalize vector length handling for AVX512/EVEX Jan Beulich
                     ` (40 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:27 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Fix an inverted pair of checks, drop an incorrect instance of #UD
raising for non-64-bit mode, and add further generic checks.

Note: Other than SDM Vol 2 rev 067 states, EVEX.V' is _not_ ignored
      outside of 64-bit mode when the field does not encode a register.
      Just like EVEX.VVVV is required to be 0b1111 in that case, EVEX.V'
      is required to be 1 there.

Also rename the bcst field to br, as #UD generation for individual insns
will need to consider both of its possible meanings.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -650,7 +650,7 @@ union evex {
         uint8_t w:1;
         uint8_t opmsk:3;
         uint8_t RX:1;
-        uint8_t bcst:1;
+        uint8_t br:1;
         uint8_t lr:2;
         uint8_t z:1;
     };
@@ -2760,13 +2760,11 @@ x86_decode(
                         evex.raw[1] = vex.raw[1];
                         evex.raw[2] = insn_fetch_type(uint8_t);
 
-                        generate_exception_if(evex.mbs || !evex.mbz, EXC_UD);
+                        generate_exception_if(!evex.mbs || evex.mbz, EXC_UD);
+                        generate_exception_if(!evex.opmsk && evex.z, EXC_UD);
 
                         if ( !mode_64bit() )
-                        {
-                            generate_exception_if(!evex.RX, EXC_UD);
                             evex.R = 1;
-                        }
 
                         vex.opcx = evex.opcx;
                         break;
@@ -3404,6 +3402,7 @@ x86_emulate(
         d = (d & ~DstMask) | DstMem;
         /* Becomes a normal DstMem operation from here on. */
     case DstMem:
+        generate_exception_if(ea.type == OP_MEM && evex.z, EXC_UD);
         if ( state->simd_size )
         {
             generate_exception_if(lock_prefix, EXC_UD);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 04/44] x86emul: generalize vector length handling for AVX512/EVEX
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (2 preceding siblings ...)
  2018-09-25 13:27   ` [PATCH v4 03/44] x86emul: correct EVEX decoding Jan Beulich
@ 2018-09-25 13:28   ` Jan Beulich
  2018-10-26 16:10     ` Andrew Cooper
  2018-09-25 13:28   ` [PATCH v4 05/44] x86emul: support basic AVX512 moves Jan Beulich
                     ` (39 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:28 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

To allow for some code sharing where possible, copy VEX.L into EVEX.LR
even for VEX (or XOP) encoded insns. Make operand size determination
use this right away, at the same time adding consistency checks for the
EVEX scalar insn cases (the non-scalar ones aren't uniform enough for
the checking to be done in a central place like this).

Note that the broadcast case is not handled here, but will be taken care
of elsewhere (in just a single place rather than at least two).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Introduce evex_encoded() to replace open-coded evex.mbs checks.
v2: Don't raise #UD in simd_scalar_opc case when EVEX.W != low-opcode-bit.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -191,14 +191,14 @@ enum simd_opsize {
      * Ordinary packed integers:
      * - 64 bits without prefix 66 (MMX)
      * - 128 bits with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      */
     simd_packed_int,
 
     /*
      * Ordinary packed/scalar floating point:
      * - 128 bits without prefix or with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      * - 32 bits with prefix F3 (scalar single)
      * - 64 bits with prefix F2 (scalar doubgle)
      */
@@ -207,14 +207,14 @@ enum simd_opsize {
     /*
      * Packed floating point:
      * - 128 bits without prefix or with prefix 66 (SSEn)
-     * - 128/256 bits depending on VEX.L (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      */
     simd_packed_fp,
 
     /*
      * Single precision packed/scalar floating point:
      * - 128 bits without prefix (SSEn)
-     * - 128/256 bits depending on VEX.L, no prefix (AVX)
+     * - 128/256/512 bits depending on VEX.L/EVEX.LR (AVX+)
      * - 32 bits with prefix F3 (scalar)
      */
     simd_single_fp,
@@ -228,7 +228,7 @@ enum simd_opsize {
 
     /*
      * Scalar floating point:
-     * - 32/64 bits depending on VEX.W
+     * - 32/64 bits depending on VEX.W/EVEX.W
      */
     simd_scalar_vexw,
 
@@ -2249,6 +2249,7 @@ int x86emul_unhandleable_rw(
 #define lock_prefix (state->lock_prefix)
 #define vex (state->vex)
 #define evex (state->evex)
+#define evex_encoded() (evex.mbs)
 #define ea (state->ea)
 
 static int
@@ -2818,6 +2819,9 @@ x86_decode(
 
                 opcode |= b | MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
 
+                if ( !evex_encoded() )
+                    evex.lr = vex.l;
+
                 if ( !(d & ModRM) )
                     break;
 
@@ -3148,7 +3152,7 @@ x86_decode(
             }
             /* fall through */
         case vex_66:
-            op_bytes = 16 << vex.l;
+            op_bytes = 16 << evex.lr;
             break;
         default:
             op_bytes = 0;
@@ -3172,9 +3176,17 @@ x86_decode(
     case simd_any_fp:
         switch ( vex.pfx )
         {
-        default:     op_bytes = 16 << vex.l; break;
-        case vex_f3: op_bytes = 4;           break;
-        case vex_f2: op_bytes = 8;           break;
+        default:
+            op_bytes = 16 << evex.lr;
+            break;
+        case vex_f3:
+            generate_exception_if(evex_encoded() && evex.w, EXC_UD);
+            op_bytes = 4;
+            break;
+        case vex_f2:
+            generate_exception_if(evex_encoded() && !evex.w, EXC_UD);
+            op_bytes = 8;
+            break;
         }
         break;
 





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 05/44] x86emul: support basic AVX512 moves
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (3 preceding siblings ...)
  2018-09-25 13:28   ` [PATCH v4 04/44] x86emul: generalize vector length handling for AVX512/EVEX Jan Beulich
@ 2018-09-25 13:28   ` Jan Beulich
  2018-11-13 17:12     ` Andrew Cooper
  2018-09-25 13:29   ` [PATCH v4 06/44] x86emul: test for correct EVEX Disp8 scaling Jan Beulich
                     ` (38 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:28 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note: SDM Vol 2 rev 067 is not really consistent about EVEX.L'L for LIG
      insns - the only place where this is made explicit is a table in
      the section titled "Vector Length Orthogonality": While they
      tolerate 0, 1, and 2, a value of 3 uniformly leads to #UD.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Introduce d8s_dq64 to deal with 32-bit mode VMOVD with EVEX.W set.
    Adjust a comment.
v3: Restrict k-reg reading to insns with memory operand. Shrink scope of
    "disp8scale".
v2: Move "full" into more narrow scope.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -1985,6 +1985,53 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovq %xmm1,32(%edx)...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_to_mem);
+
+        asm volatile ( "pcmpgtb %%xmm1, %%xmm1\n"
+                       put_insn(evex_vmovq_to_mem, "%{evex%} vmovq %%xmm1, 32(%0)")
+                       :: "d" (NULL) );
+
+        memset(res, 0xdb, 64);
+        set_insn(evex_vmovq_to_mem);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_mem) ||
+             *((uint64_t *)res + 4) ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing {evex} vmovq 32(%edx),%xmm0...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_from_mem);
+
+        asm volatile ( "pcmpeqb %%xmm0, %%xmm0\n"
+                       put_insn(evex_vmovq_from_mem, "%{evex%} vmovq 32(%0), %%xmm0")
+                       :: "d" (NULL) );
+
+        set_insn(evex_vmovq_from_mem);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_from_mem) )
+            goto fail;
+        asm ( "vmovq %1, %%xmm1\n\t"
+              "vpcmpeqq %%zmm0, %%zmm1, %%k0\n"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) );
+        if ( rc != 0xff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movdqu %xmm2,(%ecx)...");
     if ( stack_exec && cpu_has_sse2 )
     {
@@ -2085,6 +2132,118 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vmovdqu32 %zmm2,(%ecx){%k1}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovdqu32_to_mem);
+
+        memset(res, 0x55, 128);
+
+        asm volatile ( "vpcmpeqd %%ymm2, %%ymm2, %%ymm2\n\t"
+                       "kmovw %1,%%k1\n"
+                       put_insn(vmovdqu32_to_mem,
+                                "vmovdqu32 %%zmm2, (%0)%{%%k1%}")
+                       :: "c" (NULL), "rm" (res[0]) );
+        set_insn(vmovdqu32_to_mem);
+
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || memcmp(res + 16, res + 24, 32) ||
+             !check_eip(vmovdqu32_to_mem) )
+            goto fail;
+
+        res[16] = ~0; res[18] = ~0; res[20] = ~0; res[22] = ~0;
+        res[24] =  0; res[26] =  0; res[28] =  0; res[30] =  0;
+        if ( memcmp(res, res + 16, 64) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovdqu32 64(%edx),%zmm2{%k2}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovdqu32_from_mem);
+
+        asm volatile ( "knotw %%k1, %%k2\n"
+                       put_insn(vmovdqu32_from_mem,
+                                "vmovdqu32 64(%0), %%zmm2%{%%k2%}")
+                       :: "d" (NULL) );
+
+        set_insn(vmovdqu32_from_mem);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovdqu32_from_mem) )
+            goto fail;
+        asm ( "vpcmpeqd %1, %%zmm2, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[0]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovdqu16 %zmm3,(%ecx){%k1}...");
+    if ( stack_exec && cpu_has_avx512bw )
+    {
+        decl_insn(vmovdqu16_to_mem);
+
+        memset(res, 0x55, 128);
+
+        asm volatile ( "vpcmpeqw %%ymm3, %%ymm3, %%ymm3\n\t"
+                       "kmovd %1,%%k1\n"
+                       put_insn(vmovdqu16_to_mem,
+                                "vmovdqu16 %%zmm3, (%0)%{%%k1%}")
+                       :: "c" (NULL), "rm" (res[0]) );
+        set_insn(vmovdqu16_to_mem);
+
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || memcmp(res + 16, res + 24, 32) ||
+             !check_eip(vmovdqu16_to_mem) )
+            goto fail;
+
+        for ( i = 16; i < 24; ++i )
+            res[i] |= 0x0000ffff;
+        for ( ; i < 32; ++i )
+            res[i] &= 0xffff0000;
+        if ( memcmp(res, res + 16, 64) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovdqu16 64(%edx),%zmm3{%k2}...");
+    if ( stack_exec && cpu_has_avx512bw )
+    {
+        decl_insn(vmovdqu16_from_mem);
+
+        asm volatile ( "knotd %%k1, %%k2\n"
+                       put_insn(vmovdqu16_from_mem,
+                                "vmovdqu16 64(%0), %%zmm3%{%%k2%}")
+                       :: "d" (NULL) );
+
+        set_insn(vmovdqu16_from_mem);
+        regs.ecx = 0;
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovdqu16_from_mem) )
+            goto fail;
+        asm ( "vpcmpeqw %1, %%zmm3, %%k0\n\t"
+              "kmovd %%k0, %0" : "=r" (rc) : "m" (res[0]) );
+        if ( rc != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movsd %xmm5,(%ecx)...");
     memset(res, 0x77, 64);
     memset(res + 10, 0x66, 8);
@@ -2186,6 +2345,71 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vmovsd %xmm5,16(%ecx){%k3}...");
+    memset(res, 0x88, 128);
+    memset(res + 20, 0x77, 8);
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovsd_masked_to_mem);
+
+        asm volatile ( "vbroadcastsd %0, %%ymm5\n\t"
+                       "kxorw %%k3, %%k3, %%k3\n"
+                       put_insn(vmovsd_masked_to_mem,
+                                "vmovsd %%xmm5, 16(%1)%{%%k3%}")
+                       :: "m" (res[20]), "c" (NULL) );
+
+        set_insn(vmovsd_masked_to_mem);
+        regs.ecx = 0;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(vmovsd_masked_to_mem) )
+            goto fail;
+
+        asm volatile ( "kmovw %0, %%k3\n" :: "m" (res[20]) );
+
+        set_insn(vmovsd_masked_to_mem);
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(vmovsd_masked_to_mem) ||
+             memcmp(res, res + 16, 64) )
+            goto fail;
+
+        printf("okay\n");
+    }
+    else
+    {
+        printf("skipped\n");
+        memset(res + 4, 0x77, 8);
+    }
+
+    printf("%-40s", "Testing vmovaps (%edx),%zmm7{%k3}{z}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(vmovaps_masked_from_mem);
+
+        asm volatile ( "vpcmpeqd %%xmm7, %%xmm7, %%xmm7\n\t"
+                       "vbroadcastss %%xmm7, %%zmm7\n"
+                       put_insn(vmovaps_masked_from_mem,
+                                "vmovaps (%0), %%zmm7%{%%k3%}%{z%}")
+                       :: "d" (NULL) );
+
+        set_insn(vmovaps_masked_from_mem);
+        regs.edx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(vmovaps_masked_from_mem) )
+            goto fail;
+        asm ( "vcmpeqps %1, %%zmm7, %%k0\n\t"
+              "vxorps %%xmm0, %%xmm0, %%xmm0\n\t"
+              "vcmpeqps %%zmm0, %%zmm7, %%k1\n\t"
+              "kxorw %%k1, %%k0, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[16]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movd %mm3,32(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
     {
@@ -2341,6 +2565,55 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovd %xmm3,32(%ecx)...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_to_mem);
+
+        asm volatile ( "pcmpeqb %%xmm3, %%xmm3\n"
+                       put_insn(evex_vmovd_to_mem,
+                                "%{evex%} vmovd %%xmm3, 32(%0)")
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(evex_vmovd_to_mem);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovd_to_mem) ||
+             res[8] + 1 ||
+             memcmp(res, res + 9, 28) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing {evex} vmovd 32(%ecx),%xmm4...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_from_mem);
+
+        asm volatile ( "pcmpeqb %%xmm4, %%xmm4\n"
+                       put_insn(evex_vmovd_from_mem,
+                                "%{evex%} vmovd 32(%0), %%xmm4")
+                       :: "c" (NULL) );
+
+        set_insn(evex_vmovd_from_mem);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovd_from_mem) )
+            goto fail;
+        asm ( "vmovd %1, %%xmm0\n\t"
+              "vpcmpeqd %%zmm4, %%zmm0, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movd %mm3,%ebx...");
     if ( stack_exec && cpu_has_mmx )
     {
@@ -2507,6 +2780,57 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovd %xmm2,%ebx...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqb %%xmm2, %%xmm2\n"
+                       put_insn(evex_vmovd_to_reg,
+                                "%{evex%} vmovd %%xmm2, %%ebx")
+                       :: );
+
+        set_insn(evex_vmovd_to_reg);
+#ifdef __x86_64__
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+#else
+        regs.ebx = 0xbdbdbdbdUL;
+#endif
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(evex_vmovd_to_reg) ||
+             regs.ebx != 0xffffffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing {evex} vmovd %ebx,%xmm1...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovd_from_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpgtb %%xmm1, %%xmm1\n"
+                       put_insn(evex_vmovd_from_reg,
+                                "%{evex%} vmovd %%ebx, %%xmm1")
+                       :: );
+
+        set_insn(evex_vmovd_from_reg);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) || !check_eip(evex_vmovd_from_reg) )
+            goto fail;
+        asm ( "vmovd %1, %%xmm0\n\t"
+              "vpcmpeqd %%zmm1, %%zmm0, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "m" (res[8]) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #ifdef __x86_64__
     printf("%-40s", "Testing movq %mm3,32(%ecx)...");
     if ( stack_exec && cpu_has_mmx )
@@ -2584,6 +2908,36 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing {evex} vmovq %xmm11,32(%ecx)...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_to_mem2);
+
+        asm volatile ( "pcmpeqb %%xmm11, %%xmm11\n"
+#if 0 /* This may not work, as the assembler might pick opcode D6. */
+                       put_insn(evex_vmovq_to_mem2,
+                                "{evex} vmovq %%xmm11, 32(%0)")
+#else
+                       put_insn(evex_vmovq_to_mem2,
+                                ".byte 0x62, 0xf1, 0xfd, 0x08, 0x7e, 0x49, 0x04")
+#endif
+                       :: "c" (NULL) );
+
+        memset(res, 0xbd, 64);
+        set_insn(evex_vmovq_to_mem2);
+        regs.ecx = (unsigned long)res;
+        regs.edx = 0;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_mem2) ||
+             *((long *)res + 4) + 1 ||
+             memcmp(res, res + 10, 24) ||
+             memcmp(res, res + 6, 8) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
     printf("%-40s", "Testing movq %mm3,%rbx...");
     if ( stack_exec && cpu_has_mmx )
     {
@@ -2643,6 +2997,28 @@ int main(int argc, char **argv)
     }
     else
         printf("skipped\n");
+
+    printf("%-40s", "Testing vmovq %xmm22,%rbx...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovq_to_reg);
+
+        /* See comment next to movd above. */
+        asm volatile ( "pcmpeqq %%xmm2, %%xmm2\n\t"
+                       "vmovq %%xmm2, %%xmm22\n"
+                       put_insn(evex_vmovq_to_reg, "vmovq %%xmm22, %%rbx")
+                       :: );
+
+        set_insn(evex_vmovq_to_reg);
+        regs.rbx = 0xbdbdbdbdbdbdbdbdUL;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovq_to_reg) ||
+             regs.rbx + 1 )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
 #endif
 
     printf("%-40s", "Testing maskmovq %mm4,%mm4...");
@@ -2812,6 +3188,32 @@ int main(int argc, char **argv)
             goto fail;
         printf("okay\n");
     }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing vmovntdqa 64(%ecx),%zmm4...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vmovntdqa);
+
+        asm volatile ( "vpxor %%xmm4, %%xmm4, %%xmm4\n"
+                       put_insn(evex_vmovntdqa, "vmovntdqa 64(%0), %%zmm4")
+                       :: "c" (NULL) );
+
+        set_insn(evex_vmovntdqa);
+        memset(res, 0x55, 192);
+        memset(res + 16, 0xff, 64);
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vmovntdqa) )
+            goto fail;
+        asm ( "vpbroadcastd %1, %%zmm2\n\t"
+              "vpcmpeqd %%zmm4, %%zmm2, %%k0\n\t"
+              "kmovw %%k0, %0" : "=r" (rc) : "0" (~0) );
+        if ( rc != 0xffff )
+            goto fail;
+        printf("okay\n");
+    }
     else
         printf("skipped\n");
 
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -210,6 +210,7 @@ int emul_test_get_fpu(
         if ( cpu_has_avx )
             break;
     case X86EMUL_FPU_opmask:
+    case X86EMUL_FPU_zmm:
         if ( cpu_has_avx512f )
             break;
     default:
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -266,6 +266,16 @@ static inline uint64_t xgetbv(uint32_t x
     (res.b & (1U << 30)) != 0; \
 })
 
+#define cpu_has_avx512vl ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.b = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.b & (1U << 31)) != 0; \
+})
+
 int emul_test_cpuid(
     uint32_t leaf,
     uint32_t subleaf,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -243,9 +243,27 @@ enum simd_opsize {
 };
 typedef uint8_t simd_opsize_t;
 
+enum disp8scale {
+    /* Values 0 ... 4 are explicit sizes. */
+    d8s_bw = 5,
+    d8s_dq,
+    /* EVEX.W ignored outside of 64-bit mode */
+    d8s_dq64,
+    /*
+     * All further values must strictly be last and in the order
+     * given so that arithmetic on the values works.
+     */
+    d8s_vl,
+    d8s_vl_by_2,
+    d8s_vl_by_4,
+    d8s_vl_by_8,
+};
+typedef uint8_t disp8scale_t;
+
 static const struct twobyte_table {
     opcode_desc_t desc;
-    simd_opsize_t size;
+    simd_opsize_t size:4;
+    disp8scale_t d8s:4;
 } twobyte_table[256] = {
     [0x00] = { ModRM },
     [0x01] = { ImplicitOps|ModRM },
@@ -260,8 +278,8 @@ static const struct twobyte_table {
     [0x0d] = { ImplicitOps|ModRM },
     [0x0e] = { ImplicitOps },
     [0x0f] = { ModRM|SrcImmByte },
-    [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp },
-    [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+    [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl },
+    [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
     [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
@@ -270,10 +288,10 @@ static const struct twobyte_table {
     [0x18 ... 0x1f] = { ImplicitOps|ModRM },
     [0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
     [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
-    [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp },
-    [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp },
+    [0x28] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_fp, d8s_vl },
+    [0x29] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_fp, d8s_vl },
     [0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp },
+    [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x2c ... 0x2d] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp },
     [0x30 ... 0x35] = { ImplicitOps },
@@ -292,8 +310,8 @@ static const struct twobyte_table {
     [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
-    [0x6e] = { DstImplicit|SrcMem|ModRM|Mov },
-    [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int },
+    [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 },
+    [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
     [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM },
     [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
@@ -301,8 +319,8 @@ static const struct twobyte_table {
     [0x78] = { ImplicitOps|ModRM },
     [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int },
     [0x7c ... 0x7d] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0x7e] = { DstMem|SrcImplicit|ModRM|Mov },
-    [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
+    [0x7e] = { DstMem|SrcImplicit|ModRM|Mov, simd_none, d8s_dq64 },
+    [0x7f] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x80 ... 0x8f] = { DstImplicit|SrcImm },
     [0x90 ... 0x9f] = { ByteOp|DstMem|SrcNone|ModRM|Mov },
     [0xa0 ... 0xa1] = { ImplicitOps|Mov },
@@ -344,14 +362,14 @@ static const struct twobyte_table {
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
-    [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
+    [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
     [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int },
+    [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
@@ -406,6 +424,7 @@ static const struct ext0f38_table {
     uint8_t to_mem:1;
     uint8_t two_op:1;
     uint8_t vsib:1;
+    disp8scale_t d8s:4;
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
@@ -418,7 +437,7 @@ static const struct ext0f38_table {
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
-    [0x2a] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
     [0x2c ... 0x2d] = { .simd_size = simd_other },
     [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
@@ -656,6 +675,22 @@ union evex {
     };
 };
 
+#define EVEX_PFX_BYTES 4
+#define init_evex(stub) ({ \
+    uint8_t *buf_ = get_stub(stub); \
+    buf_[0] = 0x62; \
+    buf_ + EVEX_PFX_BYTES; \
+})
+
+#define copy_EVEX(ptr, evex) ({ \
+    if ( !mode_64bit() ) \
+        (evex).reg |= 8; \
+    (ptr)[1 - EVEX_PFX_BYTES] = (evex).raw[0]; \
+    (ptr)[2 - EVEX_PFX_BYTES] = (evex).raw[1]; \
+    (ptr)[3 - EVEX_PFX_BYTES] = (evex).raw[2]; \
+    container_of((ptr) + 1 - EVEX_PFX_BYTES, typeof(evex), raw[0]); \
+})
+
 #define rep_prefix()   (vex.pfx >= vex_f3)
 #define repe_prefix()  (vex.pfx == vex_f3)
 #define repne_prefix() (vex.pfx == vex_f2)
@@ -768,6 +803,7 @@ typedef union {
     uint64_t mmx;
     uint64_t __attribute__ ((aligned(16))) xmm[2];
     uint64_t __attribute__ ((aligned(32))) ymm[4];
+    uint64_t __attribute__ ((aligned(64))) zmm[8];
 } mmval_t;
 
 /*
@@ -1183,6 +1219,11 @@ static int _get_fpu(
 
     switch ( type )
     {
+    case X86EMUL_FPU_zmm:
+        if ( !(xcr0 & X86_XCR0_ZMM) || !(xcr0 & X86_XCR0_HI_ZMM) ||
+             !(xcr0 & X86_XCR0_OPMASK) )
+            return X86EMUL_UNHANDLEABLE;
+        /* fall through */
     case X86EMUL_FPU_ymm:
         if ( !(xcr0 & X86_XCR0_SSE) || !(xcr0 & X86_XCR0_YMM) )
             return X86EMUL_UNHANDLEABLE;
@@ -1777,6 +1818,7 @@ static bool vcpu_has(
 #define vcpu_has_clwb()        vcpu_has(         7, EBX, 24, ctxt, ops)
 #define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
 #define vcpu_has_avx512bw()    vcpu_has(         7, EBX, 30, ctxt, ops)
+#define vcpu_has_avx512vl()    vcpu_has(         7, EBX, 31, ctxt, ops)
 #define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
 #define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
 
@@ -2150,6 +2192,65 @@ static unsigned long *decode_vex_gpr(
     return decode_gpr(regs, ~vex_reg & (mode_64bit() ? 0xf : 7));
 }
 
+static unsigned int decode_disp8scale(enum disp8scale scale,
+                                      const struct x86_emulate_state *state)
+{
+    switch ( scale )
+    {
+    case d8s_bw:
+        return state->evex.w;
+
+    default:
+        if ( scale < d8s_vl )
+            return scale;
+        if ( state->evex.br )
+        {
+    case d8s_dq:
+            return 2 + state->evex.w;
+        }
+        break;
+
+    case d8s_dq64:
+        return 2 + (state->op_bytes == 8);
+    }
+
+    switch ( state->simd_size )
+    {
+    case simd_any_fp:
+    case simd_single_fp:
+        if ( !(state->evex.pfx & VEX_PREFIX_SCALAR_MASK) )
+            break;
+        /* fall through */
+    case simd_scalar_opc:
+    case simd_scalar_vexw:
+        return 2 + state->evex.w;
+
+    case simd_128:
+        /* These should have an explicit size specified. */
+        ASSERT_UNREACHABLE();
+        return 4;
+
+    default:
+        break;
+    }
+
+    return 4 + state->evex.lr - (scale - d8s_vl);
+}
+
+#define avx512_vlen_check(lig) do { \
+    switch ( evex.lr ) \
+    { \
+    default: \
+        generate_exception(EXC_UD); \
+    case 2: \
+        break; \
+    case 0: case 1: \
+        if (!(lig)) \
+            host_and_vcpu_must_have(avx512vl); \
+        break; \
+    } \
+} while ( false )
+
 static bool is_aligned(enum x86_segment seg, unsigned long offs,
                        unsigned int size, struct x86_emulate_ctxt *ctxt,
                        const struct x86_emulate_ops *ops)
@@ -2399,6 +2500,7 @@ x86_decode_twobyte(
         if ( vex.pfx == vex_f3 ) /* movq xmm/m64,xmm */
         {
     case X86EMUL_OPC_VEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */
+    case X86EMUL_OPC_EVEX_F3(0, 0x7e): /* vmovq xmm/m64,xmm */
             state->desc = DstImplicit | SrcMem | TwoOp;
             state->simd_size = simd_other;
             /* Avoid the state->desc clobbering of TwoOp below. */
@@ -2469,7 +2571,7 @@ x86_decode_twobyte(
     }
 
     /*
-     * Scalar forms of most VEX-encoded TwoOp instructions have
+     * Scalar forms of most VEX-/EVEX-encoded TwoOp instructions have
      * three operands.  Those which do really have two operands
      * should have exited earlier.
      */
@@ -2834,6 +2936,8 @@ x86_decode(
 
     if ( d & ModRM )
     {
+        unsigned int disp8scale = 0;
+
         d &= ~ModRM;
 #undef ModRM /* Only its aliases are valid to use from here on. */
         modrm_reg = ((rex_prefix & 4) << 1) | ((modrm & 0x38) >> 3);
@@ -2876,6 +2980,9 @@ x86_decode(
             break;
 
         case ext_0f:
+            if ( evex_encoded() )
+                disp8scale = decode_disp8scale(twobyte_table[b].d8s, state);
+
             switch ( b )
             {
             case 0x20: /* mov cr,reg */
@@ -2889,6 +2996,11 @@ x86_decode(
                  */
                 modrm_mod = 3;
                 break;
+
+            case 0x7e: /* vmovq xmm/m64,xmm needs special casing */
+                if ( disp8scale == 2 && evex.pfx == vex_f3 )
+                    disp8scale = 3;
+                break;
             }
             break;
 
@@ -2900,6 +3012,8 @@ x86_decode(
             if ( ext0f38_table[b].vsib )
                 d |= vSIB;
             state->simd_size = ext0f38_table[b].simd_size;
+            if ( evex_encoded() )
+                disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
             break;
 
         case ext_8f09:
@@ -2968,7 +3082,7 @@ x86_decode(
                     ea.mem.off = insn_fetch_type(int16_t);
                 break;
             case 1:
-                ea.mem.off += insn_fetch_type(int8_t);
+                ea.mem.off += insn_fetch_type(int8_t) << disp8scale;
                 break;
             case 2:
                 ea.mem.off += insn_fetch_type(int16_t);
@@ -3027,7 +3141,7 @@ x86_decode(
                 pc_rel = mode_64bit();
                 break;
             case 1:
-                ea.mem.off += insn_fetch_type(int8_t);
+                ea.mem.off += insn_fetch_type(int8_t) << disp8scale;
                 break;
             case 2:
                 ea.mem.off += insn_fetch_type(int32_t);
@@ -3228,10 +3342,11 @@ x86_emulate(
     struct x86_emulate_state state;
     int rc;
     uint8_t b, d, *opc = NULL;
-    unsigned int first_byte = 0, insn_bytes = 0;
+    unsigned int first_byte = 0, elem_bytes, insn_bytes = 0;
+    uint64_t op_mask = ~0ULL;
     bool singlestep = (_regs.eflags & X86_EFLAGS_TF) &&
 	    !is_branch_step(ctxt, ops);
-    bool sfence = false;
+    bool sfence = false, fault_suppression = false;
     struct operand src = { .reg = PTR_POISON };
     struct operand dst = { .reg = PTR_POISON };
     unsigned long cr4;
@@ -3272,6 +3387,7 @@ x86_emulate(
     b = ctxt->opcode;
     d = state.desc;
 #define state (&state)
+    elem_bytes = 4 << evex.w;
 
     generate_exception_if(state->not_64bit && mode_64bit(), EXC_UD);
 
@@ -3346,6 +3462,28 @@ x86_emulate(
         break;
     }
 
+    /* With a memory operand, fetch the mask register in use (if any). */
+    if ( ea.type == OP_MEM && evex.opmsk )
+    {
+        uint8_t *stb = get_stub(stub);
+
+        /* KMOV{W,Q} %k<n>, (%rax) */
+        stb[0] = 0xc4;
+        stb[1] = 0xe1;
+        stb[2] = cpu_has_avx512bw ? 0xf8 : 0x78;
+        stb[3] = 0x91;
+        stb[4] = evex.opmsk << 3;
+        insn_bytes = 5;
+        stb[5] = 0xc3;
+
+        invoke_stub("", "", "+m" (op_mask) : "a" (&op_mask));
+
+        insn_bytes = 0;
+        put_stub(stub);
+
+        fault_suppression = true;
+    }
+
     /* Decode (but don't fetch) the destination operand: register or memory. */
     switch ( d & DstMask )
     {
@@ -5708,6 +5846,41 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 2;
         break;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x2b): /* vmovntp{s,d} [xyz]mm,mem */
+        generate_exception_if(ea.type != OP_MEM || evex.opmsk, EXC_UD);
+        sfence = true;
+        fault_suppression = false;
+        /* fall through */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x10): /* vmovup{s,d} [xyz]mm/mem,[xyz]mm{k} */
+    CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x10): /* vmovs{s,d} mem,xmm{k} */
+                                            /* vmovs{s,d} xmm,xmm,xmm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x11): /* vmovup{s,d} [xyz]mm,[xyz]mm/mem{k} */
+    CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x11): /* vmovs{s,d} xmm,mem{k} */
+                                            /* vmovs{s,d} xmm,xmm,xmm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x28): /* vmovap{s,d} [xyz]mm/mem,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x29): /* vmovap{s,d} [xyz]mm,[xyz]mm/mem{k} */
+        /* vmovs{s,d} to/from memory have only two operands. */
+        if ( (b & ~1) == 0x10 && ea.type == OP_MEM )
+            d |= TwoOp;
+        generate_exception_if(evex.br, EXC_UD);
+        generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
+    simd_zmm:
+        get_fpu(X86EMUL_FPU_zmm);
+        opc = init_evex(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* convert memory operand to (%rAX) */
+            evex.b = 1;
+            opc[1] &= 0x38;
+        }
+        insn_bytes = EVEX_PFX_BYTES + 2;
+        break;
+
     case X86EMUL_OPC_66(0x0f, 0x12):       /* movlpd m64,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x12):   /* vmovlpd m64,xmm,xmm */
     CASE_SIMD_PACKED_FP(, 0x0f, 0x13):     /* movlp{s,d} xmm,m64 */
@@ -6348,6 +6521,41 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x7e): /* vmov{d,q} xmm,r/m */
+        generate_exception_if((evex.lr || evex.opmsk || evex.br ||
+                               evex.reg != 0xf || !evex.RX),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        opc = init_evex(stub);
+        opc[0] = b;
+        /* Convert memory/GPR operand to (%rAX). */
+        evex.b = 1;
+        if ( !mode_64bit() )
+            evex.w = 0;
+        opc[1] = modrm & 0x38;
+        insn_bytes = EVEX_PFX_BYTES + 2;
+        opc[2] = 0xc3;
+
+        copy_EVEX(opc, evex);
+        invoke_stub("", "", "+m" (src.val) : "a" (&src.val));
+        dst.val = src.val;
+
+        put_stub(stub);
+        ASSERT(!state->simd_size);
+        break;
+
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
+        generate_exception_if(evex.lr || !evex.w || evex.opmsk || evex.br,
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        d |= TwoOp;
+        op_bytes = 8;
+        goto simd_zmm;
+
     case X86EMUL_OPC_66(0x0f, 0xe7):     /* movntdq xmm,m128 */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq {x,y}mm,mem */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -6368,6 +6576,30 @@ x86_emulate(
             goto simd_0f_avx;
         goto simd_0f_sse2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe7): /* vmovntdq [xyz]mm,mem */
+        generate_exception_if(ea.type != OP_MEM || evex.opmsk || evex.w,
+                              EXC_UD);
+        sfence = true;
+        fault_suppression = false;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6f): /* vmovdqa{32,64} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x6f): /* vmovdqu{32,64} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x7f): /* vmovdqa{32,64} [xyz]mm,[xyz]mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x7f): /* vmovdqu{32,64} [xyz]mm,[xyz]mm/mem{k} */
+    vmovdqa:
+        generate_exception_if(evex.br, EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(false);
+        d |= TwoOp;
+        op_bytes = 16 << evex.lr;
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_F2(0x0f, 0x6f): /* vmovdqu{8,16} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F2(0x0f, 0x7f): /* vmovdqu{8,16} [xyz]mm,[xyz]mm/mem{k} */
+        host_and_vcpu_must_have(avx512bw);
+        elem_bytes = 1 << evex.w;
+        goto vmovdqa;
+
     case X86EMUL_OPC_VEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
         generate_exception_if(vex.l, EXC_UD);
         d |= TwoOp;
@@ -7734,6 +7966,15 @@ x86_emulate(
         }
         goto movdqa;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x2a): /* vmovntdqa mem,[xyz]mm */
+        generate_exception_if(ea.type != OP_MEM || evex.opmsk || evex.w,
+                              EXC_UD);
+        /* Ignore the non-temporal hint for now, using vmovdqa32 instead. */
+        asm volatile ( "mfence" ::: "memory" );
+        b = 0x6f;
+        evex.opcx = vex_0f;
+        goto vmovdqa;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2c): /* vmaskmovps mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2d): /* vmaskmovpd mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2e): /* vmaskmovps {x,y}mm,{x,y}mm,mem */
@@ -8787,17 +9028,27 @@ x86_emulate(
     else if ( state->simd_size )
     {
         generate_exception_if(!op_bytes, EXC_UD);
-        generate_exception_if(vex.opcx && (d & TwoOp) && vex.reg != 0xf,
+        generate_exception_if((vex.opcx && (d & TwoOp) &&
+                               (vex.reg != 0xf || (evex_encoded() && !evex.RX))),
                               EXC_UD);
 
         if ( !opc )
             BUG();
-        opc[insn_bytes - PFX_BYTES] = 0xc3;
-        copy_REX_VEX(opc, rex_prefix, vex);
+        if ( evex_encoded() )
+        {
+            opc[insn_bytes - EVEX_PFX_BYTES] = 0xc3;
+            copy_EVEX(opc, evex);
+        }
+        else
+        {
+            opc[insn_bytes - PFX_BYTES] = 0xc3;
+            copy_REX_VEX(opc, rex_prefix, vex);
+        }
 
         if ( ea.type == OP_MEM )
         {
             uint32_t mxcsr = 0;
+            uint64_t full = 0;
 
             if ( op_bytes < 16 ||
                  (vex.opcx
@@ -8819,6 +9070,44 @@ x86_emulate(
                                   !is_aligned(ea.mem.seg, ea.mem.off, op_bytes,
                                               ctxt, ops),
                                   EXC_GP, 0);
+
+            if ( evex.br )
+            {
+                ASSERT((d & DstMask) != DstMem);
+                op_bytes = elem_bytes;
+            }
+            if ( evex.opmsk )
+            {
+                ASSERT(!(op_bytes % elem_bytes));
+                full = ~0ULL >> (64 - op_bytes / elem_bytes);
+                op_mask &= full;
+            }
+            if ( fault_suppression )
+            {
+                if ( !op_mask )
+                    goto simd_no_mem;
+                if ( !evex.br )
+                {
+                    first_byte = __builtin_ctzll(op_mask);
+                    op_mask >>= first_byte;
+                    full >>= first_byte;
+                    first_byte *= elem_bytes;
+                    op_bytes = (64 - __builtin_clzll(op_mask)) * elem_bytes;
+                }
+            }
+            /*
+             * Independent of fault suppression we may need to read (parts of)
+             * the memory operand for the purpose of merging without splitting
+             * the write below into multiple ones. Note that the EVEX.Z check
+             * here isn't strictly needed, due to there not currently being
+             * any instructions allowing zeroing-merging on memory writes (and
+             * we raise #UD during DstMem processing far above in this case),
+             * yet conceptually the read is then unnecessary.
+             */
+            if ( evex.opmsk && !evex.z && (d & DstMask) == DstMem &&
+                 op_mask != full )
+                d = (d & ~SrcMask) | SrcMem;
+
             switch ( d & SrcMask )
             {
             case SrcMem:
@@ -8865,7 +9154,10 @@ x86_emulate(
             }
         }
         else
+        {
+        simd_no_mem:
             dst.type = OP_NONE;
+        }
 
         /* {,v}maskmov{q,dqu}, as an exception, uses rDI. */
         if ( likely((ctxt->opcode & ~(X86EMUL_OPC_PFX_MASK |
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -171,6 +171,7 @@ enum x86_emulate_fpu_type {
     X86EMUL_FPU_xmm, /* SSE instruction set (%xmm0-%xmm7/15) */
     X86EMUL_FPU_ymm, /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
     X86EMUL_FPU_opmask, /* AVX512 opmask instruction set (%k0-%k7) */
+    X86EMUL_FPU_zmm, /* AVX512 instruction set (%zmm0-%zmm7/31) */
     /* This sentinel will never be passed to ->get_fpu(). */
     X86EMUL_FPU_none
 };
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -105,6 +105,7 @@
 #define cpu_has_smap            boot_cpu_has(X86_FEATURE_SMAP)
 #define cpu_has_sha             boot_cpu_has(X86_FEATURE_SHA)
 #define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
+#define cpu_has_avx512vl        boot_cpu_has(X86_FEATURE_AVX512VL)
 
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 06/44] x86emul: test for correct EVEX Disp8 scaling
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (4 preceding siblings ...)
  2018-09-25 13:28   ` [PATCH v4 05/44] x86emul: support basic AVX512 moves Jan Beulich
@ 2018-09-25 13:29   ` Jan Beulich
  2018-11-12 17:42     ` Andrew Cooper
  2018-09-25 13:29   ` [PATCH v4 07/44] x86emul: also allow running the 32-bit harness on a 64-bit distro Jan Beulich
                     ` (37 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:29 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Besides the already existing tests (which are going to be extended once
respective ISA extension support is complete), let's also ensure for
every individual insn that their Disp8 scaling (and memory access width)
are correct.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Introduce ESZ_d_WIG.
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -139,7 +139,7 @@ $(addsuffix .h,$(SIMD) $(FMA) $(SG)): si
 
 xop.h: simd-fma.c
 
-$(TARGET): x86-emulate.o test_x86_emulator.o wrappers.o
+$(TARGET): x86-emulate.o test_x86_emulator.o evex-disp8.o wrappers.o
 	$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
 
 .PHONY: clean
@@ -166,7 +166,7 @@ x86.h := $(addprefix $(XEN_ROOT)/tools/i
                      x86-vendors.h x86-defns.h msr-index.h)
 x86_emulate.h := x86-emulate.h x86_emulate/x86_emulate.h $(x86.h)
 
-x86-emulate.o test_x86_emulator.o wrappers.o: %.o: %.c $(x86_emulate.h)
+x86-emulate.o test_x86_emulator.o evex-disp8.o wrappers.o: %.o: %.c $(x86_emulate.h)
 	$(HOSTCC) $(HOSTCFLAGS) -c -g -o $@ $<
 
 x86-emulate.o: x86_emulate/x86_emulate.c
--- /dev/null
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -0,0 +1,452 @@
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "x86-emulate.h"
+
+struct test {
+    const char *mnemonic;
+    unsigned int opc:8;
+    unsigned int spc:2;
+    unsigned int pfx:2;
+    unsigned int vsz:3;
+    unsigned int esz:4;
+    unsigned int scale:1;
+    unsigned int ext:3;
+};
+
+enum spc {
+    SPC_invalid,
+    SPC_0f,
+    SPC_0f38,
+    SPC_0f3a,
+};
+
+enum pfx {
+    PFX_,
+    PFX_66,
+    PFX_f3,
+    PFX_f2
+};
+
+enum vl {
+    VL_128,
+    VL_256,
+    VL_512,
+};
+
+enum scale {
+    SC_vl,
+    SC_el,
+};
+
+enum vsz {
+    VSZ_vl,
+    VSZ_vl_2, /* VL / 2 */
+    VSZ_vl_4, /* VL / 4 */
+    VSZ_vl_8, /* VL / 8 */
+    /* "no broadcast" implied from here on. */
+    VSZ_el,
+    VSZ_el_2, /* EL * 2 */
+    VSZ_el_4, /* EL * 4 */
+    VSZ_el_8, /* EL * 8 */
+};
+
+enum esz {
+    ESZ_d,
+    ESZ_q,
+    ESZ_dq,
+    ESZ_sd,
+    ESZ_d_nb,
+    ESZ_q_nb,
+    /* "no broadcast" implied from here on. */
+#ifdef __i386__
+    ESZ_d_WIG,
+#endif
+    ESZ_b,
+    ESZ_w,
+    ESZ_bw,
+};
+
+#ifndef __i386__
+# define ESZ_dq64 ESZ_dq
+#else
+# define ESZ_dq64 ESZ_d_WIG
+#endif
+
+#define INSNX(m, p, sp, o, e, vs, es, sc) { \
+    .mnemonic = #m, .opc = 0x##o, .spc = SPC_##sp, .pfx = PFX_##p, \
+    .vsz = VSZ_##vs, .esz = ESZ_##es, .scale = SC_##sc, .ext = 0##e \
+}
+#define INSN(m, p, sp, o, vs, es, sc) INSNX(m, p, sp, o, 0, vs, es, sc)
+#define INSN_PFP(m, sp, o) \
+    INSN(m##pd, 66, sp, o, vl, q, vl), \
+    INSN(m##ps,   , sp, o, vl, d, vl)
+#define INSN_PFP_NB(m, sp, o) \
+    INSN(m##pd, 66, sp, o, vl, q_nb, vl), \
+    INSN(m##ps,   , sp, o, vl, d_nb, vl)
+#define INSN_SFP(m, sp, o) \
+    INSN(m##sd, f2, sp, o, el, q, el), \
+    INSN(m##ss, f3, sp, o, el, d, el)
+
+#define INSN_FP(m, sp, o) \
+    INSN_PFP(m, sp, o), \
+    INSN_SFP(m, sp, o)
+
+static const struct test avx512f_all[] = {
+    INSN_SFP(mov,            0f, 10),
+    INSN_SFP(mov,            0f, 11),
+    INSN_PFP_NB(mova,        0f, 28),
+    INSN_PFP_NB(mova,        0f, 29),
+    INSN(movdqa32,     66,   0f, 6f,    vl,   d_nb, vl),
+    INSN(movdqa32,     66,   0f, 7f,    vl,   d_nb, vl),
+    INSN(movdqa64,     66,   0f, 6f,    vl,   q_nb, vl),
+    INSN(movdqa64,     66,   0f, 7f,    vl,   q_nb, vl),
+    INSN(movdqu32,     f3,   0f, 6f,    vl,   d_nb, vl),
+    INSN(movdqu32,     f3,   0f, 7f,    vl,   d_nb, vl),
+    INSN(movdqu64,     f3,   0f, 6f,    vl,   q_nb, vl),
+    INSN(movdqu64,     f3,   0f, 7f,    vl,   q_nb, vl),
+    INSN(movntdq,      66,   0f, e7,    vl,   d_nb, vl),
+    INSN(movntdqa,     66, 0f38, 2a,    vl,   d_nb, vl),
+    INSN_PFP_NB(movnt,       0f, 2b),
+    INSN_PFP_NB(movu,        0f, 10),
+    INSN_PFP_NB(movu,        0f, 11),
+};
+
+static const struct test avx512f_128[] = {
+    INSN(mov,       66,   0f, 6e, el, dq64, el),
+    INSN(mov,       66,   0f, 7e, el, dq64, el),
+    INSN(movq,      f3,   0f, 7e, el,    q, el),
+    INSN(movq,      66,   0f, d6, el,    q, el),
+};
+
+static const struct test avx512bw_all[] = {
+    INSN(movdqu8,     f2,   0f, 6f,    vl,    b, vl),
+    INSN(movdqu8,     f2,   0f, 7f,    vl,    b, vl),
+    INSN(movdqu16,    f2,   0f, 6f,    vl,    w, vl),
+    INSN(movdqu16,    f2,   0f, 7f,    vl,    w, vl),
+};
+
+static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
+static const unsigned char vl_128[] = { VL_128 };
+
+/*
+ * This table, indicating the presence of an immediate (byte) for an opcode
+ * space 0f major opcode, is indexed by high major opcode byte nibble, with
+ * each table element then bit-indexed by low major opcode byte nibble.
+ */
+static const uint16_t imm0f[16] = {
+    [0x7] = (1 << 0x0) /* vpshuf* */ |
+            (1 << 0x1) /* vps{ll,ra,rl}w */ |
+            (1 << 0x2) /* vps{l,r}ld, vp{rol,ror,sra}{d,q} */ |
+            (1 << 0x3) /* vps{l,r}l{,d}q */,
+    [0xc] = (1 << 0x2) /* vcmp{p,s}{d,s} */ |
+            (1 << 0x4) /* vpinsrw */ |
+            (1 << 0x5) /* vpextrw */ |
+            (1 << 0x6) /* vshufp{d,s} */,
+};
+
+static struct x86_emulate_ops emulops;
+
+static unsigned int accessed[3 * 64];
+
+static bool record_access(enum x86_segment seg, unsigned long offset,
+                          unsigned int bytes)
+{
+    while ( bytes-- )
+    {
+        if ( offset >= ARRAY_SIZE(accessed) )
+            return false;
+        ++accessed[offset++];
+    }
+
+    return true;
+}
+
+static int read(enum x86_segment seg, unsigned long offset, void *p_data,
+                unsigned int bytes, struct x86_emulate_ctxt *ctxt)
+{
+    if ( !record_access(seg, offset, bytes) )
+        return X86EMUL_UNHANDLEABLE;
+    memset(p_data, 0, bytes);
+    return X86EMUL_OKAY;
+}
+
+static int write(enum x86_segment seg, unsigned long offset, void *p_data,
+                 unsigned int bytes, struct x86_emulate_ctxt *ctxt)
+{
+    if ( !record_access(seg, offset, bytes) )
+        return X86EMUL_UNHANDLEABLE;
+    return X86EMUL_OKAY;
+}
+
+static void test_one(const struct test *test, enum vl vl,
+                     unsigned char *instr, struct x86_emulate_ctxt *ctxt)
+{
+    unsigned int vsz, esz, i;
+    int rc;
+    bool sg = strstr(test->mnemonic, "gather") ||
+              strstr(test->mnemonic, "scatter");
+    bool imm = test->spc == SPC_0f3a ||
+               (test->spc == SPC_0f &&
+                (imm0f[test->opc >> 4] & (1 << (test->opc & 0xf))));
+    union evex {
+        uint8_t raw[3];
+        struct {
+            uint8_t opcx:2;
+            uint8_t mbz:2;
+            uint8_t R:1;
+            uint8_t b:1;
+            uint8_t x:1;
+            uint8_t r:1;
+            uint8_t pfx:2;
+            uint8_t mbs:1;
+            uint8_t reg:4;
+            uint8_t w:1;
+            uint8_t opmsk:3;
+            uint8_t RX:1;
+            uint8_t bcst:1;
+            uint8_t lr:2;
+            uint8_t z:1;
+        };
+    } evex = {
+        .opcx = test->spc, .pfx = test->pfx, .lr = vl,
+        .R = 1, .b = 1, .x = 1, .r = 1, .mbs = 1,
+        .reg = 0xf, .RX = 1, .opmsk = sg,
+    };
+
+    switch ( test->esz )
+    {
+    case ESZ_b:
+        esz = 1;
+        break;
+
+    case ESZ_w:
+        esz = 2;
+        evex.w = 1;
+        break;
+
+#ifdef __i386__
+    case ESZ_d_WIG:
+        evex.w = 1;
+        /* fall through */
+#endif
+    case ESZ_d: case ESZ_d_nb:
+        esz = 4;
+        break;
+
+    case ESZ_q: case ESZ_q_nb:
+        esz = 8;
+        evex.w = 1;
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+    }
+
+    switch ( test->vsz )
+    {
+    case VSZ_vl:
+        vsz = 16 << vl;
+        break;
+
+    case VSZ_vl_2:
+        vsz = 8 << vl;
+        break;
+
+    case VSZ_vl_4:
+        vsz = 4 << vl;
+        break;
+
+    case VSZ_vl_8:
+        vsz = 2 << vl;
+        break;
+
+    case VSZ_el:
+        vsz = esz;
+        break;
+
+    case VSZ_el_2:
+        vsz = esz * 2;
+        break;
+
+    case VSZ_el_4:
+        vsz = esz * 4;
+        break;
+
+    case VSZ_el_8:
+        vsz = esz * 8;
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+    }
+
+    /*
+     * Note: SIB addressing is used here, such that S/G insns can be handled
+     * without extra conditionals.
+     */
+    instr[0] = 0x62;
+    instr[1] = evex.raw[0];
+    instr[2] = evex.raw[1];
+    instr[3] = evex.raw[2];
+    instr[4] = test->opc;
+    instr[5] = 0x44 | (test->ext << 3); /* ModR/M */
+    instr[6] = 0x12; /* SIB: base rDX, index none / xMM4 */
+    instr[7] = 1; /* Disp8 */
+    instr[8] = 0; /* immediate, if any */
+
+    asm volatile ( "kxnorw %k1, %k1, %k1" );
+    asm volatile ( "vxorps %xmm4, %xmm4, %xmm4" );
+
+    ctxt->regs->eip = (unsigned long)&instr[0];
+    ctxt->regs->edx = 0;
+    memset(accessed, 0, sizeof(accessed));
+
+    rc = x86_emulate(ctxt, &emulops);
+    if ( rc != X86EMUL_OKAY ||
+         (ctxt->regs->eip != (unsigned long)&instr[8 + imm]) )
+        goto fail;
+
+    for ( i = 0; i < vsz; ++i )
+         if ( accessed[i] )
+             goto fail;
+    for ( ; i < (test->scale == SC_vl ? vsz : esz) + (sg ? esz : vsz); ++i )
+         if ( accessed[i] != (sg ? vsz / esz : 1) )
+             goto fail;
+    for ( ; i < ARRAY_SIZE(accessed); ++i )
+         if ( accessed[i] )
+             goto fail;
+
+    /* Also check the broadcast case, if available. */
+    if ( test->vsz >= VSZ_el || test->scale != SC_vl )
+        return;
+
+    switch ( test->esz )
+    {
+    case ESZ_d_nb: case ESZ_q_nb:
+    case ESZ_b: case ESZ_w: case ESZ_bw:
+        return;
+
+    case ESZ_d: case ESZ_q:
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+    }
+
+    evex.bcst = 1;
+    instr[3] = evex.raw[2];
+
+    ctxt->regs->eip = (unsigned long)&instr[0];
+    memset(accessed, 0, sizeof(accessed));
+
+    rc = x86_emulate(ctxt, &emulops);
+    if ( rc != X86EMUL_OKAY ||
+         (ctxt->regs->eip != (unsigned long)&instr[8 + imm]) )
+        goto fail;
+
+    for ( i = 0; i < esz; ++i )
+         if ( accessed[i] )
+             goto fail;
+    for ( ; i < esz * 2; ++i )
+         if ( accessed[i] != 1 )
+             goto fail;
+    for ( ; i < ARRAY_SIZE(accessed); ++i )
+         if ( accessed[i] )
+             goto fail;
+
+    return;
+
+ fail:
+    printf("failed (v%s%s %u-bit)\n", test->mnemonic,
+           evex.bcst ? "/bcst" : "", 128 << vl);
+    exit(1);
+}
+
+static void test_pair(const struct test *tmpl, enum vl vl,
+                      enum esz esz1, const char *suffix1,
+                      enum esz esz2, const char *suffix2,
+                      unsigned char *instr, struct x86_emulate_ctxt *ctxt)
+{
+    struct test test = *tmpl;
+    char mnemonic[24];
+
+    test.esz = esz1;
+    snprintf(mnemonic, ARRAY_SIZE(mnemonic), "%s%s", tmpl->mnemonic, suffix1);
+    test.mnemonic = mnemonic;
+    test_one(&test, vl, instr, ctxt);
+
+    test.esz = esz2;
+    snprintf(mnemonic, ARRAY_SIZE(mnemonic), "%s%s", tmpl->mnemonic, suffix2);
+    test.mnemonic = mnemonic;
+    test_one(&test, vl, instr, ctxt);
+}
+
+static void test_group(const struct test tests[], unsigned int nr_test,
+                       const unsigned char vl[], unsigned int nr_vl,
+                       void *instr, struct x86_emulate_ctxt *ctxt)
+{
+    unsigned int i, j;
+
+    for ( i = 0; i < nr_test; ++i )
+    {
+        for ( j = 0; j < nr_vl; ++j )
+        {
+            if ( vl[0] == VL_512 && vl[j] != VL_512 && !cpu_has_avx512vl )
+                continue;
+
+            switch ( tests[i].esz )
+            {
+            default:
+                test_one(&tests[i], vl[j], instr, ctxt);
+                break;
+
+            case ESZ_bw:
+                test_pair(&tests[i], vl[j], ESZ_b, "b", ESZ_w, "w",
+                          instr, ctxt);
+                break;
+
+            case ESZ_dq:
+                test_pair(&tests[i], vl[j], ESZ_d, "d", ESZ_q, "q",
+                          instr, ctxt);
+                break;
+
+#ifdef __i386__
+            case ESZ_d_WIG:
+                test_pair(&tests[i], vl[j], ESZ_d, "/W0",
+                          ESZ_d_WIG, "/W1", instr, ctxt);
+                break;
+#endif
+
+            case ESZ_sd:
+                test_pair(&tests[i], vl[j],
+                          ESZ_d, tests[i].vsz < VSZ_el ? "ps" : "ss",
+                          ESZ_q, tests[i].vsz < VSZ_el ? "pd" : "sd",
+                          instr, ctxt);
+                break;
+            }
+        }
+    }
+}
+
+void evex_disp8_test(void *instr, struct x86_emulate_ctxt *ctxt,
+                     const struct x86_emulate_ops *ops)
+{
+    emulops = *ops;
+    emulops.read = read;
+    emulops.write = write;
+
+#define RUN(feat, vl) do { \
+    if ( cpu_has_##feat ) \
+    { \
+        printf("%-40s", "Testing " #feat "/" #vl " disp8 handling..."); \
+        test_group(feat ## _ ## vl, ARRAY_SIZE(feat ## _ ## vl), \
+                   vl_ ## vl, ARRAY_SIZE(vl_ ## vl), instr, ctxt); \
+        printf("okay\n"); \
+    } \
+} while ( false )
+
+    RUN(avx512f, all);
+    RUN(avx512f, 128);
+    RUN(avx512bw, all);
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3795,6 +3795,9 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    if ( stack_exec )
+        evex_disp8_test(instr, &ctxt, &emulops);
+
     for ( j = 0; j < ARRAY_SIZE(blobs); j++ )
     {
         if ( blobs[j].check_cpu && !blobs[j].check_cpu() )
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -71,6 +71,9 @@ WRAP(puts);
 
 #include "x86_emulate/x86_emulate.h"
 
+void evex_disp8_test(void *instr, struct x86_emulate_ctxt *ctxt,
+                     const struct x86_emulate_ops *ops);
+
 static inline uint64_t xgetbv(uint32_t xcr)
 {
     uint32_t lo, hi;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 07/44] x86emul: also allow running the 32-bit harness on a 64-bit distro
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (5 preceding siblings ...)
  2018-09-25 13:29   ` [PATCH v4 06/44] x86emul: test for correct EVEX Disp8 scaling Jan Beulich
@ 2018-09-25 13:29   ` Jan Beulich
  2018-11-12 17:50     ` Andrew Cooper
  2018-09-25 13:30   ` [PATCH v4 08/44] x86emul: use AVX512 logic for emulating V{, P}MASKMOV* Jan Beulich
                     ` (36 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:29 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

In order to be able to verify the 32-bit variant builds and runs,
introduce a respective target (and the necessary other adjustments).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Moved ahead in series.
v3: New.

--- a/.gitignore
+++ b/.gitignore
@@ -240,6 +240,7 @@ tools/security/xensec_tool
 tools/tests/depriv/depriv-fd-checker
 tools/tests/x86_emulator/*.bin
 tools/tests/x86_emulator/*.tmp
+tools/tests/x86_emulator/32/x86_emulate
 tools/tests/x86_emulator/3dnow*.[ch]
 tools/tests/x86_emulator/asm
 tools/tests/x86_emulator/avx*.[ch]
--- /dev/null
+++ b/tools/tests/x86_emulator/32/Makefile
@@ -0,0 +1,4 @@
+override XEN_COMPILE_ARCH := x86_32
+XEN_ROOT = $(CURDIR)/../../../..
+VPATH += ..
+include ../Makefile
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -1,5 +1,7 @@
 
+ifeq ($(XEN_ROOT),)
 XEN_ROOT=$(CURDIR)/../../..
+endif
 include $(XEN_ROOT)/tools/Rules.mk
 
 TARGET := test_x86_emulator
@@ -18,6 +20,12 @@ TESTCASES := blowfish $(SIMD) $(FMA) $(S
 
 OPMASK := avx512f avx512dq avx512bw
 
+ifeq ($(origin XEN_COMPILE_ARCH),override)
+
+HOSTCFLAGS += -m32
+
+else
+
 blowfish-cflags := ""
 blowfish-cflags-x86_32 := "-mno-accumulate-outgoing-args -Dstatic="
 
@@ -139,6 +147,8 @@ $(addsuffix .h,$(SIMD) $(FMA) $(SG)): si
 
 xop.h: simd-fma.c
 
+endif # 32-bit override
+
 $(TARGET): x86-emulate.o test_x86_emulator.o evex-disp8.o wrappers.o
 	$(HOSTCC) $(HOSTCFLAGS) -o $@ $^
 
@@ -153,6 +163,15 @@ distclean: clean
 .PHONY: install uninstall
 install uninstall:
 
+.PHONY: run32 clean32
+ifeq ($(XEN_TARGET_ARCH),x86_64)
+run32 clean32: %32: $(addsuffix .h,$(TESTCASES)) $(addsuffix -opmask.h,$(OPMASK))
+	$(MAKE) -C 32 $*
+clean: clean32
+else
+run32 clean32: %32: %
+endif
+
 x86_emulate:
 	[ -L $@ ] || ln -sf $(XEN_ROOT)/xen/arch/x86/$@
 





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 08/44] x86emul: use AVX512 logic for emulating V{, P}MASKMOV*
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (6 preceding siblings ...)
  2018-09-25 13:29   ` [PATCH v4 07/44] x86emul: also allow running the 32-bit harness on a 64-bit distro Jan Beulich
@ 2018-09-25 13:30   ` Jan Beulich
  2018-11-13 18:12     ` Andrew Cooper
  2018-09-25 13:31   ` [PATCH v4 09/44] x86emul: support AVX512F legacy-equivalent arithmetic FP insns Jan Beulich
                     ` (35 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:30 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

The more generic AVX512 implementation allows quite a bit of insn-
specific code to be dropped/shared.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -439,8 +439,8 @@ static const struct ext0f38_table {
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
-    [0x2c ... 0x2d] = { .simd_size = simd_other },
-    [0x2e ... 0x2f] = { .simd_size = simd_other, .to_mem = 1 },
+    [0x2c ... 0x2d] = { .simd_size = simd_packed_fp },
+    [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
     [0x40] = { .simd_size = simd_packed_int },
@@ -449,8 +449,8 @@ static const struct ext0f38_table {
     [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
     [0x5a] = { .simd_size = simd_128, .two_op = 1 },
     [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
-    [0x8c] = { .simd_size = simd_other },
-    [0x8e] = { .simd_size = simd_other, .to_mem = 1 },
+    [0x8c] = { .simd_size = simd_packed_int },
+    [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
     [0x96 ... 0x98] = { .simd_size = simd_packed_fp },
     [0x99] = { .simd_size = simd_scalar_vexw },
@@ -7984,6 +7984,8 @@ x86_emulate(
 
         generate_exception_if(ea.type != OP_MEM || vex.w, EXC_UD);
         host_and_vcpu_must_have(avx);
+        elem_bytes = 4 << (b & 1);
+    vmaskmov:
         get_fpu(X86EMUL_FPU_ymm);
 
         /*
@@ -7998,7 +8000,7 @@ x86_emulate(
         opc = init_prefixes(stub);
         pvex = copy_VEX(opc, vex);
         pvex->opcx = vex_0f;
-        if ( !(b & 1) )
+        if ( elem_bytes == 4 )
             pvex->pfx = vex_none;
         opc[0] = 0x50; /* vmovmskp{s,d} */
         /* Use %rax as GPR destination and VEX.vvvv as source. */
@@ -8011,21 +8013,9 @@ x86_emulate(
         invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
         put_stub(stub);
 
-        if ( !ea.val )
-            goto complete_insn;
-
-        op_bytes = 4 << (b & 1);
-        first_byte = __builtin_ctz(ea.val);
-        ea.val >>= first_byte;
-        first_byte *= op_bytes;
-        op_bytes *= 32 - __builtin_clz(ea.val);
-
-        /*
-         * Even for the memory write variant a memory read is needed, unless
-         * all set mask bits are contiguous.
-         */
-        if ( ea.val & (ea.val + 1) )
-            d = (d & ~SrcMask) | SrcMem;
+        evex.opmsk = 1; /* fake */
+        op_mask = ea.val;
+        fault_suppression = true;
 
         opc = init_prefixes(stub);
         opc[0] = b;
@@ -8076,63 +8066,10 @@ x86_emulate(
 
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
-    {
-        typeof(vex) *pvex;
-        unsigned int mask = vex.w ? 0x80808080U : 0x88888888U;
-
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
         host_and_vcpu_must_have(avx2);
-        get_fpu(X86EMUL_FPU_ymm);
-
-        /*
-         * While we can't reasonably provide fully correct behavior here
-         * (in particular, for writes, avoiding the memory read in anticipation
-         * of all elements in the range eventually being written), we can (and
-         * should) still limit the memory access to the smallest possible range
-         * (suppressing it altogether if all mask bits are clear), to provide
-         * correct faulting behavior. Read the mask bits via vmovmskp{s,d}
-         * for that purpose.
-         */
-        opc = init_prefixes(stub);
-        pvex = copy_VEX(opc, vex);
-        pvex->opcx = vex_0f;
-        opc[0] = 0xd7; /* vpmovmskb */
-        /* Use %rax as GPR destination and VEX.vvvv as source. */
-        pvex->r = 1;
-        pvex->b = !mode_64bit() || (vex.reg >> 3);
-        opc[1] = 0xc0 | (~vex.reg & 7);
-        pvex->reg = 0xf;
-        opc[2] = 0xc3;
-
-        invoke_stub("", "", "=a" (ea.val) : [dummy] "i" (0));
-        put_stub(stub);
-
-        /* Convert byte granular result to dword/qword granularity. */
-        ea.val &= mask;
-        if ( !ea.val )
-            goto complete_insn;
-
-        first_byte = __builtin_ctz(ea.val) & ~((4 << vex.w) - 1);
-        ea.val >>= first_byte;
-        op_bytes = 32 - __builtin_clz(ea.val);
-
-        /*
-         * Even for the memory write variant a memory read is needed, unless
-         * all set mask bits are contiguous.
-         */
-        if ( ea.val & (ea.val + ~mask + 1) )
-            d = (d & ~SrcMask) | SrcMem;
-
-        opc = init_prefixes(stub);
-        opc[0] = b;
-        /* Convert memory operand to (%rAX). */
-        rex_prefix &= ~REX_B;
-        vex.b = 1;
-        opc[1] = modrm & 0x38;
-        insn_bytes = PFX_BYTES + 2;
-
-        break;
-    }
+        elem_bytes = 4 << vex.w;
+        goto vmaskmov;
 
     case X86EMUL_OPC_VEX_66(0x0f38, 0x90): /* vpgatherd{d,q} {x,y}mm,mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x91): /* vpgatherq{d,q} {x,y}mm,mem,{x,y}mm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 09/44] x86emul: support AVX512F legacy-equivalent arithmetic FP insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (7 preceding siblings ...)
  2018-09-25 13:30   ` [PATCH v4 08/44] x86emul: use AVX512 logic for emulating V{, P}MASKMOV* Jan Beulich
@ 2018-09-25 13:31   ` Jan Beulich
  2018-11-13 18:21     ` Andrew Cooper
  2018-09-25 13:32   ` [PATCH v4 10/44] x86emul: support AVX512DQ logic " Jan Beulich
                     ` (34 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:31 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -93,6 +93,10 @@ enum esz {
     INSN_SFP(m, sp, o)
 
 static const struct test avx512f_all[] = {
+    INSN_FP(add,             0f, 58),
+    INSN_FP(div,             0f, 5e),
+    INSN_FP(max,             0f, 5f),
+    INSN_FP(min,             0f, 5d),
     INSN_SFP(mov,            0f, 10),
     INSN_SFP(mov,            0f, 11),
     INSN_PFP_NB(mova,        0f, 28),
@@ -110,6 +114,9 @@ static const struct test avx512f_all[] =
     INSN_PFP_NB(movnt,       0f, 2b),
     INSN_PFP_NB(movu,        0f, 10),
     INSN_PFP_NB(movu,        0f, 11),
+    INSN_FP(mul,             0f, 59),
+    INSN_FP(sqrt,            0f, 51),
+    INSN_FP(sub,             0f, 5c),
 };
 
 static const struct test avx512f_128[] = {
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -300,12 +300,12 @@ static const struct twobyte_table {
     [0x3a] = { DstReg|SrcImmByte|ModRM },
     [0x40 ... 0x4f] = { DstReg|SrcMem|ModRM|Mov },
     [0x50] = { DstReg|SrcImplicit|ModRM|Mov },
-    [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp },
+    [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp, d8s_vl },
     [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
     [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
-    [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+    [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp },
+    [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
@@ -5863,10 +5863,22 @@ x86_emulate(
         if ( (b & ~1) == 0x10 && ea.type == OP_MEM )
             d |= TwoOp;
         generate_exception_if(evex.br, EXC_UD);
-        generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
+        /* fall through */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x51):    /* vsqrtp{s,d} [xyz]mm/mem,[xyz]mm{k} */
+                                            /* vsqrts{s,d} xmm/m32,xmm,xmm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x58):    /* vadd{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x59):    /* vmul{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5c):    /* vsub{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5d):    /* vmin{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5e):    /* vdiv{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5f):    /* vmax{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
+                               (ea.type == OP_MEM && evex.br &&
+                                (evex.pfx & VEX_PREFIX_SCALAR_MASK))),
                               EXC_UD);
         host_and_vcpu_must_have(avx512f);
-        avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
+        if ( ea.type == OP_MEM || !evex.br )
+            avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
     simd_zmm:
         get_fpu(X86EMUL_FPU_zmm);
         opc = init_evex(stub);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 10/44] x86emul: support AVX512DQ logic FP insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (8 preceding siblings ...)
  2018-09-25 13:31   ` [PATCH v4 09/44] x86emul: support AVX512F legacy-equivalent arithmetic FP insns Jan Beulich
@ 2018-09-25 13:32   ` " Jan Beulich
  2018-11-13 18:56     ` Andrew Cooper
  2018-09-25 13:32   ` [PATCH v4 11/44] x86emul: support AVX512F "normal" FP compare insns Jan Beulich
                     ` (33 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:32 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -133,6 +133,13 @@ static const struct test avx512bw_all[]
     INSN(movdqu16,    f2,   0f, 7f,    vl,    w, vl),
 };
 
+static const struct test avx512dq_all[] = {
+    INSN_PFP(and,              0f, 54),
+    INSN_PFP(andn,             0f, 55),
+    INSN_PFP(or,               0f, 56),
+    INSN_PFP(xor,              0f, 57),
+};
+
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
 static const unsigned char vl_128[] = { VL_128 };
 
@@ -456,4 +463,5 @@ void evex_disp8_test(void *instr, struct
     RUN(avx512f, all);
     RUN(avx512f, 128);
     RUN(avx512bw, all);
+    RUN(avx512dq, all);
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -302,7 +302,7 @@ static const struct twobyte_table {
     [0x50] = { DstReg|SrcImplicit|ModRM|Mov },
     [0x51] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_any_fp, d8s_vl },
     [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
-    [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+    [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
     [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
@@ -6329,6 +6329,17 @@ x86_emulate(
         dst.bytes = 4;
         break;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x54): /* vandp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x55): /* vandnp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x56): /* vorp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x57): /* vxorp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
+                               (ea.type != OP_MEM && evex.br)),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512dq);
+        avx512_vlen_check(false);
+        goto simd_zmm;
+
     CASE_SIMD_ALL_FP(, 0x0f, 0x5a):        /* cvt{p,s}{s,d}2{p,s}{s,d} xmm/mem,xmm */
     CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a):    /* vcvtp{s,d}2p{s,d} xmm/mem,xmm */
                                            /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm */





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 11/44] x86emul: support AVX512F "normal" FP compare insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (9 preceding siblings ...)
  2018-09-25 13:32   ` [PATCH v4 10/44] x86emul: support AVX512DQ logic " Jan Beulich
@ 2018-09-25 13:32   ` Jan Beulich
  2018-11-13 19:04     ` Andrew Cooper
  2018-09-25 13:33   ` [PATCH v4 12/44] x86emul: support AVX512F misc legacy-equivalent FP insns Jan Beulich
                     ` (32 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:32 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Also correct the AVX counterpart's comment.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -94,6 +94,7 @@ enum esz {
 
 static const struct test avx512f_all[] = {
     INSN_FP(add,             0f, 58),
+    INSN_FP(cmp,             0f, c2),
     INSN_FP(div,             0f, 5e),
     INSN_FP(max,             0f, 5f),
     INSN_FP(min,             0f, 5d),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -352,7 +352,7 @@ static const struct twobyte_table {
     [0xbf] = { DstReg|SrcMem16|ModRM|Mov },
     [0xc0] = { ByteOp|DstMem|SrcReg|ModRM },
     [0xc1] = { DstMem|SrcReg|ModRM },
-    [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp },
+    [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp, d8s_vl },
     [0xc3] = { DstMem|SrcReg|ModRM|Mov },
     [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
     [0xc5] = { DstReg|SrcImmByte|ModRM|Mov },
@@ -7438,7 +7438,7 @@ x86_emulate(
         goto add;
 
     CASE_SIMD_ALL_FP(, 0x0f, 0xc2):        /* cmp{p,s}{s,d} $imm8,xmm/mem,xmm */
-    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2):    /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2):    /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_FP(, 0x0f, 0xc6):     /* shufp{s,d} $imm8,xmm/mem,xmm */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
         d = (d & ~SrcMask) | SrcMem;
@@ -7452,6 +7452,30 @@ x86_emulate(
         }
         goto simd_0f_imm8_avx;
 
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0xc2): /* vcmp{p,s}{s,d} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+        generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
+                               (ea.type == OP_MEM && evex.br &&
+                                (evex.pfx & VEX_PREFIX_SCALAR_MASK)) ||
+                               !evex.r || !evex.R || evex.z),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( ea.type == OP_MEM || !evex.br )
+            avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
+        d = (d & ~SrcMask) | SrcMem;
+        get_fpu(X86EMUL_FPU_zmm);
+        opc = init_evex(stub);
+        opc[0] = b;
+        opc[1] = modrm;
+        if ( ea.type == OP_MEM )
+        {
+            /* convert memory operand to (%rAX) */
+            evex.b = 1;
+            opc[1] &= 0x38;
+        }
+        opc[2] = imm1;
+        insn_bytes = EVEX_PFX_BYTES + 3;
+        break;
+
     case X86EMUL_OPC(0x0f, 0xc3): /* movnti */
         /* Ignore the non-temporal hint for now. */
         vcpu_must_have(sse2);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 12/44] x86emul: support AVX512F misc legacy-equivalent FP insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (10 preceding siblings ...)
  2018-09-25 13:32   ` [PATCH v4 11/44] x86emul: support AVX512F "normal" FP compare insns Jan Beulich
@ 2018-09-25 13:33   ` Jan Beulich
  2018-11-13 19:17     ` Andrew Cooper
  2018-09-25 13:33   ` [PATCH v4 13/44] x86emul: support AVX512F fused-multiply-add insns Jan Beulich
                     ` (31 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:33 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Also correct an AVX counterpart's comment.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -116,8 +116,11 @@ static const struct test avx512f_all[] =
     INSN_PFP_NB(movu,        0f, 10),
     INSN_PFP_NB(movu,        0f, 11),
     INSN_FP(mul,             0f, 59),
+    INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
     INSN_FP(sub,             0f, 5c),
+    INSN_PFP(unpckh,         0f, 15),
+    INSN_PFP(unpckl,         0f, 14),
 };
 
 static const struct test avx512f_128[] = {
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -282,7 +282,7 @@ static const struct twobyte_table {
     [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
-    [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp },
+    [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
     [0x16] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x17] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
     [0x18 ... 0x1f] = { ImplicitOps|ModRM },
@@ -356,7 +356,7 @@ static const struct twobyte_table {
     [0xc3] = { DstMem|SrcReg|ModRM|Mov },
     [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
     [0xc5] = { DstReg|SrcImmByte|ModRM|Mov },
-    [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp },
+    [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp, d8s_vl },
     [0xc7] = { ImplicitOps|ModRM },
     [0xc8 ... 0xcf] = { ImplicitOps },
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
@@ -5932,6 +5932,17 @@ x86_emulate(
         host_and_vcpu_must_have(sse3);
         goto simd_0f_xmm;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x14): /* vunpcklp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x15): /* vunpckhp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
+                              EXC_UD);
+        fault_suppression = false;
+    avx512f_no_sae:
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
+        avx512_vlen_check(false);
+        goto simd_zmm;
+
     case X86EMUL_OPC(0x0f, 0x20): /* mov cr,reg */
     case X86EMUL_OPC(0x0f, 0x21): /* mov dr,reg */
     case X86EMUL_OPC(0x0f, 0x22): /* mov reg,cr */
@@ -6611,11 +6622,9 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_F3(0x0f, 0x7f): /* vmovdqu{32,64} [xyz]mm,[xyz]mm/mem{k} */
     vmovdqa:
         generate_exception_if(evex.br, EXC_UD);
-        host_and_vcpu_must_have(avx512f);
-        avx512_vlen_check(false);
         d |= TwoOp;
         op_bytes = 16 << evex.lr;
-        goto simd_zmm;
+        goto avx512f_no_sae;
 
     case X86EMUL_OPC_EVEX_F2(0x0f, 0x6f): /* vmovdqu{8,16} [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_F2(0x0f, 0x7f): /* vmovdqu{8,16} [xyz]mm,[xyz]mm/mem{k} */
@@ -7440,7 +7449,7 @@ x86_emulate(
     CASE_SIMD_ALL_FP(, 0x0f, 0xc2):        /* cmp{p,s}{s,d} $imm8,xmm/mem,xmm */
     CASE_SIMD_ALL_FP(_VEX, 0x0f, 0xc2):    /* vcmp{p,s}{s,d} $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_FP(, 0x0f, 0xc6):     /* shufp{s,d} $imm8,xmm/mem,xmm */
-    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm */
+    CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
         d = (d & ~SrcMask) | SrcMem;
         if ( vex.opcx == vex_none )
         {
@@ -7461,7 +7470,9 @@ x86_emulate(
         host_and_vcpu_must_have(avx512f);
         if ( ea.type == OP_MEM || !evex.br )
             avx512_vlen_check(evex.pfx & VEX_PREFIX_SCALAR_MASK);
-        d = (d & ~SrcMask) | SrcMem;
+    simd_imm8_zmm:
+        if ( (d & SrcMask) == SrcImmByte )
+            d = (d & ~SrcMask) | SrcMem;
         get_fpu(X86EMUL_FPU_zmm);
         opc = init_evex(stub);
         opc[0] = b;
@@ -7505,6 +7516,15 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         goto simd_0f_to_gpr;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        fault_suppression = false;
+        generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
+        avx512_vlen_check(false);
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC(0x0f, 0xc7): /* Grp9 */
     {
         union {




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 13/44] x86emul: support AVX512F fused-multiply-add insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (11 preceding siblings ...)
  2018-09-25 13:33   ` [PATCH v4 12/44] x86emul: support AVX512F misc legacy-equivalent FP insns Jan Beulich
@ 2018-09-25 13:33   ` Jan Beulich
  2018-11-13 19:28     ` Andrew Cooper
  2018-09-25 13:34   ` [PATCH v4 14/44] x86emul: support AVX512F legacy-equivalent logic insns Jan Beulich
                     ` (30 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:33 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -96,6 +96,36 @@ static const struct test avx512f_all[] =
     INSN_FP(add,             0f, 58),
     INSN_FP(cmp,             0f, c2),
     INSN_FP(div,             0f, 5e),
+    INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
+    INSN(fmadd132,     66, 0f38, 99,    el,     sd, el),
+    INSN(fmadd213,     66, 0f38, a8,    vl,     sd, vl),
+    INSN(fmadd213,     66, 0f38, a9,    el,     sd, el),
+    INSN(fmadd231,     66, 0f38, b8,    vl,     sd, vl),
+    INSN(fmadd231,     66, 0f38, b9,    el,     sd, el),
+    INSN(fmaddsub132,  66, 0f38, 96,    vl,     sd, vl),
+    INSN(fmaddsub213,  66, 0f38, a6,    vl,     sd, vl),
+    INSN(fmaddsub231,  66, 0f38, b6,    vl,     sd, vl),
+    INSN(fmsub132,     66, 0f38, 9a,    vl,     sd, vl),
+    INSN(fmsub132,     66, 0f38, 9b,    el,     sd, el),
+    INSN(fmsub213,     66, 0f38, aa,    vl,     sd, vl),
+    INSN(fmsub213,     66, 0f38, ab,    el,     sd, el),
+    INSN(fmsub231,     66, 0f38, ba,    vl,     sd, vl),
+    INSN(fmsub231,     66, 0f38, bb,    el,     sd, el),
+    INSN(fmsubadd132,  66, 0f38, 97,    vl,     sd, vl),
+    INSN(fmsubadd213,  66, 0f38, a7,    vl,     sd, vl),
+    INSN(fmsubadd231,  66, 0f38, b7,    vl,     sd, vl),
+    INSN(fnmadd132,    66, 0f38, 9c,    vl,     sd, vl),
+    INSN(fnmadd132,    66, 0f38, 9d,    el,     sd, el),
+    INSN(fnmadd213,    66, 0f38, ac,    vl,     sd, vl),
+    INSN(fnmadd213,    66, 0f38, ad,    el,     sd, el),
+    INSN(fnmadd231,    66, 0f38, bc,    vl,     sd, vl),
+    INSN(fnmadd231,    66, 0f38, bd,    el,     sd, el),
+    INSN(fnmsub132,    66, 0f38, 9e,    vl,     sd, vl),
+    INSN(fnmsub132,    66, 0f38, 9f,    el,     sd, el),
+    INSN(fnmsub213,    66, 0f38, ae,    vl,     sd, vl),
+    INSN(fnmsub213,    66, 0f38, af,    el,     sd, el),
+    INSN(fnmsub231,    66, 0f38, be,    vl,     sd, vl),
+    INSN(fnmsub231,    66, 0f38, bf,    el,     sd, el),
     INSN_FP(max,             0f, 5f),
     INSN_FP(min,             0f, 5d),
     INSN_SFP(mov,            0f, 10),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -452,30 +452,30 @@ static const struct ext0f38_table {
     [0x8c] = { .simd_size = simd_packed_int },
     [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
-    [0x96 ... 0x98] = { .simd_size = simd_packed_fp },
-    [0x99] = { .simd_size = simd_scalar_vexw },
-    [0x9a] = { .simd_size = simd_packed_fp },
-    [0x9b] = { .simd_size = simd_scalar_vexw },
-    [0x9c] = { .simd_size = simd_packed_fp },
-    [0x9d] = { .simd_size = simd_scalar_vexw },
-    [0x9e] = { .simd_size = simd_packed_fp },
-    [0x9f] = { .simd_size = simd_scalar_vexw },
-    [0xa6 ... 0xa8] = { .simd_size = simd_packed_fp },
-    [0xa9] = { .simd_size = simd_scalar_vexw },
-    [0xaa] = { .simd_size = simd_packed_fp },
-    [0xab] = { .simd_size = simd_scalar_vexw },
-    [0xac] = { .simd_size = simd_packed_fp },
-    [0xad] = { .simd_size = simd_scalar_vexw },
-    [0xae] = { .simd_size = simd_packed_fp },
-    [0xaf] = { .simd_size = simd_scalar_vexw },
-    [0xb6 ... 0xb8] = { .simd_size = simd_packed_fp },
-    [0xb9] = { .simd_size = simd_scalar_vexw },
-    [0xba] = { .simd_size = simd_packed_fp },
-    [0xbb] = { .simd_size = simd_scalar_vexw },
-    [0xbc] = { .simd_size = simd_packed_fp },
-    [0xbd] = { .simd_size = simd_scalar_vexw },
-    [0xbe] = { .simd_size = simd_packed_fp },
-    [0xbf] = { .simd_size = simd_scalar_vexw },
+    [0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x99] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0x9a] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x9b] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0x9c] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x9d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0x9e] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x9f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xa6 ... 0xa8] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xa9] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xaa] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xab] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xac] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xad] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xae] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xaf] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xb6 ... 0xb8] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xb9] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xba] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xbb] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xbc] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xbd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+    [0xbe] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0xbf] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
     [0xc8 ... 0xcd] = { .simd_size = simd_other },
     [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0xdc ... 0xdf] = { .simd_size = simd_packed_int },
@@ -8287,6 +8287,49 @@ x86_emulate(
         host_and_vcpu_must_have(fma);
         goto simd_0f_ymm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512f);
+        if ( ea.type == OP_MEM || !evex.br )
+            avx512_vlen_check(false);
+        goto simd_zmm;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} xmm/mem,xmm,xmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} xmm/mem,xmm,xmm{k} */
+        host_and_vcpu_must_have(avx512f);
+        if ( ea.type == OP_MEM )
+        {
+            generate_exception_if(evex.br, EXC_UD);
+            avx512_vlen_check(true);
+        }
+        goto simd_zmm;
+
     case X86EMUL_OPC(0x0f38, 0xc8):     /* sha1nexte xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xc9):     /* sha1msg1 xmm/m128,xmm */
     case X86EMUL_OPC(0x0f38, 0xca):     /* sha1msg2 xmm/m128,xmm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 14/44] x86emul: support AVX512F legacy-equivalent logic insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (12 preceding siblings ...)
  2018-09-25 13:33   ` [PATCH v4 13/44] x86emul: support AVX512F fused-multiply-add insns Jan Beulich
@ 2018-09-25 13:34   ` Jan Beulich
  2018-11-13 19:30     ` Andrew Cooper
  2018-09-25 13:35   ` [PATCH v4 15/44] x86emul: support AVX512{F, DQ} FP broadcast insns Jan Beulich
                     ` (29 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:34 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Plus vpternlog{d,q} as being extensively used by the compiler, in order
to facilitate test enabling in the harness as soon as possible. Also the
twobyte_table[] entries for a few more insns get their .d8s field set
right away, in order to not split and later re-combine the groups.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -146,6 +146,11 @@ static const struct test avx512f_all[] =
     INSN_PFP_NB(movu,        0f, 10),
     INSN_PFP_NB(movu,        0f, 11),
     INSN_FP(mul,             0f, 59),
+    INSN(pand,         66,   0f, db,    vl,     dq, vl),
+    INSN(pandn,        66,   0f, df,    vl,     dq, vl),
+    INSN(por,          66,   0f, eb,    vl,     dq, vl),
+    INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
+    INSN(pxor,         66,   0f, ef,    vl,     dq, vl),
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
     INSN_FP(sub,             0f, 5c),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -364,13 +364,13 @@ static const struct twobyte_table {
     [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
-    [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
-    [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
@@ -493,6 +493,7 @@ static const struct ext0f3a_table {
     uint8_t to_mem:1;
     uint8_t two_op:1;
     uint8_t four_op:1;
+    disp8scale_t d8s:4;
 } ext0f3a_table[256] = {
     [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
@@ -510,6 +511,7 @@ static const struct ext0f3a_table {
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
+    [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
@@ -3016,20 +3018,33 @@ x86_decode(
                 disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
             break;
 
+        case ext_0f3a:
+            /*
+             * Cannot update d here yet, as the immediate operand still
+             * needs fetching.
+             */
+            state->simd_size = ext0f3a_table[b].simd_size;
+            if ( evex_encoded() )
+                disp8scale = decode_disp8scale(ext0f3a_table[b].d8s, state);
+            break;
+
         case ext_8f09:
             if ( ext8f09_table[b].two_op )
                 d |= TwoOp;
             state->simd_size = ext8f09_table[b].simd_size;
             break;
 
-        case ext_0f3a:
         case ext_8f08:
+        case ext_8f0a:
             /*
              * Cannot update d here yet, as the immediate operand still
              * needs fetching.
              */
-        default:
             break;
+
+        default:
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNIMPLEMENTED;
         }
 
         if ( modrm_mod == 3 )
@@ -3208,7 +3223,6 @@ x86_decode(
         else if ( ext0f3a_table[b].four_op && !mode_64bit() && vex.opcx )
             imm1 &= 0x7f;
         state->desc = d;
-        state->simd_size = ext0f3a_table[b].simd_size;
         rc = x86_decode_0f3a(state, ctxt, ops);
         break;
 
@@ -5937,6 +5951,11 @@ x86_emulate(
         generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
                               EXC_UD);
         fault_suppression = false;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xdb): /* vpand{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xdf): /* vpandn{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xeb): /* vpor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xef): /* vpxor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     avx512f_no_sae:
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
@@ -7520,6 +7539,8 @@ x86_emulate(
         fault_suppression = false;
         generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
                               EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x25): /* vpternlog{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
         avx512_vlen_check(false);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 15/44] x86emul: support AVX512{F, DQ} FP broadcast insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (13 preceding siblings ...)
  2018-09-25 13:34   ` [PATCH v4 14/44] x86emul: support AVX512F legacy-equivalent logic insns Jan Beulich
@ 2018-09-25 13:35   ` Jan Beulich
  2018-11-13 19:37     ` Andrew Cooper
  2018-09-25 13:35   ` [PATCH v4 16/44] x86emul: support AVX512F v{, u}comis{d, s} insns Jan Beulich
                     ` (28 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:35 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -94,6 +94,7 @@ enum esz {
 
 static const struct test avx512f_all[] = {
     INSN_FP(add,             0f, 58),
+    INSN(broadcastss,  66, 0f38, 18,    el,      d, el),
     INSN_FP(cmp,             0f, c2),
     INSN_FP(div,             0f, 5e),
     INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
@@ -165,6 +166,15 @@ static const struct test avx512f_128[] =
     INSN(movq,      66,   0f, d6, el,    q, el),
 };
 
+static const struct test avx512f_no128[] = {
+    INSN(broadcastf32x4, 66, 0f38, 1a, el_4,  d, vl),
+    INSN(broadcastsd,    66, 0f38, 19, el,    q, el),
+};
+
+static const struct test avx512f_512[] = {
+    INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
+};
+
 static const struct test avx512bw_all[] = {
     INSN(movdqu8,     f2,   0f, 6f,    vl,    b, vl),
     INSN(movdqu8,     f2,   0f, 7f,    vl,    b, vl),
@@ -179,8 +189,19 @@ static const struct test avx512dq_all[]
     INSN_PFP(xor,              0f, 57),
 };
 
+static const struct test avx512dq_no128[] = {
+    INSN(broadcastf32x2, 66, 0f38, 19, el_2, d, vl),
+    INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
+};
+
+static const struct test avx512dq_512[] = {
+    INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
+};
+
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
 static const unsigned char vl_128[] = { VL_128 };
+static const unsigned char vl_no128[] = { VL_512, VL_256 };
+static const unsigned char vl_512[] = { VL_512 };
 
 /*
  * This table, indicating the presence of an immediate (byte) for an opcode
@@ -501,6 +522,10 @@ void evex_disp8_test(void *instr, struct
 
     RUN(avx512f, all);
     RUN(avx512f, 128);
+    RUN(avx512f, no128);
+    RUN(avx512f, 512);
     RUN(avx512bw, all);
     RUN(avx512dq, all);
+    RUN(avx512dq, no128);
+    RUN(avx512dq, 512);
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -234,10 +234,16 @@ enum simd_opsize {
 
     /*
      * 128 bits of integer or floating point data, with no further
-     * formatting information.
+     * formatting information, or with it encoded by EVEX.W.
      */
     simd_128,
 
+    /*
+     * 256 bits of integer or floating point data, with formatting
+     * encoded by EVEX.W.
+     */
+    simd_256,
+
     /* Operand size encoded in non-standard way. */
     simd_other
 };
@@ -432,8 +438,10 @@ static const struct ext0f38_table {
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
     [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x18 ... 0x19] = { .simd_size = simd_scalar_opc, .two_op = 1 },
-    [0x1a] = { .simd_size = simd_128, .two_op = 1 },
+    [0x18] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 2 },
+    [0x19] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 3 },
+    [0x1a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
+    [0x1b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
     [0x28 ... 0x29] = { .simd_size = simd_packed_int },
@@ -3330,6 +3338,10 @@ x86_decode(
         op_bytes = 16;
         break;
 
+    case simd_256:
+        op_bytes = 32;
+        break;
+
     default:
         op_bytes = 0;
         break;
@@ -7979,6 +7991,42 @@ x86_emulate(
         dst.type = OP_NONE;
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
+        generate_exception_if(evex.w || evex.br, EXC_UD);
+    avx512_broadcast:
+        /*
+         * For the respective code below the main switch() to work we need to
+         * fold op_mask here: A source element gets read whenever any of its
+         * respective destination elements' mask bits is set.
+         */
+        if ( fault_suppression )
+        {
+            n = 1 << ((b & 3) - evex.w);
+            ASSERT(op_bytes == n * elem_bytes);
+            for ( i = n; i < (16 << evex.lr) / elem_bytes; i += n )
+                op_mask |= (op_mask >> i) & ((1 << n) - 1);
+        }
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */
+                                            /* vbroadcastf64x4 m256,zmm{k} */
+        generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */
+                                            /* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */
+        generate_exception_if(!evex.lr || evex.br, EXC_UD);
+        if ( !evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        goto avx512_broadcast;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */
+                                            /* vbroadcastf64x2 m128,{y,z}mm{k} */
+        generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.br,
+                              EXC_UD);
+        if ( evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        goto avx512_broadcast;
+
     case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
     case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 16/44] x86emul: support AVX512F v{, u}comis{d, s} insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (14 preceding siblings ...)
  2018-09-25 13:35   ` [PATCH v4 15/44] x86emul: support AVX512{F, DQ} FP broadcast insns Jan Beulich
@ 2018-09-25 13:35   ` Jan Beulich
  2018-11-13 19:39     ` Andrew Cooper
  2018-09-25 13:36   ` [PATCH v4 17/44] x86emul/test: introduce eq() Jan Beulich
                     ` (27 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:35 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Add missing avx512_vlen_check().
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -96,6 +96,8 @@ static const struct test avx512f_all[] =
     INSN_FP(add,             0f, 58),
     INSN(broadcastss,  66, 0f38, 18,    el,      d, el),
     INSN_FP(cmp,             0f, c2),
+    INSN(comisd,       66,   0f, 2f,    el,      q, el),
+    INSN(comiss,         ,   0f, 2f,    el,      d, el),
     INSN_FP(div,             0f, 5e),
     INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
     INSN(fmadd132,     66, 0f38, 99,    el,     sd, el),
@@ -155,6 +157,8 @@ static const struct test avx512f_all[] =
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
     INSN_FP(sub,             0f, 5c),
+    INSN(ucomisd,      66,   0f, 2e,    el,      q, el),
+    INSN(ucomiss,        ,   0f, 2e,    el,      d, el),
     INSN_PFP(unpckh,         0f, 15),
     INSN_PFP(unpckl,         0f, 14),
 };
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -299,7 +299,7 @@ static const struct twobyte_table {
     [0x2a] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x2b] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x2c ... 0x2d] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp },
+    [0x2e ... 0x2f] = { ImplicitOps|ModRM|TwoOp, simd_none, d8s_dq },
     [0x30 ... 0x35] = { ImplicitOps },
     [0x37] = { ImplicitOps },
     [0x38] = { DstReg|SrcMem|ModRM },
@@ -6111,24 +6111,34 @@ x86_emulate(
         }
 
         opc = init_prefixes(stub);
+        op_bytes = 4 << vex.pfx;
+    vcomi:
         opc[0] = b;
         opc[1] = modrm;
         if ( ea.type == OP_MEM )
         {
-            rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, vex.pfx ? 8 : 4,
-                           ctxt);
+            rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, op_bytes, ctxt);
             if ( rc != X86EMUL_OKAY )
                 goto done;
 
             /* Convert memory operand to (%rAX). */
             rex_prefix &= ~REX_B;
             vex.b = 1;
+            evex.b = 1;
             opc[1] &= 0x38;
         }
-        insn_bytes = PFX_BYTES + 2;
+        if ( evex_encoded() )
+        {
+            insn_bytes = EVEX_PFX_BYTES + 2;
+            copy_EVEX(opc, evex);
+        }
+        else
+        {
+            insn_bytes = PFX_BYTES + 2;
+            copy_REX_VEX(opc, rex_prefix, vex);
+        }
         opc[2] = 0xc3;
 
-        copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub(_PRE_EFLAGS("[eflags]", "[mask]", "[tmp]"),
                     _POST_EFLAGS("[eflags]", "[mask]", "[tmp]"),
                     [eflags] "+g" (_regs.eflags),
@@ -6139,6 +6149,20 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x2e): /* vucomis{s,d} xmm/mem,xmm */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x2f): /* vcomis{s,d} xmm/mem,xmm */
+        generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk ||
+                               (ea.type != OP_REG && evex.br) ||
+                               evex.w != evex.pfx),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(true);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        opc = init_evex(stub);
+        op_bytes = 4 << evex.w;
+        goto vcomi;
+
     case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
         generate_exception_if(!mode_ring0(), EXC_GP, 0);
         fail_if(ops->write_msr == NULL);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 17/44] x86emul/test: introduce eq()
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (15 preceding siblings ...)
  2018-09-25 13:35   ` [PATCH v4 16/44] x86emul: support AVX512F v{, u}comis{d, s} insns Jan Beulich
@ 2018-09-25 13:36   ` Jan Beulich
  2018-10-26 11:31     ` Andrew Cooper
  2018-09-25 13:37   ` [PATCH v4 18/44] x86emul: support AVX512{F, BW} packed integer compare insns Jan Beulich
                     ` (26 subsequent siblings)
  43 siblings, 1 reply; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:36 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

In preparation for sensible to-boolean conversion on AVX512, wrap
another abstraction function around the present to_bool(<x> == <y>), to
get rid of the open-coded == (which will get in the way of using
built-in functions instead). For the future AVX512 use scalar operands
can't be used then anymore: Use (vec_t){} when the operand is zero,
and broadcast (if available) otherwise (assume pre-AVX512 when broadcast
is not available, in which case a plain scalar is still fine).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -46,6 +46,10 @@ static inline bool _to_bool(byte_vec_t b
 # define to_bool(cmp) _to_bool((byte_vec_t)(cmp))
 #endif
 
+#ifndef eq
+# define eq(x, y) to_bool((x) == (y))
+#endif
+
 #if VEC_SIZE == FLOAT_SIZE
 # define to_int(x) ((vec_t){ (int)(x)[0] })
 #elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
@@ -605,18 +609,18 @@ int simd_test(void)
     touch(src);
     x = src;
     touch(x);
-    if ( !to_bool(x == src) ) return __LINE__;
+    if ( !eq(x, src) ) return __LINE__;
 
     touch(src);
     y = x + src;
     touch(src);
     touch(y);
-    if ( !to_bool(y == 2 * src) ) return __LINE__;
+    if ( !eq(y, 2 * src) ) return __LINE__;
 
     touch(src);
     z = y -= src;
     touch(z);
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 
 #if defined(UINT_SIZE)
 
@@ -628,7 +632,7 @@ int simd_test(void)
     z ^= inv;
     touch(inv);
     touch(x);
-    if ( !to_bool((x & ~y) == z) ) return __LINE__;
+    if ( !eq(x & ~y, z) ) return __LINE__;
 
 #elif ELEM_SIZE > 1 || VEC_SIZE <= 8
 
@@ -639,7 +643,7 @@ int simd_test(void)
     z = src + inv;
     touch(inv);
     z *= (src - inv);
-    if ( !to_bool(x - y == z) ) return __LINE__;
+    if ( !eq(x - y, z) ) return __LINE__;
 
 #endif
 
@@ -648,10 +652,10 @@ int simd_test(void)
     x = src * alt;
     touch(alt);
     y = src / alt;
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
     touch(alt);
     touch(src);
-    if ( !to_bool(x * -alt == -src) ) return __LINE__;
+    if ( !eq(x * -alt, -src) ) return __LINE__;
 
 # if defined(recip) && defined(to_int)
 
@@ -659,16 +663,16 @@ int simd_test(void)
     x = recip(src);
     touch(src);
     touch(x);
-    if ( !to_bool(to_int(recip(x)) == src) ) return __LINE__;
+    if ( !eq(to_int(recip(x)), src) ) return __LINE__;
 
 #  ifdef rsqrt
     x = src * src;
     touch(x);
     y = rsqrt(x);
     touch(y);
-    if ( !to_bool(to_int(recip(y)) == src) ) return __LINE__;
+    if ( !eq(to_int(recip(y)), src) ) return __LINE__;
     touch(src);
-    if ( !to_bool(to_int(y) == to_int(recip(src))) ) return __LINE__;
+    if ( !eq(to_int(y), to_int(recip(src))) ) return __LINE__;
 #  endif
 
 # endif
@@ -676,7 +680,7 @@ int simd_test(void)
 # ifdef sqrt
     x = src * src;
     touch(x);
-    if ( !to_bool(sqrt(x) == src) ) return __LINE__;
+    if ( !eq(sqrt(x), src) ) return __LINE__;
 # endif
 
 # ifdef trunc
@@ -684,20 +688,20 @@ int simd_test(void)
     y = (vec_t){ 1 };
     touch(x);
     z = trunc(x);
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
 # endif
 
 # ifdef frac
     touch(src);
     x = frac(src);
     touch(src);
-    if ( !to_bool(x == 0) ) return __LINE__;
+    if ( !eq(x, (vec_t){}) ) return __LINE__;
 
     x = 1 / (src + 1);
     touch(x);
     y = frac(x);
     touch(x);
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 # endif
 
 # if defined(trunc) && defined(frac)
@@ -707,7 +711,7 @@ int simd_test(void)
     touch(x);
     z = frac(x);
     touch(x);
-    if ( !to_bool(x == y + z) ) return __LINE__;
+    if ( !eq(x, y + z) ) return __LINE__;
 # endif
 
 #else
@@ -720,16 +724,16 @@ int simd_test(void)
     y[ELEM_COUNT - 1] = y[0] = j = ELEM_COUNT;
     for ( i = 1; i < ELEM_COUNT / 2; ++i )
         y[ELEM_COUNT - i - 1] = y[i] = y[i - 1] + (j -= 2);
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 
 #  ifdef mul_hi
     touch(alt);
     x = mul_hi(src, alt);
     touch(alt);
 #   ifdef INT_SIZE
-    if ( !to_bool(x == (alt < 0)) ) return __LINE__;
+    if ( !eq(x, alt < 0) ) return __LINE__;
 #   else
-    if ( !to_bool(x == (src & alt) + alt) ) return __LINE__;
+    if ( !eq(x, (src & alt) + alt) ) return __LINE__;
 #   endif
 #  endif
 
@@ -745,7 +749,7 @@ int simd_test(void)
         z[i] = res;
         z[i + 1] = res >> (ELEM_SIZE << 3);
     }
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
 #  endif
 
     z = src;
@@ -757,12 +761,12 @@ int simd_test(void)
     touch(z);
     y = z << 2;
     touch(z);
-    if ( !to_bool(x == y + y) ) return __LINE__;
+    if ( !eq(x, y + y) ) return __LINE__;
 
     touch(x);
     z = x >> 2;
     touch(x);
-    if ( !to_bool(y == z + z) ) return __LINE__;
+    if ( !eq(y, z + z) ) return __LINE__;
 
     z = src;
 #  ifdef INT_SIZE
@@ -781,11 +785,11 @@ int simd_test(void)
     touch(j);
     y = z << j;
     touch(j);
-    if ( !to_bool(x == y + y) ) return __LINE__;
+    if ( !eq(x, y + y) ) return __LINE__;
 
     z = x >> j;
     touch(j);
-    if ( !to_bool(y == z + z) ) return __LINE__;
+    if ( !eq(y, z + z) ) return __LINE__;
 
 # endif
 
@@ -809,12 +813,12 @@ int simd_test(void)
     --sh;
     touch(sh);
     y = z << sh;
-    if ( !to_bool(x == y + y) ) return __LINE__;
+    if ( !eq(x, y + y) ) return __LINE__;
 
 #  if (defined(__AVX2__) && ELEM_SIZE >= 4) || defined(__XOP__)
     touch(sh);
     x = y >> sh;
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 #  endif
 
 # endif
@@ -828,7 +832,7 @@ int simd_test(void)
     touch(inv);
     y = max(src, inv);
     touch(inv);
-    if ( !to_bool(x + y == src + inv) ) return __LINE__;
+    if ( !eq(x + y, src + inv) ) return __LINE__;
 # else
     x = src * alt;
     y = inv * alt;
@@ -837,33 +841,33 @@ int simd_test(void)
     touch(y);
     y = min(x, y);
     touch(y);
-    if ( !to_bool((y + z) * alt == src + inv) ) return __LINE__;
+    if ( !eq((y + z) * alt, src + inv) ) return __LINE__;
 # endif
 #endif
 
 #ifdef abs
     x = src * alt;
     touch(x);
-    if ( !to_bool(abs(x) == src) ) return __LINE__;
+    if ( !eq(abs(x), src) ) return __LINE__;
 #endif
 
 #ifdef copysignz
     touch(alt);
-    if ( !to_bool(copysignz((vec_t){} + 1, alt) == alt) ) return __LINE__;
+    if ( !eq(copysignz((vec_t){} + 1, alt), alt) ) return __LINE__;
 #endif
 
 #ifdef swap
     touch(src);
-    if ( !to_bool(swap(src) == inv) ) return __LINE__;
+    if ( !eq(swap(src), inv) ) return __LINE__;
 #endif
 
 #ifdef swap2
     touch(src);
-    if ( !to_bool(swap2(src) == inv) ) return __LINE__;
+    if ( !eq(swap2(src), inv) ) return __LINE__;
 #endif
 
 #if defined(broadcast)
-    if ( !to_bool(broadcast(ELEM_COUNT + 1) == src + inv) ) return __LINE__;
+    if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
 #endif
 
 #if defined(interleave_lo) && defined(interleave_hi)
@@ -877,7 +881,11 @@ int simd_test(void)
 # else
     z = (x - y) * alt;
 # endif
-    if ( !to_bool(z == ELEM_COUNT / 2) ) return __LINE__;
+# ifdef broadcast
+    if ( !eq(z, broadcast(ELEM_COUNT / 2)) ) return __LINE__;
+# else
+    if ( !eq(z, ELEM_COUNT / 2) ) return __LINE__;
+# endif
 #endif
 
 #if defined(INT_SIZE) && defined(widen1) && defined(interleave_lo)
@@ -887,7 +895,7 @@ int simd_test(void)
     touch(x);
     z = widen1(x);
     touch(x);
-    if ( !to_bool(z == y) ) return __LINE__;
+    if ( !eq(z, y) ) return __LINE__;
 
 # ifdef widen2
     y = interleave_lo(alt < 0, alt < 0);
@@ -895,7 +903,7 @@ int simd_test(void)
     touch(x);
     z = widen2(x);
     touch(x);
-    if ( !to_bool(z == y) ) return __LINE__;
+    if ( !eq(z, y) ) return __LINE__;
 
 #  ifdef widen3
     y = interleave_lo(alt < 0, alt < 0);
@@ -904,7 +912,7 @@ int simd_test(void)
     touch(x);
     z = widen3(x);
     touch(x);
-    if ( !to_bool(z == y) ) return __LINE__;
+    if ( !eq(z, y) ) return __LINE__;
 #  endif
 # endif
 
@@ -919,21 +927,21 @@ int simd_test(void)
     touch(src);
     x = widen1(src);
     touch(src);
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 # endif
 
 # ifdef widen2
     touch(src);
     x = widen2(src);
     touch(src);
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 # endif
 
 # ifdef widen3
     touch(src);
     x = widen3(src);
     touch(src);
-    if ( !to_bool(x == interleave_lo(z, (vec_t){})) ) return __LINE__;
+    if ( !eq(x, interleave_lo(z, (vec_t){})) ) return __LINE__;
 # endif
 
 #endif
@@ -942,14 +950,14 @@ int simd_test(void)
     touch(src);
     x = dup_lo(src);
     touch(src);
-    if ( !to_bool(x - src == (alt - 1) / 2) ) return __LINE__;
+    if ( !eq(x - src, (alt - 1) / 2) ) return __LINE__;
 #endif
 
 #ifdef dup_hi
     touch(src);
     x = dup_hi(src);
     touch(src);
-    if ( !to_bool(x - src == (alt + 1) / 2) ) return __LINE__;
+    if ( !eq(x - src, (alt + 1) / 2) ) return __LINE__;
 #endif
 
     for ( i = 0; i < ELEM_COUNT; ++i )
@@ -961,7 +969,7 @@ int simd_test(void)
 # else
     select(&z, src, inv, alt > 0);
 # endif
-    if ( !to_bool(z == y) ) return __LINE__;
+    if ( !eq(z, y) ) return __LINE__;
 #endif
 
 #ifdef select2
@@ -970,14 +978,14 @@ int simd_test(void)
 # else
     select2(&z, src, inv, alt > 0);
 # endif
-    if ( !to_bool(z == y) ) return __LINE__;
+    if ( !eq(z, y) ) return __LINE__;
 #endif
 
 #ifdef mix
     touch(src);
     touch(inv);
     x = mix(src, inv);
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 
 # ifdef addsub
     touch(src);
@@ -986,22 +994,22 @@ int simd_test(void)
     touch(src);
     touch(inv);
     y = mix(src - inv, src + inv);
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 # endif
 #endif
 
 #ifdef rotr
     x = rotr(src, 1);
     y = (src & (ELEM_COUNT - 1)) + 1;
-    if ( !to_bool(x == y) ) return __LINE__;
+    if ( !eq(x, y) ) return __LINE__;
 #endif
 
 #ifdef dot_product
     touch(src);
     touch(inv);
     x = dot_product(src, inv);
-    if ( !to_bool(x == (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
-                                 (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
+    if ( !eq(x, (vec_t){ (ELEM_COUNT * (ELEM_COUNT + 1) *
+                          (ELEM_COUNT + 2)) / 6 }) ) return __LINE__;
 #endif
 
 #ifdef hadd
@@ -1022,7 +1030,7 @@ int simd_test(void)
     x = hsub(src, inv);
     for ( i = ELEM_COUNT; i >>= 1; )
         x = hadd(x, (vec_t){});
-    if ( !to_bool(x == 0) ) return __LINE__;
+    if ( !eq(x, (vec_t){}) ) return __LINE__;
 # endif
 #endif
 
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -20,6 +20,10 @@ ENTRY(fma_test);
 # endif
 #endif
 
+#ifndef eq
+# define eq(x, y) to_bool((x) == (y))
+#endif
+
 #if VEC_SIZE == 16
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps(x, y)
@@ -62,38 +66,38 @@ int fma_test(void)
     y = (src - one) * inv;
     touch(src);
     z = inv * src + inv;
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 
     touch(src);
     z = -inv * src - inv;
-    if ( !to_bool(-x == z) ) return __LINE__;
+    if ( !eq(-x, z) ) return __LINE__;
 
     touch(src);
     z = inv * src - inv;
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
 
     touch(src);
     z = -inv * src + inv;
-    if ( !to_bool(-y == z) ) return __LINE__;
+    if ( !eq(-y, z) ) return __LINE__;
     touch(src);
 
     x = src + inv;
     y = src - inv;
     touch(inv);
     z = src * one + inv;
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 
     touch(inv);
     z = -src * one - inv;
-    if ( !to_bool(-x == z) ) return __LINE__;
+    if ( !eq(-x, z) ) return __LINE__;
 
     touch(inv);
     z = src * one - inv;
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
 
     touch(inv);
     z = -src * one + inv;
-    if ( !to_bool(-y == z) ) return __LINE__;
+    if ( !eq(-y, z) ) return __LINE__;
     touch(inv);
 
 #if defined(addsub) && defined(fmaddsub)
@@ -101,21 +105,21 @@ int fma_test(void)
     y = addsub(src * inv, -one);
     touch(one);
     z = fmaddsub(src, inv, one);
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 
     touch(one);
     z = fmaddsub(src, inv, -one);
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
     touch(one);
 
     x = addsub(src * inv, one);
     touch(inv);
     z = fmaddsub(src, inv, one);
-    if ( !to_bool(x == z) ) return __LINE__;
+    if ( !eq(x, z) ) return __LINE__;
 
     touch(inv);
     z = fmaddsub(src, inv, -one);
-    if ( !to_bool(y == z) ) return __LINE__;
+    if ( !eq(y, z) ) return __LINE__;
     touch(inv);
 #endif
 




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 18/44] x86emul: support AVX512{F, BW} packed integer compare insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (16 preceding siblings ...)
  2018-09-25 13:36   ` [PATCH v4 17/44] x86emul/test: introduce eq() Jan Beulich
@ 2018-09-25 13:37   ` Jan Beulich
  2018-09-25 13:37   ` [PATCH v4 19/44] x86emul: support AVX512{F, BW} packed integer arithmetic insns Jan Beulich
                     ` (25 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:37 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Include VPTEST{,N}M{B,D,Q,W} as once again possibly used by the compiler
for comparison against all-zero vectors.

Also table entries for a few more insns get their .d8s field set right
away, again in order to not split and later re-combine the groups.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -151,8 +151,16 @@ static const struct test avx512f_all[] =
     INSN_FP(mul,             0f, 59),
     INSN(pand,         66,   0f, db,    vl,     dq, vl),
     INSN(pandn,        66,   0f, df,    vl,     dq, vl),
+    INSN(pcmp,         66, 0f3a, 1f,    vl,     dq, vl),
+    INSN(pcmpeqd,      66,   0f, 76,    vl,      d, vl),
+    INSN(pcmpeqq,      66, 0f38, 29,    vl,      q, vl),
+    INSN(pcmpgtd,      66,   0f, 66,    vl,      d, vl),
+    INSN(pcmpgtq,      66, 0f38, 37,    vl,      q, vl),
+    INSN(pcmpu,        66, 0f3a, 1e,    vl,     dq, vl),
     INSN(por,          66,   0f, eb,    vl,     dq, vl),
     INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
+    INSN(ptestm,       66, 0f38, 27,    vl,     dq, vl),
+    INSN(ptestnm,      f3, 0f38, 27,    vl,     dq, vl),
     INSN(pxor,         66,   0f, ef,    vl,     dq, vl),
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
@@ -184,6 +192,14 @@ static const struct test avx512bw_all[]
     INSN(movdqu8,     f2,   0f, 7f,    vl,    b, vl),
     INSN(movdqu16,    f2,   0f, 6f,    vl,    w, vl),
     INSN(movdqu16,    f2,   0f, 7f,    vl,    w, vl),
+    INSN(pcmp,        66, 0f3a, 3f,    vl,   bw, vl),
+    INSN(pcmpeqb,     66,   0f, 74,    vl,    b, vl),
+    INSN(pcmpeqw,     66,   0f, 75,    vl,    w, vl),
+    INSN(pcmpgtb,     66,   0f, 64,    vl,    b, vl),
+    INSN(pcmpgtw,     66,   0f, 65,    vl,    w, vl),
+    INSN(pcmpu,       66, 0f3a, 3e,    vl,   bw, vl),
+    INSN(ptestm,      66, 0f38, 26,    vl,   bw, vl),
+    INSN(ptestnm,     f3, 0f38, 26,    vl,   bw, vl),
 };
 
 static const struct test avx512dq_all[] = {
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -313,14 +313,14 @@ static const struct twobyte_table {
     [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
     [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 },
     [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
     [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM },
-    [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x77] = { DstImplicit|SrcNone },
     [0x78] = { ImplicitOps|ModRM },
     [0x79] = { DstReg|SrcMem|ModRM, simd_packed_int },
@@ -444,13 +444,13 @@ static const struct ext0f38_table {
     [0x1b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
-    [0x28 ... 0x29] = { .simd_size = simd_packed_int },
+    [0x26 ... 0x29] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
     [0x2c ... 0x2d] = { .simd_size = simd_packed_fp },
     [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
-    [0x36 ... 0x3f] = { .simd_size = simd_packed_int },
+    [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40] = { .simd_size = simd_packed_int },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x45 ... 0x47] = { .simd_size = simd_packed_int },
@@ -516,6 +516,7 @@ static const struct ext0f3a_table {
     [0x18] = { .simd_size = simd_128 },
     [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
     [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
+    [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x20] = { .simd_size = simd_none },
     [0x21] = { .simd_size = simd_other },
     [0x22] = { .simd_size = simd_none },
@@ -523,6 +524,7 @@ static const struct ext0f3a_table {
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
+    [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
     [0x42] = { .simd_size = simd_packed_int },
     [0x44] = { .simd_size = simd_packed_int },
@@ -6569,6 +6571,32 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x26): /* vptestnm{b,w} [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x27): /* vptestnm{d,q} [xyz]mm/mem,[xyz]mm,k{k} */
+        op_bytes = 16 << evex.lr;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x64): /* vpcmpeqb [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x65): /* vpcmpeqw [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x66): /* vpcmpeqd [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x74): /* vpcmpgtb [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x75): /* vpcmpgtw [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f,   0x76): /* vpcmpgtd [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x26): /* vptestm{b,w} [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x27): /* vptestm{d,q} [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x29): /* vpcmpeqq [xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x37): /* vpcmpgtq [xyz]mm/mem,[xyz]mm,k{k} */
+        generate_exception_if(!evex.r || !evex.R || evex.z, EXC_UD);
+        if ( b & (ext == ext_0f38 ? 1 : 2) )
+        {
+            generate_exception_if(b != 0x27 && evex.w != (b & 1), EXC_UD);
+            goto avx512f_no_sae;
+        }
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = 1 << (ext == ext_0f ? b & 1 : evex.w);
+        avx512_vlen_check(false);
+        goto simd_zmm;
+
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     CASE_SIMD_PACKED_INT(0x0f, 0x7e):    /* mov{d,q} {,x}mm,r/m */
@@ -7577,6 +7605,7 @@ x86_emulate(
                               EXC_UD);
         /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x25): /* vpternlog{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    avx512f_imm_no_sae:
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
         avx512_vlen_check(false);
@@ -8750,6 +8779,19 @@ x86_emulate(
         break;
     }
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1e): /* vpcmpu{d,q} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1f): /* vpcmp{d,q} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3e): /* vpcmpu{b,w} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3f): /* vpcmp{b,w} $imm8,[xyz]mm/mem,[xyz]mm,k{k} */
+        generate_exception_if(!evex.r || !evex.R || evex.z, EXC_UD);
+        if ( !(b & 0x20) )
+            goto avx512f_imm_no_sae;
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = 1 << evex.w;
+        avx512_vlen_check(false);
+        goto simd_imm8_zmm;
+
     case X86EMUL_OPC_66(0x0f3a, 0x20): /* pinsrb $imm8,r32/m8,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x22): /* pinsr{d,q} $imm8,r/m,xmm */
         host_and_vcpu_must_have(sse4_1);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 19/44] x86emul: support AVX512{F, BW} packed integer arithmetic insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (17 preceding siblings ...)
  2018-09-25 13:37   ` [PATCH v4 18/44] x86emul: support AVX512{F, BW} packed integer compare insns Jan Beulich
@ 2018-09-25 13:37   ` Jan Beulich
  2018-09-25 13:38   ` [PATCH v4 20/44] x86emul: use simd_128 also for legacy vector shift insns Jan Beulich
                     ` (24 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:37 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note: vpadd* / vpsub* et al are put at seemingly the wrong slot of the
big switch(). This is in anticipation of adding e.g. vpunpck* to those
groups (see the legacy/VEX encoded case labels nearby to support this).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Move a case block further down.
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -149,6 +149,8 @@ static const struct test avx512f_all[] =
     INSN_PFP_NB(movu,        0f, 10),
     INSN_PFP_NB(movu,        0f, 11),
     INSN_FP(mul,             0f, 59),
+    INSN(paddd,        66,   0f, fe,    vl,      d, vl),
+    INSN(paddq,        66,   0f, d4,    vl,      q, vl),
     INSN(pand,         66,   0f, db,    vl,     dq, vl),
     INSN(pandn,        66,   0f, df,    vl,     dq, vl),
     INSN(pcmp,         66, 0f3a, 1f,    vl,     dq, vl),
@@ -157,7 +159,16 @@ static const struct test avx512f_all[] =
     INSN(pcmpgtd,      66,   0f, 66,    vl,      d, vl),
     INSN(pcmpgtq,      66, 0f38, 37,    vl,      q, vl),
     INSN(pcmpu,        66, 0f3a, 1e,    vl,     dq, vl),
+    INSN(pmaxs,        66, 0f38, 3d,    vl,     dq, vl),
+    INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
+    INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
+    INSN(pminu,        66, 0f38, 3b,    vl,     dq, vl),
+    INSN(pmuldq,       66, 0f38, 28,    vl,      q, vl),
+    INSN(pmulld,       66, 0f38, 40,    vl,      d, vl),
+    INSN(pmuludq,      66,   0f, f4,    vl,      q, vl),
     INSN(por,          66,   0f, eb,    vl,     dq, vl),
+    INSN(psubd,        66,   0f, fa,    vl,      d, vl),
+    INSN(psubq,        66,   0f, fb,    vl,      q, vl),
     INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
     INSN(ptestm,       66, 0f38, 27,    vl,     dq, vl),
     INSN(ptestnm,      f3, 0f38, 27,    vl,     dq, vl),
@@ -192,12 +203,39 @@ static const struct test avx512bw_all[]
     INSN(movdqu8,     f2,   0f, 7f,    vl,    b, vl),
     INSN(movdqu16,    f2,   0f, 6f,    vl,    w, vl),
     INSN(movdqu16,    f2,   0f, 7f,    vl,    w, vl),
+    INSN(paddb,       66,   0f, fc,    vl,    b, vl),
+    INSN(paddsb,      66,   0f, ec,    vl,    b, vl),
+    INSN(paddsw,      66,   0f, ed,    vl,    w, vl),
+    INSN(paddusb,     66,   0f, dc,    vl,    b, vl),
+    INSN(paddusw,     66,   0f, dd,    vl,    w, vl),
+    INSN(paddw,       66,   0f, fd,    vl,    w, vl),
+    INSN(pavgb,       66,   0f, e0,    vl,    b, vl),
+    INSN(pavgw,       66,   0f, e3,    vl,    w, vl),
     INSN(pcmp,        66, 0f3a, 3f,    vl,   bw, vl),
     INSN(pcmpeqb,     66,   0f, 74,    vl,    b, vl),
     INSN(pcmpeqw,     66,   0f, 75,    vl,    w, vl),
     INSN(pcmpgtb,     66,   0f, 64,    vl,    b, vl),
     INSN(pcmpgtw,     66,   0f, 65,    vl,    w, vl),
     INSN(pcmpu,       66, 0f3a, 3e,    vl,   bw, vl),
+    INSN(pmaddwd,     66,   0f, f5,    vl,    w, vl),
+    INSN(pmaxsb,      66, 0f38, 3c,    vl,    b, vl),
+    INSN(pmaxsw,      66,   0f, ee,    vl,    w, vl),
+    INSN(pmaxub,      66,   0f, de,    vl,    b, vl),
+    INSN(pmaxuw,      66, 0f38, 3e,    vl,    w, vl),
+    INSN(pminsb,      66, 0f38, 38,    vl,    b, vl),
+    INSN(pminsw,      66,   0f, ea,    vl,    w, vl),
+    INSN(pminub,      66,   0f, da,    vl,    b, vl),
+    INSN(pminuw,      66, 0f38, 3a,    vl,    w, vl),
+    INSN(pmulhuw,     66,   0f, e4,    vl,    w, vl),
+    INSN(pmulhw,      66,   0f, e5,    vl,    w, vl),
+    INSN(pmullw,      66,   0f, d5,    vl,    w, vl),
+    INSN(psadbw,      66,   0f, f6,    vl,    b, vl),
+    INSN(psubb,       66,   0f, f8,    vl,    b, vl),
+    INSN(psubsb,      66,   0f, e8,    vl,    b, vl),
+    INSN(psubsw,      66,   0f, e9,    vl,    w, vl),
+    INSN(psubusb,     66,   0f, d8,    vl,    b, vl),
+    INSN(psubusw,     66,   0f, d9,    vl,    w, vl),
+    INSN(psubw,       66,   0f, f9,    vl,    w, vl),
     INSN(ptestm,      66, 0f38, 26,    vl,   bw, vl),
     INSN(ptestnm,     f3, 0f38, 26,    vl,   bw, vl),
 };
@@ -206,6 +244,7 @@ static const struct test avx512dq_all[]
     INSN_PFP(and,              0f, 54),
     INSN_PFP(andn,             0f, 55),
     INSN_PFP(or,               0f, 56),
+    INSN(pmullq,         66, 0f38, 40,   vl,  q, vl),
     INSN_PFP(xor,              0f, 57),
 };
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -367,21 +367,21 @@ static const struct twobyte_table {
     [0xc8 ... 0xcf] = { ImplicitOps },
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
     [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
     [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
-    [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf7] = { DstMem|SrcMem|ModRM|Mov, simd_packed_int },
-    [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xff] = { ModRM }
 };
 
@@ -451,7 +451,7 @@ static const struct ext0f38_table {
     [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
     [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x40] = { .simd_size = simd_packed_int },
+    [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x45 ... 0x47] = { .simd_size = simd_packed_int },
     [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
@@ -5970,6 +5970,10 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xdf): /* vpandn{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xeb): /* vpor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xef): /* vpxor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x39): /* vpmins{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3b): /* vpminu{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3d): /* vpmaxs{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3f): /* vpmaxu{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     avx512f_no_sae:
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
@@ -6571,6 +6575,31 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf5): /* vpmaddwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        fault_suppression = false;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd5): /* vpmullw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd8): /* vpsubusb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd9): /* vpsubusw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xdc): /* vpaddusb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xdd): /* vpaddusw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe0): /* vpavgb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe3): /* vpavgw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe5): /* vpmulhw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe8): /* vpsubsb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe9): /* vpsubsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xec): /* vpaddsb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xed): /* vpaddsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf8): /* vpsubb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf9): /* vpsubw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xfc): /* vpaddb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xfd): /* vpaddw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = 1 << (b & 1);
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_EVEX_F3(0x0f38, 0x26): /* vptestnm{b,w} [xyz]mm/mem,[xyz]mm,k{k} */
     case X86EMUL_OPC_EVEX_F3(0x0f38, 0x27): /* vptestnm{d,q} [xyz]mm/mem,[xyz]mm,k{k} */
         op_bytes = 16 << evex.lr;
@@ -6597,6 +6626,12 @@ x86_emulate(
         avx512_vlen_check(false);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd4): /* vpaddq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf4): /* vpmuludq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x28): /* vpmuldq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(!evex.w, EXC_UD);
+        goto avx512f_no_sae;
+
     CASE_SIMD_PACKED_INT(0x0f, 0x6e):    /* mov{d,q} r/m,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x6e): /* vmov{d,q} r/m,xmm */
     CASE_SIMD_PACKED_INT(0x0f, 0x7e):    /* mov{d,q} {,x}mm,r/m */
@@ -7820,6 +7855,12 @@ x86_emulate(
         op_bytes = vex.pfx ? 16 : 8;
         goto simd_0f_int;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xfa): /* vpsubd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xfb): /* vpsubq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xfe): /* vpaddd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.w != (b & 1), EXC_UD);
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC(0x0f, 0xd4):        /* paddq mm/m64,mm */
     case X86EMUL_OPC(0x0f, 0xf4):        /* pmuludq mm/m64,mm */
     case X86EMUL_OPC(0x0f, 0xfb):        /* psubq mm/m64,mm */
@@ -7848,6 +7889,16 @@ x86_emulate(
         vcpu_must_have(mmxext);
         goto simd_0f_mmx;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xda): /* vpminub [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xde): /* vpmaxub [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe4): /* vpmulhuw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xea): /* vpminsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xee): /* vpmaxsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = b & 0x10 ? 1 : 2;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_66(0x0f, 0xe6):       /* cvttpd2dq xmm/mem,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe6):   /* vcvttpd2dq {x,y}mm/mem,xmm */
     case X86EMUL_OPC_F3(0x0f, 0xe6):       /* cvtdq2pd xmm/mem,xmm */
@@ -8221,6 +8272,20 @@ x86_emulate(
         host_and_vcpu_must_have(sse4_2);
         goto simd_0f38_common;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x38): /* vpminsb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3a): /* vpminuw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3c): /* vpmaxsb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x3e): /* vpmaxuw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = b & 2 ?: 1;
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x40): /* vpmull{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        if ( evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_66(0x0f38, 0xdb):     /* aesimc xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0xdb): /* vaesimc xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f38, 0xdc):     /* aesenc xmm/m128,xmm,xmm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 20/44] x86emul: use simd_128 also for legacy vector shift insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (18 preceding siblings ...)
  2018-09-25 13:37   ` [PATCH v4 19/44] x86emul: support AVX512{F, BW} packed integer arithmetic insns Jan Beulich
@ 2018-09-25 13:38   ` Jan Beulich
  2018-09-25 13:39   ` [PATCH v4 21/44] x86emul: support AVX512{F, BW} shift/rotate insns Jan Beulich
                     ` (23 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:38 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

This eliminates a separate case block here, and allows to get away with
fewer new ones when adding AVX512 vector shifts.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -366,19 +366,19 @@ static const struct twobyte_table {
     [0xc7] = { ImplicitOps|ModRM },
     [0xc8 ... 0xcf] = { ImplicitOps },
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_128 },
     [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
     [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
-    [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_128 },
     [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_128 },
     [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf7] = { DstMem|SrcMem|ModRM|Mov, simd_packed_int },
     [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
@@ -3337,7 +3337,8 @@ x86_decode(
         break;
 
     case simd_128:
-        op_bytes = 16;
+        /* The special case here are MMX shift insns. */
+        op_bytes = vex.opcx || vex.pfx ? 16 : 8;
         break;
 
     case simd_256:
@@ -6466,6 +6467,12 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f, 0x75): /* vpcmpeqw {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT(0x0f, 0x76):    /* pcmpeqd {,x}mm/mem,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0x76): /* vpcmpeqd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd1):    /* psrlw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd2):    /* psrld {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xd3):    /* psrlq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xd4):     /* paddq xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xd4): /* vpaddq {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT(0x0f, 0xd5):    /* pmullw {,x}mm/mem,{,x}mm */
@@ -6488,6 +6495,10 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f, 0xdf): /* vpandn {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xe0):     /* pavgb xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe0): /* vpavgb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe1):    /* psraw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xe2):    /* psrad {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xe2): /* vpsrad xmm/m128,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xe3):     /* pavgw xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xe3): /* vpavgw {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xe4):     /* pmulhuw xmm/m128,xmm */
@@ -6510,6 +6521,12 @@ x86_emulate(
     case X86EMUL_OPC_VEX_66(0x0f, 0xee): /* vpmaxsw {x,y}mm/mem,{x,y}mm,{x,y}mm */
     CASE_SIMD_PACKED_INT(0x0f, 0xef):    /* pxor {,x}mm/mem,{,x}mm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xef): /* vpxor {x,y}mm/mem,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf1):    /* psllw {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf2):    /* pslld {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf2): /* vpslld xmm/m128,{x,y}mm,{x,y}mm */
+    CASE_SIMD_PACKED_INT(0x0f, 0xf3):    /* psllq {,x}mm/mem,{,x}mm */
+    case X86EMUL_OPC_VEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xf4):     /* pmuludq xmm/m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f, 0xf4): /* vpmuludq {x,y}mm/mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_66(0x0f, 0xf6):     /* psadbw xmm/m128,xmm */
@@ -7836,25 +7853,6 @@ x86_emulate(
         }
         break;
 
-    CASE_SIMD_PACKED_INT(0x0f, 0xd1):    /* psrlw {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xd2):    /* psrld {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xd3):    /* psrlq {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xe1):    /* psraw {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xe2):    /* psrad {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xe2): /* vpsrad xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xf1):    /* psllw {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xf2):    /* pslld {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xf2): /* vpslld xmm/m128,{x,y}mm,{x,y}mm */
-    CASE_SIMD_PACKED_INT(0x0f, 0xf3):    /* psllq {,x}mm/mem,{,x}mm */
-    case X86EMUL_OPC_VEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,{x,y}mm,{x,y}mm */
-        op_bytes = vex.pfx ? 16 : 8;
-        goto simd_0f_int;
-
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfa): /* vpsubd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfb): /* vpsubq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfe): /* vpaddd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 21/44] x86emul: support AVX512{F, BW} shift/rotate insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (19 preceding siblings ...)
  2018-09-25 13:38   ` [PATCH v4 20/44] x86emul: use simd_128 also for legacy vector shift insns Jan Beulich
@ 2018-09-25 13:39   ` Jan Beulich
  2018-09-25 13:40   ` [PATCH v4 22/44] x86emul: support AVX512{F, BW, DQ} extract insns Jan Beulich
                     ` (22 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:39 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note that simd_packed_fp for the opcode space 0f38 major opcodes 14 and
15 is not really correct, but sufficient for the purposes here. Further
adjustments may later be needed for the down conversion unsigned
saturating VPMOV* insns, first and foremost for the different Disp8
scaling those ones use.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -167,6 +167,24 @@ static const struct test avx512f_all[] =
     INSN(pmulld,       66, 0f38, 40,    vl,      d, vl),
     INSN(pmuludq,      66,   0f, f4,    vl,      q, vl),
     INSN(por,          66,   0f, eb,    vl,     dq, vl),
+    INSNX(prol,        66,   0f, 72, 1, vl,     dq, vl),
+    INSN(prolv,        66, 0f38, 15,    vl,     dq, vl),
+    INSNX(pror,        66,   0f, 72, 0, vl,     dq, vl),
+    INSN(prorv,        66, 0f38, 14,    vl,     dq, vl),
+    INSN(pslld,        66,   0f, f2,    el_4,    d, vl),
+    INSNX(pslld,       66,   0f, 72, 6, vl,      d, vl),
+    INSN(psllq,        66,   0f, f3,    el_2,    q, vl),
+    INSNX(psllq,       66,   0f, 73, 6, vl,      q, vl),
+    INSN(psllv,        66, 0f38, 47,    vl,     dq, vl),
+    INSNX(psra,        66,   0f, 72, 4, vl,     dq, vl),
+    INSN(psrad,        66,   0f, e2,    el_4,    d, vl),
+    INSN(psraq,        66,   0f, e2,    el_2,    q, vl),
+    INSN(psrav,        66, 0f38, 46,    vl,     dq, vl),
+    INSN(psrld,        66,   0f, d2,    el_4,    d, vl),
+    INSNX(psrld,       66,   0f, 72, 2, vl,      d, vl),
+    INSN(psrlq,        66,   0f, d3,    el_2,    q, vl),
+    INSNX(psrlq,       66,   0f, 73, 2, vl,      q, vl),
+    INSN(psrlv,        66, 0f38, 45,    vl,     dq, vl),
     INSN(psubd,        66,   0f, fa,    vl,      d, vl),
     INSN(psubq,        66,   0f, fb,    vl,      q, vl),
     INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
@@ -230,6 +248,17 @@ static const struct test avx512bw_all[]
     INSN(pmulhw,      66,   0f, e5,    vl,    w, vl),
     INSN(pmullw,      66,   0f, d5,    vl,    w, vl),
     INSN(psadbw,      66,   0f, f6,    vl,    b, vl),
+    INSNX(pslldq,     66,   0f, 73, 7, vl,    b, vl),
+    INSN(psllvw,      66, 0f38, 12,    vl,    w, vl),
+    INSN(psllw,       66,   0f, f1,    el_8,  w, vl),
+    INSNX(psllw,      66,   0f, 71, 6, vl,    w, vl),
+    INSN(psravw,      66, 0f38, 11,    vl,    w, vl),
+    INSN(psraw,       66,   0f, e1,    el_8,  w, vl),
+    INSNX(psraw,      66,   0f, 71, 4, vl,    w, vl),
+    INSNX(psrldq,     66,   0f, 73, 3, vl,    b, vl),
+    INSN(psrlvw,      66, 0f38, 10,    vl,    w, vl),
+    INSN(psrlw,       66,   0f, d1,    el_8,  w, vl),
+    INSNX(psrlw,      66,   0f, 71, 2, vl,    w, vl),
     INSN(psubb,       66,   0f, f8,    vl,    b, vl),
     INSN(psubsb,      66,   0f, e8,    vl,    b, vl),
     INSN(psubsw,      66,   0f, e9,    vl,    w, vl),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -319,7 +319,7 @@ static const struct twobyte_table {
     [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 },
     [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
-    [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM },
+    [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl },
     [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x77] = { DstImplicit|SrcNone },
     [0x78] = { ImplicitOps|ModRM },
@@ -366,19 +366,19 @@ static const struct twobyte_table {
     [0xc7] = { ImplicitOps|ModRM },
     [0xc8 ... 0xcf] = { ImplicitOps },
     [0xd0] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_128 },
+    [0xd1 ... 0xd3] = { DstImplicit|SrcMem|ModRM, simd_128, 4 },
     [0xd4 ... 0xd5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xd6] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0xd7] = { DstReg|SrcImplicit|ModRM|Mov },
     [0xd8 ... 0xdf] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe0] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
-    [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_128 },
+    [0xe1 ... 0xe2] = { DstImplicit|SrcMem|ModRM, simd_128, 4 },
     [0xe3 ... 0xe5] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xe6] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0xe7] = { DstMem|SrcImplicit|ModRM|Mov, simd_packed_int, d8s_vl },
     [0xe8 ... 0xef] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf0] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_128 },
+    [0xf1 ... 0xf3] = { DstImplicit|SrcMem|ModRM, simd_128, 4 },
     [0xf4 ... 0xf6] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0xf7] = { DstMem|SrcMem|ModRM|Mov, simd_packed_int },
     [0xf8 ... 0xfe] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
@@ -434,9 +434,9 @@ static const struct ext0f38_table {
 } ext0f38_table[256] = {
     [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
-    [0x10] = { .simd_size = simd_packed_int },
+    [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
-    [0x14 ... 0x16] = { .simd_size = simd_packed_fp },
+    [0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x18] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 2 },
     [0x19] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 3 },
@@ -453,7 +453,7 @@ static const struct ext0f38_table {
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x45 ... 0x47] = { .simd_size = simd_packed_int },
+    [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
     [0x5a] = { .simd_size = simd_128, .two_op = 1 },
     [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
@@ -5971,10 +5971,15 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xdf): /* vpandn{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xeb): /* vpor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xef): /* vpxor{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x14): /* vprorv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x15): /* vprolv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x39): /* vpmins{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x3b): /* vpminu{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x3d): /* vpmaxs{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x3f): /* vpmaxu{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x45): /* vpsrlv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x46): /* vpsrav{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x47): /* vpsllv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     avx512f_no_sae:
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.br, EXC_UD);
@@ -6592,6 +6597,9 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf5): /* vpmaddwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;
@@ -6891,6 +6899,37 @@ x86_emulate(
         ASSERT(!state->simd_size);
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x71): /* Grp12 */
+        switch ( modrm_reg & 7 )
+        {
+        case 2: /* vpsrlw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 4: /* vpsraw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 6: /* vpsllw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        avx512bw_shift_imm:
+            fault_suppression = false;
+            op_bytes = 16 << evex.lr;
+            state->simd_size = simd_packed_int;
+            goto avx512bw_imm;
+        }
+        goto unrecognized_insn;
+
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x72): /* Grp13 */
+        switch ( modrm_reg & 7 )
+        {
+        case 2: /* vpsrld $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 6: /* vpslld $imm8,[xyz]mm/mem,[xyz]mm{k} */
+            generate_exception_if(evex.w, EXC_UD);
+            /* fall through */
+        case 0: /* vpror{d,q} $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 1: /* vprol{d,q} $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 4: /* vpsra{d,q} $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        avx512f_shift_imm:
+            op_bytes = 16 << evex.lr;
+            state->simd_size = simd_packed_int;
+            goto avx512f_imm_no_sae;
+        }
+        goto unrecognized_insn;
+
     case X86EMUL_OPC(0x0f, 0x73):        /* Grp14 */
         switch ( modrm_reg & 7 )
         {
@@ -6916,6 +6955,19 @@ x86_emulate(
         }
         goto unrecognized_insn;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x73): /* Grp14 */
+        switch ( modrm_reg & 7 )
+        {
+        case 2: /* vpsrlq $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        case 6: /* vpsllq $imm8,[xyz]mm/mem,[xyz]mm{k} */
+            generate_exception_if(!evex.w, EXC_UD);
+            goto avx512f_shift_imm;
+        case 3: /* vpsrldq $imm8,{x,y}mm,{x,y}mm */
+        case 7: /* vpslldq $imm8,{x,y}mm,{x,y}mm */
+            goto avx512bw_shift_imm;
+        }
+        goto unrecognized_insn;
+
     case X86EMUL_OPC(0x0f, 0x77):        /* emms */
     case X86EMUL_OPC_VEX(0x0f, 0x77):    /* vzero{all,upper} */
         if ( vex.opcx != vex_none )
@@ -7853,6 +7905,16 @@ x86_emulate(
         }
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd2): /* vpsrld xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xd3): /* vpsrlq xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xe2): /* vpsra{d,q} xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf2): /* vpslld xmm/m128,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.br, EXC_UD);
+        fault_suppression = false;
+        if ( b == 0xe2 )
+            goto avx512f_no_sae;
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfa): /* vpsubd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfb): /* vpsubq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xfe): /* vpaddd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -8093,6 +8155,14 @@ x86_emulate(
         dst.type = OP_NONE;
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x10): /* vpsrlvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x11): /* vpsravw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x12): /* vpsllvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        elem_bytes = 1 << evex.w;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
         generate_exception_if(evex.w || evex.br, EXC_UD);
     avx512_broadcast:
@@ -8849,6 +8919,7 @@ x86_emulate(
         generate_exception_if(!evex.r || !evex.R || evex.z, EXC_UD);
         if ( !(b & 0x20) )
             goto avx512f_imm_no_sae;
+    avx512bw_imm:
         host_and_vcpu_must_have(avx512bw);
         generate_exception_if(evex.br, EXC_UD);
         elem_bytes = 1 << evex.w;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 22/44] x86emul: support AVX512{F, BW, DQ} extract insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (20 preceding siblings ...)
  2018-09-25 13:39   ` [PATCH v4 21/44] x86emul: support AVX512{F, BW} shift/rotate insns Jan Beulich
@ 2018-09-25 13:40   ` Jan Beulich
  2018-09-25 13:40   ` [PATCH v4 23/44] x86emul: support AVX512{F, BW, DQ} insert insns Jan Beulich
                     ` (21 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:40 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Make use of d8s_dq64.
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -201,6 +201,7 @@ static const struct test avx512f_all[] =
 };
 
 static const struct test avx512f_128[] = {
+    INSN(extractps, 66, 0f3a, 17, el,    d, el),
     INSN(mov,       66,   0f, 6e, el, dq64, el),
     INSN(mov,       66,   0f, 7e, el, dq64, el),
     INSN(movq,      f3,   0f, 7e, el,    q, el),
@@ -210,10 +211,14 @@ static const struct test avx512f_128[] =
 static const struct test avx512f_no128[] = {
     INSN(broadcastf32x4, 66, 0f38, 1a, el_4,  d, vl),
     INSN(broadcastsd,    66, 0f38, 19, el,    q, el),
+    INSN(extractf32x4,   66, 0f3a, 19, el_4,  d, vl),
+    INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
 };
 
 static const struct test avx512f_512[] = {
     INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
+    INSN(extractf64x4,   66, 0f3a, 1b, el_4, q, vl),
+    INSN(extracti64x4,   66, 0f3a, 3b, el_4, q, vl),
 };
 
 static const struct test avx512bw_all[] = {
@@ -269,6 +274,12 @@ static const struct test avx512bw_all[]
     INSN(ptestnm,     f3, 0f38, 26,    vl,   bw, vl),
 };
 
+static const struct test avx512bw_128[] = {
+    INSN(pextrb, 66, 0f3a, 14, el, b, el),
+//       pextrw, 66,   0f, c5,     w
+    INSN(pextrw, 66, 0f3a, 15, el, w, el),
+};
+
 static const struct test avx512dq_all[] = {
     INSN_PFP(and,              0f, 54),
     INSN_PFP(andn,             0f, 55),
@@ -277,13 +288,21 @@ static const struct test avx512dq_all[]
     INSN_PFP(xor,              0f, 57),
 };
 
+static const struct test avx512dq_128[] = {
+    INSN(pextr, 66, 0f3a, 16, el, dq64, el),
+};
+
 static const struct test avx512dq_no128[] = {
     INSN(broadcastf32x2, 66, 0f38, 19, el_2, d, vl),
     INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
+    INSN(extractf64x2,   66, 0f3a, 19, el_2, q, vl),
+    INSN(extracti64x2,   66, 0f3a, 39, el_2, q, vl),
 };
 
 static const struct test avx512dq_512[] = {
     INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
+    INSN(extractf32x8,   66, 0f3a, 1b, el_8, d, vl),
+    INSN(extracti32x8,   66, 0f3a, 3b, el_8, d, vl),
 };
 
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
@@ -613,7 +632,9 @@ void evex_disp8_test(void *instr, struct
     RUN(avx512f, no128);
     RUN(avx512f, 512);
     RUN(avx512bw, all);
+    RUN(avx512bw, 128);
     RUN(avx512dq, all);
+    RUN(avx512dq, 128);
     RUN(avx512dq, no128);
     RUN(avx512dq, 512);
 }
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -512,9 +512,13 @@ static const struct ext0f3a_table {
     [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc },
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_int },
-    [0x14 ... 0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1 },
+    [0x14] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 0 },
+    [0x15] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 1 },
+    [0x16] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = d8s_dq64 },
+    [0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 2 },
     [0x18] = { .simd_size = simd_128 },
-    [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
+    [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
+    [0x1b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
     [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x20] = { .simd_size = simd_none },
@@ -523,7 +527,8 @@ static const struct ext0f3a_table {
     [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128 },
-    [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1 },
+    [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
+    [0x3b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
     [0x42] = { .simd_size = simd_packed_int },
@@ -2660,6 +2665,8 @@ x86_decode_0f3a(
      ... X86EMUL_OPC_66(0, 0x17):     /* pextr*, extractps */
     case X86EMUL_OPC_VEX_66(0, 0x14)
      ... X86EMUL_OPC_VEX_66(0, 0x17): /* vpextr*, vextractps */
+    case X86EMUL_OPC_EVEX_66(0, 0x14)
+     ... X86EMUL_OPC_EVEX_66(0, 0x17): /* vpextr*, vextractps */
     case X86EMUL_OPC_VEX_F2(0, 0xf0): /* rorx */
         break;
 
@@ -8838,9 +8845,9 @@ x86_emulate(
         opc[0] = b;
         /* Convert memory/GPR operand to (%rAX). */
         rex_prefix &= ~REX_B;
-        vex.b = 1;
+        evex.b = vex.b = 1;
         if ( !mode_64bit() )
-            vex.w = 0;
+            evex.w = vex.w = 0;
         opc[1] = modrm & 0x38;
         opc[2] = imm1;
         opc[3] = 0xc3;
@@ -8850,7 +8857,10 @@ x86_emulate(
             --opc;
         }
 
-        copy_REX_VEX(opc, rex_prefix, vex);
+        if ( evex_encoded() )
+            copy_EVEX(opc, evex);
+        else
+            copy_REX_VEX(opc, rex_prefix, vex);
         invoke_stub("", "", "=m" (dst.val) : "a" (&dst.val));
         put_stub(stub);
 
@@ -8870,6 +8880,52 @@ x86_emulate(
         opc = init_prefixes(stub);
         goto pextr;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xc5):   /* vpextrw $imm8,xmm,reg */
+        generate_exception_if(ea.type != OP_REG, EXC_UD);
+        /* Convert to alternative encoding: We want to use a memory operand. */
+        evex.opcx = ext_0f3a;
+        b = 0x15;
+        modrm <<= 3;
+        evex.r = evex.b;
+        evex.R = evex.x;
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x14): /* vpextrb $imm8,xmm,r/m */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x15): /* vpextrw $imm8,xmm,r/m */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x16): /* vpextr{d,q} $imm8,xmm,r/m */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x17): /* vextractps $imm8,xmm,r/m */
+        generate_exception_if((evex.lr || evex.reg != 0xf || !evex.RX ||
+                               evex.opmsk || evex.br),
+                              EXC_UD);
+        if ( !(b & 2) )
+            host_and_vcpu_must_have(avx512bw);
+        else if ( !(b & 1) )
+            host_and_vcpu_must_have(avx512dq);
+        else
+            host_and_vcpu_must_have(avx512f);
+        get_fpu(X86EMUL_FPU_zmm);
+        opc = init_evex(stub);
+        goto pextr;
+
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x19): /* vextractf32x4 $imm8,{y,z}mm,xmm/m128{k} */
+                                            /* vextractf64x2 $imm8,{y,z}mm,xmm/m128{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x39): /* vextracti32x4 $imm8,{y,z}mm,xmm/m128{k} */
+                                            /* vextracti64x2 $imm8,{y,z}mm,xmm/m128{k} */
+        if ( evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        generate_exception_if(!evex.lr || evex.br, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1b): /* vextractf32x8 $imm8,zmm,ymm/m256{k} */
+                                            /* vextractf64x4 $imm8,zmm,ymm/m256{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3b): /* vextracti32x8 $imm8,zmm,ymm/m256{k} */
+                                            /* vextracti64x4 $imm8,zmm,ymm/m256{k} */
+        if ( !evex.w )
+            host_and_vcpu_must_have(avx512dq);
+        generate_exception_if(evex.lr != 2 || evex.br, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm_no_sae;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */
     {
         uint32_t mxcsr;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 23/44] x86emul: support AVX512{F, BW, DQ} insert insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (21 preceding siblings ...)
  2018-09-25 13:40   ` [PATCH v4 22/44] x86emul: support AVX512{F, BW, DQ} extract insns Jan Beulich
@ 2018-09-25 13:40   ` Jan Beulich
  2018-09-25 13:41   ` [PATCH v4 24/44] x86emul: basic AVX512F testing Jan Beulich
                     ` (20 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:40 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Also correct the comment of the AVX form of VINSERTPS.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Make use of d8s_dq64.
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -202,6 +202,7 @@ static const struct test avx512f_all[] =
 
 static const struct test avx512f_128[] = {
     INSN(extractps, 66, 0f3a, 17, el,    d, el),
+    INSN(insertps,  66, 0f3a, 21, el,    d, el),
     INSN(mov,       66,   0f, 6e, el, dq64, el),
     INSN(mov,       66,   0f, 7e, el, dq64, el),
     INSN(movq,      f3,   0f, 7e, el,    q, el),
@@ -213,12 +214,16 @@ static const struct test avx512f_no128[]
     INSN(broadcastsd,    66, 0f38, 19, el,    q, el),
     INSN(extractf32x4,   66, 0f3a, 19, el_4,  d, vl),
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
+    INSN(insertf32x4,    66, 0f3a, 18, el_4,  d, vl),
+    INSN(inserti32x4,    66, 0f3a, 38, el_4,  d, vl),
 };
 
 static const struct test avx512f_512[] = {
     INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
     INSN(extractf64x4,   66, 0f3a, 1b, el_4, q, vl),
     INSN(extracti64x4,   66, 0f3a, 3b, el_4, q, vl),
+    INSN(insertf64x4,    66, 0f3a, 1a, el_4, q, vl),
+    INSN(inserti64x4,    66, 0f3a, 3a, el_4, q, vl),
 };
 
 static const struct test avx512bw_all[] = {
@@ -278,6 +283,8 @@ static const struct test avx512bw_128[]
     INSN(pextrb, 66, 0f3a, 14, el, b, el),
 //       pextrw, 66,   0f, c5,     w
     INSN(pextrw, 66, 0f3a, 15, el, w, el),
+    INSN(pinsrb, 66, 0f3a, 20, el, b, el),
+    INSN(pinsrw, 66,   0f, c4, el, w, el),
 };
 
 static const struct test avx512dq_all[] = {
@@ -290,6 +297,7 @@ static const struct test avx512dq_all[]
 
 static const struct test avx512dq_128[] = {
     INSN(pextr, 66, 0f3a, 16, el, dq64, el),
+    INSN(pinsr, 66, 0f3a, 22, el, dq64, el),
 };
 
 static const struct test avx512dq_no128[] = {
@@ -297,12 +305,16 @@ static const struct test avx512dq_no128[
     INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
     INSN(extractf64x2,   66, 0f3a, 19, el_2, q, vl),
     INSN(extracti64x2,   66, 0f3a, 39, el_2, q, vl),
+    INSN(insertf64x2,    66, 0f3a, 18, el_2, q, vl),
+    INSN(inserti64x2,    66, 0f3a, 38, el_2, q, vl),
 };
 
 static const struct test avx512dq_512[] = {
     INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
     INSN(extractf32x8,   66, 0f3a, 1b, el_8, d, vl),
     INSN(extracti32x8,   66, 0f3a, 3b, el_8, d, vl),
+    INSN(insertf32x8,    66, 0f3a, 1a, el_8, d, vl),
+    INSN(inserti32x8,    66, 0f3a, 3a, el_8, d, vl),
 };
 
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -360,7 +360,7 @@ static const struct twobyte_table {
     [0xc1] = { DstMem|SrcReg|ModRM },
     [0xc2] = { DstImplicit|SrcImmByte|ModRM, simd_any_fp, d8s_vl },
     [0xc3] = { DstMem|SrcReg|ModRM|Mov },
-    [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int },
+    [0xc4] = { DstReg|SrcImmByte|ModRM, simd_packed_int, 1 },
     [0xc5] = { DstReg|SrcImmByte|ModRM|Mov },
     [0xc6] = { DstImplicit|SrcImmByte|ModRM, simd_packed_fp, d8s_vl },
     [0xc7] = { ImplicitOps|ModRM },
@@ -516,17 +516,19 @@ static const struct ext0f3a_table {
     [0x15] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 1 },
     [0x16] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = d8s_dq64 },
     [0x17] = { .simd_size = simd_none, .to_mem = 1, .two_op = 1, .d8s = 2 },
-    [0x18] = { .simd_size = simd_128 },
+    [0x18] = { .simd_size = simd_128, .d8s = 4 },
     [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
+    [0x1a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
     [0x1b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
     [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x20] = { .simd_size = simd_none },
-    [0x21] = { .simd_size = simd_other },
-    [0x22] = { .simd_size = simd_none },
+    [0x20] = { .simd_size = simd_none, .d8s = 0 },
+    [0x21] = { .simd_size = simd_other, .d8s = 2 },
+    [0x22] = { .simd_size = simd_none, .d8s = d8s_dq64 },
     [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
-    [0x38] = { .simd_size = simd_128 },
+    [0x38] = { .simd_size = simd_128, .d8s = 4 },
+    [0x3a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
     [0x39] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
     [0x3b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
@@ -2570,6 +2572,7 @@ x86_decode_twobyte(
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         /* fall through */
     case X86EMUL_OPC_VEX_66(0, 0xc4): /* vpinsrw */
+    case X86EMUL_OPC_EVEX_66(0, 0xc4): /* vpinsrw */
         state->desc = DstReg | SrcMem16;
         break;
 
@@ -2672,6 +2675,7 @@ x86_decode_0f3a(
 
     case X86EMUL_OPC_66(0, 0x20):     /* pinsrb */
     case X86EMUL_OPC_VEX_66(0, 0x20): /* vpinsrb */
+    case X86EMUL_OPC_EVEX_66(0, 0x20): /* vpinsrb */
         state->desc = DstImplicit | SrcMem;
         if ( modrm_mod != 3 )
             state->desc |= ByteOp;
@@ -2679,6 +2683,7 @@ x86_decode_0f3a(
 
     case X86EMUL_OPC_66(0, 0x22):     /* pinsr{d,q} */
     case X86EMUL_OPC_VEX_66(0, 0x22): /* vpinsr{d,q} */
+    case X86EMUL_OPC_EVEX_66(0, 0x22): /* vpinsr{d,q} */
         state->desc = DstImplicit | SrcMem;
         break;
 
@@ -7695,6 +7700,23 @@ x86_emulate(
         ea.type = OP_MEM;
         goto simd_0f_int_imm8;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0xc4):   /* vpinsrw $imm8,r32/m16,xmm,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x20): /* vpinsrb $imm8,r32/m8,xmm,xmm */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x22): /* vpinsr{d,q} $imm8,r/m,xmm,xmm */
+        generate_exception_if(evex.lr || evex.opmsk || evex.br, EXC_UD);
+        if ( b & 2 )
+            host_and_vcpu_must_have(avx512dq);
+        else
+            host_and_vcpu_must_have(avx512bw);
+        if ( !mode_64bit() )
+            evex.w = 0;
+        memcpy(mmvalp, &src.val, op_bytes);
+        ea.type = OP_MEM;
+        op_bytes = src.bytes;
+        d = SrcMem16; /* Fake for the common SIMD code below. */
+        state->simd_size = simd_other;
+        goto avx512f_imm_no_sae;
+
     CASE_SIMD_PACKED_INT(0x0f, 0xc5):      /* pextrw $imm8,{,x}mm,reg */
     case X86EMUL_OPC_VEX_66(0x0f, 0xc5):   /* vpextrw $imm8,xmm,reg */
         generate_exception_if(vex.l, EXC_UD);
@@ -8906,8 +8928,12 @@ x86_emulate(
         opc = init_evex(stub);
         goto pextr;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x18): /* vinsertf32x4 $imm8,xmm/m128,{y,z}mm{k} */
+                                            /* vinsertf64x2 $imm8,xmm/m128,{y,z}mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x19): /* vextractf32x4 $imm8,{y,z}mm,xmm/m128{k} */
                                             /* vextractf64x2 $imm8,{y,z}mm,xmm/m128{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x38): /* vinserti32x4 $imm8,xmm/m128,{y,z}mm{k} */
+                                            /* vinserti64x2 $imm8,xmm/m128,{y,z}mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x39): /* vextracti32x4 $imm8,{y,z}mm,xmm/m128{k} */
                                             /* vextracti64x2 $imm8,{y,z}mm,xmm/m128{k} */
         if ( evex.w )
@@ -8916,8 +8942,12 @@ x86_emulate(
         fault_suppression = false;
         goto avx512f_imm_no_sae;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1a): /* vinsertf32x4 $imm8,ymm/m256,zmm{k} */
+                                            /* vinsertf64x2 $imm8,ymm/m256,zmm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1b): /* vextractf32x8 $imm8,zmm,ymm/m256{k} */
                                             /* vextractf64x4 $imm8,zmm,ymm/m256{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3a): /* vinserti32x4 $imm8,ymm/m256,zmm{k} */
+                                            /* vinserti64x2 $imm8,ymm/m256,zmm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x3b): /* vextracti32x8 $imm8,zmm,ymm/m256{k} */
                                             /* vextracti64x4 $imm8,zmm,ymm/m256{k} */
         if ( !evex.w )
@@ -9010,13 +9040,19 @@ x86_emulate(
         op_bytes = 4;
         goto simd_0f3a_common;
 
-    case X86EMUL_OPC_VEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m32,xmm,xmm */
         op_bytes = 4;
         /* fall through */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x41): /* vdppd $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
         generate_exception_if(vex.l, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x21): /* vinsertps $imm8,xmm/m32,xmm,xmm */
+        op_bytes = 4;
+        generate_exception_if(evex.lr || evex.w || evex.opmsk || evex.br,
+                              EXC_UD);
+        goto avx512f_imm_no_sae;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x30): /* kshiftr{b,w} $imm8,k,k */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x32): /* kshiftl{b,w} $imm8,k,k */
         if ( !vex.w )




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 24/44] x86emul: basic AVX512F testing
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (22 preceding siblings ...)
  2018-09-25 13:40   ` [PATCH v4 23/44] x86emul: support AVX512{F, BW, DQ} insert insns Jan Beulich
@ 2018-09-25 13:41   ` Jan Beulich
  2018-09-25 13:41   ` [PATCH v4 25/44] x86emul: support AVX512{F, BW, DQ} integer broadcast insns Jan Beulich
                     ` (19 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:41 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Test various of the insns which have been implemented already.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Make eq() also work for 4- and 8-byte integer element sizes.
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -13,7 +13,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -60,6 +60,9 @@ avx2-sg-flts := 4 8
 xop-vecs := $(avx-vecs)
 xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
+avx512f-vecs := 64
+avx512f-ints := 4 8
+avx512f-flts := 4 8
 
 avx512f-opmask-vecs := 2
 avx512dq-opmask-vecs := 1
@@ -145,7 +148,7 @@ $(addsuffix .c,$(SG)):
 
 $(addsuffix .h,$(SIMD) $(FMA) $(SG)): simd.h
 
-xop.h: simd-fma.c
+xop.h avx512f.h: simd-fma.c
 
 endif # 32-bit override
 
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -2,7 +2,41 @@
 
 ENTRY(simd_test);
 
-#if VEC_SIZE == 8 && defined(__SSE__)
+#if defined(__AVX512F__)
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# if VEC_SIZE == 4
+#  define eq(x, y) ({ \
+    float x_ = (x)[0]; \
+    float __attribute__((vector_size(16))) y_ = { (y)[0] }; \
+    unsigned short r_; \
+    asm ( "vcmpss $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
+    r_ == 1; \
+})
+# elif VEC_SIZE == 8
+#  define eq(x, y) ({ \
+    double x_ = (x)[0]; \
+    double __attribute__((vector_size(16))) y_ = { (y)[0] }; \
+    unsigned short r_; \
+    asm ( "vcmpsd $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
+    r_ == 1; \
+})
+# elif FLOAT_SIZE == 4
+/*
+ * gcc's (up to at least 8.2) __builtin_ia32_cmpps256_mask() has an anomaly in
+ * that its return type is QI rather than UQI, and hence the value would get
+ * sign-extended before comapring to ALL_TRUE. The same oddity does not matter
+ * for __builtin_ia32_cmppd256_mask(), as there only 4 bits are significant.
+ * Hence the extra " & ALL_TRUE".
+ */
+#  define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
+# elif FLOAT_SIZE == 8
+#  define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+#  define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
+# endif
+#elif VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
 #elif VEC_SIZE == 16
 # if defined(__AVX__) && defined(FLOAT_SIZE)
@@ -93,6 +127,50 @@ static inline bool _to_bool(byte_vec_t b
     touch(x); \
     __builtin_ia32_pfrcpit2(__builtin_ia32_pfrsqit1(__builtin_ia32_pfmul(t_, t_), x), t_); \
 })
+#elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
+      (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if VEC_SIZE > FLOAT_SIZE
+#  if FLOAT_SIZE == 4
+#   define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vbroadcastss %1, %0" \
+          : "=v" (t_) : "m" (*(float[1]){ x }) ); \
+    t_; \
+})
+#   define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
+#   define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
+#   define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+#   if VEC_SIZE == 16
+#    define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
+#    define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
+#    define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
+#   endif
+#  elif FLOAT_SIZE == 8
+#   if VEC_SIZE >= 32
+#    define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vbroadcastsd %1, %0" : "=v" (t_) \
+          : "m" (*(double[1]){ x }) ); \
+    t_; \
+})
+#   else
+#    define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastq %1, %0" \
+          : "=v" (t_) : "m" (*(double[1]){ x }) ); \
+    t_; \
+})
+#   endif
+#   define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
+#   define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
+#   define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
+#   if VEC_SIZE == 16
+#    define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
+#    define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
+#    define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
+#   endif
+#  endif
+# endif
 #elif FLOAT_SIZE == 4 && defined(__SSE__)
 # if VEC_SIZE == 32 && defined(__AVX__)
 #  if defined(__AVX2__)
@@ -191,7 +269,30 @@ static inline bool _to_bool(byte_vec_t b
 #  define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
 # endif
 #endif
-#if VEC_SIZE == 16 && defined(__SSE2__)
+#if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
+     defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if INT_SIZE == 4 || UINT_SIZE == 4
+#  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
+                              (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
+# endif
+# if INT_SIZE == 4
+#  define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
+#  define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
+#  define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 4
+#  define max(x, y) ((vec_t)B(pmaxud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), (vdi_t)undef(), ~0))
+# elif INT_SIZE == 8
+#  define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 8
+#  define max(x, y) ((vec_t)B(pmaxuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# endif
+#elif VEC_SIZE == 16 && defined(__SSE2__)
 # if INT_SIZE == 1 || UINT_SIZE == 1
 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)(x), (vqi_t)(y)))
@@ -587,6 +688,10 @@ static inline bool _to_bool(byte_vec_t b
 # endif
 #endif
 
+#if defined(__AVX512F__) && defined(FLOAT_SIZE)
+# include "simd-fma.c"
+#endif
+
 int simd_test(void)
 {
     unsigned int i, j;
@@ -1034,7 +1139,8 @@ int simd_test(void)
 # endif
 #endif
 
-#if defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)
+#if (defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)) || \
+    (defined(__AVX512F__) && defined(FLOAT_SIZE))
     return -fma_test();
 #endif
 
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -70,9 +70,111 @@ typedef int __attribute__((vector_size(V
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 #endif
 
+#if VEC_SIZE == 16
+# define B(n, s, a...)   __builtin_ia32_ ## n ## 128 ## s(a)
+# define B_(n, s, a...)  __builtin_ia32_ ## n ##        s(a)
+#elif VEC_SIZE == 32
+# define B(n, s, a...)   __builtin_ia32_ ## n ## 256 ## s(a)
+#elif VEC_SIZE == 64
+# define B(n, s, a...)   __builtin_ia32_ ## n ## 512 ## s(a)
+# define BR(n, s, a...)  __builtin_ia32_ ## n ## 512 ## s(a, 4)
+#endif
+#ifndef B_
+# define B_ B
+#endif
+#ifndef BR
+# define BR B
+# define BR_ B_
+#endif
+#ifndef BR_
+# define BR_ BR
+#endif
+
+#ifdef __AVX512F__
+
+/*
+ * The original plan was to effect use of EVEX encodings for scalar as well as
+ * 128- and 256-bit insn variants by restricting the compiler to use (on 64-bit
+ * only of course) XMM16-XMM31 only. All sorts of compiler errors result when
+ * doing this with gcc 8.2. Therefore resort to injecting {evex} prefixes,
+ * which has the benefit of also working for 32-bit. Granted, there is a lot of
+ * escaping to get right here.
+ */
+asm ( ".macro override insn    \n\t"
+      ".macro $\\insn o:vararg \n\t"
+      ".purgem \\insn          \n\t"
+      "{evex} \\insn \\(\\)o   \n\t"
+      ".macro \\insn o:vararg  \n\t"
+      "$\\insn \\(\\(\\))o     \n\t"
+      ".endm                   \n\t"
+      ".endm                   \n\t"
+      ".macro \\insn o:vararg  \n\t"
+      "$\\insn \\(\\)o         \n\t"
+      ".endm                   \n\t"
+      ".endm" );
+
+#define OVR(n) asm ( "override v" #n )
+#define OVR_SFP(n) OVR(n ## sd); OVR(n ## ss)
+
+#ifdef __AVX512VL__
+# ifdef __AVX512BW__
+#  define OVR_BW(n) OVR(p ## n ## b); OVR(p ## n ## w)
+# else
+#  define OVR_BW(n)
+# endif
+# define OVR_DQ(n) OVR(p ## n ## d); OVR(p ## n ## q)
+# define OVR_VFP(n) OVR(n ## pd); OVR(n ## ps)
+#else
+# define OVR_BW(n)
+# define OVR_DQ(n)
+# define OVR_VFP(n)
+#endif
+
+#define OVR_FMA(n, w) OVR_ ## w(n ## 132); OVR_ ## w(n ## 213); \
+                      OVR_ ## w(n ## 231)
+#define OVR_FP(n) OVR_VFP(n); OVR_SFP(n)
+#define OVR_INT(n) OVR_BW(n); OVR_DQ(n)
+
+OVR_SFP(broadcast);
+OVR_SFP(comi);
+OVR_FP(add);
+OVR_FP(div);
+OVR(extractps);
+OVR_FMA(fmadd, FP);
+OVR_FMA(fmsub, FP);
+OVR_FMA(fnmadd, FP);
+OVR_FMA(fnmsub, FP);
+OVR(insertps);
+OVR_FP(max);
+OVR_FP(min);
+OVR(movd);
+OVR(movq);
+OVR_SFP(mov);
+OVR_FP(mul);
+OVR_FP(sqrt);
+OVR_FP(sub);
+OVR_SFP(ucomi);
+
+#undef OVR_VFP
+#undef OVR_SFP
+#undef OVR_INT
+#undef OVR_FP
+#undef OVR_FMA
+#undef OVR_DQ
+#undef OVR_BW
+#undef OVR
+
+#endif
+
 /*
  * Suppress value propagation by the compiler, preventing unwanted
  * optimization. This at once makes the compiler use memory operands
  * more often, which for our purposes is the more interesting case.
  */
 #define touch(var) asm volatile ( "" : "+m" (var) )
+
+static inline vec_t undef(void)
+{
+    vec_t v = v;
+    return v;
+}
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -1,10 +1,9 @@
+#if !defined(__XOP__) && !defined(__AVX512F__)
 #include "simd.h"
-
-#ifndef __XOP__
 ENTRY(fma_test);
 #endif
 
-#if VEC_SIZE < 16
+#if VEC_SIZE < 16 && !defined(to_bool)
 # define to_bool(cmp) (!~(cmp)[0])
 #elif VEC_SIZE == 16
 # if FLOAT_SIZE == 4
@@ -24,7 +23,13 @@ ENTRY(fma_test);
 # define eq(x, y) to_bool((x) == (y))
 #endif
 
-#if VEC_SIZE == 16
+#if defined(__AVX512F__) && VEC_SIZE > FLOAT_SIZE
+# if FLOAT_SIZE == 4
+#  define fmaddsub(x, y, z) BR(vfmaddsubps, _mask, x, y, z, ~0)
+# elif FLOAT_SIZE == 8
+#  define fmaddsub(x, y, z) BR(vfmaddsubpd, _mask, x, y, z, ~0)
+# endif
+#elif VEC_SIZE == 16
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps(x, y)
 #  if defined(__FMA4__) || defined(__FMA__)
@@ -50,6 +55,10 @@ ENTRY(fma_test);
 # endif
 #endif
 
+#if defined(fmaddsub) && !defined(addsub)
+# define addsub(x, y) fmaddsub(x, broadcast(1), y)
+#endif
+
 int fma_test(void)
 {
     unsigned int i;
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -21,6 +21,7 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512f-opmask.h"
 #include "avx512dq-opmask.h"
 #include "avx512bw-opmask.h"
+#include "avx512f.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -248,6 +249,14 @@ static const struct {
     SIMD(OPMASK/b,    avx512dq_opmask,         1),
     SIMD(OPMASK/d,    avx512bw_opmask,         4),
     SIMD(OPMASK/q,    avx512bw_opmask,         8),
+    SIMD(AVX512F f32 scalar,  avx512f,        f4),
+    SIMD(AVX512F f32x16,      avx512f,      64f4),
+    SIMD(AVX512F f64 scalar,  avx512f,        f8),
+    SIMD(AVX512F f64x8,       avx512f,      64f8),
+    SIMD(AVX512F s32x16,      avx512f,      64i4),
+    SIMD(AVX512F u32x16,      avx512f,      64u4),
+    SIMD(AVX512F s64x8,       avx512f,      64i8),
+    SIMD(AVX512F u64x8,       avx512f,      64u8),
 #undef SIMD_
 #undef SIMD
 };




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 25/44] x86emul: support AVX512{F, BW, DQ} integer broadcast insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (23 preceding siblings ...)
  2018-09-25 13:41   ` [PATCH v4 24/44] x86emul: basic AVX512F testing Jan Beulich
@ 2018-09-25 13:41   ` Jan Beulich
  2018-09-25 13:42   ` [PATCH v4 26/44] x86emul: basic AVX512VL testing Jan Beulich
                     ` (18 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:41 UTC (permalink / raw)
  To: xen-devel, Jan Beulich; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note that the pbroadcastw table entry in evex-disp8.c is slightly
different from what one would expect, due to it requiring EVEX.W to be
zero.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -153,6 +153,9 @@ static const struct test avx512f_all[] =
     INSN(paddq,        66,   0f, d4,    vl,      q, vl),
     INSN(pand,         66,   0f, db,    vl,     dq, vl),
     INSN(pandn,        66,   0f, df,    vl,     dq, vl),
+//       pbroadcast,   66, 0f38, 7c,          dq64
+    INSN(pbroadcastd,  66, 0f38, 58,    el,      d, el),
+    INSN(pbroadcastq,  66, 0f38, 59,    el,      q, el),
     INSN(pcmp,         66, 0f3a, 1f,    vl,     dq, vl),
     INSN(pcmpeqd,      66,   0f, 76,    vl,      d, vl),
     INSN(pcmpeqq,      66, 0f38, 29,    vl,      q, vl),
@@ -211,6 +214,7 @@ static const struct test avx512f_128[] =
 
 static const struct test avx512f_no128[] = {
     INSN(broadcastf32x4, 66, 0f38, 1a, el_4,  d, vl),
+    INSN(broadcasti32x4, 66, 0f38, 5a, el_4,  d, vl),
     INSN(broadcastsd,    66, 0f38, 19, el,    q, el),
     INSN(extractf32x4,   66, 0f3a, 19, el_4,  d, vl),
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
@@ -220,6 +224,7 @@ static const struct test avx512f_no128[]
 
 static const struct test avx512f_512[] = {
     INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl),
+    INSN(broadcasti64x4, 66, 0f38, 5b, el_4, q, vl),
     INSN(extractf64x4,   66, 0f3a, 1b, el_4, q, vl),
     INSN(extracti64x4,   66, 0f3a, 3b, el_4, q, vl),
     INSN(insertf64x4,    66, 0f3a, 1a, el_4, q, vl),
@@ -239,6 +244,10 @@ static const struct test avx512bw_all[]
     INSN(paddw,       66,   0f, fd,    vl,    w, vl),
     INSN(pavgb,       66,   0f, e0,    vl,    b, vl),
     INSN(pavgw,       66,   0f, e3,    vl,    w, vl),
+    INSN(pbroadcastb, 66, 0f38, 78,    el,    b, el),
+//       pbroadcastb, 66, 0f38, 7a,           b
+    INSN(pbroadcastw, 66, 0f38, 79,    el_2,  b, vl),
+//       pbroadcastw, 66, 0f38, 7b,           b
     INSN(pcmp,        66, 0f3a, 3f,    vl,   bw, vl),
     INSN(pcmpeqb,     66,   0f, 74,    vl,    b, vl),
     INSN(pcmpeqw,     66,   0f, 75,    vl,    w, vl),
@@ -290,6 +299,7 @@ static const struct test avx512bw_128[]
 static const struct test avx512dq_all[] = {
     INSN_PFP(and,              0f, 54),
     INSN_PFP(andn,             0f, 55),
+    INSN(broadcasti32x2, 66, 0f38, 59, el_2,  d, vl),
     INSN_PFP(or,               0f, 56),
     INSN(pmullq,         66, 0f38, 40,   vl,  q, vl),
     INSN_PFP(xor,              0f, 57),
@@ -303,6 +313,7 @@ static const struct test avx512dq_128[]
 static const struct test avx512dq_no128[] = {
     INSN(broadcastf32x2, 66, 0f38, 19, el_2, d, vl),
     INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl),
+    INSN(broadcasti64x2, 66, 0f38, 5a, el_2, q, vl),
     INSN(extractf64x2,   66, 0f3a, 19, el_2, q, vl),
     INSN(extracti64x2,   66, 0f3a, 39, el_2, q, vl),
     INSN(insertf64x2,    66, 0f3a, 18, el_2, q, vl),
@@ -311,6 +322,7 @@ static const struct test avx512dq_no128[
 
 static const struct test avx512dq_512[] = {
     INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl),
+    INSN(broadcasti32x8, 66, 0f38, 5b, el_8, d, vl),
     INSN(extractf32x8,   66, 0f3a, 1b, el_8, d, vl),
     INSN(extracti32x8,   66, 0f3a, 3b, el_8, d, vl),
     INSN(insertf32x8,    66, 0f3a, 1a, el_8, d, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -272,9 +272,33 @@ static inline bool _to_bool(byte_vec_t b
 #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
      defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
 # if INT_SIZE == 4 || UINT_SIZE == 4
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastd %1, %0" \
+          : "=v" (t_) : "m" (*(int[1]){ x }) ); \
+    t_; \
+})
+#  define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
+    t_; \
+})
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
 # elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastq %1, %0" \
+          : "=v" (t_) : "m" (*(long long[1]){ x }) ); \
+    t_; \
+})
+#  ifdef __x86_64__
+#   define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastq %1, %0" : "=v" (t_) : "r" ((x) + 0ULL) ); \
+    t_; \
+})
+#  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
 # endif
 # if INT_SIZE == 4
@@ -971,10 +995,14 @@ int simd_test(void)
     if ( !eq(swap2(src), inv) ) return __LINE__;
 #endif
 
-#if defined(broadcast)
+#ifdef broadcast
     if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
 #endif
 
+#ifdef broadcast2
+    if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -454,9 +454,13 @@ static const struct ext0f38_table {
     [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 },
-    [0x5a] = { .simd_size = simd_128, .two_op = 1 },
-    [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 },
+    [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
+    [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
+    [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
+    [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x78] = { .simd_size = simd_other, .two_op = 1 },
+    [0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 },
+    [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 },
     [0x8c] = { .simd_size = simd_packed_int },
     [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
@@ -2620,6 +2624,11 @@ x86_decode_0f38(
         ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK);
         break;
 
+    case X86EMUL_OPC_EVEX_66(0, 0x7a): /* vpbroadcastb */
+    case X86EMUL_OPC_EVEX_66(0, 0x7b): /* vpbroadcastw */
+    case X86EMUL_OPC_EVEX_66(0, 0x7c): /* vpbroadcast{d,q} */
+        break;
+
     case 0xf0: /* movbe / crc32 */
         state->desc |= repne_prefix() ? ByteOp : Mov;
         if ( rep_prefix() )
@@ -8193,6 +8202,8 @@ x86_emulate(
         goto avx512f_no_sae;
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,[xyz]mm{k} */
+        op_bytes = elem_bytes;
         generate_exception_if(evex.w || evex.br, EXC_UD);
     avx512_broadcast:
         /*
@@ -8211,17 +8222,27 @@ x86_emulate(
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */
                                             /* vbroadcastf64x4 m256,zmm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x5b): /* vbroadcasti32x8 m256,zmm{k} */
+                                            /* vbroadcasti64x4 m256,zmm{k} */
         generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD);
         /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */
                                             /* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */
-        generate_exception_if(!evex.lr || evex.br, EXC_UD);
+        generate_exception_if(!evex.lr, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,[xyz]mm{k} */
+                                            /* vbroadcasti32x2 xmm/m64,[xyz]mm{k} */
+        if ( b == 0x59 )
+            op_bytes = 8;
+        generate_exception_if(evex.br, EXC_UD);
         if ( !evex.w )
             host_and_vcpu_must_have(avx512dq);
         goto avx512_broadcast;
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */
                                             /* vbroadcastf64x2 m128,{y,z}mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x5a): /* vbroadcasti32x4 m128,{y,z}mm{k} */
+                                            /* vbroadcasti64x2 m128,{y,z}mm{k} */
         generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.br,
                               EXC_UD);
         if ( evex.w )
@@ -8415,6 +8436,45 @@ x86_emulate(
         generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
         goto simd_0f_avx2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.w || evex.br, EXC_UD);
+        op_bytes = elem_bytes = 1 << (b & 1);
+        /* See the comment at the avx512_broadcast label. */
+        op_mask |= !(b & 1 ? !(uint32_t)op_mask : !op_mask);
+        goto avx512f_no_sae;
+
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7a): /* vpbroadcastb r32,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7b): /* vpbroadcastw r32,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7c): /* vpbroadcast{d,q} reg,[xyz]mm{k} */
+        generate_exception_if((ea.type != OP_REG || evex.br ||
+                               evex.reg != 0xf || !evex.RX),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(false);
+        get_fpu(X86EMUL_FPU_zmm);
+
+        opc = init_evex(stub);
+        opc[0] = b;
+        /* Convert GPR source to %rAX. */
+        evex.b = 1;
+        if ( !mode_64bit() )
+            evex.w = 0;
+        opc[1] = modrm & 0xf8;
+        insn_bytes = EVEX_PFX_BYTES + 2;
+        opc[2] = 0xc3;
+
+        copy_EVEX(opc, evex);
+        invoke_stub("", "", "+m" (src.val) : "a" (src.val));
+
+        put_stub(stub);
+        ASSERT(!state->simd_size);
+        break;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 26/44] x86emul: basic AVX512VL testing
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (24 preceding siblings ...)
  2018-09-25 13:41   ` [PATCH v4 25/44] x86emul: support AVX512{F, BW, DQ} integer broadcast insns Jan Beulich
@ 2018-09-25 13:42   ` Jan Beulich
  2018-09-25 13:43   ` [PATCH v4 27/44] x86emul: support AVX512{F, BW} zero- and sign-extending moves Jan Beulich
                     ` (17 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:42 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Test the 128- and 256-bit variants of the insns which have been
implemented already.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Move OVR() additions into __AVX512VL__ conditional.
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -60,7 +60,7 @@ avx2-sg-flts := 4 8
 xop-vecs := $(avx-vecs)
 xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
-avx512f-vecs := 64
+avx512f-vecs := 64 16 32
 avx512f-ints := 4 8
 avx512f-flts := 4 8
 
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -5,13 +5,13 @@ ENTRY(fma_test);
 
 #if VEC_SIZE < 16 && !defined(to_bool)
 # define to_bool(cmp) (!~(cmp)[0])
-#elif VEC_SIZE == 16
+#elif VEC_SIZE == 16 && !defined(__AVX512VL__)
 # if FLOAT_SIZE == 4
 #  define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
 # elif FLOAT_SIZE == 8
 #  define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
 # endif
-#elif VEC_SIZE == 32
+#elif VEC_SIZE == 32 && !defined(__AVX512VL__)
 # if FLOAT_SIZE == 4
 #  define to_bool(cmp) __builtin_ia32_vtestcps256(cmp, (vec_t){} == 0)
 # elif FLOAT_SIZE == 8
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -533,7 +533,7 @@ static inline bool _to_bool(byte_vec_t b
 #  define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 64))
 # endif
 #endif
-#if VEC_SIZE == 16 && defined(__SSE4_1__)
+#if VEC_SIZE == 16 && defined(__SSE4_1__) && !defined(__AVX512VL__)
 # if INT_SIZE == 1
 #  define max(x, y) ((vec_t)__builtin_ia32_pmaxsb128((vqi_t)(x), (vqi_t)(y)))
 #  define min(x, y) ((vec_t)__builtin_ia32_pminsb128((vqi_t)(x), (vqi_t)(y)))
@@ -587,7 +587,7 @@ static inline bool _to_bool(byte_vec_t b
 #  define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
 # endif
 #endif
-#if VEC_SIZE == 32 && defined(__AVX__)
+#if VEC_SIZE == 32 && defined(__AVX__) && !defined(__AVX512VL__)
 # if FLOAT_SIZE == 4
 #  define dot_product(x, y) ({ \
     vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -92,6 +92,15 @@ typedef long long __attribute__((vector_
 
 #ifdef __AVX512F__
 
+#if VEC_SIZE < 64
+# pragma GCC target ( "avx512vl" )
+#endif
+
+#define REN(insn, old, new)                      \
+    asm ( ".macro v" #insn #old " o:vararg \n\t" \
+          "v" #insn #new " \\o             \n\t" \
+          ".endm" )
+
 /*
  * The original plan was to effect use of EVEX encodings for scalar as well as
  * 128- and 256-bit insn variants by restricting the compiler to use (on 64-bit
@@ -135,25 +144,88 @@ asm ( ".macro override insn    \n\t"
 #define OVR_FP(n) OVR_VFP(n); OVR_SFP(n)
 #define OVR_INT(n) OVR_BW(n); OVR_DQ(n)
 
+OVR_INT(broadcast);
 OVR_SFP(broadcast);
 OVR_SFP(comi);
 OVR_FP(add);
+OVR_INT(add);
 OVR_FP(div);
 OVR(extractps);
 OVR_FMA(fmadd, FP);
+OVR_FMA(fmaddsub, VFP);
 OVR_FMA(fmsub, FP);
+OVR_FMA(fmsubadd, VFP);
 OVR_FMA(fnmadd, FP);
 OVR_FMA(fnmsub, FP);
 OVR(insertps);
 OVR_FP(max);
+OVR_INT(maxs);
+OVR_INT(maxu);
 OVR_FP(min);
+OVR_INT(mins);
+OVR_INT(minu);
 OVR(movd);
 OVR(movq);
 OVR_SFP(mov);
+OVR_VFP(mova);
+OVR_VFP(movnt);
+OVR_VFP(movu);
 OVR_FP(mul);
+OVR_VFP(shuf);
+OVR_INT(sll);
+OVR_DQ(sllv);
 OVR_FP(sqrt);
+OVR_INT(sra);
+OVR_DQ(srav);
+OVR_INT(srl);
+OVR_DQ(srlv);
 OVR_FP(sub);
+OVR_INT(sub);
 OVR_SFP(ucomi);
+OVR_VFP(unpckh);
+OVR_VFP(unpckl);
+
+#ifdef __AVX512VL__
+# if ELEM_SIZE == 8 && defined(__AVX512DQ__)
+REN(extract, f128, f64x2);
+REN(extract, i128, i64x2);
+REN(insert, f128, f64x2);
+REN(insert, i128, i64x2);
+# else
+REN(extract, f128, f32x4);
+REN(extract, i128, i32x4);
+REN(insert, f128, f32x4);
+REN(insert, i128, i32x4);
+# endif
+# if ELEM_SIZE == 8
+REN(movdqa, , 64);
+REN(movdqu, , 64);
+REN(pand, , q);
+REN(pandn, , q);
+REN(por, , q);
+REN(pxor, , q);
+# else
+#  if ELEM_SIZE == 1 && defined(__AVX512BW__)
+REN(movdq, a, u8);
+REN(movdqu, , 8);
+#  elif ELEM_SIZE == 2 && defined(__AVX512BW__)
+REN(movdq, a, u16);
+REN(movdqu, , 16);
+#  else
+REN(movdqa, , 32);
+REN(movdqu, , 32);
+#  endif
+REN(pand, , d);
+REN(pandn, , d);
+REN(por, , d);
+REN(pxor, , d);
+# endif
+OVR(movntdq);
+OVR(movntdqa);
+OVR(pmulld);
+OVR(pmuldq);
+OVR(pmuludq);
+#endif
 
 #undef OVR_VFP
 #undef OVR_SFP
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -88,6 +88,11 @@ static bool simd_check_avx512f(void)
 }
 #define simd_check_avx512f_opmask simd_check_avx512f
 
+static bool simd_check_avx512f_vl(void)
+{
+    return cpu_has_avx512f && cpu_has_avx512vl;
+}
+
 static bool simd_check_avx512dq(void)
 {
     return cpu_has_avx512dq;
@@ -142,11 +147,21 @@ static const struct {
       .check_cpu = simd_check_ ## feat,                             \
       .set_regs = simd_set_regs,                                    \
       .check_regs = simd_check_regs }
+#define AVX512VL_(bits, desc, feat, form)                          \
+    { .code = feat ## _x86_ ## bits ## _D ## _ ## form,            \
+      .size = sizeof(feat ## _x86_ ## bits ## _D ## _ ## form),    \
+      .bitness = bits, .name = "AVX512" #desc,                     \
+      .check_cpu = simd_check_ ## feat ## _vl,                     \
+      .set_regs = simd_set_regs,                                   \
+      .check_regs = simd_check_regs }
 #ifdef __x86_64__
 # define SIMD(desc, feat, form) SIMD_(64, desc, feat, form), \
                                 SIMD_(32, desc, feat, form)
+# define AVX512VL(desc, feat, form) AVX512VL_(64, desc, feat, form), \
+                                    AVX512VL_(32, desc, feat, form)
 #else
 # define SIMD(desc, feat, form) SIMD_(32, desc, feat, form)
+# define AVX512VL(desc, feat, form) AVX512VL_(32, desc, feat, form)
 #endif
     SIMD(3DNow! single,          _3dnow,     8f4),
     SIMD(SSE scalar single,      sse,         f4),
@@ -257,6 +272,20 @@ static const struct {
     SIMD(AVX512F u32x16,      avx512f,      64u4),
     SIMD(AVX512F s64x8,       avx512f,      64i8),
     SIMD(AVX512F u64x8,       avx512f,      64u8),
+    AVX512VL(VL f32x4,        avx512f,      16f4),
+    AVX512VL(VL f64x2,        avx512f,      16f8),
+    AVX512VL(VL f32x8,        avx512f,      32f4),
+    AVX512VL(VL f64x4,        avx512f,      32f8),
+    AVX512VL(VL s32x4,        avx512f,      16i4),
+    AVX512VL(VL u32x4,        avx512f,      16u4),
+    AVX512VL(VL s32x8,        avx512f,      32i4),
+    AVX512VL(VL u32x8,        avx512f,      32u4),
+    AVX512VL(VL s64x2,        avx512f,      16i8),
+    AVX512VL(VL u64x2,        avx512f,      16u8),
+    AVX512VL(VL s64x4,        avx512f,      32i8),
+    AVX512VL(VL u64x4,        avx512f,      32u8),
+#undef AVX512VL_
+#undef AVX512VL
 #undef SIMD_
 #undef SIMD
 };




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 27/44] x86emul: support AVX512{F, BW} zero- and sign-extending moves
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (25 preceding siblings ...)
  2018-09-25 13:42   ` [PATCH v4 26/44] x86emul: basic AVX512VL testing Jan Beulich
@ 2018-09-25 13:43   ` Jan Beulich
  2018-09-25 13:43   ` [PATCH v4 28/44] x86emul: support AVX512{F, BW} down conversion moves Jan Beulich
                     ` (16 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:43 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note that the testing in simd.c doesn't really follow the ISA extension
pattern - to fit the scheme, extensions from byte and word granular
vectors can (currently) sensibly only happen in the AVX512BW case (and
hence respective abstraction macros will be added there rather than
here).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -166,6 +166,16 @@ static const struct test avx512f_all[] =
     INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
     INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
     INSN(pminu,        66, 0f38, 3b,    vl,     dq, vl),
+    INSN(pmovsxbd,     66, 0f38, 21,    vl_4,    b, vl),
+    INSN(pmovsxbq,     66, 0f38, 22,    vl_8,    b, vl),
+    INSN(pmovsxwd,     66, 0f38, 23,    vl_2,    w, vl),
+    INSN(pmovsxwq,     66, 0f38, 24,    vl_4,    w, vl),
+    INSN(pmovsxdq,     66, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovzxbd,     66, 0f38, 31,    vl_4,    b, vl),
+    INSN(pmovzxbq,     66, 0f38, 32,    vl_8,    b, vl),
+    INSN(pmovzxwd,     66, 0f38, 33,    vl_2,    w, vl),
+    INSN(pmovzxwq,     66, 0f38, 34,    vl_4,    w, vl),
+    INSN(pmovzxdq,     66, 0f38, 35,    vl_2, d_nb, vl),
     INSN(pmuldq,       66, 0f38, 28,    vl,      q, vl),
     INSN(pmulld,       66, 0f38, 40,    vl,      d, vl),
     INSN(pmuludq,      66,   0f, f4,    vl,      q, vl),
@@ -263,6 +273,8 @@ static const struct test avx512bw_all[]
     INSN(pminsw,      66,   0f, ea,    vl,    w, vl),
     INSN(pminub,      66,   0f, da,    vl,    b, vl),
     INSN(pminuw,      66, 0f38, 3a,    vl,    w, vl),
+    INSN(pmovsxbw,    66, 0f38, 20,    vl_2,  b, vl),
+    INSN(pmovzxbw,    66, 0f38, 30,    vl_2,  b, vl),
     INSN(pmulhuw,     66,   0f, e4,    vl,    w, vl),
     INSN(pmulhw,      66,   0f, e5,    vl,    w, vl),
     INSN(pmullw,      66,   0f, d5,    vl,    w, vl),
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -443,13 +443,23 @@ static const struct ext0f38_table {
     [0x1a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
     [0x1b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1c ... 0x1e] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x20 ... 0x25] = { .simd_size = simd_other, .two_op = 1 },
+    [0x20] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x21] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x22] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_8 },
+    [0x23] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x24] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x25] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x26 ... 0x29] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x2b] = { .simd_size = simd_packed_int },
     [0x2c ... 0x2d] = { .simd_size = simd_packed_fp },
     [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
-    [0x30 ... 0x35] = { .simd_size = simd_other, .two_op = 1 },
+    [0x30] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x31] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x32] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_8 },
+    [0x33] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x34] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_4 },
+    [0x35] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x36 ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
@@ -8308,6 +8318,25 @@ x86_emulate(
         op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
         goto simd_0f_int;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x30): /* vpmovzxbw {x,y}mm/mem,[xyz]mm{k} */
+        host_and_vcpu_must_have(avx512bw);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x23): /* vpmovsxwd {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x24): /* vpmovsxwq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x25): /* vpmovsxdq {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x31): /* vpmovzxbd xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x32): /* vpmovzxbq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x33): /* vpmovzxwd {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x35): /* vpmovzxdq {x,y}mm/mem,[xyz]mm{k} */
+        generate_exception_if(evex.w && (b & 7) == 5, EXC_UD);
+        op_bytes = 32 >> (pmov_convert_delta[b & 7] + 1 - evex.lr);
+        elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_66(0x0f38, 0x2a):     /* movntdqa m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -305,10 +305,12 @@ static inline bool _to_bool(byte_vec_t b
 #  define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
 #  define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
 #  define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovsxdq, _mask, x, (vdi_t)undef(), ~0))
 # elif UINT_SIZE == 4
 #  define max(x, y) ((vec_t)B(pmaxud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #  define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), (vdi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovzxdq, _mask, (vsi_half_t)(x), (vdi_t)undef(), ~0))
 # elif INT_SIZE == 8
 #  define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -222,6 +222,16 @@ REN(pxor, , d);
 # endif
 OVR(movntdq);
 OVR(movntdqa);
+OVR(pmovsxbd);
+OVR(pmovsxbq);
+OVR(pmovsxdq);
+OVR(pmovsxwd);
+OVR(pmovsxwq);
+OVR(pmovzxbd);
+OVR(pmovzxbq);
+OVR(pmovzxdq);
+OVR(pmovzxwd);
+OVR(pmovzxwq);
 OVR(pmulld);
 OVR(pmuldq);
 OVR(pmuludq);




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 28/44] x86emul: support AVX512{F, BW} down conversion moves
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (26 preceding siblings ...)
  2018-09-25 13:43   ` [PATCH v4 27/44] x86emul: support AVX512{F, BW} zero- and sign-extending moves Jan Beulich
@ 2018-09-25 13:43   ` Jan Beulich
  2018-09-25 13:44   ` [PATCH v4 29/44] x86emul: support AVX512{F, BW} integer unpack insns Jan Beulich
                     ` (15 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:43 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Note that the vpmov{,s,us}{d,q}w table entries in evex-disp8.c are
slightly different from what one would expect, due to them requiring
EVEX.W to be zero.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Also #UD when evex.z is set with a memory operand.
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -166,11 +166,26 @@ static const struct test avx512f_all[] =
     INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
     INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
     INSN(pminu,        66, 0f38, 3b,    vl,     dq, vl),
+    INSN(pmovdb,       f3, 0f38, 31,    vl_4,    b, vl),
+    INSN(pmovdw,       f3, 0f38, 33,    vl_2,    b, vl),
+    INSN(pmovqb,       f3, 0f38, 32,    vl_8,    b, vl),
+    INSN(pmovqd,       f3, 0f38, 35,    vl_2, d_nb, vl),
+    INSN(pmovqw,       f3, 0f38, 34,    vl_4,    b, vl),
+    INSN(pmovsdb,      f3, 0f38, 21,    vl_4,    b, vl),
+    INSN(pmovsdw,      f3, 0f38, 23,    vl_2,    b, vl),
+    INSN(pmovsqb,      f3, 0f38, 22,    vl_8,    b, vl),
+    INSN(pmovsqd,      f3, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovsqw,      f3, 0f38, 24,    vl_4,    b, vl),
     INSN(pmovsxbd,     66, 0f38, 21,    vl_4,    b, vl),
     INSN(pmovsxbq,     66, 0f38, 22,    vl_8,    b, vl),
     INSN(pmovsxwd,     66, 0f38, 23,    vl_2,    w, vl),
     INSN(pmovsxwq,     66, 0f38, 24,    vl_4,    w, vl),
     INSN(pmovsxdq,     66, 0f38, 25,    vl_2, d_nb, vl),
+    INSN(pmovusdb,     f3, 0f38, 11,    vl_4,    b, vl),
+    INSN(pmovusdw,     f3, 0f38, 13,    vl_2,    b, vl),
+    INSN(pmovusqb,     f3, 0f38, 12,    vl_8,    b, vl),
+    INSN(pmovusqd,     f3, 0f38, 15,    vl_2, d_nb, vl),
+    INSN(pmovusqw,     f3, 0f38, 14,    vl_4,    b, vl),
     INSN(pmovzxbd,     66, 0f38, 31,    vl_4,    b, vl),
     INSN(pmovzxbq,     66, 0f38, 32,    vl_8,    b, vl),
     INSN(pmovzxwd,     66, 0f38, 33,    vl_2,    w, vl),
@@ -273,7 +288,10 @@ static const struct test avx512bw_all[]
     INSN(pminsw,      66,   0f, ea,    vl,    w, vl),
     INSN(pminub,      66,   0f, da,    vl,    b, vl),
     INSN(pminuw,      66, 0f38, 3a,    vl,    w, vl),
+    INSN(pmovswb,     f3, 0f38, 20,    vl_2,  b, vl),
     INSN(pmovsxbw,    66, 0f38, 20,    vl_2,  b, vl),
+    INSN(pmovuswb,    f3, 0f38, 10,    vl_2,  b, vl),
+    INSN(pmovwb,      f3, 0f38, 30,    vl_2,  b, vl),
     INSN(pmovzxbw,    66, 0f38, 30,    vl_2,  b, vl),
     INSN(pmulhuw,     66,   0f, e4,    vl,    w, vl),
     INSN(pmulhw,      66,   0f, e5,    vl,    w, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -271,6 +271,17 @@ static inline bool _to_bool(byte_vec_t b
 #endif
 #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
      defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if ELEM_COUNT == 8 /* vextracti{32,64}x4 */ || \
+     (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextracti32x8 */ || \
+     (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
+#  define low_half(x) ({ \
+    half_t t_; \
+    asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+    t_; \
+})
+# endif
 # if INT_SIZE == 4 || UINT_SIZE == 4
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -285,6 +296,7 @@ static inline bool _to_bool(byte_vec_t b
 })
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+#  define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
 # elif INT_SIZE == 8 || UINT_SIZE == 8
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -714,6 +726,27 @@ static inline bool _to_bool(byte_vec_t b
 # endif
 #endif
 
+#if VEC_SIZE >= 16
+
+# if !defined(low_half) && defined(HALF_SIZE)
+static inline half_t low_half(vec_t x)
+{
+#  if HALF_SIZE < VEC_SIZE
+    half_t y;
+    unsigned int i;
+
+    for ( i = 0; i < ELEM_COUNT / 2; ++i )
+        y[i] = x[i];
+
+    return y;
+#  else
+    return x;
+#  endif
+}
+# endif
+
+#endif
+
 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
 # include "simd-fma.c"
 #endif
@@ -1081,6 +1114,21 @@ int simd_test(void)
 
 #endif
 
+#if defined(widen1) && defined(shrink1)
+    {
+        half_t aux1 = low_half(src), aux2;
+
+        touch(aux1);
+        x = widen1(aux1);
+        touch(x);
+        aux2 = shrink1(x);
+        touch(aux2);
+        for ( i = 0; i < ELEM_COUNT / 2; ++i )
+            if ( aux2[i] != src[i] )
+                return __LINE__;
+    }
+#endif
+
 #ifdef dup_lo
     touch(src);
     x = dup_lo(src);
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -70,6 +70,23 @@ typedef int __attribute__((vector_size(V
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 #endif
 
+#if VEC_SIZE >= 16
+
+# if ELEM_COUNT >= 2
+#  if VEC_SIZE > 32
+#   define HALF_SIZE (VEC_SIZE / 2)
+#  else
+#   define HALF_SIZE 16
+#  endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(HALF_SIZE))) half_t;
+typedef char __attribute__((vector_size(HALF_SIZE))) vqi_half_t;
+typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
+typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
+typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+# endif
+
+#endif
+
 #if VEC_SIZE == 16
 # define B(n, s, a...)   __builtin_ia32_ ## n ## 128 ## s(a)
 # define B_(n, s, a...)  __builtin_ia32_ ## n ##        s(a)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3056,7 +3056,22 @@ x86_decode(
                 d |= vSIB;
             state->simd_size = ext0f38_table[b].simd_size;
             if ( evex_encoded() )
-                disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
+            {
+                /*
+                 * VPMOVUS* are identical to VPMOVS* Disp8-scaling-wise, but
+                 * their attributes don't match those of the vex_66 encoded
+                 * insns with the same base opcodes. Rather than adding new
+                 * columns to the table, handle this here for now.
+                 */
+                if ( evex.pfx != vex_f3 || (b & 0xf8) != 0x10 )
+                    disp8scale = decode_disp8scale(ext0f38_table[b].d8s, state);
+                else
+                {
+                    disp8scale = decode_disp8scale(ext0f38_table[b + 0x10].d8s,
+                                                   state);
+                    state->simd_size = simd_other;
+                }
+            }
             break;
 
         case ext_0f3a:
@@ -8318,10 +8333,14 @@ x86_emulate(
         op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
         goto simd_0f_int;
 
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10): /* vpmovuswb [xyz]mm,{x,y}mm/mem{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20): /* vpmovswb [xyz]mm,{x,y}mm/mem{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x30): /* vpmovzxbw {x,y}mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x30): /* vpmovwb [xyz]mm,{x,y}mm/mem{k} */
         host_and_vcpu_must_have(avx512bw);
-        /* fall through */
+        if ( evex.pfx != vex_f3 )
+        {
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x23): /* vpmovsxwd {x,y}mm/mem,[xyz]mm{k} */
@@ -8332,7 +8351,28 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x33): /* vpmovzxwd {x,y}mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x35): /* vpmovzxdq {x,y}mm/mem,[xyz]mm{k} */
-        generate_exception_if(evex.w && (b & 7) == 5, EXC_UD);
+            generate_exception_if(evex.w && (b & 7) == 5, EXC_UD);
+        }
+        else
+        {
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x11): /* vpmovusdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x12): /* vpmovusqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x13): /* vpmovusdw [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x14): /* vpmovusqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x15): /* vpmovusqd [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x21): /* vpmovsdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x22): /* vpmovsqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x23): /* vpmovsdw [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x24): /* vpmovsqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x25): /* vpmovsqd [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x31): /* vpmovdb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x32): /* vpmovqb [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x33): /* vpmovdw [xyz]mm,{x,y}mm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x34): /* vpmovqw [xyz]mm,xmm/mem{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x35): /* vpmovqd [xyz]mm,{x,y}mm/mem{k} */
+            generate_exception_if(evex.w || (ea.type == OP_MEM && evex.z), EXC_UD);
+            d = DstMem | SrcReg | TwoOp;
+        }
         op_bytes = 32 >> (pmov_convert_delta[b & 7] + 1 - evex.lr);
         elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
         goto avx512f_no_sae;




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 29/44] x86emul: support AVX512{F, BW} integer unpack insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (27 preceding siblings ...)
  2018-09-25 13:43   ` [PATCH v4 28/44] x86emul: support AVX512{F, BW} down conversion moves Jan Beulich
@ 2018-09-25 13:44   ` Jan Beulich
  2018-09-25 13:44   ` [PATCH v4 30/44] x86emul: support AVX512{F, BW, _VBMI} full permute insns Jan Beulich
                     ` (14 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:44 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

There's once again one extra twobyte_table[] entry which gets its Disp8
shift value set right away without getting support implemented just yet,
again to avoid needlessly splitting groups of entries.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Move OVR() additions into __AVX512VL__ conditional.
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -218,6 +218,10 @@ static const struct test avx512f_all[] =
     INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
     INSN(ptestm,       66, 0f38, 27,    vl,     dq, vl),
     INSN(ptestnm,      f3, 0f38, 27,    vl,     dq, vl),
+    INSN(punpckhdq,    66,   0f, 6a,    vl,      d, vl),
+    INSN(punpckhqdq,   66,   0f, 6d,    vl,      q, vl),
+    INSN(punpckldq,    66,   0f, 62,    vl,      d, vl),
+    INSN(punpcklqdq,   66,   0f, 6c,    vl,      q, vl),
     INSN(pxor,         66,   0f, ef,    vl,     dq, vl),
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
@@ -316,6 +320,10 @@ static const struct test avx512bw_all[]
     INSN(psubw,       66,   0f, f9,    vl,    w, vl),
     INSN(ptestm,      66, 0f38, 26,    vl,   bw, vl),
     INSN(ptestnm,     f3, 0f38, 26,    vl,   bw, vl),
+    INSN(punpckhbw,   66,   0f, 68,    vl,    b, vl),
+    INSN(punpckhwd,   66,   0f, 69,    vl,    w, vl),
+    INSN(punpcklbw,   66,   0f, 60,    vl,    b, vl),
+    INSN(punpcklwd,   66,   0f, 61,    vl,    w, vl),
 };
 
 static const struct test avx512bw_128[] = {
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -294,6 +294,10 @@ static inline bool _to_bool(byte_vec_t b
     asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
     t_; \
 })
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  endif
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
 #  define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
@@ -311,6 +315,10 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 #  endif
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
 # endif
 # if INT_SIZE == 4
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -252,6 +252,10 @@ OVR(pmovzxwq);
 OVR(pmulld);
 OVR(pmuldq);
 OVR(pmuludq);
+OVR(punpckhdq);
+OVR(punpckhqdq);
+OVR(punpckldq);
+OVR(punpcklqdq);
 #endif
 
 #undef OVR_VFP
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -312,10 +312,10 @@ static const struct twobyte_table {
     [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
-    [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
     [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
-    [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
+    [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 },
     [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
@@ -6643,6 +6643,12 @@ x86_emulate(
         get_fpu(X86EMUL_FPU_mmx);
         goto simd_0f_common;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x60): /* vpunpcklbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x61): /* vpunpcklwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x68): /* vpunpckhbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x69): /* vpunpckhwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        op_bytes = 16 << evex.lr;
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
@@ -6671,6 +6677,13 @@ x86_emulate(
         elem_bytes = 1 << (b & 1);
         goto avx512f_no_sae;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x62): /* vpunpckldq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6a): /* vpunpckhdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.w, EXC_UD);
+        fault_suppression = false;
+        op_bytes = 16 << evex.lr;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_EVEX_F3(0x0f38, 0x26): /* vptestnm{b,w} [xyz]mm/mem,[xyz]mm,k{k} */
     case X86EMUL_OPC_EVEX_F3(0x0f38, 0x27): /* vptestnm{d,q} [xyz]mm/mem,[xyz]mm,k{k} */
         op_bytes = 16 << evex.lr;
@@ -6697,6 +6710,10 @@ x86_emulate(
         avx512_vlen_check(false);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6c): /* vpunpcklqdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6d): /* vpunpckhqdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        fault_suppression = false;
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd4): /* vpaddq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf4): /* vpmuludq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x28): /* vpmuldq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 30/44] x86emul: support AVX512{F, BW, _VBMI} full permute insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (28 preceding siblings ...)
  2018-09-25 13:44   ` [PATCH v4 29/44] x86emul: support AVX512{F, BW} integer unpack insns Jan Beulich
@ 2018-09-25 13:44   ` Jan Beulich
  2018-09-25 13:46   ` [PATCH v4 31/44] x86emul: support AVX512{F, BW} integer shuffle insns Jan Beulich
                     ` (13 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:44 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Take the liberty and also correct the (public interface) name of the
AVX512_VBMI feature flag, on the assumption that no external consumer
has actually been using that flag so far. Furthermore make it have
AVX512BW instead of AVX512F as a prerequisite, for requiring full
64-bit mask registers (the upper 48 bits of which can't be accessed
other than through XSAVE/XRSTOR without AVX512BW support).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -162,6 +162,10 @@ static const struct test avx512f_all[] =
     INSN(pcmpgtd,      66,   0f, 66,    vl,      d, vl),
     INSN(pcmpgtq,      66, 0f38, 37,    vl,      q, vl),
     INSN(pcmpu,        66, 0f3a, 1e,    vl,     dq, vl),
+    INSN(permi2,       66, 0f38, 76,    vl,     dq, vl),
+    INSN(permi2,       66, 0f38, 77,    vl,     sd, vl),
+    INSN(permt2,       66, 0f38, 7e,    vl,     dq, vl),
+    INSN(permt2,       66, 0f38, 7f,    vl,     sd, vl),
     INSN(pmaxs,        66, 0f38, 3d,    vl,     dq, vl),
     INSN(pmaxu,        66, 0f38, 3f,    vl,     dq, vl),
     INSN(pmins,        66, 0f38, 39,    vl,     dq, vl),
@@ -283,6 +287,8 @@ static const struct test avx512bw_all[]
     INSN(pcmpgtb,     66,   0f, 64,    vl,    b, vl),
     INSN(pcmpgtw,     66,   0f, 65,    vl,    w, vl),
     INSN(pcmpu,       66, 0f3a, 3e,    vl,   bw, vl),
+    INSN(permi2w,     66, 0f38, 75,    vl,    w, vl),
+    INSN(permt2w,     66, 0f38, 7d,    vl,    w, vl),
     INSN(pmaddwd,     66,   0f, f5,    vl,    w, vl),
     INSN(pmaxsb,      66, 0f38, 3c,    vl,    b, vl),
     INSN(pmaxsw,      66,   0f, ee,    vl,    w, vl),
@@ -367,6 +373,11 @@ static const struct test avx512dq_512[]
     INSN(inserti32x8,    66, 0f3a, 3a, el_8, d, vl),
 };
 
+static const struct test avx512_vbmi_all[] = {
+    INSN(permi2b,       66, 0f38, 75, vl, b, vl),
+    INSN(permt2b,       66, 0f38, 7d, vl, b, vl),
+};
+
 static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
 static const unsigned char vl_128[] = { VL_128 };
 static const unsigned char vl_no128[] = { VL_512, VL_256 };
@@ -699,4 +710,5 @@ void evex_disp8_test(void *instr, struct
     RUN(avx512dq, 128);
     RUN(avx512dq, no128);
     RUN(avx512dq, 512);
+    RUN(avx512_vbmi, all);
 }
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -144,6 +144,9 @@ static inline bool _to_bool(byte_vec_t b
 #    define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
 #    define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
 #    define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
+#   else
+#    define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
+#    define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
 #   endif
 #  elif FLOAT_SIZE == 8
 #   if VEC_SIZE >= 32
@@ -168,6 +171,9 @@ static inline bool _to_bool(byte_vec_t b
 #    define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
 #    define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
 #    define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
+#   else
+#    define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
+#    define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
 #   endif
 #  endif
 # endif
@@ -297,6 +303,9 @@ static inline bool _to_bool(byte_vec_t b
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  else
+#   define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
@@ -318,6 +327,9 @@ static inline bool _to_bool(byte_vec_t b
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  else
+#   define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
 # endif
@@ -763,6 +775,7 @@ int simd_test(void)
 {
     unsigned int i, j;
     vec_t x, y, z, src, inv, alt, sh;
+    vint_t interleave_lo, interleave_hi;
 
     for ( i = 0, j = ELEM_SIZE << 3; i < ELEM_COUNT; ++i )
     {
@@ -776,6 +789,9 @@ int simd_test(void)
         if ( !(i & (i + 1)) )
             --j;
         sh[i] = j;
+
+        interleave_lo[i] = ((i & 1) * ELEM_COUNT) | (i >> 1);
+        interleave_hi[i] = interleave_lo[i] + (ELEM_COUNT / 2);
     }
 
     touch(src);
@@ -1069,7 +1085,7 @@ int simd_test(void)
     x = src * alt;
     y = interleave_lo(x, alt < 0);
     touch(x);
-    z = widen1(x);
+    z = widen1(low_half(x));
     touch(x);
     if ( !eq(z, y) ) return __LINE__;
 
@@ -1101,7 +1117,7 @@ int simd_test(void)
 
 # ifdef widen1
     touch(src);
-    x = widen1(src);
+    x = widen1(low_half(src));
     touch(src);
     if ( !eq(x, y) ) return __LINE__;
 # endif
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -70,6 +70,16 @@ typedef int __attribute__((vector_size(V
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 #endif
 
+#if ELEM_SIZE == 1
+typedef vqi_t vint_t;
+#elif ELEM_SIZE == 2
+typedef vhi_t vint_t;
+#elif ELEM_SIZE == 4
+typedef vsi_t vint_t;
+#elif ELEM_SIZE == 8
+typedef vdi_t vint_t;
+#endif
+
 #if VEC_SIZE >= 16
 
 # if ELEM_COUNT >= 2
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -279,6 +279,16 @@ static inline uint64_t xgetbv(uint32_t x
     (res.b & (1U << 31)) != 0; \
 })
 
+#define cpu_has_avx512_vbmi ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 0xe6) != 0xe6) ) \
+        res.c = 0; \
+    else \
+        emul_test_cpuid(7, 0, &res, NULL); \
+    (res.c & (1U << 1)) != 0; \
+})
+
 int emul_test_cpuid(
     uint32_t leaf,
     uint32_t subleaf,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -468,9 +468,13 @@ static const struct ext0f38_table {
     [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
     [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
     [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
+    [0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x78] = { .simd_size = simd_other, .two_op = 1 },
     [0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 },
     [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 },
+    [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x8c] = { .simd_size = simd_packed_int },
     [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
@@ -1852,6 +1856,7 @@ static bool vcpu_has(
 #define vcpu_has_sha()         vcpu_has(         7, EBX, 29, ctxt, ops)
 #define vcpu_has_avx512bw()    vcpu_has(         7, EBX, 30, ctxt, ops)
 #define vcpu_has_avx512vl()    vcpu_has(         7, EBX, 31, ctxt, ops)
+#define vcpu_has_avx512_vbmi() vcpu_has(         7, ECX,  1, ctxt, ops)
 #define vcpu_has_rdpid()       vcpu_has(         7, ECX, 22, ctxt, ops)
 #define vcpu_has_clzero()      vcpu_has(0x80000008, EBX,  0, ctxt, ops)
 
@@ -6011,6 +6016,11 @@ x86_emulate(
     CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x15): /* vunpckhp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
                               EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x76): /* vpermi2{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x77): /* vpermi2p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7e): /* vpermt2{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7f): /* vpermt2p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;
         /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xdb): /* vpand{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -8522,6 +8532,16 @@ x86_emulate(
         generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
         goto simd_0f_avx2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        if ( !evex.w )
+            host_and_vcpu_must_have(avx512_vbmi);
+        else
+            host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.br, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} */
         host_and_vcpu_must_have(avx512bw);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -107,6 +107,9 @@
 #define cpu_has_avx512bw        boot_cpu_has(X86_FEATURE_AVX512BW)
 #define cpu_has_avx512vl        boot_cpu_has(X86_FEATURE_AVX512VL)
 
+/* CPUID level 0x00000007:0.ecx */
+#define cpu_has_avx512_vbmi     boot_cpu_has(X86_FEATURE_AVX512_VBMI)
+
 /* CPUID level 0x80000007.edx */
 #define cpu_has_itsc            boot_cpu_has(X86_FEATURE_ITSC)
 
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -224,7 +224,7 @@ XEN_CPUFEATURE(AVX512VL,      5*32+31) /
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0.ecx, word 6 */
 XEN_CPUFEATURE(PREFETCHWT1,   6*32+ 0) /*A  PREFETCHWT1 instruction */
-XEN_CPUFEATURE(AVX512VBMI,    6*32+ 1) /*A  AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(AVX512_VBMI,   6*32+ 1) /*A  AVX-512 Vector Byte Manipulation Instrs */
 XEN_CPUFEATURE(UMIP,          6*32+ 2) /*S  User Mode Instruction Prevention */
 XEN_CPUFEATURE(PKU,           6*32+ 3) /*H  Protection Keys for Userspace */
 XEN_CPUFEATURE(OSPKE,         6*32+ 4) /*!  OS Protection Keys Enable */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -254,12 +254,17 @@ def crunch_numbers(state):
         AVX2: [AVX512F],
 
         # AVX512F is taken to mean hardware support for 512bit registers
-        # (which in practice depends on the EVEX prefix to encode), and the
-        # instructions themselves. All further AVX512 features are built on
-        # top of AVX512F
+        # (which in practice depends on the EVEX prefix to encode) as well
+        # as mask registers, and the instructions themselves. All further
+        # AVX512 features are built on top of AVX512F
         AVX512F: [AVX512DQ, AVX512IFMA, AVX512PF, AVX512ER, AVX512CD,
-                  AVX512BW, AVX512VL, AVX512VBMI, AVX512_4VNNIW,
-                  AVX512_4FMAPS, AVX512_VPOPCNTDQ],
+                  AVX512BW, AVX512VL, AVX512_4VNNIW, AVX512_4FMAPS,
+                  AVX512_VPOPCNTDQ],
+
+        # AVX512 extensions acting solely on vectors of bytes/words are made
+        # dependents of AVX512BW (as to requiring wider than 16-bit mask
+        # registers), despite the SDM not formally making this connection.
+        AVX512BW: [AVX512_VBMI],
 
         # The features:
         #   * Single Thread Indirect Branch Predictors




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 31/44] x86emul: support AVX512{F, BW} integer shuffle insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (29 preceding siblings ...)
  2018-09-25 13:44   ` [PATCH v4 30/44] x86emul: support AVX512{F, BW, _VBMI} full permute insns Jan Beulich
@ 2018-09-25 13:46   ` Jan Beulich
  2018-09-25 13:46   ` [PATCH v4 32/44] x86emul: support AVX512{BW, DQ} mask move insns Jan Beulich
                     ` (12 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:46 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Also include shuff{32x4,64x2} as being very similar to shufi{32x4,64x2}.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Move OVR() addition into __AVX512VL__ conditional. Correct comments.
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -203,6 +203,7 @@ static const struct test avx512f_all[] =
     INSN(prolv,        66, 0f38, 15,    vl,     dq, vl),
     INSNX(pror,        66,   0f, 72, 0, vl,     dq, vl),
     INSN(prorv,        66, 0f38, 14,    vl,     dq, vl),
+    INSN(pshufd,       66,   0f, 70,    vl,      d, vl),
     INSN(pslld,        66,   0f, f2,    el_4,    d, vl),
     INSNX(pslld,       66,   0f, 72, 6, vl,      d, vl),
     INSN(psllq,        66,   0f, f3,    el_2,    q, vl),
@@ -253,6 +254,10 @@ static const struct test avx512f_no128[]
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
     INSN(insertf32x4,    66, 0f3a, 18, el_4,  d, vl),
     INSN(inserti32x4,    66, 0f3a, 38, el_4,  d, vl),
+    INSN(shuff32x4,      66, 0f3a, 23, vl,    d, vl),
+    INSN(shuff64x2,      66, 0f3a, 23, vl,    q, vl),
+    INSN(shufi32x4,      66, 0f3a, 43, vl,    d, vl),
+    INSN(shufi64x2,      66, 0f3a, 43, vl,    q, vl),
 };
 
 static const struct test avx512f_512[] = {
@@ -307,6 +312,9 @@ static const struct test avx512bw_all[]
     INSN(pmulhw,      66,   0f, e5,    vl,    w, vl),
     INSN(pmullw,      66,   0f, d5,    vl,    w, vl),
     INSN(psadbw,      66,   0f, f6,    vl,    b, vl),
+    INSN(pshufb,      66, 0f38, 00,    vl,    b, vl),
+    INSN(pshufhw,     f3,   0f, 70,    vl,    w, vl),
+    INSN(pshuflw,     f2,   0f, 70,    vl,    w, vl),
     INSNX(pslldq,     66,   0f, 73, 7, vl,    b, vl),
     INSN(psllvw,      66, 0f38, 12,    vl,    w, vl),
     INSN(psllw,       66,   0f, f1,    el_8,  w, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -147,6 +147,10 @@ static inline bool _to_bool(byte_vec_t b
 #   else
 #    define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
 #    define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
+#    define swap(x) ({ \
+    vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
+    B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \
+})
 #   endif
 #  elif FLOAT_SIZE == 8
 #   if VEC_SIZE >= 32
@@ -174,6 +178,10 @@ static inline bool _to_bool(byte_vec_t b
 #   else
 #    define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
 #    define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
+#    define swap(x) ({ \
+    vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
+    B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \
+})
 #   endif
 #  endif
 # endif
@@ -303,9 +311,14 @@ static inline bool _to_bool(byte_vec_t b
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0))
 #  else
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, \
+                             B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \
+                               VEC_SIZE == 32 ? 0b01 : 0b00011011, (vsi_t)undef(), ~0), \
+                             0b00011011, (vsi_t)undef(), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
@@ -327,9 +340,14 @@ static inline bool _to_bool(byte_vec_t b
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b01001110, (vsi_t)undef(), ~0))
 #  else
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2varq, _mask, (vdi_t)(x), interleave_hi, (vdi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2varq, _mask, interleave_lo, (vdi_t)(x), (vdi_t)(y), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, \
+                             (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), (vdi_t)(x), \
+                                      VEC_SIZE == 32 ? 0b01 : 0b00011011, (vdi_t)undef(), ~0), \
+                             0b01001110, (vsi_t)undef(), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
 # endif
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -119,6 +119,12 @@ typedef long long __attribute__((vector_
 
 #ifdef __AVX512F__
 
+/* Sadly there are a few exceptions to the general naming rules. */
+#define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
+#define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
+#define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
+#define __builtin_ia32_shuf_i64x2_512_mask __builtin_ia32_shuf_i64x2_mask
+
 #if VEC_SIZE < 64
 # pragma GCC target ( "avx512vl" )
 #endif
@@ -262,6 +268,7 @@ OVR(pmovzxwq);
 OVR(pmulld);
 OVR(pmuldq);
 OVR(pmuludq);
+OVR(pshufd);
 OVR(punpckhdq);
 OVR(punpckhqdq);
 OVR(punpckldq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -318,7 +318,7 @@ static const struct twobyte_table {
     [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 },
     [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
-    [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
+    [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other, d8s_vl },
     [0x71 ... 0x73] = { DstImplicit|SrcImmByte|ModRM, simd_none, d8s_vl },
     [0x74 ... 0x76] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x77] = { DstImplicit|SrcNone },
@@ -432,7 +432,8 @@ static const struct ext0f38_table {
     uint8_t vsib:1;
     disp8scale_t d8s:4;
 } ext0f38_table[256] = {
-    [0x00 ... 0x0b] = { .simd_size = simd_packed_int },
+    [0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x01 ... 0x0b] = { .simd_size = simd_packed_int },
     [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
@@ -543,6 +544,7 @@ static const struct ext0f3a_table {
     [0x20] = { .simd_size = simd_none, .d8s = 0 },
     [0x21] = { .simd_size = simd_other, .d8s = 2 },
     [0x22] = { .simd_size = simd_none, .d8s = d8s_dq64 },
+    [0x23] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x25] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x30 ... 0x33] = { .simd_size = simd_other, .two_op = 1 },
     [0x38] = { .simd_size = simd_128, .d8s = 4 },
@@ -552,6 +554,7 @@ static const struct ext0f3a_table {
     [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
     [0x42] = { .simd_size = simd_packed_int },
+    [0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x44] = { .simd_size = simd_packed_int },
     [0x46] = { .simd_size = simd_packed_int },
     [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -6664,6 +6667,7 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf5): /* vpmaddwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x00): /* vpshufb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;
         /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd5): /* vpmullw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -6919,6 +6923,20 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 3;
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x70): /* vpshufd $imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x70): /* vpshufhw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F2(0x0f, 0x70): /* vpshuflw $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        if ( evex.pfx == vex_66 )
+            generate_exception_if(evex.w, EXC_UD);
+        else
+        {
+            host_and_vcpu_must_have(avx512bw);
+            generate_exception_if(evex.br, EXC_UD);
+        }
+        d = (d & ~SrcMask) | SrcMem | TwoOp;
+        op_bytes = 16 << evex.lr;
+        goto avx512f_imm_no_sae;
+
     CASE_SIMD_PACKED_INT(0x0f, 0x71):    /* Grp12 */
     case X86EMUL_OPC_VEX_66(0x0f, 0x71):
     CASE_SIMD_PACKED_INT(0x0f, 0x72):    /* Grp13 */
@@ -9104,7 +9122,13 @@ x86_emulate(
                                             /* vextracti64x2 $imm8,{y,z}mm,xmm/m128{k} */
         if ( evex.w )
             host_and_vcpu_must_have(avx512dq);
-        generate_exception_if(!evex.lr || evex.br, EXC_UD);
+        generate_exception_if(evex.br, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x23): /* vshuff32x4 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+                                            /* vshuff64x2 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x43): /* vshufi32x4 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+                                            /* vshufi64x2 $imm8,{y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+        generate_exception_if(!evex.lr, EXC_UD);
         fault_suppression = false;
         goto avx512f_imm_no_sae;
 




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 32/44] x86emul: support AVX512{BW, DQ} mask move insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (30 preceding siblings ...)
  2018-09-25 13:46   ` [PATCH v4 31/44] x86emul: support AVX512{F, BW} integer shuffle insns Jan Beulich
@ 2018-09-25 13:46   ` Jan Beulich
  2018-09-25 13:47   ` [PATCH v4 33/44] x86emul: basic AVX512BW testing Jan Beulich
                     ` (11 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:46 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Entries to the tables in evex-disp8.c are added despite these insns not
allowing for memory operands, with the goal of the tables giving a
complete picture of the supported EVEX-encoded insns in the end.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -303,9 +303,12 @@ static const struct test avx512bw_all[]
     INSN(pminsw,      66,   0f, ea,    vl,    w, vl),
     INSN(pminub,      66,   0f, da,    vl,    b, vl),
     INSN(pminuw,      66, 0f38, 3a,    vl,    w, vl),
+//       pmovb2m,     f3, 0f38, 29,           b
+//       pmovm2,      f3, 0f38, 28,          bw
     INSN(pmovswb,     f3, 0f38, 20,    vl_2,  b, vl),
     INSN(pmovsxbw,    66, 0f38, 20,    vl_2,  b, vl),
     INSN(pmovuswb,    f3, 0f38, 10,    vl_2,  b, vl),
+//       pmovw2m,     f3, 0f38, 29,           w
     INSN(pmovwb,      f3, 0f38, 30,    vl_2,  b, vl),
     INSN(pmovzxbw,    66, 0f38, 30,    vl_2,  b, vl),
     INSN(pmulhuw,     66,   0f, e4,    vl,    w, vl),
@@ -353,6 +356,9 @@ static const struct test avx512dq_all[]
     INSN_PFP(andn,             0f, 55),
     INSN(broadcasti32x2, 66, 0f38, 59, el_2,  d, vl),
     INSN_PFP(or,               0f, 56),
+//       pmovd2m,        f3, 0f38, 39,        d
+//       pmovm2,         f3, 0f38, 38,       dq
+//       pmovq2m,        f3, 0f38, 39,        q
     INSN(pmullq,         66, 0f38, 40,   vl,  q, vl),
     INSN_PFP(xor,              0f, 57),
 };
--- a/tools/tests/x86_emulator/opmask.S
+++ b/tools/tests/x86_emulator/opmask.S
@@ -12,17 +12,23 @@
 
 #if SIZE == 1
 # define _(x) x##b
+# define _v(x, t) _v_(x##q, t)
 #elif SIZE == 2
 # define _(x) x##w
+# define _v(x, t) _v_(x##d, t)
 # define WIDEN(x) x##bw
 #elif SIZE == 4
 # define _(x) x##d
+# define _v(x, t) _v_(x##w, t)
 # define WIDEN(x) x##wd
 #elif SIZE == 8
 # define _(x) x##q
+# define _v(x, t) _v_(x##b, t)
 # define WIDEN(x) x##dq
 #endif
 
+#define _v_(x, t) v##x##t
+
     .macro check res1:req, res2:req, line:req
     _(kmov)       %\res1, DATA(out)
 #if SIZE < 8 || !defined(__i386__)
@@ -131,6 +137,15 @@ _start:
 
 #endif
 
+#if SIZE > 2 ? defined(__AVX512BW__) : defined(__AVX512DQ__)
+
+    _(kmov)       DATA(in1), %k0
+    _v(pmovm2,)   %k0, %zmm7
+    _v(pmov,2m)   %zmm7, %k3
+    check         k0, k3, __LINE__
+
+#endif
+
     xor           %eax, %eax
     ret
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -8422,6 +8422,21 @@ x86_emulate(
         elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
         goto avx512f_no_sae;
 
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x29): /* vpmov{b,w}2m [xyz]mm,k */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x39): /* vpmov{d,q}2m [xyz]mm,k */
+        generate_exception_if(!evex.r || !evex.R, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x28): /* vpmovm2{b,w} k,[xyz]mm */
+    case X86EMUL_OPC_EVEX_F3(0x0f38, 0x38): /* vpmovm2{d,q} k,[xyz]mm */
+        if ( b & 0x10 )
+            host_and_vcpu_must_have(avx512dq);
+        else
+            host_and_vcpu_must_have(avx512bw);
+        generate_exception_if(evex.opmsk || ea.type != OP_REG, EXC_UD);
+        d |= TwoOp;
+        op_bytes = 16 << evex.lr;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_66(0x0f38, 0x2a):     /* movntdqa m128,xmm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x2a): /* vmovntdqa mem,{x,y}mm */
         generate_exception_if(ea.type != OP_MEM, EXC_UD);





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 33/44] x86emul: basic AVX512BW testing
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (31 preceding siblings ...)
  2018-09-25 13:46   ` [PATCH v4 32/44] x86emul: support AVX512{BW, DQ} mask move insns Jan Beulich
@ 2018-09-25 13:47   ` Jan Beulich
  2018-09-25 13:48   ` [PATCH v4 34/44] x86emul: basic AVX512DQ testing Jan Beulich
                     ` (10 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:47 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Test various of the insns which have been implemented already.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Add __AVX512VL__ conditional around majority of OVR() additions.
    Correct eq() for 1- and 2-byte cases.
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -13,7 +13,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -63,6 +63,9 @@ xop-flts := $(avx-flts)
 avx512f-vecs := 64 16 32
 avx512f-ints := 4 8
 avx512f-flts := 4 8
+avx512bw-vecs := $(avx512f-vecs)
+avx512bw-ints := 1 2
+avx512bw-flts :=
 
 avx512f-opmask-vecs := 2
 avx512dq-opmask-vecs := 1
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -31,6 +31,10 @@ ENTRY(simd_test);
 #  define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
 # elif FLOAT_SIZE == 8
 #  define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
+# elif (INT_SIZE == 1 || UINT_SIZE == 1) && defined(__AVX512BW__)
+#  define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# elif (INT_SIZE == 2 || UINT_SIZE == 2) && defined(__AVX512BW__)
+#  define eq(x, y) (B(pcmpeqw, _mask, (vhi_t)(x), (vhi_t)(y), -1) == ALL_TRUE)
 # elif INT_SIZE == 4 || UINT_SIZE == 4
 #  define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
 # elif INT_SIZE == 8 || UINT_SIZE == 8
@@ -368,6 +372,87 @@ static inline bool _to_bool(byte_vec_t b
 #  define max(x, y) ((vec_t)B(pmaxuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #  define min(x, y) ((vec_t)B(pminuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 # endif
+#elif (INT_SIZE == 1 || UINT_SIZE == 1 || INT_SIZE == 2 || UINT_SIZE == 2) && \
+      defined(__AVX512BW__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if INT_SIZE == 1 || UINT_SIZE == 1
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastb %1, %0" \
+          : "=v" (t_) : "m" (*(char[1]){ x }) ); \
+    t_; \
+})
+#  define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastb %k1, %0" : "=v" (t_) : "r" (x) ); \
+    t_; \
+})
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhbw, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpcklbw, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufb, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0))
+#  elif defined(__AVX512VBMI__)
+#   define interleave_hi(x, y) ((vec_t)B(vpermi2varqi, _mask, (vqi_t)(x), interleave_hi, (vqi_t)(y), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(vpermt2varqi, _mask, interleave_lo, (vqi_t)(x), (vqi_t)(y), ~0))
+#  endif
+#  define mix(x, y) ((vec_t)B(movdquqi, _mask, (vqi_t)(x), (vqi_t)(y), \
+                              (0b0101010101010101010101010101010101010101010101010101010101010101LL & ALL_TRUE)))
+#  define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
+#  define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0))
+#  define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0))
+# elif INT_SIZE == 2 || UINT_SIZE == 2
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastw %1, %0" \
+          : "=v" (t_) : "m" (*(short[1]){ x }) ); \
+    t_; \
+})
+#  define broadcast2(x) ({ \
+    vec_t t_; \
+    asm ( "vpbroadcastw %k1, %0" : "=v" (t_) : "r" (x) ); \
+    t_; \
+})
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhwd, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpcklwd, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
+#   define swap(x) ((vec_t)B(pshufd, _mask, \
+                             (vsi_t)B(pshufhw, _mask, \
+                                      B(pshuflw, _mask, (vhi_t)(x), 0b00011011, (vhi_t)undef(), ~0), \
+                                      0b00011011, (vhi_t)undef(), ~0), \
+                             0b01001110, (vsi_t)undef(), ~0))
+#  else
+#   define interleave_hi(x, y) ((vec_t)B(vpermi2varhi, _mask, (vhi_t)(x), interleave_hi, (vhi_t)(y), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(vpermt2varhi, _mask, interleave_lo, (vhi_t)(x), (vhi_t)(y), ~0))
+#  endif
+#  define mix(x, y) ((vec_t)B(movdquhi, _mask, (vhi_t)(x), (vhi_t)(y), \
+                              (0b01010101010101010101010101010101 & ALL_TRUE)))
+#  define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
+#  define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0))
+# endif
+# if INT_SIZE == 1
+#  define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovsxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovsxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0))
+#  define widen3(x) ((vec_t)B(pmovsxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 1
+#  define max(x, y) ((vec_t)B(pmaxub, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminub, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovzxbw, _mask, (vqi_half_t)(x), (vhi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovzxbd, _mask, (vqi_quarter_t)(x), (vsi_t)undef(), ~0))
+#  define widen3(x) ((vec_t)B(pmovzxbq, _mask, (vqi_eighth_t)(x), (vdi_t)undef(), ~0))
+# elif INT_SIZE == 2
+#  define max(x, y) B(pmaxsw, _mask, x, y, undef(), ~0)
+#  define min(x, y) B(pminsw, _mask, x, y, undef(), ~0)
+#  define mul_hi(x, y) B(pmulhw, _mask, x, y, undef(), ~0)
+#  define widen1(x) ((vec_t)B(pmovsxwd, _mask, x, (vsi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovsxwq, _mask, x, (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 2
+#  define max(x, y) ((vec_t)B(pmaxuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
+#  define mul_hi(x, y) ((vec_t)B(pmulhuw, _mask, (vhi_t)(x), (vhi_t)(y), (vhi_t)undef(), ~0))
+#  define widen1(x) ((vec_t)B(pmovzxwd, _mask, (vhi_half_t)(x), (vsi_t)undef(), ~0))
+#  define widen2(x) ((vec_t)B(pmovzxwq, _mask, (vhi_quarter_t)(x), (vdi_t)undef(), ~0))
+# endif
 #elif VEC_SIZE == 16 && defined(__SSE2__)
 # if INT_SIZE == 1 || UINT_SIZE == 1
 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
@@ -559,7 +644,7 @@ static inline bool _to_bool(byte_vec_t b
 #  endif
 # endif
 #endif
-#if VEC_SIZE == 16 && defined(__SSSE3__)
+#if VEC_SIZE == 16 && defined(__SSSE3__) && !defined(__AVX512VL__)
 # if INT_SIZE == 1
 #  define abs(x) ((vec_t)__builtin_ia32_pabsb128((vqi_t)(x)))
 # elif INT_SIZE == 2
@@ -783,6 +868,40 @@ static inline half_t low_half(vec_t x)
 }
 # endif
 
+# if !defined(low_quarter) && defined(QUARTER_SIZE)
+static inline quarter_t low_quarter(vec_t x)
+{
+#  if QUARTER_SIZE < VEC_SIZE
+    quarter_t y;
+    unsigned int i;
+
+    for ( i = 0; i < ELEM_COUNT / 4; ++i )
+        y[i] = x[i];
+
+    return y;
+#  else
+    return x;
+#  endif
+}
+# endif
+
+# if !defined(low_eighth) && defined(EIGHTH_SIZE)
+static inline eighth_t low_eighth(vec_t x)
+{
+#  if EIGHTH_SIZE < VEC_SIZE
+    eighth_t y;
+    unsigned int i;
+
+    for ( i = 0; i < ELEM_COUNT / 4; ++i )
+        y[i] = x[i];
+
+    return y;
+#  else
+    return x;
+#  endif
+}
+# endif
+
 #endif
 
 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
@@ -1111,7 +1230,7 @@ int simd_test(void)
     y = interleave_lo(alt < 0, alt < 0);
     y = interleave_lo(z, y);
     touch(x);
-    z = widen2(x);
+    z = widen2(low_quarter(x));
     touch(x);
     if ( !eq(z, y) ) return __LINE__;
 
@@ -1120,7 +1239,7 @@ int simd_test(void)
     y = interleave_lo(y, y);
     y = interleave_lo(z, y);
     touch(x);
-    z = widen3(x);
+    z = widen3(low_eighth(x));
     touch(x);
     if ( !eq(z, y) ) return __LINE__;
 #  endif
@@ -1142,14 +1261,14 @@ int simd_test(void)
 
 # ifdef widen2
     touch(src);
-    x = widen2(src);
+    x = widen2(low_quarter(src));
     touch(src);
     if ( !eq(x, z) ) return __LINE__;
 # endif
 
 # ifdef widen3
     touch(src);
-    x = widen3(src);
+    x = widen3(low_eighth(src));
     touch(src);
     if ( !eq(x, interleave_lo(z, (vec_t){})) ) return __LINE__;
 # endif
@@ -1169,6 +1288,36 @@ int simd_test(void)
             if ( aux2[i] != src[i] )
                 return __LINE__;
     }
+#endif
+
+#if defined(widen2) && defined(shrink2)
+    {
+        quarter_t aux1 = low_quarter(src), aux2;
+
+        touch(aux1);
+        x = widen2(aux1);
+        touch(x);
+        aux2 = shrink2(x);
+        touch(aux2);
+        for ( i = 0; i < ELEM_COUNT / 4; ++i )
+            if ( aux2[i] != src[i] )
+                return __LINE__;
+    }
+#endif
+
+#if defined(widen3) && defined(shrink3)
+    {
+        eighth_t aux1 = low_eighth(src), aux2;
+
+        touch(aux1);
+        x = widen3(aux1);
+        touch(x);
+        aux2 = shrink3(x);
+        touch(aux2);
+        for ( i = 0; i < ELEM_COUNT / 8; ++i )
+            if ( aux2[i] != src[i] )
+                return __LINE__;
+    }
 #endif
 
 #ifdef dup_lo
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -95,6 +95,32 @@ typedef int __attribute__((vector_size(H
 typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
 # endif
 
+# if ELEM_COUNT >= 4
+#  if VEC_SIZE > 64
+#   define QUARTER_SIZE (VEC_SIZE / 4)
+#  else
+#   define QUARTER_SIZE 16
+#  endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(QUARTER_SIZE))) quarter_t;
+typedef char __attribute__((vector_size(QUARTER_SIZE))) vqi_quarter_t;
+typedef short __attribute__((vector_size(QUARTER_SIZE))) vhi_quarter_t;
+typedef int __attribute__((vector_size(QUARTER_SIZE))) vsi_quarter_t;
+typedef long long __attribute__((vector_size(QUARTER_SIZE))) vdi_quarter_t;
+# endif
+
+# if ELEM_COUNT >= 8
+#  if VEC_SIZE > 128
+#   define EIGHTH_SIZE (VEC_SIZE / 8)
+#  else
+#   define EIGHTH_SIZE 16
+#  endif
+typedef typeof((vec_t){}[0]) __attribute__((vector_size(EIGHTH_SIZE))) eighth_t;
+typedef char __attribute__((vector_size(EIGHTH_SIZE))) vqi_eighth_t;
+typedef short __attribute__((vector_size(EIGHTH_SIZE))) vhi_eighth_t;
+typedef int __attribute__((vector_size(EIGHTH_SIZE))) vsi_eighth_t;
+typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t;
+# endif
+
 #endif
 
 #if VEC_SIZE == 16
@@ -182,6 +208,9 @@ OVR_SFP(broadcast);
 OVR_SFP(comi);
 OVR_FP(add);
 OVR_INT(add);
+OVR_BW(adds);
+OVR_BW(addus);
+OVR_BW(avg);
 OVR_FP(div);
 OVR(extractps);
 OVR_FMA(fmadd, FP);
@@ -214,6 +243,8 @@ OVR_INT(srl);
 OVR_DQ(srlv);
 OVR_FP(sub);
 OVR_INT(sub);
+OVR_BW(subs);
+OVR_BW(subus);
 OVR_SFP(ucomi);
 OVR_VFP(unpckh);
 OVR_VFP(unpckl);
@@ -275,6 +306,31 @@ OVR(punpckldq);
 OVR(punpcklqdq);
 #endif
 
+#ifdef __AVX512BW__
+OVR(pextrb);
+OVR(pextrw);
+OVR(pinsrb);
+OVR(pinsrw);
+# ifdef __AVX512VL__
+OVR(pmaddwd);
+OVR(pmovsxbw);
+OVR(pmovzxbw);
+OVR(pmulhuw);
+OVR(pmulhw);
+OVR(pmullw);
+OVR(psadbw);
+OVR(pshufb);
+OVR(pshufhw);
+OVR(pshuflw);
+OVR(punpckhbw);
+OVR(punpckhwd);
+OVR(punpcklbw);
+OVR(punpcklwd);
+OVR(slldq);
+OVR(srldq);
+# endif
+#endif
+
 #undef OVR_VFP
 #undef OVR_SFP
 #undef OVR_INT
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -22,6 +22,7 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512dq-opmask.h"
 #include "avx512bw-opmask.h"
 #include "avx512f.h"
+#include "avx512bw.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -105,6 +106,11 @@ static bool simd_check_avx512bw(void)
 }
 #define simd_check_avx512bw_opmask simd_check_avx512bw
 
+static bool simd_check_avx512bw_vl(void)
+{
+    return cpu_has_avx512bw && cpu_has_avx512vl;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -284,6 +290,18 @@ static const struct {
     AVX512VL(VL u64x2,        avx512f,      16u8),
     AVX512VL(VL s64x4,        avx512f,      32i8),
     AVX512VL(VL u64x4,        avx512f,      32u8),
+    SIMD(AVX512BW s8x64,     avx512bw,      64i1),
+    SIMD(AVX512BW u8x64,     avx512bw,      64u1),
+    SIMD(AVX512BW s16x32,    avx512bw,      64i2),
+    SIMD(AVX512BW u16x32,    avx512bw,      64u2),
+    AVX512VL(BW+VL s8x16,    avx512bw,      16i1),
+    AVX512VL(BW+VL u8x16,    avx512bw,      16u1),
+    AVX512VL(BW+VL s8x32,    avx512bw,      32i1),
+    AVX512VL(BW+VL u8x32,    avx512bw,      32u1),
+    AVX512VL(BW+VL s16x8,    avx512bw,      16i2),
+    AVX512VL(BW+VL u16x8,    avx512bw,      16u2),
+    AVX512VL(BW+VL s16x16,   avx512bw,      32i2),
+    AVX512VL(BW+VL u16x16,   avx512bw,      32u2),
 #undef AVX512VL_
 #undef AVX512VL
 #undef SIMD_




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 34/44] x86emul: basic AVX512DQ testing
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (32 preceding siblings ...)
  2018-09-25 13:47   ` [PATCH v4 33/44] x86emul: basic AVX512BW testing Jan Beulich
@ 2018-09-25 13:48   ` Jan Beulich
  2018-09-25 13:48   ` [PATCH v4 35/44] x86emul: support AVX512F move high/low insns Jan Beulich
                     ` (9 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:48 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Test various of the insns which have been implemented already.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Wrap OVR(pmullq) in __AVX512VL__ conditional.
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -13,7 +13,7 @@ all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -66,9 +66,12 @@ avx512f-flts := 4 8
 avx512bw-vecs := $(avx512f-vecs)
 avx512bw-ints := 1 2
 avx512bw-flts :=
+avx512dq-vecs := $(avx512f-vecs)
+avx512dq-ints := $(avx512f-ints)
+avx512dq-flts := $(avx512f-flts)
 
 avx512f-opmask-vecs := 2
-avx512dq-opmask-vecs := 1
+avx512dq-opmask-vecs := 1 2
 avx512bw-opmask-vecs := 4 8
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -121,6 +121,34 @@ typedef int __attribute__((vector_size(E
 typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t;
 # endif
 
+# define DECL_PAIR(w) \
+typedef w ## _t pair_t; \
+typedef vsi_ ## w ## _t vsi_pair_t; \
+typedef vdi_ ## w ## _t vdi_pair_t
+# define DECL_QUARTET(w) \
+typedef w ## _t quartet_t; \
+typedef vsi_ ## w ## _t vsi_quartet_t; \
+typedef vdi_ ## w ## _t vdi_quartet_t
+# define DECL_OCTET(w) \
+typedef w ## _t octet_t; \
+typedef vsi_ ## w ## _t vsi_octet_t; \
+typedef vdi_ ## w ## _t vdi_octet_t
+
+# if ELEM_COUNT == 4
+DECL_PAIR(half);
+# elif ELEM_COUNT == 8
+DECL_PAIR(quarter);
+DECL_QUARTET(half);
+# elif ELEM_COUNT == 16
+DECL_PAIR(eighth);
+DECL_QUARTET(quarter);
+DECL_OCTET(half);
+# endif
+
+# undef DECL_OCTET
+# undef DECL_QUARTET
+# undef DECL_PAIR
+
 #endif
 
 #if VEC_SIZE == 16
@@ -146,6 +174,14 @@ typedef long long __attribute__((vector_
 #ifdef __AVX512F__
 
 /* Sadly there are a few exceptions to the general naming rules. */
+#define __builtin_ia32_broadcastf32x4_512_mask __builtin_ia32_broadcastf32x4_512
+#define __builtin_ia32_broadcasti32x4_512_mask __builtin_ia32_broadcasti32x4_512
+#define __builtin_ia32_insertf32x4_512_mask __builtin_ia32_insertf32x4_mask
+#define __builtin_ia32_insertf32x8_512_mask __builtin_ia32_insertf32x8_mask
+#define __builtin_ia32_insertf64x4_512_mask __builtin_ia32_insertf64x4_mask
+#define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask
+#define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask
+#define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask
 #define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
 #define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
 #define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
@@ -331,6 +367,20 @@ OVR(srldq);
 # endif
 #endif
 
+#ifdef __AVX512DQ__
+OVR_VFP(and);
+OVR_VFP(andn);
+OVR_VFP(or);
+OVR(pextrd);
+OVR(pextrq);
+OVR(pinsrd);
+OVR(pinsrq);
+# ifdef __AVX512VL__
+OVR(pmullq);
+# endif
+OVR_VFP(xor);
+#endif
+
 #undef OVR_VFP
 #undef OVR_SFP
 #undef OVR_INT
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -134,6 +134,27 @@ static inline bool _to_bool(byte_vec_t b
 #elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
       (VEC_SIZE == 64 || defined(__AVX512VL__))
 # if VEC_SIZE > FLOAT_SIZE
+#  if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
+       (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \
+       (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
+#   define low_half(x) ({ \
+    half_t t_; \
+    asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+    t_; \
+})
+#  endif
+#  if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \
+       (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
+#   define low_quarter(x) ({ \
+    quarter_t t_; \
+    asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
+    t_; \
+})
+#  endif
 #  if FLOAT_SIZE == 4
 #   define broadcast(x) ({ \
     vec_t t_; \
@@ -141,6 +162,17 @@ static inline bool _to_bool(byte_vec_t b
           : "=v" (t_) : "m" (*(float[1]){ x }) ); \
     t_; \
 })
+#   if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+#    define broadcast_pair(x) ({ \
+    vec_t t_; \
+    asm ( "vbroadcastf32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
+    t_; \
+})
+#   endif
+#   if VEC_SIZE == 64 && defined(__AVX512DQ__)
+#    define broadcast_octet(x) B(broadcastf32x8_, _mask, x, undef(), ~0)
+#    define insert_octet(x, y, p) B(insertf32x8_, _mask, x, y, p, undef(), ~0)
+#   endif
 #   define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
 #   define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
 #   define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
@@ -149,6 +181,13 @@ static inline bool _to_bool(byte_vec_t b
 #    define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
 #    define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
 #   else
+#    define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0)
+#    define insert_pair(x, y, p) \
+    B(insertf32x4_, _mask, x, \
+      /* Cast needed below to work around gcc 7.x quirk. */ \
+      (p) & 1 ? (typeof(y))__builtin_ia32_shufps(y, y, 0b01000100) : (y), \
+      (p) >> 1, x, 3 << ((p) * 2))
+#    define insert_quartet(x, y, p) B(insertf32x4_, _mask, x, y, p, undef(), ~0)
 #    define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
 #    define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
 #    define swap(x) ({ \
@@ -172,6 +211,14 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 #   endif
+#   if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+#    define broadcast_pair(x) B(broadcastf64x2_, _mask, x, undef(), ~0)
+#    define insert_pair(x, y, p) B(insertf64x2_, _mask, x, y, p, undef(), ~0)
+#   endif
+#   if VEC_SIZE == 64
+#    define broadcast_quartet(x) B(broadcastf64x4_, , x, undef(), ~0)
+#    define insert_quartet(x, y, p) B(insertf64x4_, _mask, x, y, p, undef(), ~0)
+#   endif
 #   define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
 #   define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
 #   define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
@@ -300,6 +347,16 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 # endif
+# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextracti32x4 */ || \
+       (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
+#  define low_quarter(x) ({ \
+    quarter_t t_; \
+    asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
+          : [d] "=m" (t_) \
+          : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
+    t_; \
+})
+# endif
 # if INT_SIZE == 4 || UINT_SIZE == 4
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -312,11 +369,30 @@ static inline bool _to_bool(byte_vec_t b
     asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
     t_; \
 })
+#  ifdef __AVX512DQ__
+#   define broadcast_pair(x) ({ \
+    vec_t t_; \
+    asm ( "vbroadcasti32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
+    t_; \
+})
+#  endif
+#  if VEC_SIZE == 64 && defined(__AVX512DQ__)
+#   define broadcast_octet(x) ((vec_t)B(broadcasti32x8_, _mask, (vsi_octet_t)(x), (vsi_t)undef(), ~0))
+#   define insert_octet(x, y, p) ((vec_t)B(inserti32x8_, _mask, (vsi_t)(x), (vsi_octet_t)(y), p, (vsi_t)undef(), ~0))
+#  endif
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
 #   define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0))
 #  else
+#   define broadcast_quartet(x) ((vec_t)B(broadcasti32x4_, _mask, (vsi_quartet_t)(x), (vsi_t)undef(), ~0))
+#   define insert_pair(x, y, p) \
+    (vec_t)(B(inserti32x4_, _mask, (vsi_t)(x), \
+              /* First cast needed below to work around gcc 7.x quirk. */ \
+              (p) & 1 ? (vsi_pair_t)__builtin_ia32_pshufd((vsi_pair_t)(y), 0b01000100) \
+                      : (vsi_pair_t)(y), \
+              (p) >> 1, (vsi_t)(x), 3 << ((p) * 2)))
+#   define insert_quartet(x, y, p) ((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), (vsi_quartet_t)(y), p, (vsi_t)undef(), ~0))
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
 #   define swap(x) ((vec_t)B(pshufd, _mask, \
@@ -341,6 +417,14 @@ static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 #  endif
+#  if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+#   define broadcast_pair(x) ((vec_t)B(broadcasti64x2_, _mask, (vdi_pair_t)(x), (vdi_t)undef(), ~0))
+#   define insert_pair(x, y, p) ((vec_t)B(inserti64x2_, _mask, (vdi_t)(x), (vdi_pair_t)(y), p, (vdi_t)undef(), ~0))
+#  endif
+#  if VEC_SIZE == 64
+#   define broadcast_quartet(x) ((vec_t)B(broadcasti64x4_, , (vdi_quartet_t)(x), (vdi_t)undef(), ~0))
+#   define insert_quartet(x, y, p) ((vec_t)B(inserti64x4_, _mask, (vdi_t)(x), (vdi_quartet_t)(y), p, (vdi_t)undef(), ~0))
+#  endif
 #  if VEC_SIZE == 16
 #   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
@@ -892,7 +976,7 @@ static inline eighth_t low_eighth(vec_t
     eighth_t y;
     unsigned int i;
 
-    for ( i = 0; i < ELEM_COUNT / 4; ++i )
+    for ( i = 0; i < ELEM_COUNT / 8; ++i )
         y[i] = x[i];
 
     return y;
@@ -904,6 +988,50 @@ static inline eighth_t low_eighth(vec_t
 
 #endif
 
+#ifdef broadcast_pair
+# if ELEM_COUNT == 4
+#  define broadcast_half broadcast_pair
+# elif ELEM_COUNT == 8
+#  define broadcast_quarter broadcast_pair
+# elif ELEM_COUNT == 16
+#  define broadcast_eighth broadcast_pair
+# endif
+#endif
+
+#ifdef insert_pair
+# if ELEM_COUNT == 4
+#  define insert_half insert_pair
+# elif ELEM_COUNT == 8
+#  define insert_quarter insert_pair
+# elif ELEM_COUNT == 16
+#  define insert_eighth insert_pair
+# endif
+#endif
+
+#ifdef broadcast_quartet
+# if ELEM_COUNT == 8
+#  define broadcast_half broadcast_quartet
+# elif ELEM_COUNT == 16
+#  define broadcast_quarter broadcast_quartet
+# endif
+#endif
+
+#ifdef insert_quartet
+# if ELEM_COUNT == 8
+#  define insert_half insert_quartet
+# elif ELEM_COUNT == 16
+#  define insert_quarter insert_quartet
+# endif
+#endif
+
+#if defined(broadcast_octet) && ELEM_COUNT == 16
+# define broadcast_half broadcast_octet
+#endif
+
+#if defined(insert_octet) && ELEM_COUNT == 16
+# define insert_half insert_octet
+#endif
+
 #if defined(__AVX512F__) && defined(FLOAT_SIZE)
 # include "simd-fma.c"
 #endif
@@ -1199,6 +1327,60 @@ int simd_test(void)
     if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
 #endif
 
+#if defined(broadcast_half) && defined(insert_half)
+    {
+        half_t aux = low_half(src);
+
+        touch(aux);
+        x = broadcast_half(aux);
+        touch(aux);
+        y = insert_half(src, aux, 1);
+        if ( !eq(x, y) ) return __LINE__;
+    }
+#endif
+
+#if defined(broadcast_quarter) && defined(insert_quarter)
+    {
+        quarter_t aux = low_quarter(src);
+
+        touch(aux);
+        x = broadcast_quarter(aux);
+        touch(aux);
+        y = insert_quarter(src, aux, 1);
+        touch(aux);
+        y = insert_quarter(y, aux, 2);
+        touch(aux);
+        y = insert_quarter(y, aux, 3);
+        if ( !eq(x, y) ) return __LINE__;
+    }
+#endif
+
+#if defined(broadcast_eighth) && defined(insert_eighth) && \
+    /* At least gcc 7.3 "optimizes" away all insert_eighth() calls below. */ \
+    __GNUC__ >= 8
+    {
+        eighth_t aux = low_eighth(src);
+
+        touch(aux);
+        x = broadcast_eighth(aux);
+        touch(aux);
+        y = insert_eighth(src, aux, 1);
+        touch(aux);
+        y = insert_eighth(y, aux, 2);
+        touch(aux);
+        y = insert_eighth(y, aux, 3);
+        touch(aux);
+        y = insert_eighth(y, aux, 4);
+        touch(aux);
+        y = insert_eighth(y, aux, 5);
+        touch(aux);
+        y = insert_eighth(y, aux, 6);
+        touch(aux);
+        y = insert_eighth(y, aux, 7);
+        if ( !eq(x, y) ) return __LINE__;
+    }
+#endif
+
 #if defined(interleave_lo) && defined(interleave_hi)
     touch(src);
     x = interleave_lo(inv, src);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -23,6 +23,7 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512bw-opmask.h"
 #include "avx512f.h"
 #include "avx512bw.h"
+#include "avx512dq.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -100,6 +101,11 @@ static bool simd_check_avx512dq(void)
 }
 #define simd_check_avx512dq_opmask simd_check_avx512dq
 
+static bool simd_check_avx512dq_vl(void)
+{
+    return cpu_has_avx512dq && cpu_has_avx512vl;
+}
+
 static bool simd_check_avx512bw(void)
 {
     return cpu_has_avx512bw;
@@ -267,9 +273,10 @@ static const struct {
     SIMD(XOP i32x8,               xop,      32i4),
     SIMD(XOP i64x4,               xop,      32i8),
     SIMD(OPMASK/w,     avx512f_opmask,         2),
-    SIMD(OPMASK/b,    avx512dq_opmask,         1),
-    SIMD(OPMASK/d,    avx512bw_opmask,         4),
-    SIMD(OPMASK/q,    avx512bw_opmask,         8),
+    SIMD(OPMASK+DQ/b, avx512dq_opmask,         1),
+    SIMD(OPMASK+DQ/w, avx512dq_opmask,         2),
+    SIMD(OPMASK+BW/d, avx512bw_opmask,         4),
+    SIMD(OPMASK+BW/q, avx512bw_opmask,         8),
     SIMD(AVX512F f32 scalar,  avx512f,        f4),
     SIMD(AVX512F f32x16,      avx512f,      64f4),
     SIMD(AVX512F f64 scalar,  avx512f,        f8),
@@ -302,6 +309,24 @@ static const struct {
     AVX512VL(BW+VL u16x8,    avx512bw,      16u2),
     AVX512VL(BW+VL s16x16,   avx512bw,      32i2),
     AVX512VL(BW+VL u16x16,   avx512bw,      32u2),
+    SIMD(AVX512DQ f32x16,    avx512dq,      64f4),
+    SIMD(AVX512DQ f64x8,     avx512dq,      64f8),
+    SIMD(AVX512DQ s32x16,    avx512dq,      64i4),
+    SIMD(AVX512DQ u32x16,    avx512dq,      64u4),
+    SIMD(AVX512DQ s64x8,     avx512dq,      64i8),
+    SIMD(AVX512DQ u64x8,     avx512dq,      64u8),
+    AVX512VL(DQ+VL f32x4,    avx512dq,      16f4),
+    AVX512VL(DQ+VL f64x2,    avx512dq,      16f8),
+    AVX512VL(DQ+VL f32x8,    avx512dq,      32f4),
+    AVX512VL(DQ+VL f64x4,    avx512dq,      32f8),
+    AVX512VL(DQ+VL s32x4,    avx512dq,      16i4),
+    AVX512VL(DQ+VL u32x4,    avx512dq,      16u4),
+    AVX512VL(DQ+VL s32x8,    avx512dq,      32i4),
+    AVX512VL(DQ+VL u32x8,    avx512dq,      32u4),
+    AVX512VL(DQ+VL s64x2,    avx512dq,      16i8),
+    AVX512VL(DQ+VL u64x2,    avx512dq,      16u8),
+    AVX512VL(DQ+VL s64x4,    avx512dq,      32i8),
+    AVX512VL(DQ+VL u64x4,    avx512dq,      32u8),
 #undef AVX512VL_
 #undef AVX512VL
 #undef SIMD_




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 35/44] x86emul: support AVX512F move high/low insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (33 preceding siblings ...)
  2018-09-25 13:48   ` [PATCH v4 34/44] x86emul: basic AVX512DQ testing Jan Beulich
@ 2018-09-25 13:48   ` Jan Beulich
  2018-09-25 13:49   ` [PATCH v4 36/44] x86emul: support AVX512F move duplicate insns Jan Beulich
                     ` (8 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:48 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

No explicit test harness additions other than the overrides, as the
compiler already makes use of the insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -242,6 +242,16 @@ static const struct test avx512f_128[] =
     INSN(insertps,  66, 0f3a, 21, el,    d, el),
     INSN(mov,       66,   0f, 6e, el, dq64, el),
     INSN(mov,       66,   0f, 7e, el, dq64, el),
+//       movhlps,     ,   0f, 12,        d
+    INSN(movhpd,    66,   0f, 16, el,    q, vl),
+    INSN(movhpd,    66,   0f, 17, el,    q, vl),
+    INSN(movhps,      ,   0f, 16, el_2,  d, vl),
+    INSN(movhps,      ,   0f, 17, el_2,  d, vl),
+//       movlhps,     ,   0f, 16,        d
+    INSN(movlpd,    66,   0f, 12, el,    q, vl),
+    INSN(movlpd,    66,   0f, 13, el,    q, vl),
+    INSN(movlps,      ,   0f, 12, el_2,  d, vl),
+    INSN(movlps,      ,   0f, 13, el_2,  d, vl),
     INSN(movq,      f3,   0f, 7e, el,    q, el),
     INSN(movq,      66,   0f, d6, el,    q, el),
 };
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -266,6 +266,12 @@ OVR(movd);
 OVR(movq);
 OVR_SFP(mov);
 OVR_VFP(mova);
+OVR(movhlps);
+OVR(movhpd);
+OVR(movhps);
+OVR(movlhps);
+OVR(movlpd);
+OVR(movlps);
 OVR_VFP(movnt);
 OVR_VFP(movu);
 OVR_FP(mul);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -286,11 +286,11 @@ static const struct twobyte_table {
     [0x0f] = { ModRM|SrcImmByte },
     [0x10] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl },
     [0x11] = { DstMem|SrcImplicit|ModRM|Mov, simd_any_fp, d8s_vl },
-    [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
+    [0x12] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, 3 },
+    [0x13] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0x14 ... 0x15] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
-    [0x16] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
-    [0x17] = { DstMem|SrcImplicit|ModRM|Mov, simd_other },
+    [0x16] = { DstImplicit|SrcMem|ModRM|Mov, simd_other, 3 },
+    [0x17] = { DstMem|SrcImplicit|ModRM|Mov, simd_other, 3 },
     [0x18 ... 0x1f] = { ImplicitOps|ModRM },
     [0x20 ... 0x21] = { DstMem|SrcImplicit|ModRM },
     [0x22 ... 0x23] = { DstImplicit|SrcMem|ModRM },
@@ -6000,6 +6000,26 @@ x86_emulate(
         op_bytes = 8;
         goto simd_0f_fp;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x12):   /* vmovlpd m64,xmm,xmm */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x13): /* vmovlp{s,d} xmm,m64 */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x16):   /* vmovhpd m64,xmm,xmm */
+    CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x17): /* vmovhp{s,d} xmm,m64 */
+        generate_exception_if(ea.type != OP_MEM, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX(0x0f, 0x12):      /* vmovlps m64,xmm,xmm */
+                                            /* vmovhlps xmm,xmm,xmm */
+    case X86EMUL_OPC_EVEX(0x0f, 0x16):      /* vmovhps m64,xmm,xmm */
+                                            /* vmovlhps xmm,xmm,xmm */
+        generate_exception_if((evex.lr || evex.opmsk || evex.br ||
+                               evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK)),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( (d & DstMask) != DstMem )
+            d &= ~TwoOp;
+        op_bytes = 8;
+        fault_suppression = false;
+        goto simd_zmm;
+
     case X86EMUL_OPC_F3(0x0f, 0x12):       /* movsldup xmm/m128,xmm */
     case X86EMUL_OPC_VEX_F3(0x0f, 0x12):   /* vmovsldup {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_F2(0x0f, 0x12):       /* movddup xmm/m64,xmm */





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 36/44] x86emul: support AVX512F move duplicate insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (34 preceding siblings ...)
  2018-09-25 13:48   ` [PATCH v4 35/44] x86emul: support AVX512F move high/low insns Jan Beulich
@ 2018-09-25 13:49   ` Jan Beulich
  2018-09-25 13:49   ` [PATCH v4 37/44] x86emul: support AVX512{F, BW, VBMI} permute insns Jan Beulich
                     ` (7 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:49 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Judging from insn prefixes, these are scalar insns, but their (memory)
operands are vector ones (with the exception of 128-bit VMOVDDUP). For
this some adjustments to disp8scale calculation code are needed.

No explicit test harness additions other than the overrides, as the
compiler already makes use of the insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -146,6 +146,8 @@ static const struct test avx512f_all[] =
     INSN(movntdq,      66,   0f, e7,    vl,   d_nb, vl),
     INSN(movntdqa,     66, 0f38, 2a,    vl,   d_nb, vl),
     INSN_PFP_NB(movnt,       0f, 2b),
+    INSN(movshdup,     f3,   0f, 16,    vl,   d_nb, vl),
+    INSN(movsldup,     f3,   0f, 12,    vl,   d_nb, vl),
     INSN_PFP_NB(movu,        0f, 10),
     INSN_PFP_NB(movu,        0f, 11),
     INSN_FP(mul,             0f, 59),
@@ -242,6 +244,7 @@ static const struct test avx512f_128[] =
     INSN(insertps,  66, 0f3a, 21, el,    d, el),
     INSN(mov,       66,   0f, 6e, el, dq64, el),
     INSN(mov,       66,   0f, 7e, el, dq64, el),
+    INSN(movddup,   f2,   0f, 12, el,    q, el),
 //       movhlps,     ,   0f, 12,        d
     INSN(movhpd,    66,   0f, 16, el,    q, vl),
     INSN(movhpd,    66,   0f, 17, el,    q, vl),
@@ -264,6 +267,7 @@ static const struct test avx512f_no128[]
     INSN(extracti32x4,   66, 0f3a, 39, el_4,  d, vl),
     INSN(insertf32x4,    66, 0f3a, 18, el_4,  d, vl),
     INSN(inserti32x4,    66, 0f3a, 38, el_4,  d, vl),
+    INSN(movddup,        f2,   0f, 12, vl, q_nb, vl),
     INSN(shuff32x4,      66, 0f3a, 23, vl,    d, vl),
     INSN(shuff64x2,      66, 0f3a, 23, vl,    q, vl),
     INSN(shufi32x4,      66, 0f3a, 43, vl,    d, vl),
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -326,8 +326,11 @@ REN(pandn, , d);
 REN(por, , d);
 REN(pxor, , d);
 # endif
+OVR(movddup);
 OVR(movntdq);
 OVR(movntdqa);
+OVR(movshdup);
+OVR(movsldup);
 OVR(pmovsxbd);
 OVR(pmovsxbq);
 OVR(pmovsxdq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3036,6 +3036,15 @@ x86_decode(
 
             switch ( b )
             {
+            case 0x12: /* vmovsldup / vmovddup */
+                if ( evex.pfx == vex_f2 )
+                    disp8scale = evex.lr ? 4 + evex.lr : 3;
+                /* fall through */
+            case 0x16: /* vmovshdup */
+                if ( evex.pfx == vex_f3 )
+                    disp8scale = 4 + evex.lr;
+                break;
+
             case 0x20: /* mov cr,reg */
             case 0x21: /* mov dr,reg */
             case 0x22: /* mov reg,cr */
@@ -6035,6 +6044,20 @@ x86_emulate(
         host_and_vcpu_must_have(sse3);
         goto simd_0f_xmm;
 
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x12):   /* vmovsldup [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F2(0x0f, 0x12):   /* vmovddup [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_F3(0x0f, 0x16):   /* vmovshdup [xyz]mm/mem,[xyz]mm{k} */
+        generate_exception_if((evex.br ||
+                               evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK)),
+                              EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        avx512_vlen_check(false);
+        d |= TwoOp;
+        op_bytes = !(evex.pfx & VEX_PREFIX_DOUBLE_MASK) || evex.lr
+                   ? 16 << evex.lr : 8;
+        fault_suppression = false;
+        goto simd_zmm;
+
     CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x14): /* vunpcklp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0x15): /* vunpckhp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 37/44] x86emul: support AVX512{F, BW, VBMI} permute insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (35 preceding siblings ...)
  2018-09-25 13:49   ` [PATCH v4 36/44] x86emul: support AVX512F move duplicate insns Jan Beulich
@ 2018-09-25 13:49   ` Jan Beulich
  2018-09-25 13:50   ` [PATCH v4 38/44] x86emul: support AVX512BW pack insns Jan Beulich
                     ` (6 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:49 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -166,6 +166,10 @@ static const struct test avx512f_all[] =
     INSN(pcmpu,        66, 0f3a, 1e,    vl,     dq, vl),
     INSN(permi2,       66, 0f38, 76,    vl,     dq, vl),
     INSN(permi2,       66, 0f38, 77,    vl,     sd, vl),
+    INSN(permilpd,     66, 0f38, 0d,    vl,      q, vl),
+    INSN(permilpd,     66, 0f3a, 05,    vl,      q, vl),
+    INSN(permilps,     66, 0f38, 0c,    vl,      d, vl),
+    INSN(permilps,     66, 0f3a, 04,    vl,      d, vl),
     INSN(permt2,       66, 0f38, 7e,    vl,     dq, vl),
     INSN(permt2,       66, 0f38, 7f,    vl,     sd, vl),
     INSN(pmaxs,        66, 0f38, 3d,    vl,     dq, vl),
@@ -268,6 +272,10 @@ static const struct test avx512f_no128[]
     INSN(insertf32x4,    66, 0f3a, 18, el_4,  d, vl),
     INSN(inserti32x4,    66, 0f3a, 38, el_4,  d, vl),
     INSN(movddup,        f2,   0f, 12, vl, q_nb, vl),
+    INSN(perm,           66, 0f38, 36, vl,   dq, vl),
+    INSN(perm,           66, 0f38, 16, vl,   sd, vl),
+    INSN(permpd,         66, 0f3a, 01, vl,    q, vl),
+    INSN(permq,          66, 0f3a, 00, vl,    q, vl),
     INSN(shuff32x4,      66, 0f3a, 23, vl,    d, vl),
     INSN(shuff64x2,      66, 0f3a, 23, vl,    q, vl),
     INSN(shufi32x4,      66, 0f3a, 43, vl,    d, vl),
@@ -306,6 +314,7 @@ static const struct test avx512bw_all[]
     INSN(pcmpgtb,     66,   0f, 64,    vl,    b, vl),
     INSN(pcmpgtw,     66,   0f, 65,    vl,    w, vl),
     INSN(pcmpu,       66, 0f3a, 3e,    vl,   bw, vl),
+    INSN(permw,       66, 0f38, 8d,    vl,    w, vl),
     INSN(permi2w,     66, 0f38, 75,    vl,    w, vl),
     INSN(permt2w,     66, 0f38, 7d,    vl,    w, vl),
     INSN(pmaddwd,     66,   0f, f5,    vl,    w, vl),
@@ -402,6 +411,7 @@ static const struct test avx512dq_512[]
 };
 
 static const struct test avx512_vbmi_all[] = {
+    INSN(permb,         66, 0f38, 8d, vl, b, vl),
     INSN(permi2b,       66, 0f38, 75, vl, b, vl),
     INSN(permt2b,       66, 0f38, 7d, vl, b, vl),
 };
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -180,6 +180,7 @@ static inline bool _to_bool(byte_vec_t b
 #    define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
 #    define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
 #    define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
+#    define swap2(x) B_(vpermilps, _mask, x, 0b00011011, undef(), ~0)
 #   else
 #    define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0)
 #    define insert_pair(x, y, p) \
@@ -194,6 +195,10 @@ static inline bool _to_bool(byte_vec_t b
     vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
     B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \
 })
+#    define swap2(x) B(vpermilps, _mask, \
+                       B(shuf_f32x4_, _mask, x, x, \
+                         VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
+                       0b00011011, undef(), ~0)
 #   endif
 #  elif FLOAT_SIZE == 8
 #   if VEC_SIZE >= 32
@@ -226,6 +231,7 @@ static inline bool _to_bool(byte_vec_t b
 #    define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
 #    define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
 #    define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
+#    define swap2(x) B_(vpermilpd, _mask, x, 0b01, undef(), ~0)
 #   else
 #    define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0)
 #    define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0)
@@ -233,6 +239,10 @@ static inline bool _to_bool(byte_vec_t b
     vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \
     B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \
 })
+#    define swap2(x) B(vpermilpd, _mask, \
+                       B(shuf_f64x2_, _mask, x, x, \
+                         VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \
+                       0b01010101, undef(), ~0)
 #   endif
 #  endif
 # endif
@@ -399,6 +409,7 @@ static inline bool _to_bool(byte_vec_t b
                              B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \
                                VEC_SIZE == 32 ? 0b01 : 0b00011011, (vsi_t)undef(), ~0), \
                              0b00011011, (vsi_t)undef(), ~0))
+#   define swap2(x) ((vec_t)B_(permvarsi, _mask, (vsi_t)(x), (vsi_t)(inv - 1), (vsi_t)undef(), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
@@ -436,8 +447,17 @@ static inline bool _to_bool(byte_vec_t b
                              (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), (vdi_t)(x), \
                                       VEC_SIZE == 32 ? 0b01 : 0b00011011, (vdi_t)undef(), ~0), \
                              0b01001110, (vsi_t)undef(), ~0))
+#   define swap2(x) ((vec_t)B(permvardi, _mask, (vdi_t)(x), (vdi_t)(inv - 1), (vdi_t)undef(), ~0))
 #  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
+#  if VEC_SIZE == 32
+#   define swap3(x) ((vec_t)B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0))
+#  elif VEC_SIZE == 64
+#   define swap3(x) ({ \
+    vdi_t t_ = B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0); \
+    B(shuf_i64x2_, _mask, t_, t_, 0b01001110, (vdi_t)undef(), ~0); \
+})
+#  endif
 # endif
 # if INT_SIZE == 4
 #  define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
@@ -483,6 +503,9 @@ static inline bool _to_bool(byte_vec_t b
 #  define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
 #  define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0))
 #  define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0))
+#  ifdef __AVX512VBMI__
+#   define swap2(x) ((vec_t)B(permvarqi, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0))
+#  endif
 # elif INT_SIZE == 2 || UINT_SIZE == 2
 #  define broadcast(x) ({ \
     vec_t t_; \
@@ -511,6 +534,7 @@ static inline bool _to_bool(byte_vec_t b
                               (0b01010101010101010101010101010101 & ALL_TRUE)))
 #  define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
 #  define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0))
+#  define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), (vhi_t)undef(), ~0))
 # endif
 # if INT_SIZE == 1
 #  define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0))
@@ -1319,6 +1343,12 @@ int simd_test(void)
     if ( !eq(swap2(src), inv) ) return __LINE__;
 #endif
 
+#ifdef swap3
+    touch(src);
+    if ( !eq(swap3(src), inv) ) return __LINE__;
+    touch(src);
+#endif
+
 #ifdef broadcast
     if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__;
 #endif
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -275,6 +275,8 @@ OVR(movlps);
 OVR_VFP(movnt);
 OVR_VFP(movu);
 OVR_FP(mul);
+OVR_VFP(perm);
+OVR_VFP(permil);
 OVR_VFP(shuf);
 OVR_INT(sll);
 OVR_DQ(sllv);
@@ -331,6 +333,8 @@ OVR(movntdq);
 OVR(movntdqa);
 OVR(movshdup);
 OVR(movsldup);
+OVR(permd);
+OVR(permq);
 OVR(pmovsxbd);
 OVR(pmovsxbq);
 OVR(pmovsxdq);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -434,7 +434,8 @@ static const struct ext0f38_table {
 } ext0f38_table[256] = {
     [0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x01 ... 0x0b] = { .simd_size = simd_packed_int },
-    [0x0c ... 0x0f] = { .simd_size = simd_packed_fp },
+    [0x0c ... 0x0d] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
+    [0x0e ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x13] = { .simd_size = simd_other, .two_op = 1 },
     [0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -477,6 +478,7 @@ static const struct ext0f38_table {
     [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x8c] = { .simd_size = simd_packed_int },
+    [0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
     [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 },
     [0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -522,10 +524,10 @@ static const struct ext0f3a_table {
     uint8_t four_op:1;
     disp8scale_t d8s:4;
 } ext0f3a_table[256] = {
-    [0x00] = { .simd_size = simd_packed_int, .two_op = 1 },
-    [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x00] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
+    [0x01] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
     [0x02] = { .simd_size = simd_packed_int },
-    [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
     [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 },
     [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc },
@@ -8062,6 +8064,9 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf2): /* vpslld xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,[xyz]mm,[xyz]mm{k} */
         generate_exception_if(evex.br, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x0c): /* vpermilps [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x0d): /* vpermilpd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;
         if ( b == 0xe2 )
             goto avx512f_no_sae;
@@ -8406,6 +8411,12 @@ x86_emulate(
         generate_exception_if(!vex.l || vex.w, EXC_UD);
         goto simd_0f_avx2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x16): /* vpermp{s,d} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x36): /* vperm{d,q} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+        generate_exception_if(!evex.lr, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
@@ -8610,6 +8621,7 @@ x86_emulate(
 
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x8d): /* vperm{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         if ( !evex.w )
             host_and_vcpu_must_have(avx512_vbmi);
         else
@@ -9036,6 +9048,12 @@ x86_emulate(
         generate_exception_if(!vex.l || !vex.w, EXC_UD);
         goto simd_0f_imm8_avx2;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x00): /* vpermq $imm8,{y,z}mm/mem,{y,z}mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x01): /* vpermpd $imm8,{y,z}mm/mem,{y,z}mm{k} */
+        generate_exception_if(!evex.lr || !evex.w, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm_no_sae;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 $imm8,xmm/m128,ymm,ymm */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 */
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 $imm8,ymm/m256,ymm,ymm */
@@ -9055,6 +9073,12 @@ x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x04): /* vpermilps $imm8,[xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x05): /* vpermilpd $imm8,[xyz]mm/mem,[xyz]mm{k} */
+        generate_exception_if(evex.w != (b & 1), EXC_UD);
+        fault_suppression = false;
+        goto avx512f_imm_no_sae;
+
     case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */
     case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 38/44] x86emul: support AVX512BW pack insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (36 preceding siblings ...)
  2018-09-25 13:49   ` [PATCH v4 37/44] x86emul: support AVX512{F, BW, VBMI} permute insns Jan Beulich
@ 2018-09-25 13:50   ` Jan Beulich
  2018-09-25 13:51   ` [PATCH v4 39/44] x86emul: support AVX512F floating-point conversion insns Jan Beulich
                     ` (5 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:50 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

No further test harness additions - what is there is good enough for
these rather "regular" insns.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -296,6 +296,10 @@ static const struct test avx512bw_all[]
     INSN(movdqu8,     f2,   0f, 7f,    vl,    b, vl),
     INSN(movdqu16,    f2,   0f, 6f,    vl,    w, vl),
     INSN(movdqu16,    f2,   0f, 7f,    vl,    w, vl),
+    INSN(packssdw,    66,   0f, 6b,    vl, d_nb, vl),
+    INSN(packsswb,    66,   0f, 63,    vl,    w, vl),
+    INSN(packusdw,    66, 0f38, 2b,    vl, d_nb, vl),
+    INSN(packuswb,    66,   0f, 67,    vl,    w, vl),
     INSN(paddb,       66,   0f, fc,    vl,    b, vl),
     INSN(paddsb,      66,   0f, ec,    vl,    b, vl),
     INSN(paddsw,      66,   0f, ed,    vl,    w, vl),
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -361,6 +361,10 @@ OVR(pextrw);
 OVR(pinsrb);
 OVR(pinsrw);
 # ifdef __AVX512VL__
+OVR(packssdw);
+OVR(packsswb);
+OVR(packusdw);
+OVR(packuswb);
 OVR(pmaddwd);
 OVR(pmovsxbw);
 OVR(pmovzxbw);
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -453,7 +453,7 @@ static const struct ext0f38_table {
     [0x25] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x26 ... 0x29] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x2a] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
-    [0x2b] = { .simd_size = simd_packed_int },
+    [0x2b] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x2c ... 0x2d] = { .simd_size = simd_packed_fp },
     [0x2e ... 0x2f] = { .simd_size = simd_packed_fp, .to_mem = 1 },
     [0x30] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
@@ -6707,6 +6707,8 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0x69): /* vpunpckhwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         op_bytes = 16 << evex.lr;
         /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x63): /* vpacksswb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x67): /* vpackuswb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
@@ -6769,6 +6771,12 @@ x86_emulate(
         avx512_vlen_check(false);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6b): /* vpackssdw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x2b): /* vpackusdw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.w || evex.br, EXC_UD);
+        fault_suppression = false;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_EVEX_66(0x0f, 0x6c): /* vpunpcklqdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0x6d): /* vpunpckhqdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 39/44] x86emul: support AVX512F floating-point conversion insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (37 preceding siblings ...)
  2018-09-25 13:50   ` [PATCH v4 38/44] x86emul: support AVX512BW pack insns Jan Beulich
@ 2018-09-25 13:51   ` Jan Beulich
  2018-09-25 13:52   ` [PATCH v4 40/44] x86emul: support AVX512F legacy-equivalent packed int/FP " Jan Beulich
                     ` (4 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:51 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

VCVTPS2PD, sharing its main opcode with others, needs a "manual"
override of disp8scale.

The simd_size change for twobyte_table[0x5a] is benign to pre-existing
code, but allows decode_disp8scale() to work as is here.

Also correct the comment on an AVX counterpart.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -98,6 +98,12 @@ static const struct test avx512f_all[] =
     INSN_FP(cmp,             0f, c2),
     INSN(comisd,       66,   0f, 2f,    el,      q, el),
     INSN(comiss,         ,   0f, 2f,    el,      d, el),
+    INSN(cvtpd2ps,     66,   0f, 5a,    vl,      q, vl),
+    INSN(cvtph2ps,     66, 0f38, 13,    vl_2, d_nb, vl),
+    INSN(cvtps2pd,       ,   0f, 5a,    vl_2,    d, vl),
+    INSN(cvtps2ph,     66, 0f3a, 1d,    vl_2, d_nb, vl),
+    INSN(cvtsd2ss,     f2,   0f, 5a,    el,      q, el),
+    INSN(cvtss2sd,     f3,   0f, 5a,    el,      d, el),
     INSN_FP(div,             0f, 5e),
     INSN(fmadd132,     66, 0f38, 98,    vl,     sd, vl),
     INSN(fmadd132,     66, 0f38, 99,    el,     sd, el),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -176,6 +176,8 @@ static inline bool _to_bool(byte_vec_t b
 #   define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
 #   define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
 #   define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+#   define shrink1(x) BR_(cvtpd2ps, _mask, (vdf_t)(x), (vsf_half_t){}, ~0)
+#   define widen1(x) ((vec_t)BR(cvtps2pd, _mask, x, (vdf_t)undef(), ~0))
 #   if VEC_SIZE == 16
 #    define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
 #    define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -68,6 +68,7 @@ typedef short __attribute__((vector_size
 typedef int __attribute__((vector_size(VEC_SIZE))) vsi_t;
 #if VEC_SIZE >= 8
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
+typedef double __attribute__((vector_size(VEC_SIZE))) vdf_t;
 #endif
 
 #if ELEM_SIZE == 1
@@ -93,6 +94,7 @@ typedef char __attribute__((vector_size(
 typedef short __attribute__((vector_size(HALF_SIZE))) vhi_half_t;
 typedef int __attribute__((vector_size(HALF_SIZE))) vsi_half_t;
 typedef long long __attribute__((vector_size(HALF_SIZE))) vdi_half_t;
+typedef float __attribute__((vector_size(HALF_SIZE))) vsf_half_t;
 # endif
 
 # if ELEM_COUNT >= 4
@@ -328,6 +330,13 @@ REN(pandn, , d);
 REN(por, , d);
 REN(pxor, , d);
 # endif
+OVR(cvtpd2psx);
+OVR(cvtpd2psy);
+OVR(cvtph2ps);
+OVR(cvtps2pd);
+OVR(cvtps2ph);
+OVR(cvtsd2ss);
+OVR(cvtss2sd);
 OVR(movddup);
 OVR(movntdq);
 OVR(movntdqa);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -3842,6 +3842,49 @@ int main(int argc, char **argv)
     else
         printf("skipped\n");
 
+    printf("%-40s", "Testing vcvtph2ps 32(%ecx),%zmm7{%k4}...");
+    if ( stack_exec && cpu_has_avx512f )
+    {
+        decl_insn(evex_vcvtph2ps);
+        decl_insn(evex_vcvtps2ph);
+
+        asm volatile ( "vpternlogd $0x81, %%zmm7, %%zmm7, %%zmm7\n\t"
+                       "kmovw %1,%%k4\n"
+                       put_insn(evex_vcvtph2ps, "vcvtph2ps 32(%0), %%zmm7%{%%k4%}")
+                       :: "c" (NULL), "r" (0x3333) );
+
+        set_insn(evex_vcvtph2ps);
+        memset(res, 0xff, 128);
+        res[8] = 0x40003c00; /* (1.0, 2.0) */
+        res[10] = 0x44004200; /* (3.0, 4.0) */
+        res[12] = 0x3400b800; /* (-.5, .25) */
+        res[14] = 0xbc000000; /* (0.0, -1.) */
+        regs.ecx = (unsigned long)res;
+        rc = x86_emulate(&ctxt, &emulops);
+        asm volatile ( "vmovups %%zmm7, %0" : "=m" (res[16]) );
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vcvtph2ps) )
+            goto fail;
+        printf("okay\n");
+
+        printf("%-40s", "Testing vcvtps2ph $0,%zmm3,64(%edx){%k4}...");
+        asm volatile ( "vmovups %0, %%zmm3\n"
+                       put_insn(evex_vcvtps2ph, "vcvtps2ph $0, %%zmm3, 128(%1)%{%%k4%}")
+                       :: "m" (res[16]), "d" (NULL) );
+
+        set_insn(evex_vcvtps2ph);
+        regs.edx = (unsigned long)res;
+        memset(res + 32, 0xcc, 32);
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( rc != X86EMUL_OKAY || !check_eip(evex_vcvtps2ph) )
+            goto fail;
+        res[15] = res[13] = res[11] = res[9] = 0xcccccccc;
+        if ( memcmp(res + 8, res + 32, 32) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
 #undef decl_insn
 #undef put_insn
 #undef set_insn
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -310,7 +310,8 @@ static const struct twobyte_table {
     [0x52 ... 0x53] = { DstImplicit|SrcMem|ModRM|TwoOp, simd_single_fp },
     [0x54 ... 0x57] = { DstImplicit|SrcMem|ModRM, simd_packed_fp, d8s_vl },
     [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
-    [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
+    [0x5a] = { DstImplicit|SrcMem|ModRM|Mov, simd_any_fp, d8s_vl },
+    [0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
     [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
@@ -437,7 +438,7 @@ static const struct ext0f38_table {
     [0x0c ... 0x0d] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x0e ... 0x0f] = { .simd_size = simd_packed_fp },
     [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
-    [0x13] = { .simd_size = simd_other, .two_op = 1 },
+    [0x13] = { .simd_size = simd_other, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x17] = { .simd_size = simd_packed_int, .two_op = 1 },
     [0x18] = { .simd_size = simd_scalar_opc, .two_op = 1, .d8s = 2 },
@@ -541,7 +542,7 @@ static const struct ext0f3a_table {
     [0x19] = { .simd_size = simd_128, .to_mem = 1, .two_op = 1, .d8s = 4 },
     [0x1a] = { .simd_size = simd_256, .d8s = d8s_vl_by_2 },
     [0x1b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
-    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1 },
+    [0x1d] = { .simd_size = simd_other, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x1e ... 0x1f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x20] = { .simd_size = simd_none, .d8s = 0 },
     [0x21] = { .simd_size = simd_other, .d8s = 2 },
@@ -3059,6 +3060,11 @@ x86_decode(
                 modrm_mod = 3;
                 break;
 
+            case 0x5a: /* vcvtps2pd needs special casing */
+                if ( disp8scale && !evex.pfx && !evex.br )
+                    --disp8scale;
+                break;
+
             case 0x7e: /* vmovq xmm/m64,xmm needs special casing */
                 if ( disp8scale == 2 && evex.pfx == vex_f3 )
                     disp8scale = 3;
@@ -5966,6 +5972,7 @@ x86_emulate(
     CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5d):    /* vmin{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5e):    /* vdiv{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5f):    /* vmax{p,s}{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    avx512f_all_fp:
         generate_exception_if((evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK) ||
                                (ea.type == OP_MEM && evex.br &&
                                 (evex.pfx & VEX_PREFIX_SCALAR_MASK))),
@@ -6523,7 +6530,7 @@ x86_emulate(
         goto simd_zmm;
 
     CASE_SIMD_ALL_FP(, 0x0f, 0x5a):        /* cvt{p,s}{s,d}2{p,s}{s,d} xmm/mem,xmm */
-    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a):    /* vcvtp{s,d}2p{s,d} xmm/mem,xmm */
+    CASE_SIMD_ALL_FP(_VEX, 0x0f, 0x5a):    /* vcvtp{s,d}2p{s,d} {x,y}mm/mem,{x,y}mm */
                                            /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm */
         op_bytes = 4 << (((vex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + vex.l) +
                          !!(vex.pfx & VEX_PREFIX_DOUBLE_MASK));
@@ -6532,6 +6539,12 @@ x86_emulate(
             goto simd_0f_sse2;
         goto simd_0f_avx;
 
+    CASE_SIMD_ALL_FP(_EVEX, 0x0f, 0x5a):   /* vcvtp{s,d}2p{s,d} [xyz]mm/mem,[xyz]mm{k} */
+                                           /* vcvts{s,d}2s{s,d} xmm/mem,xmm,xmm{k} */
+        op_bytes = 4 << (((evex.pfx & VEX_PREFIX_SCALAR_MASK) ? 0 : 1 + evex.lr) +
+                         evex.w);
+        goto avx512f_all_fp;
+
     CASE_SIMD_PACKED_FP(, 0x0f, 0x5b):     /* cvt{ps,dq}2{dq,ps} xmm/mem,xmm */
     CASE_SIMD_PACKED_FP(_VEX, 0x0f, 0x5b): /* vcvt{ps,dq}2{dq,ps} {x,y}mm/mem,{x,y}mm */
     case X86EMUL_OPC_F3(0x0f, 0x5b):       /* cvttps2dq xmm/mem,xmm */
@@ -8414,6 +8427,15 @@ x86_emulate(
         op_bytes = 8 << vex.l;
         goto simd_0f_ymm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x13): /* vcvtph2ps {x,y}mm/mem,[xyz]mm{k} */
+        generate_exception_if(evex.w || (ea.type == OP_MEM && evex.br), EXC_UD);
+        host_and_vcpu_must_have(avx512f);
+        if ( !evex.br )
+            avx512_vlen_check(false);
+        op_bytes = 8 << evex.lr;
+        elem_bytes = 2;
+        goto simd_zmm;
+
     case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
     case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
         generate_exception_if(!vex.l || vex.w, EXC_UD);
@@ -9237,27 +9259,79 @@ x86_emulate(
         goto avx512f_imm_no_sae;
 
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,{x,y}mm,xmm/mem */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x1d): /* vcvtps2ph $imm8,[xyz]mm,{x,y}mm/mem{k} */
     {
         uint32_t mxcsr;
 
-        generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
-        host_and_vcpu_must_have(f16c);
         fail_if(!ops->write);
+        if ( evex_encoded() )
+        {
+            generate_exception_if((evex.w || evex.reg != 0xf || !evex.RX ||
+                                   (ea.type == OP_MEM && (evex.z || evex.br))),
+                                  EXC_UD);
+            host_and_vcpu_must_have(avx512f);
+            avx512_vlen_check(false);
+            opc = init_evex(stub);
+        }
+        else
+        {
+            generate_exception_if(vex.w || vex.reg != 0xf, EXC_UD);
+            host_and_vcpu_must_have(f16c);
+            opc = init_prefixes(stub);
+        }
+
+        op_bytes = 8 << evex.lr;
 
-        opc = init_prefixes(stub);
         opc[0] = b;
         opc[1] = modrm;
         if ( ea.type == OP_MEM )
         {
             /* Convert memory operand to (%rAX). */
             vex.b = 1;
+            evex.b = 1;
             opc[1] &= 0x38;
         }
         opc[2] = imm1;
-        insn_bytes = PFX_BYTES + 3;
+        if ( evex_encoded() )
+        {
+            unsigned int full = 0;
+
+            insn_bytes = EVEX_PFX_BYTES + 3;
+            copy_EVEX(opc, evex);
+
+            if ( ea.type == OP_MEM && evex.opmsk )
+            {
+                full = 0xffff >> (16 - op_bytes / 2);
+                op_mask &= full;
+                if ( !op_mask )
+                    goto complete_insn;
+
+                first_byte = __builtin_ctz(op_mask);
+                op_mask >>= first_byte;
+                full >>= first_byte;
+                first_byte <<= 1;
+                op_bytes = (32 - __builtin_clz(op_mask)) << 1;
+
+                /*
+                 * We may need to read (parts of) the memory operand for the
+                 * purpose of merging in order to avoid splitting the write
+                 * below into multiple ones.
+                 */
+                if ( op_mask != full &&
+                     (rc = ops->read(ea.mem.seg,
+                                     truncate_ea(ea.mem.off + first_byte),
+                                     (void *)mmvalp + first_byte, op_bytes,
+                                     ctxt)) != X86EMUL_OKAY )
+                    goto done;
+            }
+        }
+        else
+        {
+            insn_bytes = PFX_BYTES + 3;
+            copy_VEX(opc, vex);
+        }
         opc[3] = 0xc3;
 
-        copy_VEX(opc, vex);
         /* Latch MXCSR - we may need to restore it below. */
         invoke_stub("stmxcsr %[mxcsr]", "",
                     "=m" (*mmvalp), [mxcsr] "=m" (mxcsr) : "a" (mmvalp));
@@ -9266,7 +9340,8 @@ x86_emulate(
 
         if ( ea.type == OP_MEM )
         {
-            rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp, 8 << vex.l, ctxt);
+            rc = ops->write(ea.mem.seg, truncate_ea(ea.mem.off + first_byte),
+                            (void *)mmvalp + first_byte, op_bytes, ctxt);
             if ( rc != X86EMUL_OKAY )
             {
                 asm volatile ( "ldmxcsr %0" :: "m" (mxcsr) );




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

^ permalink raw reply	[flat|nested] 465+ messages in thread

* [PATCH v4 40/44] x86emul: support AVX512F legacy-equivalent packed int/FP conversion insns
  2018-09-25 13:14 ` [PATCH v4 00/44] x86emul: fixes, improvements, and beginnings of AVX512 support Jan Beulich
                     ` (38 preceding siblings ...)
  2018-09-25 13:51   ` [PATCH v4 39/44] x86emul: support AVX512F floating-point conversion insns Jan Beulich
@ 2018-09-25 13:52   ` " Jan Beulich
  2018-09-25 13:53   ` [PATCH v4 41/44] x86emul: support AVX512F legacy-equivalent scalar " Jan Beulich
                     ` (3 subsequent siblings)
  43 siblings, 0 replies; 465+ messages in thread
From: Jan Beulich @ 2018-09-25 13:52 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Andrew Cooper, Wei Liu

... including the two AVX512DQ forms which shared encodings, just with
EVEX.W set there.

VCVTDQ2PD, sharing its main opcode with others, needs a "manual"
override of disp8scale.

The simd_size changes for the twobyte_table[] entries are benign to
pre-existing code, but allow decode_disp8scale() to work as is here.

The at this point wrong placement of the 0xe6 case block is once again
in anticipation of further additions of case labels.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -98,8 +98,12 @@ static const struct test avx512f_all[] =
     INSN_FP(cmp,             0f, c2),
     INSN(comisd,       66,   0f, 2f,    el,      q, el),
     INSN(comiss,         ,   0f, 2f,    el,      d, el),
+    INSN(cvtdq2pd,     f3,   0f, e6,    vl_2,    d, vl),
+    INSN(cvtdq2ps,       ,   0f, 5b,    vl,      d, vl),
+    INSN(cvtpd2dq,     f2,   0f, e6,    vl,      q, vl),
     INSN(cvtpd2ps,     66,   0f, 5a,    vl,      q, vl),
     INSN(cvtph2ps,     66, 0f38, 13,    vl_2, d_nb, vl),
+    INSN(cvtps2dq,     66,   0f, 5b,    vl,      d, vl),
     INSN(cvtps2pd,       ,   0f, 5a,    vl_2,    d, vl),
     INSN(cvtps2ph,     66, 0f3a, 1d,    vl_2, d_nb, vl),
     INSN(cvtsd2ss,     f2,   0f, 5a,    el,      q, el),
@@ -388,6 +392,8 @@ static const struct test avx512dq_all[]
     INSN_PFP(and,              0f, 54),
     INSN_PFP(andn,             0f, 55),
     INSN(broadcasti32x2, 66, 0f38, 59, el_2,  d, vl),
+    INSN(cvtqq2pd,       f3,   0f, e6,   vl,  q, vl),
+    INSN(cvtqq2ps,         ,   0f, 5b,   vl,  q, vl),
     INSN_PFP(or,               0f, 56),
 //       pmovd2m,        f3, 0f38, 39,        d
 //       pmovm2,         f3, 0f38, 38,       dq
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -92,6 +92,13 @@ static inline bool _to_bool(byte_vec_t b
 # define to_int(x) ((vec_t){ (int)(x)[0] })
 #elif VEC_SIZE == 8 && FLOAT_SIZE == 4 && defined(__3dNOW__)
 # define to_int(x) __builtin_ia32_pi2fd(__builtin_ia32_pf2id(x))
+#elif defined(FLOAT_SIZE) && VEC_SIZE > FLOAT_SIZE && defined(__AVX512F__) && \
+      (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if FLOAT_SIZE == 4
+#  def