* [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support
@ 2019-07-17 6:27 Jan Beulich
2019-07-17 6:33 ` [Xen-devel] [PATCH v10 01/13] x86emul: support of AVX512* population count insns Jan Beulich
` (12 more replies)
0 siblings, 13 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:27 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, Roger Pau Monne
01: support of AVX512* population count insns
02: support of AVX512_IFMA insns
03: support remaining AVX512_VBMI2 insns
04: support AVX512_4FMAPS insns
05: support AVX512_4VNNIW insns
06: support AVX512_VNNI insns
07: support VPCLMULQDQ insns
08: support VAES insns
09: support GFNI insns
10: restore ordering within main switch statement
11: add an AES/VAES test case to the harness
12: add a SHA test case to the harness
13: add a PCLMUL/VPCLMUL test case to the harness
Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 01/13] x86emul: support of AVX512* population count insns
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
@ 2019-07-17 6:33 ` Jan Beulich
2019-07-17 11:32 ` Andrew Cooper
2019-07-17 6:34 ` [Xen-devel] [PATCH v10 02/13] x86emul: support of AVX512_IFMA insns Jan Beulich
` (11 subsequent siblings)
12 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:33 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
Plus the only other AVX512_BITALG one.
As in a few cases before, since the insns here and in particular their
memory access patterns follow the usual scheme, I didn't think it was
necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v10: Sort AVX512BW deps by number instead of alphabetically.
v9: Re-base.
v7: Re-base.
v6: New.
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -538,6 +538,11 @@ static const struct test avx512pf_512[]
INSNX(scatterpf1q, 66, 0f38, c7, 6, vl, sd, el),
};
+static const struct test avx512_bitalg_all[] = {
+ INSN(popcnt, 66, 0f38, 54, vl, bw, vl),
+ INSN(pshufbitqmb, 66, 0f38, 8f, vl, b, vl),
+};
+
static const struct test avx512_vbmi_all[] = {
INSN(permb, 66, 0f38, 8d, vl, b, vl),
INSN(permi2b, 66, 0f38, 75, vl, b, vl),
@@ -550,6 +555,10 @@ static const struct test avx512_vbmi2_al
INSN(pexpand, 66, 0f38, 62, vl, bw, el),
};
+static const struct test avx512_vpopcntdq_all[] = {
+ INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
+};
+
static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
static const unsigned char vl_128[] = { VL_128 };
static const unsigned char vl_no128[] = { VL_512, VL_256 };
@@ -919,6 +928,8 @@ void evex_disp8_test(void *instr, struct
RUN(avx512er, 512);
#define cpu_has_avx512pf cpu_has_avx512f
RUN(avx512pf, 512);
+ RUN(avx512_bitalg, all);
RUN(avx512_vbmi, all);
RUN(avx512_vbmi2, all);
+ RUN(avx512_vpopcntdq, all);
}
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -143,6 +143,8 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
+#define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
#define cpu_has_xgetbv1 (cpu_has_xsave && cp.xstate.xgetbv1)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -479,6 +479,7 @@ static const struct ext0f38_table {
[0x4d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0x4e] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0x4f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+ [0x54 ... 0x55] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
[0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
[0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
[0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 },
@@ -501,6 +502,7 @@ static const struct ext0f38_table {
[0x8c] = { .simd_size = simd_packed_int },
[0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 },
+ [0x8f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1, .d8s = d8s_dq },
[0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0x99] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
@@ -1883,6 +1885,8 @@ in_protmode(
#define vcpu_has_avx512vl() (ctxt->cpuid->feat.avx512vl)
#define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
#define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
+#define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
#define vcpu_has_rdpid() (ctxt->cpuid->feat.rdpid)
#define vcpu_must_have(feat) \
@@ -8899,6 +8903,19 @@ x86_emulate(
generate_exception_if(vex.l, EXC_UD);
goto simd_0f_avx;
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x8f): /* vpshufbitqmb [xyz]mm/mem,[xyz]mm,k{k} */
+ generate_exception_if(evex.w || !evex.r || !evex.R || evex.z, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x54): /* vpopcnt{b,w} [xyz]mm/mem,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512_bitalg);
+ generate_exception_if(evex.brs, EXC_UD);
+ elem_bytes = 1 << evex.w;
+ goto avx512f_no_sae;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x55): /* vpopcnt{d,q} [xyz]mm/mem,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512_vpopcntdq);
+ goto avx512f_no_sae;
+
case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -110,6 +110,8 @@
/* CPUID level 0x00000007:0.ecx */
#define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI)
#define cpu_has_avx512_vbmi2 boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_avx512_bitalg boot_cpu_has(X86_FEATURE_AVX512_BITALG)
+#define cpu_has_avx512_vpopcntdq boot_cpu_has(X86_FEATURE_AVX512_VPOPCNTDQ)
#define cpu_has_rdpid boot_cpu_has(X86_FEATURE_RDPID)
/* CPUID level 0x80000007.edx */
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP, 6*32+ 2) /
XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */
XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */
XEN_CPUFEATURE(AVX512_VBMI2, 6*32+ 6) /*A Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A Support for VPOPCNT[B,W] and VPSHUFBITQMB */
XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A POPCNT for vectors of DW/QW */
XEN_CPUFEATURE(RDPID, 6*32+22) /*A RDPID instruction */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -268,7 +268,7 @@ def crunch_numbers(state):
# AVX512 extensions acting on vectors of bytes/words are made
# dependents of AVX512BW (as to requiring wider than 16-bit mask
# registers), despite the SDM not formally making this connection.
- AVX512BW: [AVX512_BF16, AVX512_VBMI, AVX512_VBMI2],
+ AVX512BW: [AVX512_VBMI, AVX512_VBMI2, AVX512_BITALG, AVX512_BF16],
# The features:
# * Single Thread Indirect Branch Predictors
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 02/13] x86emul: support of AVX512_IFMA insns
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
2019-07-17 6:33 ` [Xen-devel] [PATCH v10 01/13] x86emul: support of AVX512* population count insns Jan Beulich
@ 2019-07-17 6:34 ` Jan Beulich
2019-07-17 6:34 ` [Xen-devel] [PATCH v10 03/13] x86emul: support remaining AVX512_VBMI2 insns Jan Beulich
` (10 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:34 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
Once again take the liberty and also correct the (public interface) name
of the AVX512_IFMA feature flag to match the SDM, on the assumption that
no external consumer has actually been using that flag so far.
As in a few cases before, since the insns here and in particular their
memory access patterns follow the usual scheme, I didn't think it was
necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v9: Re-base.
v7: Reject EVEX.W=0.
v6: New.
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -543,6 +543,11 @@ static const struct test avx512_bitalg_a
INSN(pshufbitqmb, 66, 0f38, 8f, vl, b, vl),
};
+static const struct test avx512_ifma_all[] = {
+ INSN(pmadd52huq, 66, 0f38, b5, vl, q, vl),
+ INSN(pmadd52luq, 66, 0f38, b4, vl, q, vl),
+};
+
static const struct test avx512_vbmi_all[] = {
INSN(permb, 66, 0f38, 8d, vl, b, vl),
INSN(permi2b, 66, 0f38, 75, vl, b, vl),
@@ -929,6 +934,7 @@ void evex_disp8_test(void *instr, struct
#define cpu_has_avx512pf cpu_has_avx512f
RUN(avx512pf, 512);
RUN(avx512_bitalg, all);
+ RUN(avx512_ifma, all);
RUN(avx512_vbmi, all);
RUN(avx512_vbmi2, all);
RUN(avx512_vpopcntdq, all);
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -137,6 +137,7 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_bmi2 cp.feat.bmi2
#define cpu_has_avx512f (cp.feat.avx512f && xcr0_mask(0xe6))
#define cpu_has_avx512dq (cp.feat.avx512dq && xcr0_mask(0xe6))
+#define cpu_has_avx512_ifma (cp.feat.avx512_ifma && xcr0_mask(0xe6))
#define cpu_has_avx512er (cp.feat.avx512er && xcr0_mask(0xe6))
#define cpu_has_avx512cd (cp.feat.avx512cd && xcr0_mask(0xe6))
#define cpu_has_avx512bw (cp.feat.avx512bw && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -521,6 +521,7 @@ static const struct ext0f38_table {
[0xad] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0xae] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0xaf] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+ [0xb4 ... 0xb5] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0xb6 ... 0xb8] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0xb9] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0xba] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
@@ -1875,6 +1876,7 @@ in_protmode(
#define vcpu_has_rdseed() (ctxt->cpuid->feat.rdseed)
#define vcpu_has_adx() (ctxt->cpuid->feat.adx)
#define vcpu_has_smap() (ctxt->cpuid->feat.smap)
+#define vcpu_has_avx512_ifma() (ctxt->cpuid->feat.avx512_ifma)
#define vcpu_has_clflushopt() (ctxt->cpuid->feat.clflushopt)
#define vcpu_has_clwb() (ctxt->cpuid->feat.clwb)
#define vcpu_has_avx512pf() (ctxt->cpuid->feat.avx512pf)
@@ -9455,6 +9457,12 @@ x86_emulate(
break;
}
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xb4): /* vpmadd52luq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xb5): /* vpmadd52huq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512_ifma);
+ generate_exception_if(!evex.w, EXC_UD);
+ goto avx512f_no_sae;
+
case X86EMUL_OPC_EVEX_66(0x0f38, 0xc6):
case X86EMUL_OPC_EVEX_66(0x0f38, 0xc7):
{
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -101,6 +101,7 @@
#define cpu_has_avx512dq boot_cpu_has(X86_FEATURE_AVX512DQ)
#define cpu_has_rdseed boot_cpu_has(X86_FEATURE_RDSEED)
#define cpu_has_smap boot_cpu_has(X86_FEATURE_SMAP)
+#define cpu_has_avx512_ifma boot_cpu_has(X86_FEATURE_AVX512_IFMA)
#define cpu_has_avx512er boot_cpu_has(X86_FEATURE_AVX512ER)
#define cpu_has_avx512cd boot_cpu_has(X86_FEATURE_AVX512CD)
#define cpu_has_sha boot_cpu_has(X86_FEATURE_SHA)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -212,7 +212,7 @@ XEN_CPUFEATURE(AVX512DQ, 5*32+17) /
XEN_CPUFEATURE(RDSEED, 5*32+18) /*A RDSEED instruction */
XEN_CPUFEATURE(ADX, 5*32+19) /*A ADCX, ADOX instructions */
XEN_CPUFEATURE(SMAP, 5*32+20) /*S Supervisor Mode Access Prevention */
-XEN_CPUFEATURE(AVX512IFMA, 5*32+21) /*A AVX-512 Integer Fused Multiply Add */
+XEN_CPUFEATURE(AVX512_IFMA, 5*32+21) /*A AVX-512 Integer Fused Multiply Add */
XEN_CPUFEATURE(CLFLUSHOPT, 5*32+23) /*A CLFLUSHOPT instruction */
XEN_CPUFEATURE(CLWB, 5*32+24) /*A CLWB instruction */
XEN_CPUFEATURE(AVX512PF, 5*32+26) /*A AVX-512 Prefetch Instructions */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -261,7 +261,7 @@ def crunch_numbers(state):
# (which in practice depends on the EVEX prefix to encode) as well
# as mask registers, and the instructions themselves. All further
# AVX512 features are built on top of AVX512F
- AVX512F: [AVX512DQ, AVX512IFMA, AVX512PF, AVX512ER, AVX512CD,
+ AVX512F: [AVX512DQ, AVX512_IFMA, AVX512PF, AVX512ER, AVX512CD,
AVX512BW, AVX512VL, AVX512_4VNNIW, AVX512_4FMAPS,
AVX512_VPOPCNTDQ],
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 03/13] x86emul: support remaining AVX512_VBMI2 insns
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
2019-07-17 6:33 ` [Xen-devel] [PATCH v10 01/13] x86emul: support of AVX512* population count insns Jan Beulich
2019-07-17 6:34 ` [Xen-devel] [PATCH v10 02/13] x86emul: support of AVX512_IFMA insns Jan Beulich
@ 2019-07-17 6:34 ` Jan Beulich
2019-07-17 6:34 ` [Xen-devel] [PATCH v10 04/13] x86emul: support AVX512_4FMAPS insns Jan Beulich
` (9 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:34 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
As in a few cases before, since the insns here and in particular their
memory access patterns follow the usual scheme, I didn't think it was
necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v7: Re-base over change earlier in the series.
v6: New.
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -558,6 +558,14 @@ static const struct test avx512_vbmi_all
static const struct test avx512_vbmi2_all[] = {
INSN(pcompress, 66, 0f38, 63, vl, bw, el),
INSN(pexpand, 66, 0f38, 62, vl, bw, el),
+ INSN(pshld, 66, 0f3a, 71, vl, dq, vl),
+ INSN(pshldv, 66, 0f38, 71, vl, dq, vl),
+ INSN(pshldvw, 66, 0f38, 70, vl, w, vl),
+ INSN(pshldw, 66, 0f3a, 70, vl, w, vl),
+ INSN(pshrd, 66, 0f3a, 73, vl, dq, vl),
+ INSN(pshrdv, 66, 0f38, 73, vl, dq, vl),
+ INSN(pshrdvw, 66, 0f38, 72, vl, w, vl),
+ INSN(pshrdw, 66, 0f3a, 72, vl, w, vl),
};
static const struct test avx512_vpopcntdq_all[] = {
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -487,6 +487,7 @@ static const struct ext0f38_table {
[0x62] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_bw },
[0x63] = { .simd_size = simd_packed_int, .to_mem = 1, .two_op = 1, .d8s = d8s_bw },
[0x64 ... 0x66] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+ [0x70 ... 0x73] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
[0x78] = { .simd_size = simd_other, .two_op = 1 },
@@ -611,6 +612,7 @@ static const struct ext0f3a_table {
[0x6a ... 0x6b] = { .simd_size = simd_scalar_opc, .four_op = 1 },
[0x6c ... 0x6d] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x6e ... 0x6f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
+ [0x70 ... 0x73] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x78 ... 0x79] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x7a ... 0x7b] = { .simd_size = simd_scalar_opc, .four_op = 1 },
[0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -8969,6 +8971,16 @@ x86_emulate(
}
goto simd_zmm;
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x70): /* vpshldvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x72): /* vpshrdvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ generate_exception_if(!evex.w, EXC_UD);
+ elem_bytes = 2;
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x71): /* vpshldv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x73): /* vpshrdv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512_vbmi2);
+ goto avx512f_no_sae;
+
case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x8d): /* vperm{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
@@ -10281,6 +10293,16 @@ x86_emulate(
avx512_vlen_check(true);
goto simd_imm8_zmm;
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x70): /* vpshldw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x72): /* vpshrdw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ generate_exception_if(!evex.w, EXC_UD);
+ elem_bytes = 2;
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x71): /* vpshld{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x73): /* vpshrd{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512_vbmi2);
+ goto avx512f_imm8_no_sae;
+
case X86EMUL_OPC(0x0f3a, 0xcc): /* sha1rnds4 $imm8,xmm/m128,xmm */
host_and_vcpu_must_have(sha);
op_bytes = 16;
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 04/13] x86emul: support AVX512_4FMAPS insns
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
` (2 preceding siblings ...)
2019-07-17 6:34 ` [Xen-devel] [PATCH v10 03/13] x86emul: support remaining AVX512_VBMI2 insns Jan Beulich
@ 2019-07-17 6:34 ` Jan Beulich
2019-07-17 6:35 ` [Xen-devel] [PATCH v10 05/13] x86emul: support AVX512_4VNNIW insns Jan Beulich
` (8 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:34 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
A decoder adjustment is needed here because of the current sharing of
table entries between different (implied) opcode prefixes: The same
major opcodes are used for vfmsub{132,213}{p,s}{s,d}, which have a
different memory operand size and different Disp8 scaling.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v9: Re-base. Explain need for decoder special case.
v8: Correct vcpu_has_*() insertion point.
v7: Re-base.
v6: New.
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -538,6 +538,13 @@ static const struct test avx512pf_512[]
INSNX(scatterpf1q, 66, 0f38, c7, 6, vl, sd, el),
};
+static const struct test avx512_4fmaps_512[] = {
+ INSN(4fmaddps, f2, 0f38, 9a, el_4, d, vl),
+ INSN(4fmaddss, f2, 0f38, 9b, el_4, d, vl),
+ INSN(4fnmaddps, f2, 0f38, aa, el_4, d, vl),
+ INSN(4fnmaddss, f2, 0f38, ab, el_4, d, vl),
+};
+
static const struct test avx512_bitalg_all[] = {
INSN(popcnt, 66, 0f38, 54, vl, bw, vl),
INSN(pshufbitqmb, 66, 0f38, 8f, vl, b, vl),
@@ -941,6 +948,7 @@ void evex_disp8_test(void *instr, struct
RUN(avx512er, 512);
#define cpu_has_avx512pf cpu_has_avx512f
RUN(avx512pf, 512);
+ RUN(avx512_4fmaps, 512);
RUN(avx512_bitalg, all);
RUN(avx512_ifma, all);
RUN(avx512_vbmi, all);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -4274,6 +4274,81 @@ int main(int argc, char **argv)
}
#endif
+ printf("%-40s", "Testing v4fmaddps 32(%ecx),%zmm4,%zmm4{%k5}...");
+ if ( stack_exec && cpu_has_avx512_4fmaps )
+ {
+ decl_insn(v4fmaddps);
+ static const struct {
+ float f[16];
+ } in = {{
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+ }}, out = {{
+ 1 + 1 * 9 + 2 * 10 + 3 * 11 + 4 * 12,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16 + 16 * 9 + 17 * 10 + 18 * 11 + 19 * 12
+ }};
+
+ asm volatile ( "vmovups %1, %%zmm4\n\t"
+ "vbroadcastss %%xmm4, %%zmm7\n\t"
+ "vaddps %%zmm4, %%zmm7, %%zmm5\n\t"
+ "vaddps %%zmm5, %%zmm7, %%zmm6\n\t"
+ "vaddps %%zmm6, %%zmm7, %%zmm7\n\t"
+ "kmovw %2, %%k5\n"
+ put_insn(v4fmaddps,
+ "v4fmaddps 32(%0), %%zmm4, %%zmm4%{%%k5%}")
+ :: "c" (NULL), "m" (in), "rmk" (0x8001) );
+
+ set_insn(v4fmaddps);
+ regs.ecx = (unsigned long)∈
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(v4fmaddps) )
+ goto fail;
+
+ asm ( "vcmpeqps %1, %%zmm4, %%k0\n\t"
+ "kmovw %%k0, %0" : "=g" (rc) : "m" (out) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
+ printf("%-40s", "Testing v4fnmaddss 16(%edx),%zmm4,%zmm4{%k3}...");
+ if ( stack_exec && cpu_has_avx512_4fmaps )
+ {
+ decl_insn(v4fnmaddss);
+ static const struct {
+ float f[16];
+ } in = {{
+ 1, 2, 3, 4, 5, 6, 7, 8
+ }}, out = {{
+ 1 - 1 * 5 - 2 * 6 - 3 * 7 - 4 * 8, 2, 3, 4
+ }};
+
+ asm volatile ( "vmovups %1, %%xmm4\n\t"
+ "vaddss %%xmm4, %%xmm4, %%xmm5\n\t"
+ "vaddss %%xmm5, %%xmm4, %%xmm6\n\t"
+ "vaddss %%xmm6, %%xmm4, %%xmm7\n\t"
+ "kmovw %2, %%k3\n"
+ put_insn(v4fnmaddss,
+ "v4fnmaddss 16(%0), %%xmm4, %%xmm4%{%%k3%}")
+ :: "d" (NULL), "m" (in), "rmk" (1) );
+
+ set_insn(v4fnmaddss);
+ regs.edx = (unsigned long)∈
+ rc = x86_emulate(&ctxt, &emulops);
+ if ( rc != X86EMUL_OKAY || !check_eip(v4fnmaddss) )
+ goto fail;
+
+ asm ( "vcmpeqps %1, %%zmm4, %%k0\n\t"
+ "kmovw %%k0, %0" : "=g" (rc) : "m" (out) );
+ if ( rc != 0xffff )
+ goto fail;
+ printf("okay\n");
+ }
+ else
+ printf("skipped\n");
+
#undef decl_insn
#undef put_insn
#undef set_insn
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -146,6 +146,7 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
#define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
#define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
+#define cpu_has_avx512_4fmaps (cp.feat.avx512_4fmaps && xcr0_mask(0xe6))
#define cpu_has_xgetbv1 (cpu_has_xsave && cp.xstate.xgetbv1)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1892,6 +1892,7 @@ in_protmode(
#define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
#define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
#define vcpu_has_rdpid() (ctxt->cpuid->feat.rdpid)
+#define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
#define vcpu_must_have(feat) \
generate_exception_if(!vcpu_has_##feat(), EXC_UD)
@@ -3173,6 +3174,18 @@ x86_decode(
state);
state->simd_size = simd_other;
}
+
+ switch ( b )
+ {
+ /* v4f{,n}madd{p,s}s need special casing */
+ case 0x9a: case 0x9b: case 0xaa: case 0xab:
+ if ( evex.pfx == vex_f2 )
+ {
+ disp8scale = 4;
+ state->simd_size = simd_128;
+ }
+ break;
+ }
}
break;
@@ -9370,6 +9383,24 @@ x86_emulate(
avx512_vlen_check(true);
goto simd_zmm;
+ case X86EMUL_OPC_EVEX_F2(0x0f38, 0x9a): /* v4fmaddps m128,zmm+3,zmm{k} */
+ case X86EMUL_OPC_EVEX_F2(0x0f38, 0xaa): /* v4fnmaddps m128,zmm+3,zmm{k} */
+ host_and_vcpu_must_have(avx512_4fmaps);
+ generate_exception_if((ea.type != OP_MEM || evex.w || evex.brs ||
+ evex.lr != 2),
+ EXC_UD);
+ op_mask = op_mask & 0xffff ? 0xf : 0;
+ goto simd_zmm;
+
+ case X86EMUL_OPC_EVEX_F2(0x0f38, 0x9b): /* v4fmaddss m128,xmm+3,xmm{k} */
+ case X86EMUL_OPC_EVEX_F2(0x0f38, 0xab): /* v4fnmaddss m128,xmm+3,xmm{k} */
+ host_and_vcpu_must_have(avx512_4fmaps);
+ generate_exception_if((ea.type != OP_MEM || evex.w || evex.brs ||
+ evex.lr == 3),
+ EXC_UD);
+ op_mask = op_mask & 1 ? 0xf : 0;
+ goto simd_zmm;
+
case X86EMUL_OPC_EVEX_66(0x0f38, 0xa0): /* vpscatterd{d,q} [xyz]mm,mem{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0xa1): /* vpscatterq{d,q} [xyz]mm,mem{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0xa2): /* vscatterdp{s,d} [xyz]mm,mem{k} */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -119,6 +119,7 @@
#define cpu_has_itsc boot_cpu_has(X86_FEATURE_ITSC)
/* CPUID level 0x00000007:0.edx */
+#define cpu_has_avx512_4fmaps boot_cpu_has(X86_FEATURE_AVX512_4FMAPS)
#define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)
/* Synthesized. */
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 05/13] x86emul: support AVX512_4VNNIW insns
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
` (3 preceding siblings ...)
2019-07-17 6:34 ` [Xen-devel] [PATCH v10 04/13] x86emul: support AVX512_4FMAPS insns Jan Beulich
@ 2019-07-17 6:35 ` Jan Beulich
2019-07-17 6:35 ` [Xen-devel] [PATCH v10 06/13] x86emul: support AVX512_VNNI insns Jan Beulich
` (7 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:35 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
As in a few cases before, since the insns here and in particular their
memory access patterns follow the AVX512_4FMAPS scheme, I didn't think
it was necessary to add contrived tests specifically for them, beyond
the Disp8 scaling ones.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v9: Re-base.
v8: Correct vcpu_has_*() insertion point.
v7: Re-base.
v6: New.
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -545,6 +545,11 @@ static const struct test avx512_4fmaps_5
INSN(4fnmaddss, f2, 0f38, ab, el_4, d, vl),
};
+static const struct test avx512_4vnniw_512[] = {
+ INSN(p4dpwssd, f2, 0f38, 52, el_4, d, vl),
+ INSN(p4dpwssds, f2, 0f38, 53, el_4, d, vl),
+};
+
static const struct test avx512_bitalg_all[] = {
INSN(popcnt, 66, 0f38, 54, vl, bw, vl),
INSN(pshufbitqmb, 66, 0f38, 8f, vl, b, vl),
@@ -949,6 +954,7 @@ void evex_disp8_test(void *instr, struct
#define cpu_has_avx512pf cpu_has_avx512f
RUN(avx512pf, 512);
RUN(avx512_4fmaps, 512);
+ RUN(avx512_4vnniw, 512);
RUN(avx512_bitalg, all);
RUN(avx512_ifma, all);
RUN(avx512_vbmi, all);
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -146,6 +146,7 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
#define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
#define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
+#define cpu_has_avx512_4vnniw (cp.feat.avx512_4vnniw && xcr0_mask(0xe6))
#define cpu_has_avx512_4fmaps (cp.feat.avx512_4fmaps && xcr0_mask(0xe6))
#define cpu_has_xgetbv1 (cpu_has_xsave && cp.xstate.xgetbv1)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -479,6 +479,7 @@ static const struct ext0f38_table {
[0x4d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0x4e] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0x4f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+ [0x52 ... 0x53] = { .simd_size = simd_128, .d8s = 4 },
[0x54 ... 0x55] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
[0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
[0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
@@ -1892,6 +1893,7 @@ in_protmode(
#define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
#define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
#define vcpu_has_rdpid() (ctxt->cpuid->feat.rdpid)
+#define vcpu_has_avx512_4vnniw() (ctxt->cpuid->feat.avx512_4vnniw)
#define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
#define vcpu_must_have(feat) \
@@ -8920,6 +8922,15 @@ x86_emulate(
generate_exception_if(vex.l, EXC_UD);
goto simd_0f_avx;
+ case X86EMUL_OPC_EVEX_F2(0x0f38, 0x52): /* vp4dpwssd m128,zmm+3,zmm{k} */
+ case X86EMUL_OPC_EVEX_F2(0x0f38, 0x53): /* vp4dpwssds m128,zmm+3,zmm{k} */
+ host_and_vcpu_must_have(avx512_4vnniw);
+ generate_exception_if((ea.type != OP_MEM || evex.w || evex.brs ||
+ evex.lr != 2),
+ EXC_UD);
+ op_mask = op_mask & 0xffff ? 0xf : 0;
+ goto simd_zmm;
+
case X86EMUL_OPC_EVEX_66(0x0f38, 0x8f): /* vpshufbitqmb [xyz]mm/mem,[xyz]mm,k{k} */
generate_exception_if(evex.w || !evex.r || !evex.R || evex.z, EXC_UD);
/* fall through */
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -119,6 +119,7 @@
#define cpu_has_itsc boot_cpu_has(X86_FEATURE_ITSC)
/* CPUID level 0x00000007:0.edx */
+#define cpu_has_avx512_4vnniw boot_cpu_has(X86_FEATURE_AVX512_4VNNIW)
#define cpu_has_avx512_4fmaps boot_cpu_has(X86_FEATURE_AVX512_4FMAPS)
#define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 06/13] x86emul: support AVX512_VNNI insns
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
` (4 preceding siblings ...)
2019-07-17 6:35 ` [Xen-devel] [PATCH v10 05/13] x86emul: support AVX512_4VNNIW insns Jan Beulich
@ 2019-07-17 6:35 ` Jan Beulich
2019-07-17 6:35 ` [Xen-devel] [PATCH v10 07/13] x86emul: support VPCLMULQDQ insns Jan Beulich
` (6 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:35 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
Along the lines of the 4FMAPS case, convert the 4VNNIW-based table
entries to a decoder adjustment. Because of the current sharing of table
entries between different (implied) opcode prefixes and with the same
major opcodes being used for vp4dpwssd{,s}, which have a different
memory operand size and different Disp8 scaling, the pre-existing table
entries get converted to a decoder override. The table entries will now
represent the insns here, in line with other table entries preferably
representing the prefix-66 insns.
As in a few cases before, since the insns here and in particular their
memory access patterns follow the usual scheme, I didn't think it was
necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v9: Re-base. Explain need for decoder special case.
v8: Re-base.
v7: New.
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -580,6 +580,13 @@ static const struct test avx512_vbmi2_al
INSN(pshrdw, 66, 0f3a, 72, vl, w, vl),
};
+static const struct test avx512_vnni_all[] = {
+ INSN(pdpbusd, 66, 0f38, 50, vl, d, vl),
+ INSN(pdpbusds, 66, 0f38, 51, vl, d, vl),
+ INSN(pdpwssd, 66, 0f38, 52, vl, d, vl),
+ INSN(pdpwssds, 66, 0f38, 53, vl, d, vl),
+};
+
static const struct test avx512_vpopcntdq_all[] = {
INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
};
@@ -959,5 +966,6 @@ void evex_disp8_test(void *instr, struct
RUN(avx512_ifma, all);
RUN(avx512_vbmi, all);
RUN(avx512_vbmi2, all);
+ RUN(avx512_vnni, all);
RUN(avx512_vpopcntdq, all);
}
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
#define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
#define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
#define cpu_has_avx512_4vnniw (cp.feat.avx512_4vnniw && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -479,7 +479,7 @@ static const struct ext0f38_table {
[0x4d] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0x4e] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0x4f] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
- [0x52 ... 0x53] = { .simd_size = simd_128, .d8s = 4 },
+ [0x50 ... 0x53] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x54 ... 0x55] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
[0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 },
[0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 },
@@ -1890,6 +1890,7 @@ in_protmode(
#define vcpu_has_avx512vl() (ctxt->cpuid->feat.avx512vl)
#define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
#define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
#define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
#define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
#define vcpu_has_rdpid() (ctxt->cpuid->feat.rdpid)
@@ -3179,6 +3180,8 @@ x86_decode(
switch ( b )
{
+ /* vp4dpwssd{,s} need special casing */
+ case 0x52: case 0x53:
/* v4f{,n}madd{p,s}s need special casing */
case 0x9a: case 0x9b: case 0xaa: case 0xab:
if ( evex.pfx == vex_f2 )
@@ -9394,6 +9397,14 @@ x86_emulate(
avx512_vlen_check(true);
goto simd_zmm;
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x50): /* vpdpbusd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x51): /* vpdpbusds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x52): /* vpdpwssd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x53): /* vpdpwssds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512_vnni);
+ generate_exception_if(evex.w, EXC_UD);
+ goto avx512f_no_sae;
+
case X86EMUL_OPC_EVEX_F2(0x0f38, 0x9a): /* v4fmaddps m128,zmm+3,zmm{k} */
case X86EMUL_OPC_EVEX_F2(0x0f38, 0xaa): /* v4fnmaddps m128,zmm+3,zmm{k} */
host_and_vcpu_must_have(avx512_4fmaps);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@
/* CPUID level 0x00000007:0.ecx */
#define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI)
#define cpu_has_avx512_vbmi2 boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_avx512_vnni boot_cpu_has(X86_FEATURE_AVX512_VNNI)
#define cpu_has_avx512_bitalg boot_cpu_has(X86_FEATURE_AVX512_BITALG)
#define cpu_has_avx512_vpopcntdq boot_cpu_has(X86_FEATURE_AVX512_VPOPCNTDQ)
#define cpu_has_rdpid boot_cpu_has(X86_FEATURE_RDPID)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP, 6*32+ 2) /
XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */
XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */
XEN_CPUFEATURE(AVX512_VBMI2, 6*32+ 6) /*A Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(AVX512_VNNI, 6*32+11) /*A Vector Neural Network Instrs */
XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A Support for VPOPCNT[B,W] and VPSHUFBITQMB */
XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A POPCNT for vectors of DW/QW */
XEN_CPUFEATURE(RDPID, 6*32+22) /*A RDPID instruction */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -263,7 +263,7 @@ def crunch_numbers(state):
# AVX512 features are built on top of AVX512F
AVX512F: [AVX512DQ, AVX512_IFMA, AVX512PF, AVX512ER, AVX512CD,
AVX512BW, AVX512VL, AVX512_4VNNIW, AVX512_4FMAPS,
- AVX512_VPOPCNTDQ],
+ AVX512_VNNI, AVX512_VPOPCNTDQ],
# AVX512 extensions acting on vectors of bytes/words are made
# dependents of AVX512BW (as to requiring wider than 16-bit mask
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 07/13] x86emul: support VPCLMULQDQ insns
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
` (5 preceding siblings ...)
2019-07-17 6:35 ` [Xen-devel] [PATCH v10 06/13] x86emul: support AVX512_VNNI insns Jan Beulich
@ 2019-07-17 6:35 ` Jan Beulich
2019-07-17 6:36 ` [Xen-devel] [PATCH v10 08/13] x86emul: support VAES insns Jan Beulich
` (5 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:35 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
As to the feature dependency adjustment, while strictly speaking AVX is
a sufficient prereq (to have YMM registers), 256-bit vectors of integers
have got fully introduced with AVX2 only. Sadly gcc can't be used as a
reference here: They don't provide any AVX512-independent built-in at
all.
Along the lines of PCLMULQDQ, since the insns here and in particular
their memory access patterns follow the usual scheme, I didn't think it
was necessary to add a contrived test specifically for them, beyond the
Disp8 scaling one.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v10: Re-base.
v9: Re-base. Make VPCLMULQDQ also depend on PCLMULQDQ.
v8: No need to set fault_suppression to false.
v7: New.
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -591,6 +591,10 @@ static const struct test avx512_vpopcntd
INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
};
+static const struct test vpclmulqdq_all[] = {
+ INSN(pclmulqdq, 66, 0f3a, 44, vl, q_nb, vl)
+};
+
static const unsigned char vl_all[] = { VL_512, VL_128, VL_256 };
static const unsigned char vl_128[] = { VL_128 };
static const unsigned char vl_no128[] = { VL_512, VL_256 };
@@ -968,4 +972,9 @@ void evex_disp8_test(void *instr, struct
RUN(avx512_vbmi2, all);
RUN(avx512_vnni, all);
RUN(avx512_vpopcntdq, all);
+
+ if ( cpu_has_avx512f )
+ {
+ RUN(vpclmulqdq, all);
+ }
}
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6))
#define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
#define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
#define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -594,7 +594,7 @@ static const struct ext0f3a_table {
[0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x40 ... 0x41] = { .simd_size = simd_packed_fp },
[0x42 ... 0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
- [0x44] = { .simd_size = simd_packed_int },
+ [0x44] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0x46] = { .simd_size = simd_packed_int },
[0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -1890,6 +1890,7 @@ in_protmode(
#define vcpu_has_avx512vl() (ctxt->cpuid->feat.avx512vl)
#define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
#define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_vpclmulqdq() (ctxt->cpuid->feat.vpclmulqdq)
#define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
#define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
#define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
@@ -10207,13 +10208,19 @@ x86_emulate(
goto opmask_shift_imm;
case X86EMUL_OPC_66(0x0f3a, 0x44): /* pclmulqdq $imm8,xmm/m128,xmm */
- case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,xmm/m128,xmm,xmm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
host_and_vcpu_must_have(pclmulqdq);
if ( vex.opcx == vex_none )
goto simd_0f3a_common;
- generate_exception_if(vex.l, EXC_UD);
+ if ( vex.l )
+ host_and_vcpu_must_have(vpclmulqdq);
goto simd_0f_imm8_avx;
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0x44): /* vpclmulqdq $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm */
+ host_and_vcpu_must_have(vpclmulqdq);
+ generate_exception_if(evex.brs || evex.opmsk, EXC_UD);
+ goto avx512f_imm8_no_sae;
+
case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
generate_exception_if(vex.w, EXC_UD);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@
/* CPUID level 0x00000007:0.ecx */
#define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI)
#define cpu_has_avx512_vbmi2 boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_vpclmulqdq boot_cpu_has(X86_FEATURE_VPCLMULQDQ)
#define cpu_has_avx512_vnni boot_cpu_has(X86_FEATURE_AVX512_VNNI)
#define cpu_has_avx512_bitalg boot_cpu_has(X86_FEATURE_AVX512_BITALG)
#define cpu_has_avx512_vpopcntdq boot_cpu_has(X86_FEATURE_AVX512_VPOPCNTDQ)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -121,7 +121,7 @@ XEN_CPUFEATURE(PBE, 0*32+31) /
/* Intel-defined CPU features, CPUID level 0x00000001.ecx, word 1 */
XEN_CPUFEATURE(SSE3, 1*32+ 0) /*A Streaming SIMD Extensions-3 */
-XEN_CPUFEATURE(PCLMULQDQ, 1*32+ 1) /*A Carry-less mulitplication */
+XEN_CPUFEATURE(PCLMULQDQ, 1*32+ 1) /*A Carry-less multiplication */
XEN_CPUFEATURE(DTES64, 1*32+ 2) /* 64-bit Debug Store */
XEN_CPUFEATURE(MONITOR, 1*32+ 3) /* Monitor/Mwait support */
XEN_CPUFEATURE(DSCPL, 1*32+ 4) /* CPL Qualified Debug Store */
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP, 6*32+ 2) /
XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */
XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */
XEN_CPUFEATURE(AVX512_VBMI2, 6*32+ 6) /*A Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(VPCLMULQDQ, 6*32+10) /*A Vector Carry-less Multiplication Instrs */
XEN_CPUFEATURE(AVX512_VNNI, 6*32+11) /*A Vector Neural Network Instrs */
XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A Support for VPOPCNT[B,W] and VPSHUFBITQMB */
XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A POPCNT for vectors of DW/QW */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -254,8 +254,9 @@ def crunch_numbers(state):
# This is just the dependency between AVX512 and AVX2 of XSTATE
# feature flags. If want to use AVX512, AVX2 must be supported and
- # enabled.
- AVX2: [AVX512F],
+ # enabled. Certain later extensions, acting on 256-bit vectors of
+ # integers, better depend on AVX2 than AVX.
+ AVX2: [AVX512F, VPCLMULQDQ],
# AVX512F is taken to mean hardware support for 512bit registers
# (which in practice depends on the EVEX prefix to encode) as well
@@ -270,6 +271,10 @@ def crunch_numbers(state):
# registers), despite the SDM not formally making this connection.
AVX512BW: [AVX512_VBMI, AVX512_VBMI2, AVX512_BITALG, AVX512_BF16],
+ # Extensions with VEX/EVEX encodings keyed to a separate feature
+ # flag are made dependents of their respective legacy feature.
+ PCLMULQDQ: [VPCLMULQDQ],
+
# The features:
# * Single Thread Indirect Branch Predictors
# * Speculative Store Bypass Disable
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 08/13] x86emul: support VAES insns
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
` (6 preceding siblings ...)
2019-07-17 6:35 ` [Xen-devel] [PATCH v10 07/13] x86emul: support VPCLMULQDQ insns Jan Beulich
@ 2019-07-17 6:36 ` Jan Beulich
2019-07-17 6:36 ` [Xen-devel] [PATCH v10 09/13] x86emul: support GFNI insns Jan Beulich
` (4 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:36 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
As to the feature dependency adjustment, just like for VPCLMULQDQ while
strictly speaking AVX is a sufficient prereq (to have YMM registers),
256-bit vectors of integers have got fully introduced with AVX2 only.
A new test case (also covering AESNI) will be added to the harness by a
subsequent patch.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citirx.com>
---
v9: Re-base. Make VAES also depend on AESNI
v8: No need to set fault_suppression to false.
v7: New.
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -591,6 +591,18 @@ static const struct test avx512_vpopcntd
INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
};
+/*
+ * The uses of b in this table are simply (one of) the shortest form(s) of
+ * saying "no broadcast" without introducing a 128-bit granularity enumerator.
+ * Due to all of the insns being WIG, w, d_nb, and q_nb would all also fit.
+ */
+static const struct test vaes_all[] = {
+ INSN(aesdec, 66, 0f38, de, vl, b, vl),
+ INSN(aesdeclast, 66, 0f38, df, vl, b, vl),
+ INSN(aesenc, 66, 0f38, dc, vl, b, vl),
+ INSN(aesenclast, 66, 0f38, dd, vl, b, vl),
+};
+
static const struct test vpclmulqdq_all[] = {
INSN(pclmulqdq, 66, 0f3a, 44, vl, q_nb, vl)
};
@@ -975,6 +987,7 @@ void evex_disp8_test(void *instr, struct
if ( cpu_has_avx512f )
{
+ RUN(vaes, all);
RUN(vpclmulqdq, all);
}
}
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_vaes (cp.feat.vaes && xcr0_mask(6))
#define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6))
#define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
#define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -541,7 +541,7 @@ static const struct ext0f38_table {
[0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
- [0xdc ... 0xdf] = { .simd_size = simd_packed_int },
+ [0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0xf0] = { .two_op = 1 },
[0xf1] = { .to_mem = 1, .two_op = 1 },
[0xf2 ... 0xf3] = {},
@@ -1890,6 +1890,7 @@ in_protmode(
#define vcpu_has_avx512vl() (ctxt->cpuid->feat.avx512vl)
#define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
#define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_vaes() (ctxt->cpuid->feat.vaes)
#define vcpu_has_vpclmulqdq() (ctxt->cpuid->feat.vpclmulqdq)
#define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
#define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
@@ -8911,13 +8912,9 @@ x86_emulate(
case X86EMUL_OPC_66(0x0f38, 0xdb): /* aesimc xmm/m128,xmm */
case X86EMUL_OPC_VEX_66(0x0f38, 0xdb): /* vaesimc xmm/m128,xmm */
case X86EMUL_OPC_66(0x0f38, 0xdc): /* aesenc xmm/m128,xmm,xmm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0xdc): /* vaesenc xmm/m128,xmm,xmm */
case X86EMUL_OPC_66(0x0f38, 0xdd): /* aesenclast xmm/m128,xmm,xmm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0xdd): /* vaesenclast xmm/m128,xmm,xmm */
case X86EMUL_OPC_66(0x0f38, 0xde): /* aesdec xmm/m128,xmm,xmm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0xde): /* vaesdec xmm/m128,xmm,xmm */
case X86EMUL_OPC_66(0x0f38, 0xdf): /* aesdeclast xmm/m128,xmm,xmm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0xdf): /* vaesdeclast xmm/m128,xmm,xmm */
host_and_vcpu_must_have(aesni);
if ( vex.opcx == vex_none )
goto simd_0f38_common;
@@ -9643,6 +9640,24 @@ x86_emulate(
host_and_vcpu_must_have(avx512er);
goto simd_zmm_scalar_sae;
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xdc): /* vaesenc {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xdd): /* vaesenclast {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xde): /* vaesdec {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xdf): /* vaesdeclast {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ if ( !vex.l )
+ host_and_vcpu_must_have(aesni);
+ else
+ host_and_vcpu_must_have(vaes);
+ goto simd_0f_avx;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xdc): /* vaesenc [xyz]mm/mem,[xyz]mm,[xyz]mm */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xdd): /* vaesenclast [xyz]mm/mem,[xyz]mm,[xyz]mm */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xde): /* vaesdec [xyz]mm/mem,[xyz]mm,[xyz]mm */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xdf): /* vaesdeclast [xyz]mm/mem,[xyz]mm,[xyz]mm */
+ host_and_vcpu_must_have(vaes);
+ generate_exception_if(evex.brs || evex.opmsk, EXC_UD);
+ goto avx512f_no_sae;
+
case X86EMUL_OPC(0x0f38, 0xf0): /* movbe m,r */
case X86EMUL_OPC(0x0f38, 0xf1): /* movbe r,m */
vcpu_must_have(movbe);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@
/* CPUID level 0x00000007:0.ecx */
#define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI)
#define cpu_has_avx512_vbmi2 boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_vaes boot_cpu_has(X86_FEATURE_VAES)
#define cpu_has_vpclmulqdq boot_cpu_has(X86_FEATURE_VPCLMULQDQ)
#define cpu_has_avx512_vnni boot_cpu_has(X86_FEATURE_AVX512_VNNI)
#define cpu_has_avx512_bitalg boot_cpu_has(X86_FEATURE_AVX512_BITALG)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP, 6*32+ 2) /
XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */
XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */
XEN_CPUFEATURE(AVX512_VBMI2, 6*32+ 6) /*A Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(VAES, 6*32+ 9) /*A Vector AES Instrs */
XEN_CPUFEATURE(VPCLMULQDQ, 6*32+10) /*A Vector Carry-less Multiplication Instrs */
XEN_CPUFEATURE(AVX512_VNNI, 6*32+11) /*A Vector Neural Network Instrs */
XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A Support for VPOPCNT[B,W] and VPSHUFBITQMB */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -256,7 +256,7 @@ def crunch_numbers(state):
# feature flags. If want to use AVX512, AVX2 must be supported and
# enabled. Certain later extensions, acting on 256-bit vectors of
# integers, better depend on AVX2 than AVX.
- AVX2: [AVX512F, VPCLMULQDQ],
+ AVX2: [AVX512F, VAES, VPCLMULQDQ],
# AVX512F is taken to mean hardware support for 512bit registers
# (which in practice depends on the EVEX prefix to encode) as well
@@ -274,6 +274,7 @@ def crunch_numbers(state):
# Extensions with VEX/EVEX encodings keyed to a separate feature
# flag are made dependents of their respective legacy feature.
PCLMULQDQ: [VPCLMULQDQ],
+ AESNI: [VAES],
# The features:
# * Single Thread Indirect Branch Predictors
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 09/13] x86emul: support GFNI insns
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
` (7 preceding siblings ...)
2019-07-17 6:36 ` [Xen-devel] [PATCH v10 08/13] x86emul: support VAES insns Jan Beulich
@ 2019-07-17 6:36 ` Jan Beulich
2019-07-17 6:36 ` [Xen-devel] [PATCH v10 10/13] x86emul: restore ordering within main switch statement Jan Beulich
` (3 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:36 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
As to the feature dependency adjustment, while strictly speaking SSE is
a sufficient prereq (to have XMM registers), vectors of bytes and qwords
have got introduced only with SSE2. gcc, for example, uses a similar
connection in its respective intrinsics header.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v9: Re-base. Drop stale part of description.
v8: Add {evex}-producing vgf2p8mulb alias to simd.h. Add missing simd.h
dependency. Re-base.
v7: New.
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -19,7 +19,8 @@ CFLAGS += $(CFLAGS_xeninclude)
SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi
FMA := fma4 fma
SG := avx2-sg avx512f-sg avx512vl-sg
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
+GF := sse2-gf avx2-gf avx512bw-gf
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(GF)
OPMASK := avx512f avx512dq avx512bw
@@ -142,12 +143,17 @@ $(1)-cflags := \
$(foreach flt,$($(1)-flts), \
"-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
endef
+define simd-gf-defs
+$(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \
+ "-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
+endef
define opmask-defs
$(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)")
endef
$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
first-string = $(shell for s in $(1); do echo "$$s"; break; done)
@@ -197,7 +203,10 @@ $(addsuffix .c,$(FMA)):
$(addsuffix .c,$(SG)):
ln -sf simd-sg.c $@
-$(addsuffix .h,$(SIMD) $(FMA) $(SG)): simd.h
+$(addsuffix .c,$(GF)):
+ ln -sf simd-gf.c $@
+
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(GF)): simd.h
xop.h avx512f.h: simd-fma.c
--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -591,6 +591,12 @@ static const struct test avx512_vpopcntd
INSN(popcnt, 66, 0f38, 55, vl, dq, vl)
};
+static const struct test gfni_all[] = {
+ INSN(gf2p8affineinvqb, 66, 0f3a, cf, vl, q, vl),
+ INSN(gf2p8affineqb, 66, 0f3a, ce, vl, q, vl),
+ INSN(gf2p8mulb, 66, 0f38, cf, vl, b, vl),
+};
+
/*
* The uses of b in this table are simply (one of) the shortest form(s) of
* saying "no broadcast" without introducing a 128-bit granularity enumerator.
@@ -987,6 +993,7 @@ void evex_disp8_test(void *instr, struct
if ( cpu_has_avx512f )
{
+ RUN(gfni, all);
RUN(vaes, all);
RUN(vpclmulqdq, all);
}
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -371,6 +371,7 @@ OVR(cvttsd2siq);
OVR(cvttss2si);
OVR(cvttss2sil);
OVR(cvttss2siq);
+OVR(gf2p8mulb);
OVR(movddup);
OVR(movntdq);
OVR(movntdqa);
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-gf.c
@@ -0,0 +1,80 @@
+#define UINT_SIZE 1
+
+#include "simd.h"
+ENTRY(gf_test);
+
+#if VEC_SIZE == 16
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v16qi ## s(a)
+#elif VEC_SIZE == 32
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v32qi ## s(a)
+#elif VEC_SIZE == 64
+# define GF(op, s, a...) __builtin_ia32_vgf2p8 ## op ## _v64qi ## s(a)
+#endif
+
+#ifdef __AVX512BW__
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# define mul(x, y) GF(mulb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)
+# define transform(m, dir, x, c) ({ \
+ vec_t t_; \
+ asm ( "vgf2p8affine" #dir "qb %[imm], %[matrix]%{1to%c[n]%}, %[src], %[dst]" \
+ : [dst] "=v" (t_) \
+ : [matrix] "m" (m), [src] "v" (x), [imm] "i" (c), [n] "i" (VEC_SIZE / 8) ); \
+ t_; \
+})
+#else
+# if defined(__AVX2__)
+# define bcstq(x) ({ \
+ vdi_t t_; \
+ asm ( "vpbroadcastq %1, %0" : "=x" (t_) : "m" (x) ); \
+ t_; \
+})
+# define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
+# else
+# define bcstq(x) ((vdi_t){x, x})
+# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# endif
+# define eq(x, y) to_bool((x) == (y))
+# define mul(x, y) GF(mulb, , (vqi_t)(x), (vqi_t)(y))
+# define transform(m, dir, x, c) ({ \
+ vdi_t m_ = bcstq(m); \
+ touch(m_); \
+ ((vec_t)GF(affine ## dir ## qb, , (vqi_t)(x), (vqi_t)m_, c)); \
+})
+#endif
+
+const unsigned __attribute__((mode(DI))) ident = 0x0102040810204080ULL;
+
+int gf_test(void)
+{
+ unsigned int i;
+ vec_t src, one;
+
+ for ( i = 0; i < ELEM_COUNT; ++i )
+ {
+ src[i] = i;
+ one[i] = 1;
+ }
+
+ /* Special case for first iteration. */
+ one[0] = 0;
+
+ do {
+ vec_t inv = transform(ident, inv, src, 0);
+
+ touch(src);
+ touch(inv);
+ if ( !eq(mul(src, inv), one) ) return __LINE__;
+
+ touch(src);
+ touch(inv);
+ if ( !eq(mul(inv, src), one) ) return __LINE__;
+
+ one[0] = 1;
+
+ src += ELEM_COUNT;
+ i += ELEM_COUNT;
+ } while ( i < 256 );
+
+ return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -11,12 +11,14 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "3dnow.h"
#include "sse.h"
#include "sse2.h"
+#include "sse2-gf.h"
#include "sse4.h"
#include "avx.h"
#include "fma4.h"
#include "fma.h"
#include "avx2.h"
#include "avx2-sg.h"
+#include "avx2-gf.h"
#include "xop.h"
#include "avx512f-opmask.h"
#include "avx512dq-opmask.h"
@@ -25,6 +27,7 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "avx512f-sg.h"
#include "avx512vl-sg.h"
#include "avx512bw.h"
+#include "avx512bw-gf.h"
#include "avx512dq.h"
#include "avx512er.h"
#include "avx512vbmi.h"
@@ -138,6 +141,26 @@ static bool simd_check_avx512vbmi_vl(voi
return cpu_has_avx512_vbmi && cpu_has_avx512vl;
}
+static bool simd_check_sse2_gf(void)
+{
+ return cpu_has_gfni && cpu_has_sse2;
+}
+
+static bool simd_check_avx2_gf(void)
+{
+ return cpu_has_gfni && cpu_has_avx2;
+}
+
+static bool simd_check_avx512bw_gf(void)
+{
+ return cpu_has_gfni && cpu_has_avx512bw;
+}
+
+static bool simd_check_avx512bw_gf_vl(void)
+{
+ return cpu_has_gfni && cpu_has_avx512vl;
+}
+
static void simd_set_regs(struct cpu_user_regs *regs)
{
if ( cpu_has_mmx )
@@ -395,6 +418,12 @@ static const struct {
AVX512VL(_VBMI+VL u16x8, avx512vbmi, 16u2),
AVX512VL(_VBMI+VL s16x16, avx512vbmi, 32i2),
AVX512VL(_VBMI+VL u16x16, avx512vbmi, 32u2),
+ SIMD(GFNI (legacy), sse2_gf, 16),
+ SIMD(GFNI (VEX/x16), avx2_gf, 16),
+ SIMD(GFNI (VEX/x32), avx2_gf, 32),
+ SIMD(GFNI (EVEX/x64), avx512bw_gf, 64),
+ AVX512VL(VL+GFNI (x16), avx512bw_gf, 16),
+ AVX512VL(VL+GFNI (x32), avx512bw_gf, 32),
#undef AVX512VL_
#undef AVX512VL
#undef SIMD_
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -144,6 +144,7 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi2 (cp.feat.avx512_vbmi2 && xcr0_mask(0xe6))
+#define cpu_has_gfni cp.feat.gfni
#define cpu_has_vaes (cp.feat.vaes && xcr0_mask(6))
#define cpu_has_vpclmulqdq (cp.feat.vpclmulqdq && xcr0_mask(6))
#define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -540,6 +540,7 @@ static const struct ext0f38_table {
[0xcb] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
[0xcc] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
[0xcd] = { .simd_size = simd_scalar_vexw, .d8s = d8s_dq },
+ [0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xdc ... 0xdf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0xf0] = { .two_op = 1 },
@@ -619,6 +620,7 @@ static const struct ext0f3a_table {
[0x7c ... 0x7d] = { .simd_size = simd_packed_fp, .four_op = 1 },
[0x7e ... 0x7f] = { .simd_size = simd_scalar_opc, .four_op = 1 },
[0xcc] = { .simd_size = simd_other },
+ [0xce ... 0xcf] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
[0xdf] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xf0] = {},
};
@@ -1890,6 +1892,7 @@ in_protmode(
#define vcpu_has_avx512vl() (ctxt->cpuid->feat.avx512vl)
#define vcpu_has_avx512_vbmi() (ctxt->cpuid->feat.avx512_vbmi)
#define vcpu_has_avx512_vbmi2() (ctxt->cpuid->feat.avx512_vbmi2)
+#define vcpu_has_gfni() (ctxt->cpuid->feat.gfni)
#define vcpu_has_vaes() (ctxt->cpuid->feat.vaes)
#define vcpu_has_vpclmulqdq() (ctxt->cpuid->feat.vpclmulqdq)
#define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
@@ -9640,6 +9643,21 @@ x86_emulate(
host_and_vcpu_must_have(avx512er);
goto simd_zmm_scalar_sae;
+ case X86EMUL_OPC_66(0x0f38, 0xcf): /* gf2p8mulb xmm/m128,xmm */
+ host_and_vcpu_must_have(gfni);
+ goto simd_0f38_common;
+
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xcf): /* vgf2p8mulb {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ host_and_vcpu_must_have(gfni);
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_avx;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xcf): /* vgf2p8mulb [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(gfni);
+ generate_exception_if(evex.w || evex.brs, EXC_UD);
+ elem_bytes = 1;
+ goto avx512f_no_sae;
+
case X86EMUL_OPC_VEX_66(0x0f38, 0xdc): /* vaesenc {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0xdd): /* vaesenclast {x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f38, 0xde): /* vaesdec {x,y}mm/mem,{x,y}mm,{x,y}mm */
@@ -10383,6 +10401,24 @@ x86_emulate(
op_bytes = 16;
goto simd_0f3a_common;
+ case X86EMUL_OPC_66(0x0f3a, 0xce): /* gf2p8affineqb $imm8,xmm/m128,xmm */
+ case X86EMUL_OPC_66(0x0f3a, 0xcf): /* gf2p8affineinvqb $imm8,xmm/m128,xmm */
+ host_and_vcpu_must_have(gfni);
+ goto simd_0f3a_common;
+
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0xce): /* vgf2p8affineqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb $imm8,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ host_and_vcpu_must_have(gfni);
+ generate_exception_if(!vex.w, EXC_UD);
+ goto simd_0f_imm8_avx;
+
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0xce): /* vgf2p8affineqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f3a, 0xcf): /* vgf2p8affineinvqb $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(gfni);
+ generate_exception_if(!evex.w, EXC_UD);
+ fault_suppression = false;
+ goto avx512f_imm8_no_sae;
+
case X86EMUL_OPC_66(0x0f3a, 0xdf): /* aeskeygenassist $imm8,xmm/m128,xmm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0xdf): /* vaeskeygenassist $imm8,xmm/m128,xmm */
host_and_vcpu_must_have(aesni);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -111,6 +111,7 @@
/* CPUID level 0x00000007:0.ecx */
#define cpu_has_avx512_vbmi boot_cpu_has(X86_FEATURE_AVX512_VBMI)
#define cpu_has_avx512_vbmi2 boot_cpu_has(X86_FEATURE_AVX512_VBMI2)
+#define cpu_has_gfni boot_cpu_has(X86_FEATURE_GFNI)
#define cpu_has_vaes boot_cpu_has(X86_FEATURE_VAES)
#define cpu_has_vpclmulqdq boot_cpu_has(X86_FEATURE_VPCLMULQDQ)
#define cpu_has_avx512_vnni boot_cpu_has(X86_FEATURE_AVX512_VNNI)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -229,6 +229,7 @@ XEN_CPUFEATURE(UMIP, 6*32+ 2) /
XEN_CPUFEATURE(PKU, 6*32+ 3) /*H Protection Keys for Userspace */
XEN_CPUFEATURE(OSPKE, 6*32+ 4) /*! OS Protection Keys Enable */
XEN_CPUFEATURE(AVX512_VBMI2, 6*32+ 6) /*A Additional AVX-512 Vector Byte Manipulation Instrs */
+XEN_CPUFEATURE(GFNI, 6*32+ 8) /*A Galois Field Instrs */
XEN_CPUFEATURE(VAES, 6*32+ 9) /*A Vector AES Instrs */
XEN_CPUFEATURE(VPCLMULQDQ, 6*32+10) /*A Vector Carry-less Multiplication Instrs */
XEN_CPUFEATURE(AVX512_VNNI, 6*32+11) /*A Vector Neural Network Instrs */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -201,7 +201,7 @@ def crunch_numbers(state):
# SSE2 was re-specified as core instructions for 64bit. Also ISA
# extensions dealing with vectors of integers are added here rather
# than to SSE.
- SSE2: [SSE3, LM, AESNI, PCLMULQDQ, SHA],
+ SSE2: [SSE3, LM, AESNI, PCLMULQDQ, SHA, GFNI],
# Other SSEn each depend on their predecessor versions.
SSE3: [SSSE3],
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 10/13] x86emul: restore ordering within main switch statement
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
` (8 preceding siblings ...)
2019-07-17 6:36 ` [Xen-devel] [PATCH v10 09/13] x86emul: support GFNI insns Jan Beulich
@ 2019-07-17 6:36 ` Jan Beulich
2019-07-17 6:37 ` [Xen-devel] [PATCH v10 11/13] x86emul: add an AES/VAES test case to the harness Jan Beulich
` (2 subsequent siblings)
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:36 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
Incremental additions and/or mistakes have lead to some code blocks
sitting in "unexpected" places. Re-sort the case blocks (opcode space;
major opcode; 66/F3/F2 prefix; legacy/VEX/EVEX encoding).
As an exception the opcode space 0x0f EVEX-encoded VPEXTRW is left at
its current place, to keep it close to the "pextr" label.
Pure code movement.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citirx.com>
---
v7: New.
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -7105,15 +7105,6 @@ x86_emulate(
ASSERT(!state->simd_size);
break;
- case X86EMUL_OPC_EVEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
- case X86EMUL_OPC_EVEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
- generate_exception_if(evex.lr || !evex.w || evex.opmsk || evex.brs,
- EXC_UD);
- host_and_vcpu_must_have(avx512f);
- d |= TwoOp;
- op_bytes = 8;
- goto simd_zmm;
-
case X86EMUL_OPC_66(0x0f, 0xe7): /* movntdq xmm,m128 */
case X86EMUL_OPC_VEX_66(0x0f, 0xe7): /* vmovntdq {x,y}mm,mem */
generate_exception_if(ea.type != OP_MEM, EXC_UD);
@@ -7511,6 +7502,15 @@ x86_emulate(
op_bytes = 8;
goto simd_0f_int;
+ case X86EMUL_OPC_EVEX_F3(0x0f, 0x7e): /* vmovq xmm/m64,xmm */
+ case X86EMUL_OPC_EVEX_66(0x0f, 0xd6): /* vmovq xmm,xmm/m64 */
+ generate_exception_if(evex.lr || !evex.w || evex.opmsk || evex.brs,
+ EXC_UD);
+ host_and_vcpu_must_have(avx512f);
+ d |= TwoOp;
+ op_bytes = 8;
+ goto simd_zmm;
+
case X86EMUL_OPC(0x0f, 0x80) ... X86EMUL_OPC(0x0f, 0x8f): /* jcc (near) */
if ( test_cc(b, _regs.eflags) )
jmp_rel((int32_t)src.val);
@@ -8611,63 +8611,6 @@ x86_emulate(
dst.type = OP_NONE;
break;
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x10): /* vpsrlvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x11): /* vpsravw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x12): /* vpsllvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- host_and_vcpu_must_have(avx512bw);
- generate_exception_if(!evex.w || evex.brs, EXC_UD);
- elem_bytes = 2;
- goto avx512f_no_sae;
-
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,[xyz]mm{k} */
- op_bytes = elem_bytes;
- generate_exception_if(evex.w || evex.brs, EXC_UD);
- avx512_broadcast:
- /*
- * For the respective code below the main switch() to work we need to
- * fold op_mask here: A source element gets read whenever any of its
- * respective destination elements' mask bits is set.
- */
- if ( fault_suppression )
- {
- n = 1 << ((b & 3) - evex.w);
- EXPECT(elem_bytes > 0);
- ASSERT(op_bytes == n * elem_bytes);
- for ( i = n; i < (16 << evex.lr) / elem_bytes; i += n )
- op_mask |= (op_mask >> i) & ((1 << n) - 1);
- }
- goto avx512f_no_sae;
-
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */
- /* vbroadcastf64x4 m256,zmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x5b): /* vbroadcasti32x8 m256,zmm{k} */
- /* vbroadcasti64x4 m256,zmm{k} */
- generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD);
- /* fall through */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */
- /* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */
- generate_exception_if(!evex.lr, EXC_UD);
- /* fall through */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,[xyz]mm{k} */
- /* vbroadcasti32x2 xmm/m64,[xyz]mm{k} */
- if ( b == 0x59 )
- op_bytes = 8;
- generate_exception_if(evex.brs, EXC_UD);
- if ( !evex.w )
- host_and_vcpu_must_have(avx512dq);
- goto avx512_broadcast;
-
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */
- /* vbroadcastf64x2 m128,{y,z}mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x5a): /* vbroadcasti32x4 m128,{y,z}mm{k} */
- /* vbroadcasti64x2 m128,{y,z}mm{k} */
- generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.brs,
- EXC_UD);
- if ( evex.w )
- host_and_vcpu_must_have(avx512dq);
- goto avx512_broadcast;
-
case X86EMUL_OPC_66(0x0f38, 0x20): /* pmovsxbw xmm/m64,xmm */
case X86EMUL_OPC_66(0x0f38, 0x21): /* pmovsxbd xmm/m32,xmm */
case X86EMUL_OPC_66(0x0f38, 0x22): /* pmovsxbq xmm/m16,xmm */
@@ -8701,47 +8644,14 @@ x86_emulate(
host_and_vcpu_must_have(sse4_1);
goto simd_0f38_common;
- case X86EMUL_OPC_VEX_66(0x0f38, 0x13): /* vcvtph2ps xmm/mem,{x,y}mm */
- generate_exception_if(vex.w, EXC_UD);
- host_and_vcpu_must_have(f16c);
- op_bytes = 8 << vex.l;
- goto simd_0f_ymm;
-
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x13): /* vcvtph2ps {x,y}mm/mem,[xyz]mm{k} */
- generate_exception_if(evex.w || (ea.type != OP_REG && evex.brs), EXC_UD);
- host_and_vcpu_must_have(avx512f);
- if ( !evex.brs )
- avx512_vlen_check(false);
- op_bytes = 8 << evex.lr;
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x10): /* vpsrlvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x11): /* vpsravw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x12): /* vpsllvw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512bw);
+ generate_exception_if(!evex.w || evex.brs, EXC_UD);
elem_bytes = 2;
- goto simd_zmm;
-
- case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
- generate_exception_if(!vex.l || vex.w, EXC_UD);
- goto simd_0f_avx2;
-
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x16): /* vpermp{s,d} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x36): /* vperm{d,q} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
- generate_exception_if(!evex.lr, EXC_UD);
- fault_suppression = false;
goto avx512f_no_sae;
- case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x23): /* vpmovsxwd xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x24): /* vpmovsxwq xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x25): /* vpmovsxdq xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x30): /* vpmovzxbw xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x31): /* vpmovzxbd xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x32): /* vpmovzxbq xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x33): /* vpmovzxwd xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x35): /* vpmovzxdq xmm/mem,{x,y}mm */
- op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
- goto simd_0f_int;
-
case X86EMUL_OPC_EVEX_F3(0x0f38, 0x10): /* vpmovuswb [xyz]mm,{x,y}mm/mem{k} */
case X86EMUL_OPC_EVEX_66(0x0f38, 0x20): /* vpmovsxbw {x,y}mm/mem,[xyz]mm{k} */
case X86EMUL_OPC_EVEX_F3(0x0f38, 0x20): /* vpmovswb [xyz]mm,{x,y}mm/mem{k} */
@@ -8787,6 +8697,96 @@ x86_emulate(
elem_bytes = (b & 7) < 3 ? 1 : (b & 7) != 5 ? 2 : 4;
goto avx512f_no_sae;
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x13): /* vcvtph2ps xmm/mem,{x,y}mm */
+ generate_exception_if(vex.w, EXC_UD);
+ host_and_vcpu_must_have(f16c);
+ op_bytes = 8 << vex.l;
+ goto simd_0f_ymm;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x13): /* vcvtph2ps {x,y}mm/mem,[xyz]mm{k} */
+ generate_exception_if(evex.w || (ea.type != OP_REG && evex.brs), EXC_UD);
+ host_and_vcpu_must_have(avx512f);
+ if ( !evex.brs )
+ avx512_vlen_check(false);
+ op_bytes = 8 << evex.lr;
+ elem_bytes = 2;
+ goto simd_zmm;
+
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x16): /* vpermps ymm/m256,ymm,ymm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x36): /* vpermd ymm/m256,ymm,ymm */
+ generate_exception_if(!vex.l || vex.w, EXC_UD);
+ goto simd_0f_avx2;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x16): /* vpermp{s,d} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x36): /* vperm{d,q} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */
+ generate_exception_if(!evex.lr, EXC_UD);
+ fault_suppression = false;
+ goto avx512f_no_sae;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,[xyz]mm{k} */
+ op_bytes = elem_bytes;
+ generate_exception_if(evex.w || evex.brs, EXC_UD);
+ avx512_broadcast:
+ /*
+ * For the respective code below the main switch() to work we need to
+ * fold op_mask here: A source element gets read whenever any of its
+ * respective destination elements' mask bits is set.
+ */
+ if ( fault_suppression )
+ {
+ n = 1 << ((b & 3) - evex.w);
+ EXPECT(elem_bytes > 0);
+ ASSERT(op_bytes == n * elem_bytes);
+ for ( i = n; i < (16 << evex.lr) / elem_bytes; i += n )
+ op_mask |= (op_mask >> i) & ((1 << n) - 1);
+ }
+ goto avx512f_no_sae;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */
+ /* vbroadcastf64x4 m256,zmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x5b): /* vbroadcasti32x8 m256,zmm{k} */
+ /* vbroadcasti64x4 m256,zmm{k} */
+ generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */
+ /* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */
+ generate_exception_if(!evex.lr, EXC_UD);
+ /* fall through */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,[xyz]mm{k} */
+ /* vbroadcasti32x2 xmm/m64,[xyz]mm{k} */
+ if ( b == 0x59 )
+ op_bytes = 8;
+ generate_exception_if(evex.brs, EXC_UD);
+ if ( !evex.w )
+ host_and_vcpu_must_have(avx512dq);
+ goto avx512_broadcast;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */
+ /* vbroadcastf64x2 m128,{y,z}mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x5a): /* vbroadcasti32x4 m128,{y,z}mm{k} */
+ /* vbroadcasti64x2 m128,{y,z}mm{k} */
+ generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.brs,
+ EXC_UD);
+ if ( evex.w )
+ host_and_vcpu_must_have(avx512dq);
+ goto avx512_broadcast;
+
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x23): /* vpmovsxwd xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x24): /* vpmovsxwq xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x25): /* vpmovsxdq xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x30): /* vpmovzxbw xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x31): /* vpmovzxbd xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x32): /* vpmovzxbq xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x33): /* vpmovzxwd xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x34): /* vpmovzxwq xmm/mem,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x35): /* vpmovzxdq xmm/mem,{x,y}mm */
+ op_bytes = 16 >> (pmov_convert_delta[b & 7] - vex.l);
+ goto simd_0f_int;
+
case X86EMUL_OPC_EVEX_F3(0x0f38, 0x29): /* vpmov{b,w}2m [xyz]mm,k */
case X86EMUL_OPC_EVEX_F3(0x0f38, 0x39): /* vpmov{d,q}2m [xyz]mm,k */
generate_exception_if(!evex.r || !evex.R, EXC_UD);
@@ -8894,6 +8894,52 @@ x86_emulate(
break;
}
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x2c): /* vscalefp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x42): /* vgetexpp{s,d} [xyz]mm/mem,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512f);
+ if ( ea.type != OP_REG || !evex.brs )
+ avx512_vlen_check(false);
+ goto simd_zmm;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x2d): /* vscalefs{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x43): /* vgetexps{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} xmm/mem,xmm,xmm{k} */
+ host_and_vcpu_must_have(avx512f);
+ simd_zmm_scalar_sae:
+ generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);
+ if ( !evex.brs )
+ avx512_vlen_check(true);
+ goto simd_zmm;
+
case X86EMUL_OPC_66(0x0f38, 0x37): /* pcmpgtq xmm/m128,xmm */
host_and_vcpu_must_have(sse4_2);
goto simd_0f38_common;
@@ -8926,6 +8972,31 @@ x86_emulate(
generate_exception_if(vex.l, EXC_UD);
goto simd_0f_avx;
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x50): /* vpdpbusd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x51): /* vpdpbusds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x52): /* vpdpwssd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x53): /* vpdpwssds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+ host_and_vcpu_must_have(avx512_vnni);
+ generate_exception_if(evex.w, EXC_UD);
+ goto avx512f_no_sae;
+
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
+ op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
+ /* fall through */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_avx2;
+
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x4d): /* vrcp14s{s,d} xmm/mem,xmm,xmm{k} */
+ case X86EMUL_OPC_EVEX_66(0x0f38, 0x4f): /* vrsqrt14s{s,d} xmm/mem,xmm,xmm{k} */
+ host_and_vcpu_must_have(avx512f);
+ generate_exception_if(evex.brs, EXC_UD);
+ avx512_vlen_check(true);
+ goto simd_zmm;
+
case X86EMUL_OPC_EVEX_F2(0x0f38, 0x52): /* vp4dpwssd m128,zmm+3,zmm{k} */
case X86EMUL_OPC_EVEX_F2(0x0f38, 0x53): /* vp4dpwssds m128,zmm+3,zmm{k} */
host_and_vcpu_must_have(avx512_4vnniw);
@@ -8948,23 +9019,6 @@ x86_emulate(
host_and_vcpu_must_have(avx512_vpopcntdq);
goto avx512f_no_sae;
- case X86EMUL_OPC_VEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,{x,y}mm */
- op_bytes = 1 << ((!(b & 0x20) * 2) + (b & 1));
- /* fall through */
- case X86EMUL_OPC_VEX_66(0x0f38, 0x46): /* vpsravd {x,y}mm/mem,{x,y}mm,{x,y}mm */
- generate_exception_if(vex.w, EXC_UD);
- goto simd_0f_avx2;
-
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x4d): /* vrcp14s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x4f): /* vrsqrt14s{s,d} xmm/mem,xmm,xmm{k} */
- host_and_vcpu_must_have(avx512f);
- generate_exception_if(evex.brs, EXC_UD);
- avx512_vlen_check(true);
- goto simd_zmm;
-
case X86EMUL_OPC_VEX_66(0x0f38, 0x5a): /* vbroadcasti128 m128,ymm */
generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD);
goto simd_0f_avx2;
@@ -9352,60 +9406,6 @@ x86_emulate(
host_and_vcpu_must_have(fma);
goto simd_0f_ymm;
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x2c): /* vscalefp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x42): /* vgetexpp{s,d} [xyz]mm/mem,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- host_and_vcpu_must_have(avx512f);
- if ( ea.type != OP_REG || !evex.brs )
- avx512_vlen_check(false);
- goto simd_zmm;
-
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x2d): /* vscalefs{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x43): /* vgetexps{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} xmm/mem,xmm,xmm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} xmm/mem,xmm,xmm{k} */
- host_and_vcpu_must_have(avx512f);
- simd_zmm_scalar_sae:
- generate_exception_if(ea.type != OP_REG && evex.brs, EXC_UD);
- if ( !evex.brs )
- avx512_vlen_check(true);
- goto simd_zmm;
-
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x50): /* vpdpbusd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x51): /* vpdpbusds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x52): /* vpdpwssd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- case X86EMUL_OPC_EVEX_66(0x0f38, 0x53): /* vpdpwssds [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
- host_and_vcpu_must_have(avx512_vnni);
- generate_exception_if(evex.w, EXC_UD);
- goto avx512f_no_sae;
-
case X86EMUL_OPC_EVEX_F2(0x0f38, 0x9a): /* v4fmaddps m128,zmm+3,zmm{k} */
case X86EMUL_OPC_EVEX_F2(0x0f38, 0xaa): /* v4fnmaddps m128,zmm+3,zmm{k} */
host_and_vcpu_must_have(avx512_4fmaps);
@@ -10254,11 +10254,6 @@ x86_emulate(
generate_exception_if(evex.brs || evex.opmsk, EXC_UD);
goto avx512f_imm8_no_sae;
- case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
- case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
- generate_exception_if(vex.w, EXC_UD);
- goto simd_0f_imm8_avx;
-
case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
/* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
@@ -10266,6 +10261,11 @@ x86_emulate(
host_and_vcpu_must_have(xop);
goto simd_0f_imm8_ymm;
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x4a): /* vblendvps {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f3a, 0x4b): /* vblendvpd {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+ generate_exception_if(vex.w, EXC_UD);
+ goto simd_0f_imm8_avx;
+
case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
generate_exception_if(vex.w, EXC_UD);
goto simd_0f_int_imm8;
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 11/13] x86emul: add an AES/VAES test case to the harness
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
` (9 preceding siblings ...)
2019-07-17 6:36 ` [Xen-devel] [PATCH v10 10/13] x86emul: restore ordering within main switch statement Jan Beulich
@ 2019-07-17 6:37 ` Jan Beulich
2019-07-17 6:38 ` [Xen-devel] [PATCH v10 12/13] x86emul: add a SHA " Jan Beulich
2019-07-17 6:38 ` [Xen-devel] [PATCH v10 13/13] x86emul: add a PCLMUL/VPCLMUL " Jan Beulich
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:37 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v8: New.
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -19,8 +19,9 @@ CFLAGS += $(CFLAGS_xeninclude)
SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq avx512er avx512vbmi
FMA := fma4 fma
SG := avx2-sg avx512f-sg avx512vl-sg
+AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes
GF := sse2-gf avx2-gf avx512bw-gf
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(GF)
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(GF)
OPMASK := avx512f avx512dq avx512bw
@@ -143,6 +144,10 @@ $(1)-cflags := \
$(foreach flt,$($(1)-flts), \
"-D_$(vec)x$(idx)f$(flt) -m$(1:-sg=) $(call non-sse,$(1)) -Os -DVEC_MAX=$(vec) -DIDX_SIZE=$(idx) -DFLOAT_SIZE=$(flt)")))
endef
+define simd-aes-defs
+$(1)-cflags := $(foreach vec,$($(patsubst %-aes,sse,$(1))-vecs) $($(patsubst %-vaes,%,$(1))-vecs), \
+ "-D_$(vec) -maes $(addprefix -m,$(subst -,$(space),$(1))) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
+endef
define simd-gf-defs
$(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \
"-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
@@ -153,6 +158,7 @@ endef
$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
+$(foreach flavor,$(AES),$(eval $(call simd-aes-defs,$(flavor))))
$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
@@ -203,10 +209,13 @@ $(addsuffix .c,$(FMA)):
$(addsuffix .c,$(SG)):
ln -sf simd-sg.c $@
+$(addsuffix .c,$(AES)):
+ ln -sf simd-aes.c $@
+
$(addsuffix .c,$(GF)):
ln -sf simd-gf.c $@
-$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(GF)): simd.h
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(GF)): simd.h
xop.h avx512f.h: simd-fma.c
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-aes.c
@@ -0,0 +1,102 @@
+#define UINT_SIZE 1
+
+#include "simd.h"
+ENTRY(aes_test);
+
+#if VEC_SIZE == 16
+# define AES(op, a...) __builtin_ia32_vaes ## op ## _v16qi(a)
+# define imc(x) ((vec_t)__builtin_ia32_aesimc128((vdi_t)(x)))
+#elif VEC_SIZE == 32
+# define AES(op, a...) __builtin_ia32_vaes ## op ## _v32qi(a)
+# define imc(x) ({ \
+ vec_t r_; \
+ unsigned char __attribute__((vector_size(16))) t_; \
+ asm ( "vaesimc (%3), %x0\n\t" \
+ "vaesimc 16(%3), %1\n\t" \
+ "vinserti128 $1, %1, %0, %0" \
+ : "=&v" (r_), "=&v" (t_) \
+ : "m" (x), "r" (&(x)) ); \
+ r_; \
+})
+#elif VEC_SIZE == 64
+# define AES(op, a...) __builtin_ia32_vaes ## op ## _v64qi(a)
+# define imc(x) ({ \
+ vec_t r_; \
+ unsigned char __attribute__((vector_size(16))) t_; \
+ asm ( "vaesimc (%3), %x0\n\t" \
+ "vaesimc 1*16(%3), %1\n\t" \
+ "vinserti32x4 $1, %1, %0, %0\n\t" \
+ "vaesimc 2*16(%3), %1\n\t" \
+ "vinserti32x4 $2, %1, %0, %0\n\t" \
+ "vaesimc 3*16(%3), %1\n\t" \
+ "vinserti32x4 $3, %1, %0, %0" \
+ : "=&v" (r_), "=&v" (t_) \
+ : "m" (x), "r" (&(x)) ); \
+ r_; \
+})
+#endif
+
+#ifdef __AVX512BW__
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqb, _mask, (vqi_t)(x), (vqi_t)(y), -1) == ALL_TRUE)
+# define aes(op, x, y) ((vec_t)AES(op, (vqi_t)(x), (vqi_t)(y)))
+#else
+# if defined(__AVX2__) && VEC_SIZE == 32
+# define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
+# define aes(op, x, y) ((vec_t)AES(op, (vqi_t)(x), (vqi_t)(y)))
+# else
+# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# define aes(op, x, y) ((vec_t)__builtin_ia32_aes ## op ## 128((vdi_t)(x), (vdi_t)(y)))
+# endif
+# define eq(x, y) to_bool((x) == (y))
+#endif
+
+int aes_test(void)
+{
+ unsigned int i;
+ vec_t src, zero = {};
+
+ for ( i = 0; i < ELEM_COUNT; ++i )
+ src[i] = i;
+
+ do {
+ vec_t x, y;
+
+ touch(src);
+ x = imc(src);
+ touch(src);
+
+ touch(zero);
+ y = aes(enclast, src, zero);
+ touch(zero);
+ y = aes(dec, y, zero);
+
+ if ( !eq(x, y) ) return __LINE__;
+
+ touch(zero);
+ x = aes(declast, src, zero);
+ touch(zero);
+ y = aes(enc, x, zero);
+ touch(y);
+ x = imc(y);
+
+ if ( !eq(x, src) ) return __LINE__;
+
+#if VEC_SIZE == 16
+ touch(src);
+ x = (vec_t)__builtin_ia32_aeskeygenassist128((vdi_t)src, 0);
+ touch(src);
+ y = (vec_t)__builtin_ia32_pshufb128((vqi_t)x,
+ (vqi_t){ 7, 4, 5, 6,
+ 1, 2, 3, 0,
+ 15, 12, 13, 14,
+ 9, 10, 11, 8 });
+ if ( !eq(x, y) ) return __LINE__;
+#endif
+
+ src += ELEM_COUNT;
+ i += ELEM_COUNT;
+ } while ( i <= 256 );
+
+ return 0;
+}
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -340,6 +340,10 @@ REN(pandn, , d);
REN(por, , d);
REN(pxor, , d);
# endif
+OVR(aesdec);
+OVR(aesdeclast);
+OVR(aesenc);
+OVR(aesenclast);
OVR(cvtpd2dqx);
OVR(cvtpd2dqy);
OVR(cvtpd2psx);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -12,12 +12,15 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "sse.h"
#include "sse2.h"
#include "sse2-gf.h"
+#include "ssse3-aes.h"
#include "sse4.h"
#include "avx.h"
+#include "avx-aes.h"
#include "fma4.h"
#include "fma.h"
#include "avx2.h"
#include "avx2-sg.h"
+#include "avx2-vaes.h"
#include "avx2-gf.h"
#include "xop.h"
#include "avx512f-opmask.h"
@@ -27,6 +30,7 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "avx512f-sg.h"
#include "avx512vl-sg.h"
#include "avx512bw.h"
+#include "avx512bw-vaes.h"
#include "avx512bw-gf.h"
#include "avx512dq.h"
#include "avx512er.h"
@@ -91,6 +95,16 @@ static bool simd_check_xop(void)
return cpu_has_xop;
}
+static bool simd_check_ssse3_aes(void)
+{
+ return cpu_has_aesni && cpu_has_ssse3;
+}
+
+static bool simd_check_avx_aes(void)
+{
+ return cpu_has_aesni && cpu_has_avx;
+}
+
static bool simd_check_avx512f(void)
{
return cpu_has_avx512f;
@@ -141,6 +155,22 @@ static bool simd_check_avx512vbmi_vl(voi
return cpu_has_avx512_vbmi && cpu_has_avx512vl;
}
+static bool simd_check_avx2_vaes(void)
+{
+ return cpu_has_aesni && cpu_has_vaes && cpu_has_avx2;
+}
+
+static bool simd_check_avx512bw_vaes(void)
+{
+ return cpu_has_aesni && cpu_has_vaes && cpu_has_avx512bw;
+}
+
+static bool simd_check_avx512bw_vaes_vl(void)
+{
+ return cpu_has_aesni && cpu_has_vaes &&
+ cpu_has_avx512bw && cpu_has_avx512vl;
+}
+
static bool simd_check_sse2_gf(void)
{
return cpu_has_gfni && cpu_has_sse2;
@@ -319,6 +349,8 @@ static const struct {
SIMD(XOP i16x16, xop, 32i2),
SIMD(XOP i32x8, xop, 32i4),
SIMD(XOP i64x4, xop, 32i8),
+ SIMD(AES (legacy), ssse3_aes, 16),
+ SIMD(AES (VEX/x16), avx_aes, 16),
SIMD(OPMASK/w, avx512f_opmask, 2),
SIMD(OPMASK+DQ/b, avx512dq_opmask, 1),
SIMD(OPMASK+DQ/w, avx512dq_opmask, 2),
@@ -418,6 +450,10 @@ static const struct {
AVX512VL(_VBMI+VL u16x8, avx512vbmi, 16u2),
AVX512VL(_VBMI+VL s16x16, avx512vbmi, 32i2),
AVX512VL(_VBMI+VL u16x16, avx512vbmi, 32u2),
+ SIMD(VAES (VEX/x32), avx2_vaes, 32),
+ SIMD(VAES (EVEX/x64), avx512bw_vaes, 64),
+ AVX512VL(VL+VAES (x16), avx512bw_vaes, 16),
+ AVX512VL(VL+VAES (x32), avx512bw_vaes, 32),
SIMD(GFNI (legacy), sse2_gf, 16),
SIMD(GFNI (VEX/x16), avx2_gf, 16),
SIMD(GFNI (VEX/x32), avx2_gf, 32),
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -125,10 +125,12 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_sse cp.basic.sse
#define cpu_has_sse2 cp.basic.sse2
#define cpu_has_sse3 cp.basic.sse3
+#define cpu_has_ssse3 cp.basic.ssse3
#define cpu_has_fma (cp.basic.fma && xcr0_mask(6))
#define cpu_has_sse4_1 cp.basic.sse4_1
#define cpu_has_sse4_2 cp.basic.sse4_2
#define cpu_has_popcnt cp.basic.popcnt
+#define cpu_has_aesni cp.basic.aesni
#define cpu_has_avx (cp.basic.avx && xcr0_mask(6))
#define cpu_has_f16c (cp.basic.f16c && xcr0_mask(6))
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 12/13] x86emul: add a SHA test case to the harness
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
` (10 preceding siblings ...)
2019-07-17 6:37 ` [Xen-devel] [PATCH v10 11/13] x86emul: add an AES/VAES test case to the harness Jan Beulich
@ 2019-07-17 6:38 ` Jan Beulich
2019-07-17 6:38 ` [Xen-devel] [PATCH v10 13/13] x86emul: add a PCLMUL/VPCLMUL " Jan Beulich
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:38 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
Also use this for AVX512VL VPRO{L,R}{,V}D as well as some further shifts
testing.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v8: New.
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -20,8 +20,9 @@ SIMD := 3dnow sse sse2 sse4 avx avx2 xop
FMA := fma4 fma
SG := avx2-sg avx512f-sg avx512vl-sg
AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes
+SHA := sse4-sha avx-sha avx512f-sha
GF := sse2-gf avx2-gf avx512bw-gf
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(GF)
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(SHA) $(GF)
OPMASK := avx512f avx512dq avx512bw
@@ -148,6 +149,10 @@ define simd-aes-defs
$(1)-cflags := $(foreach vec,$($(patsubst %-aes,sse,$(1))-vecs) $($(patsubst %-vaes,%,$(1))-vecs), \
"-D_$(vec) -maes $(addprefix -m,$(subst -,$(space),$(1))) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
endef
+define simd-sha-defs
+$(1)-cflags := $(foreach vec,$(sse-vecs), \
+ "-D_$(vec) $(addprefix -m,$(subst -,$(space),$(1))) -Os -DVEC_SIZE=$(vec)")
+endef
define simd-gf-defs
$(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \
"-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
@@ -159,6 +164,7 @@ endef
$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
$(foreach flavor,$(AES),$(eval $(call simd-aes-defs,$(flavor))))
+$(foreach flavor,$(SHA),$(eval $(call simd-sha-defs,$(flavor))))
$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
@@ -212,10 +218,13 @@ $(addsuffix .c,$(SG)):
$(addsuffix .c,$(AES)):
ln -sf simd-aes.c $@
+$(addsuffix .c,$(SHA)):
+ ln -sf simd-sha.c $@
+
$(addsuffix .c,$(GF)):
ln -sf simd-gf.c $@
-$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(GF)): simd.h
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(SHA) $(GF)): simd.h
xop.h avx512f.h: simd-fma.c
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-sha.c
@@ -0,0 +1,392 @@
+#define INT_SIZE 4
+
+#include "simd.h"
+ENTRY(sha_test);
+
+#define SHA(op, a...) __builtin_ia32_sha ## op(a)
+
+#ifdef __AVX512F__
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqd, _mask, x, y, -1) == ALL_TRUE)
+# define blend(x, y, sel) B(movdqa32_, _mask, y, x, sel)
+# define rot_c(f, r, x, n) B(pro ## f ## d, _mask, x, n, undef(), ~0)
+# define rot_s(f, r, x, n) ({ /* gcc does not support embedded broadcast */ \
+ vec_t r_; \
+ asm ( "vpro" #f "vd %2%{1to%c3%}, %1, %0" \
+ : "=v" (r_) \
+ : "v" (x), "m" (n), "i" (ELEM_COUNT) ); \
+ r_; \
+})
+# define rot_v(d, x, n) B(pro ## d ## vd, _mask, x, n, undef(), ~0)
+# define shift_s(d, x, n) ({ \
+ vec_t r_; \
+ asm ( "vps" #d "lvd %2%{1to%c3%}, %1, %0" \
+ : "=v" (r_) \
+ : "v" (x), "m" (n), "i" (ELEM_COUNT) ); \
+ r_; \
+})
+# define vshift(d, x, n) ({ /* gcc does not allow memory operands */ \
+ vec_t r_; \
+ asm ( "vps" #d "ldq %2, %1, %0" \
+ : "=v" (r_) : "m" (x), "i" ((n) * ELEM_SIZE) ); \
+ r_; \
+})
+#else
+# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# define eq(x, y) to_bool((x) == (y))
+# define blend(x, y, sel) \
+ ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), \
+ ((sel) & 1 ? 0x03 : 0) | \
+ ((sel) & 2 ? 0x0c : 0) | \
+ ((sel) & 4 ? 0x30 : 0) | \
+ ((sel) & 8 ? 0xc0 : 0)))
+# define rot_c(f, r, x, n) (sh ## f ## _c(x, n) | sh ## r ## _c(x, 32 - (n)))
+# define rot_s(f, r, x, n) ({ /* gcc does not allow memory operands */ \
+ vec_t r_, t_, n_ = (vec_t){ 32 } - (n); \
+ asm ( "ps" #f "ld %2, %0; ps" #r "ld %3, %1; por %1, %0" \
+ : "=&x" (r_), "=&x" (t_) \
+ : "m" (n), "m" (n_), "0" (x), "1" (x) ); \
+ r_; \
+})
+static inline unsigned int rotl(unsigned int x, unsigned int n)
+{
+ return (x << (n & 0x1f)) | (x >> ((32 - n) & 0x1f));
+}
+static inline unsigned int rotr(unsigned int x, unsigned int n)
+{
+ return (x >> (n & 0x1f)) | (x << ((32 - n) & 0x1f));
+}
+# define rot_v(d, x, n) ({ \
+ vec_t t_; \
+ unsigned int i_; \
+ for ( i_ = 0; i_ < ELEM_COUNT; ++i_ ) \
+ t_[i_] = rot ## d((x)[i_], (n)[i_]); \
+ t_; \
+})
+# define shift_s(d, x, n) ({ \
+ vec_t r_; \
+ asm ( "ps" #d "ld %1, %0" : "=&x" (r_) : "m" (n), "0" (x) ); \
+ r_; \
+})
+# define vshift(d, x, n) \
+ (vec_t)(__builtin_ia32_ps ## d ## ldqi128((vdi_t)(x), (n) * ELEM_SIZE * 8))
+#endif
+
+#define alignr(x, y, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(y), (n) * 8))
+#define hadd(x, y) __builtin_ia32_phaddd128(x, y)
+#define rol_c(x, n) rot_c(l, r, x, n)
+#define rol_s(x, n) rot_s(l, r, x, n)
+#define rol_v(x, n...) rot_v(l, x, n)
+#define ror_c(x, n) rot_c(r, l, x, n)
+#define ror_s(x, n) rot_s(r, l, x, n)
+#define ror_v(x, n...) rot_v(r, x, n)
+#define shl_c(x, n) __builtin_ia32_pslldi128(x, n)
+#define shl_s(x, n) shift_s(l, x, n)
+#define shr_c(x, n) __builtin_ia32_psrldi128(x, n)
+#define shr_s(x, n) shift_s(r, x, n)
+#define shuf(x, s) __builtin_ia32_pshufd(x, s)
+#define swap(x) shuf(x, 0b00011011)
+#define vshl(x, n) vshift(l, x, n)
+#define vshr(x, n) vshift(r, x, n)
+
+static inline vec_t sha256_sigma0(vec_t w)
+{
+ vec_t res;
+
+ touch(w);
+ res = ror_c(w, 7);
+ touch(w);
+ res ^= rol_c(w, 14);
+ touch(w);
+ res ^= shr_c(w, 3);
+ touch(w);
+
+ return res;
+}
+
+static inline vec_t sha256_sigma1(vec_t w)
+{
+ vec_t _17 = { 17 }, _19 = { 19 }, _10 = { 10 };
+
+ return ror_s(w, _17) ^ ror_s(w, _19) ^ shr_s(w, _10);
+}
+
+static inline vec_t sha256_Sigma0(vec_t w)
+{
+ vec_t res, n1 = { 0, 0, 2, 2 }, n2 = { 0, 0, 13, 13 }, n3 = { 0, 0, 10, 10 };
+
+ touch(n1);
+ res = ror_v(w, n1);
+ touch(n2);
+ res ^= ror_v(w, n2);
+ touch(n3);
+
+ return res ^ rol_v(w, n3);
+}
+
+static inline vec_t sha256_Sigma1(vec_t w)
+{
+ return ror_c(w, 6) ^ ror_c(w, 11) ^ rol_c(w, 7);
+}
+
+int sha_test(void)
+{
+ unsigned int i;
+ vec_t src, one = { 1 };
+ vqi_t raw = {};
+
+ for ( i = 1; i < VEC_SIZE; ++i )
+ raw[i] = i;
+ src = (vec_t)raw;
+
+ for ( i = 0; i < 256; i += VEC_SIZE )
+ {
+ vec_t x, y, tmp, hash = -src;
+ vec_t a, b, c, d, e, g, h;
+ unsigned int k, r;
+
+ touch(src);
+ x = SHA(1msg1, hash, src);
+ touch(src);
+ y = hash ^ alignr(hash, src, 8);
+ touch(src);
+
+ if ( !eq(x, y) ) return __LINE__;
+
+ touch(src);
+ x = SHA(1msg2, hash, src);
+ touch(src);
+ tmp = hash ^ alignr(src, hash, 12);
+ touch(tmp);
+ y = rol_c(tmp, 1);
+ tmp = hash ^ alignr(src, y, 12);
+ touch(tmp);
+ y = rol_c(tmp, 1);
+
+ if ( !eq(x, y) ) return __LINE__;
+
+ touch(src);
+ x = SHA(1msg2, hash, src);
+ touch(src);
+ tmp = rol_s(hash ^ alignr(src, hash, 12), one);
+ y = rol_s(hash ^ alignr(src, tmp, 12), one);
+
+ if ( !eq(x, y) ) return __LINE__;
+
+ touch(src);
+ x = SHA(1nexte, hash, src);
+ touch(src);
+ touch(hash);
+ tmp = rol_c(hash, 30);
+ tmp[2] = tmp[1] = tmp[0] = 0;
+
+ if ( !eq(x, src + tmp) ) return __LINE__;
+
+ /*
+ * SHA1RNDS4
+ *
+ * SRC1 = { A0, B0, C0, D0 }
+ * SRC2 = W' = { W[0]E0, W[1], W[2], W[3] }
+ *
+ * (NB that the notation is not C-like, i.e. elements are listed
+ * high-to-low everywhere in this comment.)
+ *
+ * In order to pick a simple rounds function, an immediate value of
+ * 1 is used; 3 would also be a possibility.
+ *
+ * Applying
+ *
+ * A1 = ROL5(A0) + (B0 ^ C0 ^ D0) + W'[0] + K
+ * E1 = D0
+ * D1 = C0
+ * C1 = ROL30(B0)
+ * B1 = A0
+ *
+ * iteratively four times and resolving round variable values to
+ * A<n> and B0, C0, and D0 we get
+ *
+ * A4 = ROL5(A3) + (A2 ^ ROL30(A1) ^ ROL30(A0)) + W'[3] + ROL30(B0) + K
+ * A3 = ROL5(A2) + (A1 ^ ROL30(A0) ^ ROL30(B0)) + W'[2] + C0 + K
+ * A2 = ROL5(A1) + (A0 ^ ROL30(B0) ^ C0 ) + W'[1] + D0 + K
+ * A1 = ROL5(A0) + (B0 ^ C0 ^ D0 ) + W'[0] + K
+ *
+ * (respective per-column variable names:
+ * y a b c d src e k
+ * )
+ *
+ * with
+ *
+ * B4 = A3
+ * C4 = ROL30(A2)
+ * D4 = ROL30(A1)
+ * E4 = ROL30(A0)
+ *
+ * and hence
+ *
+ * DST = { A4, A3, ROL30(A2), ROL30(A1) }
+ */
+
+ touch(src);
+ x = SHA(1rnds4, hash, src, 1);
+ touch(src);
+
+ a = vshr(hash, 3);
+ b = vshr(hash, 2);
+ touch(hash);
+ d = rol_c(hash, 30);
+ touch(hash);
+ d = blend(d, hash, 0b0011);
+ c = vshr(d, 1);
+ e = vshl(d, 1);
+ tmp = (vec_t){};
+ k = rol_c(SHA(1rnds4, tmp, tmp, 1), 2)[0];
+
+ for ( r = 0; r < 4; ++r )
+ {
+ y = rol_c(a, 5) + (b ^ c ^ d) + swap(src) + e + k;
+
+ switch ( r )
+ {
+ case 0:
+ c[3] = rol_c(y, 30)[0];
+ /* fall through */
+ case 1:
+ b[r + 2] = y[r];
+ /* fall through */
+ case 2:
+ a[r + 1] = y[r];
+ break;
+ }
+
+ switch ( r )
+ {
+ case 3:
+ if ( a[3] != y[2] ) return __LINE__;
+ /* fall through */
+ case 2:
+ if ( a[2] != y[1] ) return __LINE__;
+ if ( b[3] != y[1] ) return __LINE__;
+ /* fall through */
+ case 1:
+ if ( a[1] != y[0] ) return __LINE__;
+ if ( b[2] != y[0] ) return __LINE__;
+ if ( c[3] != rol_c(y, 30)[0] ) return __LINE__;
+ break;
+ }
+ }
+
+ a = blend(rol_c(y, 30), y, 0b1100);
+
+ if ( !eq(x, a) ) return __LINE__;
+
+ touch(src);
+ x = SHA(256msg1, hash, src);
+ touch(src);
+ y = hash + sha256_sigma0(alignr(src, hash, 4));
+
+ if ( !eq(x, y) ) return __LINE__;
+
+ touch(src);
+ x = SHA(256msg2, hash, src);
+ touch(src);
+ tmp = hash + sha256_sigma1(alignr(hash, src, 8));
+ y = hash + sha256_sigma1(alignr(tmp, src, 8));
+
+ if ( !eq(x, y) ) return __LINE__;
+
+ /*
+ * SHA256RNDS2
+ *
+ * SRC1 = { C0, D0, G0, H0 }
+ * SRC2 = { A0, B0, E0, F0 }
+ * XMM0 = W' = { ?, ?, WK1, WK0 }
+ *
+ * (NB that the notation again is not C-like, i.e. elements are listed
+ * high-to-low everywhere in this comment.)
+ *
+ * Ch(E,F,G) = (E & F) ^ (~E & G)
+ * Maj(A,B,C) = (A & B) ^ (A & C) ^ (B & C)
+ *
+ * Σ0(A) = ROR2(A) ^ ROR13(A) ^ ROR22(A)
+ * Σ1(E) = ROR6(E) ^ ROR11(E) ^ ROR25(E)
+ *
+ * Applying
+ *
+ * A1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + Maj(A0, B0, C0) + Σ0(A0)
+ * B1 = A0
+ * C1 = B0
+ * D1 = C0
+ * E1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + D0
+ * F1 = E0
+ * G1 = F0
+ * H1 = G0
+ *
+ * iteratively four times and resolving round variable values to
+ * A<n> / E<n> and B0, C0, D0, F0, G0, and H0 we get
+ *
+ * A2 = Ch(E1, E0, F0) + Σ1(E1) + WK1 + G0 + Maj(A1, A0, B0) + Σ0(A1)
+ * A1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + Maj(A0, B0, C0) + Σ0(A0)
+ * E2 = Ch(E1, E0, F0) + Σ1(E1) + WK1 + G0 + C0
+ * E1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + D0
+ *
+ * with
+ *
+ * B2 = A1
+ * F2 = E1
+ *
+ * and hence
+ *
+ * DST = { A2, A1, E2, E1 }
+ *
+ * which we can simplify a little, by letting A0, B0, and E0 be zero
+ * and F0 = ~G0, and by then utilizing
+ *
+ * Ch(0, 0, x) = x
+ * Ch(x, 0, y) = ~x & y
+ * Maj(x, 0, 0) = Maj(0, x, 0) = Maj(0, 0, x) = 0
+ *
+ * A2 = (~E1 & F0) + Σ1(E1) + WK1 + G0 + Σ0(A1)
+ * A1 = (~E0 & G0) + Σ1(E0) + WK0 + H0 + Σ0(A0)
+ * E2 = (~E1 & F0) + Σ1(E1) + WK1 + G0 + C0
+ * E1 = (~E0 & G0) + Σ1(E0) + WK0 + H0 + D0
+ *
+ * (respective per-column variable names:
+ * y e g e src h d
+ * )
+ */
+
+ tmp = (vec_t){ ~hash[1] };
+ touch(tmp);
+ x = SHA(256rnds2, hash, tmp, src);
+ touch(tmp);
+
+ e = y = (vec_t){};
+ d = alignr(y, hash, 8);
+ g = (vec_t){ hash[1], tmp[0], hash[1], tmp[0] };
+ h = shuf(hash, 0b01000100);
+
+ for ( r = 0; r < 2; ++r )
+ {
+ y = (~e & g) + sha256_Sigma1(e) + shuf(src, 0b01000100) +
+ h + sha256_Sigma0(d);
+
+ if ( !r )
+ {
+ d[3] = y[2];
+ e[3] = e[1] = y[0];
+ }
+ else if ( d[3] != y[2] )
+ return __LINE__;
+ else if ( e[1] != y[0] )
+ return __LINE__;
+ else if ( e[3] != y[0] )
+ return __LINE__;
+ }
+
+ if ( !eq(x, y) ) return __LINE__;
+
+ src += 0x01010101 * VEC_SIZE;
+ }
+
+ return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -14,8 +14,10 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "sse2-gf.h"
#include "ssse3-aes.h"
#include "sse4.h"
+#include "sse4-sha.h"
#include "avx.h"
#include "avx-aes.h"
+#include "avx-sha.h"
#include "fma4.h"
#include "fma.h"
#include "avx2.h"
@@ -28,6 +30,7 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "avx512bw-opmask.h"
#include "avx512f.h"
#include "avx512f-sg.h"
+#include "avx512f-sha.h"
#include "avx512vl-sg.h"
#include "avx512bw.h"
#include "avx512bw-vaes.h"
@@ -155,6 +158,21 @@ static bool simd_check_avx512vbmi_vl(voi
return cpu_has_avx512_vbmi && cpu_has_avx512vl;
}
+static bool simd_check_sse4_sha(void)
+{
+ return cpu_has_sha && cpu_has_sse4_2;
+}
+
+static bool simd_check_avx_sha(void)
+{
+ return cpu_has_sha && cpu_has_avx;
+}
+
+static bool simd_check_avx512f_sha_vl(void)
+{
+ return cpu_has_sha && cpu_has_avx512vl;
+}
+
static bool simd_check_avx2_vaes(void)
{
return cpu_has_aesni && cpu_has_vaes && cpu_has_avx2;
@@ -450,6 +468,9 @@ static const struct {
AVX512VL(_VBMI+VL u16x8, avx512vbmi, 16u2),
AVX512VL(_VBMI+VL s16x16, avx512vbmi, 32i2),
AVX512VL(_VBMI+VL u16x16, avx512vbmi, 32u2),
+ SIMD(SHA, sse4_sha, 16),
+ SIMD(AVX+SHA, avx_sha, 16),
+ AVX512VL(VL+SHA, avx512f_sha, 16),
SIMD(VAES (VEX/x32), avx2_vaes, 32),
SIMD(VAES (EVEX/x64), avx512bw_vaes, 64),
AVX512VL(VL+VAES (x16), avx512bw_vaes, 16),
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -142,6 +142,7 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_avx512_ifma (cp.feat.avx512_ifma && xcr0_mask(0xe6))
#define cpu_has_avx512er (cp.feat.avx512er && xcr0_mask(0xe6))
#define cpu_has_avx512cd (cp.feat.avx512cd && xcr0_mask(0xe6))
+#define cpu_has_sha cp.feat.sha
#define cpu_has_avx512bw (cp.feat.avx512bw && xcr0_mask(0xe6))
#define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6))
#define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Xen-devel] [PATCH v10 13/13] x86emul: add a PCLMUL/VPCLMUL test case to the harness
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
` (11 preceding siblings ...)
2019-07-17 6:38 ` [Xen-devel] [PATCH v10 12/13] x86emul: add a SHA " Jan Beulich
@ 2019-07-17 6:38 ` Jan Beulich
12 siblings, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2019-07-17 6:38 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper, Wei Liu, RogerPau Monne
Also use this for AVX512_VBMI2 VPSH{L,R}D{,V}{D,Q,W} testing (only the
quad word right shifts get actually used; the assumption is that their
"left" counterparts as well as the double word and word forms then work
as well).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citirx.com>
---
v8: New.
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -20,9 +20,10 @@ SIMD := 3dnow sse sse2 sse4 avx avx2 xop
FMA := fma4 fma
SG := avx2-sg avx512f-sg avx512vl-sg
AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes
+CLMUL := ssse3-pclmul avx-pclmul avx2-vpclmulqdq avx512bw-vpclmulqdq avx512vbmi2-vpclmulqdq
SHA := sse4-sha avx-sha avx512f-sha
GF := sse2-gf avx2-gf avx512bw-gf
-TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(SHA) $(GF)
+TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(CLMUL) $(SHA) $(GF)
OPMASK := avx512f avx512dq avx512bw
@@ -89,6 +90,7 @@ avx512er-flts := 4 8
avx512vbmi-vecs := $(avx512bw-vecs)
avx512vbmi-ints := $(avx512bw-ints)
avx512vbmi-flts := $(avx512bw-flts)
+avx512vbmi2-vecs := $(avx512bw-vecs)
avx512f-opmask-vecs := 2
avx512dq-opmask-vecs := 1 2
@@ -149,6 +151,10 @@ define simd-aes-defs
$(1)-cflags := $(foreach vec,$($(patsubst %-aes,sse,$(1))-vecs) $($(patsubst %-vaes,%,$(1))-vecs), \
"-D_$(vec) -maes $(addprefix -m,$(subst -,$(space),$(1))) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
endef
+define simd-clmul-defs
+$(1)-cflags := $(foreach vec,$($(patsubst %-pclmul,sse,$(1))-vecs) $($(patsubst %-vpclmulqdq,%,$(1))-vecs), \
+ "-D_$(vec) -mpclmul $(addprefix -m,$(subst -,$(space),$(1))) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)")
+endef
define simd-sha-defs
$(1)-cflags := $(foreach vec,$(sse-vecs), \
"-D_$(vec) $(addprefix -m,$(subst -,$(space),$(1))) -Os -DVEC_SIZE=$(vec)")
@@ -164,6 +170,7 @@ endef
$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
$(foreach flavor,$(AES),$(eval $(call simd-aes-defs,$(flavor))))
+$(foreach flavor,$(CLMUL),$(eval $(call simd-clmul-defs,$(flavor))))
$(foreach flavor,$(SHA),$(eval $(call simd-sha-defs,$(flavor))))
$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
@@ -218,13 +225,16 @@ $(addsuffix .c,$(SG)):
$(addsuffix .c,$(AES)):
ln -sf simd-aes.c $@
+$(addsuffix .c,$(CLMUL)):
+ ln -sf simd-clmul.c $@
+
$(addsuffix .c,$(SHA)):
ln -sf simd-sha.c $@
$(addsuffix .c,$(GF)):
ln -sf simd-gf.c $@
-$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(SHA) $(GF)): simd.h
+$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(CLMUL) $(SHA) $(GF)): simd.h
xop.h avx512f.h: simd-fma.c
--- /dev/null
+++ b/tools/tests/x86_emulator/simd-clmul.c
@@ -0,0 +1,150 @@
+#define UINT_SIZE 8
+
+#include "simd.h"
+ENTRY(clmul_test);
+
+#ifdef __AVX512F__ /* AVX512BW may get enabled only below */
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
+# define lane_shr_unit(x) \
+ ((vec_t)B(palignr, _mask, (vdi_t)(x), (vdi_t)(x), 64, (vdi_t){}, \
+ 0x00ff00ff00ff00ffULL & (~0ULL >> (64 - VEC_SIZE))))
+#else
+# if defined(__AVX2__) && VEC_SIZE == 32
+# define to_bool(cmp) B(ptestc, , cmp, (vdi_t){} == 0)
+# else
+# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff)
+# endif
+# define eq(x, y) to_bool((x) == (y))
+# define lane_shr_unit(x) ((vec_t)B(palignr, , (vdi_t){}, (vdi_t)(x), 64))
+#endif
+
+#define CLMUL(op, x, y, c) (vec_t)(__builtin_ia32_ ## op((vdi_t)(x), (vdi_t)(y), c))
+
+#if VEC_SIZE == 16
+# define clmul(x, y, c) CLMUL(pclmulqdq128, x, y, c)
+# define vpshrd __builtin_ia32_vpshrd_v2di
+#elif VEC_SIZE == 32
+# define clmul(x, y, c) CLMUL(vpclmulqdq_v4di, x, y, c)
+# define vpshrd __builtin_ia32_vpshrd_v4di
+#elif VEC_SIZE == 64
+# define clmul(x, y, c) CLMUL(vpclmulqdq_v8di, x, y, c)
+# define vpshrd __builtin_ia32_vpshrd_v8di
+#endif
+
+#define clmul_ll(x, y) clmul(x, y, 0x00)
+#define clmul_hl(x, y) clmul(x, y, 0x01)
+#define clmul_lh(x, y) clmul(x, y, 0x10)
+#define clmul_hh(x, y) clmul(x, y, 0x11)
+
+#if defined(__AVX512VBMI2__)
+# pragma GCC target ( "avx512bw" )
+# define lane_shr_i(x, n) ({ \
+ vec_t h_ = lane_shr_unit(x); \
+ touch(h_); \
+ (n) < 64 ? (vec_t)vpshrd((vdi_t)(x), (vdi_t)(h_), n) : h_ >> ((n) - 64); \
+})
+# define lane_shr_v(x, n) ({ \
+ vec_t t_ = (x), h_ = lane_shr_unit(x); \
+ typeof(t_[0]) n_ = (n); \
+ if ( (n) < 64 ) \
+ /* gcc does not support embedded broadcast */ \
+ asm ( "vpshrdvq %2%{1to%c3%}, %1, %0" \
+ : "+v" (t_) : "v" (h_), "m" (n_), "i" (ELEM_COUNT) ); \
+ else \
+ t_ = h_ >> ((n) - 64); \
+ t_; \
+})
+#else
+# define lane_shr_i lane_shr_v
+# define lane_shr_v(x, n) ({ \
+ vec_t t_ = (n) > 0 ? lane_shr_unit(x) : (x); \
+ (n) < 64 ? ((x) >> (n)) | (t_ << (-(n) & 0x3f)) \
+ : t_ >> ((n) - 64); \
+})
+#endif
+
+int clmul_test(void)
+{
+ unsigned int i;
+ vec_t src;
+ vqi_t raw = {};
+
+ for ( i = 1; i < VEC_SIZE; ++i )
+ raw[i] = i;
+ src = (vec_t)raw;
+
+ for ( i = 0; i < 256; i += VEC_SIZE )
+ {
+ vec_t x = {}, y, z, lo, hi;
+ unsigned int j;
+
+ touch(x);
+ y = clmul_ll(src, x);
+ touch(x);
+
+ if ( !eq(y, x) ) return __LINE__;
+
+ for ( j = 0; j < ELEM_COUNT; j += 2 )
+ x[j] = 1;
+
+ touch(src);
+ y = clmul_ll(x, src);
+ touch(src);
+ z = clmul_lh(x, src);
+ touch(src);
+
+ for ( j = 0; j < ELEM_COUNT; j += 2 )
+ y[j + 1] = z[j];
+
+ if ( !eq(y, src) ) return __LINE__;
+
+ /*
+ * Besides the obvious property of the low and high half products
+ * being the same either direction, the "square" of a number has the
+ * property of simply being the original bit pattern with a zero bit
+ * inserted between any two bits. This is what the code below checks.
+ */
+
+ x = src;
+ touch(src);
+ y = clmul_lh(x, src);
+ touch(src);
+ z = clmul_hl(x, src);
+
+ if ( !eq(y, z) ) return __LINE__;
+
+ touch(src);
+ y = lo = clmul_ll(x, src);
+ touch(src);
+ z = hi = clmul_hh(x, src);
+ touch(src);
+
+ for ( j = 0; j < 64; ++j )
+ {
+ vec_t l = lane_shr_v(lo, 2 * j);
+ vec_t h = lane_shr_v(hi, 2 * j);
+ unsigned int n;
+
+ if ( !eq(l, y) ) return __LINE__;
+ if ( !eq(h, z) ) return __LINE__;
+
+ x = src >> j;
+
+ for ( n = 0; n < ELEM_COUNT; n += 2 )
+ {
+ if ( (x[n + 0] & 1) != (l[n] & 3) ) return __LINE__;
+ if ( (x[n + 1] & 1) != (h[n] & 3) ) return __LINE__;
+ }
+
+ touch(y);
+ y = lane_shr_i(y, 2);
+ touch(z);
+ z = lane_shr_i(z, 2);
+ }
+
+ src += 0x0101010101010101ULL * VEC_SIZE;
+ }
+
+ return 0;
+}
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -381,6 +381,7 @@ OVR(movntdq);
OVR(movntdqa);
OVR(movshdup);
OVR(movsldup);
+OVR(pclmulqdq);
OVR(permd);
OVR(permq);
OVR(pmovsxbd);
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -13,16 +13,19 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "sse2.h"
#include "sse2-gf.h"
#include "ssse3-aes.h"
+#include "ssse3-pclmul.h"
#include "sse4.h"
#include "sse4-sha.h"
#include "avx.h"
#include "avx-aes.h"
+#include "avx-pclmul.h"
#include "avx-sha.h"
#include "fma4.h"
#include "fma.h"
#include "avx2.h"
#include "avx2-sg.h"
#include "avx2-vaes.h"
+#include "avx2-vpclmulqdq.h"
#include "avx2-gf.h"
#include "xop.h"
#include "avx512f-opmask.h"
@@ -34,10 +37,12 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "avx512vl-sg.h"
#include "avx512bw.h"
#include "avx512bw-vaes.h"
+#include "avx512bw-vpclmulqdq.h"
#include "avx512bw-gf.h"
#include "avx512dq.h"
#include "avx512er.h"
#include "avx512vbmi.h"
+#include "avx512vbmi2-vpclmulqdq.h"
#define verbose false /* Switch to true for far more logging. */
@@ -108,6 +113,16 @@ static bool simd_check_avx_aes(void)
return cpu_has_aesni && cpu_has_avx;
}
+static bool simd_check_ssse3_pclmul(void)
+{
+ return cpu_has_pclmulqdq && cpu_has_ssse3;
+}
+
+static bool simd_check_avx_pclmul(void)
+{
+ return cpu_has_pclmulqdq && cpu_has_avx;
+}
+
static bool simd_check_avx512f(void)
{
return cpu_has_avx512f;
@@ -189,6 +204,31 @@ static bool simd_check_avx512bw_vaes_vl(
cpu_has_avx512bw && cpu_has_avx512vl;
}
+static bool simd_check_avx2_vpclmulqdq(void)
+{
+ return cpu_has_vpclmulqdq && cpu_has_avx2;
+}
+
+static bool simd_check_avx512bw_vpclmulqdq(void)
+{
+ return cpu_has_vpclmulqdq && cpu_has_avx512bw;
+}
+
+static bool simd_check_avx512bw_vpclmulqdq_vl(void)
+{
+ return cpu_has_vpclmulqdq && cpu_has_avx512bw && cpu_has_avx512vl;
+}
+
+static bool simd_check_avx512vbmi2_vpclmulqdq(void)
+{
+ return cpu_has_avx512_vbmi2 && simd_check_avx512bw_vpclmulqdq();
+}
+
+static bool simd_check_avx512vbmi2_vpclmulqdq_vl(void)
+{
+ return cpu_has_avx512_vbmi2 && simd_check_avx512bw_vpclmulqdq_vl();
+}
+
static bool simd_check_sse2_gf(void)
{
return cpu_has_gfni && cpu_has_sse2;
@@ -369,6 +409,8 @@ static const struct {
SIMD(XOP i64x4, xop, 32i8),
SIMD(AES (legacy), ssse3_aes, 16),
SIMD(AES (VEX/x16), avx_aes, 16),
+ SIMD(PCLMUL (legacy), ssse3_pclmul, 16),
+ SIMD(PCLMUL (VEX/x2), avx_pclmul, 16),
SIMD(OPMASK/w, avx512f_opmask, 2),
SIMD(OPMASK+DQ/b, avx512dq_opmask, 1),
SIMD(OPMASK+DQ/w, avx512dq_opmask, 2),
@@ -475,6 +517,13 @@ static const struct {
SIMD(VAES (EVEX/x64), avx512bw_vaes, 64),
AVX512VL(VL+VAES (x16), avx512bw_vaes, 16),
AVX512VL(VL+VAES (x32), avx512bw_vaes, 32),
+ SIMD(VPCLMUL (VEX/x4), avx2_vpclmulqdq, 32),
+ SIMD(VPCLMUL (EVEX/x8), avx512bw_vpclmulqdq, 64),
+ AVX512VL(VL+VPCLMUL (x4), avx512bw_vpclmulqdq, 16),
+ AVX512VL(VL+VPCLMUL (x8), avx512bw_vpclmulqdq, 32),
+ SIMD(AVX512_VBMI2+VPCLMUL (x8), avx512vbmi2_vpclmulqdq, 64),
+ AVX512VL(_VBMI2+VL+VPCLMUL (x2), avx512vbmi2_vpclmulqdq, 16),
+ AVX512VL(_VBMI2+VL+VPCLMUL (x4), avx512vbmi2_vpclmulqdq, 32),
SIMD(GFNI (legacy), sse2_gf, 16),
SIMD(GFNI (VEX/x16), avx2_gf, 16),
SIMD(GFNI (VEX/x32), avx2_gf, 32),
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -125,6 +125,7 @@ static inline bool xcr0_mask(uint64_t ma
#define cpu_has_sse cp.basic.sse
#define cpu_has_sse2 cp.basic.sse2
#define cpu_has_sse3 cp.basic.sse3
+#define cpu_has_pclmulqdq cp.basic.pclmulqdq
#define cpu_has_ssse3 cp.basic.ssse3
#define cpu_has_fma (cp.basic.fma && xcr0_mask(6))
#define cpu_has_sse4_1 cp.basic.sse4_1
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Xen-devel] [PATCH v10 01/13] x86emul: support of AVX512* population count insns
2019-07-17 6:33 ` [Xen-devel] [PATCH v10 01/13] x86emul: support of AVX512* population count insns Jan Beulich
@ 2019-07-17 11:32 ` Andrew Cooper
0 siblings, 0 replies; 15+ messages in thread
From: Andrew Cooper @ 2019-07-17 11:32 UTC (permalink / raw)
To: Jan Beulich, xen-devel; +Cc: Wei Liu, RogerPau Monne
On 17/07/2019 07:33, Jan Beulich wrote:
> Plus the only other AVX512_BITALG one.
>
> As in a few cases before, since the insns here and in particular their
> memory access patterns follow the usual scheme, I didn't think it was
> necessary to add a contrived test specifically for them, beyond the
> Disp8 scaling one.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2019-07-17 11:33 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-17 6:27 [Xen-devel] [PATCH v10 00/13] x86emul: remaining AVX512 support Jan Beulich
2019-07-17 6:33 ` [Xen-devel] [PATCH v10 01/13] x86emul: support of AVX512* population count insns Jan Beulich
2019-07-17 11:32 ` Andrew Cooper
2019-07-17 6:34 ` [Xen-devel] [PATCH v10 02/13] x86emul: support of AVX512_IFMA insns Jan Beulich
2019-07-17 6:34 ` [Xen-devel] [PATCH v10 03/13] x86emul: support remaining AVX512_VBMI2 insns Jan Beulich
2019-07-17 6:34 ` [Xen-devel] [PATCH v10 04/13] x86emul: support AVX512_4FMAPS insns Jan Beulich
2019-07-17 6:35 ` [Xen-devel] [PATCH v10 05/13] x86emul: support AVX512_4VNNIW insns Jan Beulich
2019-07-17 6:35 ` [Xen-devel] [PATCH v10 06/13] x86emul: support AVX512_VNNI insns Jan Beulich
2019-07-17 6:35 ` [Xen-devel] [PATCH v10 07/13] x86emul: support VPCLMULQDQ insns Jan Beulich
2019-07-17 6:36 ` [Xen-devel] [PATCH v10 08/13] x86emul: support VAES insns Jan Beulich
2019-07-17 6:36 ` [Xen-devel] [PATCH v10 09/13] x86emul: support GFNI insns Jan Beulich
2019-07-17 6:36 ` [Xen-devel] [PATCH v10 10/13] x86emul: restore ordering within main switch statement Jan Beulich
2019-07-17 6:37 ` [Xen-devel] [PATCH v10 11/13] x86emul: add an AES/VAES test case to the harness Jan Beulich
2019-07-17 6:38 ` [Xen-devel] [PATCH v10 12/13] x86emul: add a SHA " Jan Beulich
2019-07-17 6:38 ` [Xen-devel] [PATCH v10 13/13] x86emul: add a PCLMUL/VPCLMUL " Jan Beulich
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.