* [Qemu-devel] [RISU 0/2] i386 support for avx
@ 2019-04-09 4:13 Richard Henderson
2019-04-09 4:13 ` [Qemu-devel] [PATCH 1/2] i386: Add avx512 state to reginfo_t Richard Henderson
2019-04-09 4:13 ` [Qemu-devel] [PATCH 2/2] HACK: Test avx2 state Richard Henderson
0 siblings, 2 replies; 3+ messages in thread
From: Richard Henderson @ 2019-04-09 4:13 UTC (permalink / raw)
To: qemu-devel; +Cc: jan.bobek
It's quite plausible that we should add a command-line argument for
risu, similar to the aarch64 --test-sve option, that explicitly
limits the size of the vectors to be recorded or compared. Otherwise
I worry about the host supporting AVX-N and wanting to test TCG with
a narrower vector insns.
Anyway, adding a command-line option should be a relatively trivial
extension to the following, which works to extract state from the
Intel XSAVE structure that the kernel borrows for the signal frame.
r~
Richard Henderson (2):
i386: Add avx512 state to reginfo_t
HACK: Test avx2 state
risu_reginfo_i386.h | 15 ++++
risu_reginfo_i386.c | 194 ++++++++++++++++++++++++++++++++++++++++++--
test_i386.S | 39 +++++++++
3 files changed, 243 insertions(+), 5 deletions(-)
--
2.17.1
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Qemu-devel] [PATCH 1/2] i386: Add avx512 state to reginfo_t
2019-04-09 4:13 [Qemu-devel] [RISU 0/2] i386 support for avx Richard Henderson
@ 2019-04-09 4:13 ` Richard Henderson
2019-04-09 4:13 ` [Qemu-devel] [PATCH 2/2] HACK: Test avx2 state Richard Henderson
1 sibling, 0 replies; 3+ messages in thread
From: Richard Henderson @ 2019-04-09 4:13 UTC (permalink / raw)
To: qemu-devel; +Cc: jan.bobek
The state beyond what is present on the running cpu will be zero.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
risu_reginfo_i386.h | 15 ++++
risu_reginfo_i386.c | 194 ++++++++++++++++++++++++++++++++++++++++++--
test_i386.S | 39 +++++++++
3 files changed, 243 insertions(+), 5 deletions(-)
diff --git a/risu_reginfo_i386.h b/risu_reginfo_i386.h
index 755283a..1d13e41 100644
--- a/risu_reginfo_i386.h
+++ b/risu_reginfo_i386.h
@@ -12,14 +12,29 @@
#ifndef RISU_REGINFO_I386_H
#define RISU_REGINFO_I386_H
+struct avx512_reg {
+ uint64_t q[8];
+};
+
/*
* This is the data structure we pass over the socket.
* It is a simplified and reduced subset of what can
* be obtained with a ucontext_t*
*/
struct reginfo {
+ uint16_t nvecregs;
+ uint16_t nvecquads;
uint32_t faulting_insn;
+ uint32_t mxcsr;
+
gregset_t gregs;
+
+#ifdef __x86_64__
+ struct avx512_reg vregs[32];
+#else
+ struct avx512_reg vregs[8];
+#endif
+ uint64_t kregs[8];
};
/*
diff --git a/risu_reginfo_i386.c b/risu_reginfo_i386.c
index c4dc14a..e1974ec 100644
--- a/risu_reginfo_i386.c
+++ b/risu_reginfo_i386.c
@@ -11,13 +11,17 @@
#include <stdio.h>
#include <stdlib.h>
+#include <stddef.h>
#include <string.h>
#include <ucontext.h>
#include <assert.h>
+#include <cpuid.h>
#include "risu.h"
#include "risu_reginfo_i386.h"
+#include <asm/sigcontext.h>
+
const struct option * const arch_long_opts;
const char * const arch_extra_help;
@@ -31,10 +35,30 @@ const int reginfo_size(void)
return sizeof(struct reginfo);
}
+static void *xsave_feature_buf(struct _xstate *xs, int feature)
+{
+ unsigned int eax, ebx, ecx, edx;
+ int ok;
+
+ /*
+ * Get the location of the XSAVE feature from the cpuid leaf.
+ * Given that we know the xfeature bit is set, this must succeed.
+ */
+ ok = __get_cpuid_count(0xd, feature, &eax, &ebx, &ecx, &edx);
+ assert(ok);
+
+ /* Sanity check that the frame stored by the kernel contains the data. */
+ assert(xs->fpstate.sw_reserved.extended_size >= eax + ebx);
+
+ return (void *)xs + ebx;
+}
+
/* reginfo_init: initialize with a ucontext */
void reginfo_init(struct reginfo *ri, ucontext_t *uc)
{
- int i;
+ int i, nvecregs, nvecquads;
+ struct _fpstate *fp;
+ struct _xstate *xs;
memset(ri, 0, sizeof(*ri));
@@ -79,12 +103,100 @@ void reginfo_init(struct reginfo *ri, ucontext_t *uc)
* distinguish 'do compare' from 'stop'.
*/
ri->faulting_insn = *(uint32_t *)uc->uc_mcontext.gregs[REG_E(IP)];
+
+ /*
+ * FP state is omitted if unused (aka in init state).
+ * Use the <asm/sigcontext.h> struct for access to AVX state.
+ */
+ fp = (struct _fpstate *)uc->uc_mcontext.fpregs;
+ if (fp == NULL) {
+ return;
+ }
+
+#ifdef __x86_64__
+ nvecregs = 16;
+#else
+ /* We don't (currently) care about the 80387 state, only SSE+. */
+ if (fp->magic != X86_FXSR_MAGIC) {
+ return;
+ }
+ nvecregs = 8;
+#endif
+ nvecquads = 2;
+
+ /*
+ * Now we know that _fpstate contains FXSAVE data.
+ */
+ ri->mxcsr = fp->mxcsr;
+
+ for (i = 0; i < nvecregs; ++i) {
+#ifdef __x86_64__
+ memcpy(&ri->vregs[i], &fp->xmm_space[i], 16);
+#else
+ memcpy(&ri->vregs[i], &fp->_xmm[i * 4], 16);
+#endif
+ }
+
+ if (fp->sw_reserved.magic1 != FP_XSTATE_MAGIC1) {
+ return;
+ }
+ xs = (struct _xstate *)fp;
+
+ /*
+ * Now we know that _fpstate contains XSAVE data.
+ */
+
+ if (xs->xstate_hdr.xfeatures & (1 << 2)) {
+ /* YMM_Hi128 state */
+ void *buf = xsave_feature_buf(xs, 2);
+ for (i = 0; i < nvecregs; ++i) {
+ memcpy(&ri->vregs[i].q[2], buf + 16 * i, 16);
+ }
+ nvecquads = 4;
+ }
+
+ if (xs->xstate_hdr.xfeatures & (1 << 5)) {
+ /* Opmask state */
+ uint64_t *buf = xsave_feature_buf(xs, 5);
+ for (i = 0; i < 8; ++i) {
+ ri->kregs[i] = buf[i];
+ }
+ }
+
+ if (xs->xstate_hdr.xfeatures & (1 << 6)) {
+ /* ZMM_Hi256 state */
+ void *buf = xsave_feature_buf(xs, 6);
+ for (i = 0; i < nvecregs; ++i) {
+ memcpy(&ri->vregs[i].q[4], buf + 32 * i, 32);
+ }
+ nvecquads = 8;
+ }
+
+#ifdef __x86_64__
+ if (xs->xstate_hdr.xfeatures & (1 << 7)) {
+ /* Hi16_ZMM state */
+ void *buf = xsave_feature_buf(xs, 7);
+ for (i = 0; i < 16; ++i) {
+ memcpy(&ri->vregs[i + 16], buf + 64 * i, 64);
+ }
+ nvecregs = 32;
+ }
+#endif
+
+ ri->nvecregs = nvecregs;
+ ri->nvecquads = nvecquads;
}
/* reginfo_is_eq: compare the reginfo structs, returns nonzero if equal */
int reginfo_is_eq(struct reginfo *m, struct reginfo *a)
{
- return 0 == memcmp(m, a, sizeof(*m));
+ /*
+ * Do not compare nvecregs & nvecquads. This allows master
+ * and apprentice to have different vector widths, so long
+ * as we're only interested in the low N bits of the register.
+ */
+ return !memcmp(&m->faulting_insn, &a->faulting_insn,
+ sizeof(*m) - offsetof(struct reginfo, faulting_insn));
}
static const char *const regname[NGREG] = {
@@ -129,25 +241,97 @@ static const char *const regname[NGREG] = {
/* reginfo_dump: print state to a stream, returns nonzero on success */
int reginfo_dump(struct reginfo *ri, FILE *f)
{
- int i;
+ int i, j, n, w;
+ char r;
+
fprintf(f, " faulting insn %x\n", ri->faulting_insn);
for (i = 0; i < NGREG; i++) {
if (regname[i]) {
fprintf(f, " %-6s: " PRIxREG "\n", regname[i], ri->gregs[i]);
}
}
+
+ fprintf(f, " mxcsr : %x\n", ri->mxcsr);
+
+ n = ri->nvecregs;
+ w = ri->nvecquads;
+ r = (w <= 2 ? 'x' : w <= 4 ? 'y' : 'z');
+
+ for (i = 0; i < n; i++) {
+ fprintf(f, " %cmm%-3d: ", r, i);
+ for (j = w - 1; j >= 0; j--) {
+ fprintf(f, "%016" PRIx64 "%c",
+ ri->vregs[i].q[j], j == 0 ? '\n' : ' ');
+ }
+ }
+
+ if (n >= 32 || w >= 8) {
+ for (i = 0; i < 8; i++) {
+ fprintf(f, " k%-5d: %016" PRIx64 "\n", i, ri->kregs[i]);
+ }
+ }
+
return !ferror(f);
}
+static void reginfo_dump_mismatch_vreg(struct reginfo *m, struct reginfo *a,
+ FILE *f, int i, int w)
+{
+ char r = (w <= 2 ? 'x' : w <= 4 ? 'y' : 'z');
+ int j;
+
+ fprintf(f, " %cmm%-3d: ", r, i);
+ for (j = w - 1; j >= 0; j--) {
+ fprintf(f, "%016" PRIx64 "%c",
+ m->vregs[i].q[j], j == 0 ? '\n' : ' ');
+ }
+ fprintf(f, " vs: ");
+ for (j = w - 1; j >= 0; j--) {
+ fprintf(f, "%016" PRIx64 "%c",
+ a->vregs[i].q[j], j == 0 ? '\n' : ' ');
+ }
+}
+
int reginfo_dump_mismatch(struct reginfo *m, struct reginfo *a, FILE *f)
{
- int i;
+ int i, n, wmin, wmax;
+
+ fprintf(f, "Mismatch (master v apprentice):\n");
+
for (i = 0; i < NGREG; i++) {
if (m->gregs[i] != a->gregs[i]) {
assert(regname[i]);
- fprintf(f, "Mismatch: %s: " PRIxREG " v " PRIxREG "\n",
+ fprintf(f, " %-6s: " PRIxREG " v " PRIxREG "\n",
regname[i], m->gregs[i], a->gregs[i]);
}
}
+
+ if (m->mxcsr != a->mxcsr) {
+ fprintf(f, " mxcsr : %x v %x\n", m->mxcsr, a->mxcsr);
+ }
+
+ n = sizeof(m->vregs) / sizeof(m->vregs[0]);
+ wmax = sizeof(m->vregs[0]) / sizeof(m->vregs[0].q[0]);
+ wmin = m->nvecquads;
+
+ for (i = 0; i < n; i++) {
+ if (memcmp(&m->vregs[i], &a->vregs[i], wmax * 8)) {
+ if (memcmp(&m->vregs[i], &a->vregs[i], wmin * 8)) {
+ /* This is expected behaviour. */
+ reginfo_dump_mismatch_vreg(m, a, f, i, wmin);
+ } else {
+ /* This probably means different vector widths. */
+ reginfo_dump_mismatch_vreg(m, a, f, i, wmax);
+ }
+ }
+ }
+
+ for (i = 0; i < 8; i++) {
+ if (m->kregs[i] != a->kregs[i]) {
+ fprintf(f, " k%-5d: %016" PRIx64 " v %016" PRIx64 "\n",
+ i, m->kregs[i], a->kregs[i]);
+ }
+ }
+
return !ferror(f);
}
diff --git a/test_i386.S b/test_i386.S
index 456b99c..05344d7 100644
--- a/test_i386.S
+++ b/test_i386.S
@@ -12,6 +12,37 @@
/* A trivial test image for x86 */
/* Initialise the registers to avoid spurious mismatches */
+
+#ifdef __x86_64__
+#define BASE %rax
+ lea 2f(%rip), BASE
+#else
+#define BASE %eax
+ call 1f
+1: pop BASE
+ add $2f-1b, BASE
+#endif
+
+ movdqa 0(BASE), %xmm0
+ movdqa 1*16(BASE), %xmm1
+ movdqa 2*16(BASE), %xmm2
+ movdqa 3*16(BASE), %xmm3
+ movdqa 4*16(BASE), %xmm4
+ movdqa 5*16(BASE), %xmm5
+ movdqa 6*16(BASE), %xmm6
+ movdqa 7*16(BASE), %xmm7
+
+#ifdef __x86_64__
+ movdqa 8*16(BASE), %xmm8
+ movdqa 9*16(BASE), %xmm9
+ movdqa 10*16(BASE), %xmm10
+ movdqa 11*16(BASE), %xmm11
+ movdqa 12*16(BASE), %xmm12
+ movdqa 13*16(BASE), %xmm13
+ movdqa 14*16(BASE), %xmm14
+ movdqa 15*16(BASE), %xmm15
+#endif
+
xor %eax, %eax
sahf /* init eflags */
@@ -39,3 +70,11 @@
/* exit test */
ud1 %ecx, %eax
+
+ .p2align 16
+2:
+ .set i, 0
+ .rept 256
+ .byte i
+ .set i, i + 1
+ .endr
--
2.17.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [Qemu-devel] [PATCH 2/2] HACK: Test avx2 state
2019-04-09 4:13 [Qemu-devel] [RISU 0/2] i386 support for avx Richard Henderson
2019-04-09 4:13 ` [Qemu-devel] [PATCH 1/2] i386: Add avx512 state to reginfo_t Richard Henderson
@ 2019-04-09 4:13 ` Richard Henderson
1 sibling, 0 replies; 3+ messages in thread
From: Richard Henderson @ 2019-04-09 4:13 UTC (permalink / raw)
To: qemu-devel; +Cc: jan.bobek
In order to not be a hack, we'd have to test whether avx2 exists.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
test_i386.S | 34 +++++++++++++++++-----------------
1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/test_i386.S b/test_i386.S
index 05344d7..55815fe 100644
--- a/test_i386.S
+++ b/test_i386.S
@@ -23,24 +23,24 @@
add $2f-1b, BASE
#endif
- movdqa 0(BASE), %xmm0
- movdqa 1*16(BASE), %xmm1
- movdqa 2*16(BASE), %xmm2
- movdqa 3*16(BASE), %xmm3
- movdqa 4*16(BASE), %xmm4
- movdqa 5*16(BASE), %xmm5
- movdqa 6*16(BASE), %xmm6
- movdqa 7*16(BASE), %xmm7
+ vmovdqa 0(BASE), %ymm0
+ vmovdqa 1*32(BASE), %ymm1
+ vmovdqa 2*32(BASE), %ymm2
+ vmovdqa 3*32(BASE), %ymm3
+ vmovdqa 4*32(BASE), %ymm4
+ vmovdqa 5*32(BASE), %ymm5
+ vmovdqa 6*32(BASE), %ymm6
+ vmovdqa 7*32(BASE), %ymm7
#ifdef __x86_64__
- movdqa 8*16(BASE), %xmm8
- movdqa 9*16(BASE), %xmm9
- movdqa 10*16(BASE), %xmm10
- movdqa 11*16(BASE), %xmm11
- movdqa 12*16(BASE), %xmm12
- movdqa 13*16(BASE), %xmm13
- movdqa 14*16(BASE), %xmm14
- movdqa 15*16(BASE), %xmm15
+ vmovdqa 8*32(BASE), %ymm8
+ vmovdqa 9*32(BASE), %ymm9
+ vmovdqa 10*32(BASE), %ymm10
+ vmovdqa 11*32(BASE), %ymm11
+ vmovdqa 12*32(BASE), %ymm12
+ vmovdqa 13*32(BASE), %ymm13
+ vmovdqa 14*32(BASE), %ymm14
+ vmovdqa 15*32(BASE), %ymm15
#endif
xor %eax, %eax
@@ -75,6 +75,6 @@
2:
.set i, 0
.rept 256
- .byte i
+ .byte i, i, i, i
.set i, i + 1
.endr
--
2.17.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2019-04-09 4:13 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-09 4:13 [Qemu-devel] [RISU 0/2] i386 support for avx Richard Henderson
2019-04-09 4:13 ` [Qemu-devel] [PATCH 1/2] i386: Add avx512 state to reginfo_t Richard Henderson
2019-04-09 4:13 ` [Qemu-devel] [PATCH 2/2] HACK: Test avx2 state Richard Henderson
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.