From: Jan Beulich <jbeulich@suse.com>
To: "xen-devel@lists.xenproject.org" <xen-devel@lists.xenproject.org>
Cc: "Andrew Cooper" <andrew.cooper3@citrix.com>,
"George Dunlap" <george.dunlap@citrix.com>,
"Wei Liu" <wl@xen.org>, "Roger Pau Monné" <roger.pau@citrix.com>
Subject: [PATCH v3 21/22] x86emul: test AMX insns
Date: Thu, 22 Apr 2021 16:57:04 +0200 [thread overview]
Message-ID: <19925d83-ae92-cdda-f5be-193cfb05c775@suse.com> (raw)
In-Reply-To: <322de6db-e01f-0b57-5777-5d94a13c441a@suse.com>
Carry out some basic matrix operations on 2x2, 3x3, and 4x4 matrixes.
To also have a use of a non-square matrix, also transpose ones of said
square formats via linearization and multiplication by the respective
transposition permutation matrix. To generate the latter, introduce a
small helper tool. This is mainly to avoid creating / populating a
rather large matrix (up to 16x16) in a stack variable.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -25,6 +25,12 @@ SHA := sse4-sha avx-sha avx512f-sha
GF := sse2-gf avx2-gf avx512bw-gf
TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(CLMUL) $(SHA) $(GF)
+MATRIX := amx-bf16 amx-int8
+
+ifeq ($(XEN_COMPILE_ARCH),x86_64)
+TESTCASES += $(MATRIX)
+endif
+
OPMASK := avx512f avx512dq avx512bw
ifeq ($(origin XEN_COMPILE_ARCH),override)
@@ -96,6 +102,13 @@ avx512f-opmask-vecs := 2
avx512dq-opmask-vecs := 1 2
avx512bw-opmask-vecs := 4 8
+amx-bf16-dims := 2 3 4
+amx-bf16-ints :=
+amx-bf16-flts := 2
+amx-int8-dims := 2 3 4
+amx-int8-ints := 1
+amx-int8-flts :=
+
# Suppress building by default of the harness if the compiler can't deal
# with some of the extensions used. Don't alter the "run" target dependencies
# though, as this target needs to be specified manually, and things may work
@@ -170,6 +183,18 @@ endef
define opmask-defs
$(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)")
endef
+amx-cflags-common = $(CFLAGS_xeninclude) -Os -DN=$(1) -DTPM_H=tpm-$(1)x$(1).h
+define matrix-defs
+$(1).h: $(foreach dim,$($(1)-dims),tpm-$(dim)x$(dim).h)
+$(1)-cflags :=
+$(1)-cflags-x86_64 := \
+ $(foreach dim,$($(1)-dims), \
+ $(foreach flt,$($(1)-flts), \
+ "-D_$(dim)x$(dim) -DFLOAT_SIZE=$(flt) $(call amx-cflags-common,$(dim))") \
+ $(foreach int,$($(1)-ints), \
+ "-Di_$(dim)x$(dim) -DINT_SIZE=$(int) $(call amx-cflags-common,$(dim))" \
+ "-Du_$(dim)x$(dim) -DUINT_SIZE=$(int) $(call amx-cflags-common,$(dim))"))
+endef
$(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
$(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
@@ -178,6 +203,7 @@ $(foreach flavor,$(CLMUL),$(eval $(call
$(foreach flavor,$(SHA),$(eval $(call simd-sha-defs,$(flavor))))
$(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
$(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
+$(foreach flavor,$(MATRIX),$(eval $(call matrix-defs,$(flavor))))
first-string = $(shell for s in $(1); do echo "$$s"; break; done)
@@ -248,6 +274,9 @@ $(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AE
xop.h avx512f.h: simd-fma.c
+$(addsuffix .c,$(MATRIX)):
+ ln -sf matrix.c $@
+
endif # 32-bit override
$(TARGET): x86-emulate.o cpuid.o test_x86_emulator.o evex-disp8.o predicates.o wrappers.o
@@ -295,6 +324,12 @@ x86-emulate.o cpuid.o test_x86_emulator.
x86-emulate.o: x86_emulate/x86_emulate.c
x86-emulate.o: HOSTCFLAGS += -D__XEN_TOOLS__
+tpm-%.h: mktpm Makefile
+ set -x; ./$< $(subst x,$(space),$*) >$@
+
+mktpm: mktpm.c
+ $(HOSTCC) $(HOSTCFLAGS) -o $@ $<
+
# In order for our custom .type assembler directives to reliably land after
# gcc's, we need to keep it from re-ordering top-level constructs.
$(call cc-option-add,HOSTCFLAGS-toplevel,HOSTCC,-fno-toplevel-reorder)
--- /dev/null
+++ b/tools/tests/x86_emulator/matrix.c
@@ -0,0 +1,229 @@
+#include <stdbool.h>
+
+typedef unsigned int __attribute__((mode(QI))) uint8_t;
+typedef unsigned int __attribute__((mode(HI))) uint16_t;
+
+#define stringify_(x...) #x
+#define stringify(x...) stringify_(x)
+
+#include <xen/asm/x86-types.h>
+
+asm ( "\t.text\n"
+ "\t.globl _start\n"
+ "_start:\n"
+ "\tjmp matrix_test" );
+
+/*
+ * For the purposes here we consider the 32-bit elements to hold just a single
+ * value, with the other slots zero-filled. This way the 2- or 4-way dot
+ * products really end up as simple multiplications, allowing us to treat the
+ * underlying insns as simple matrix multiply-and-accumulate ones. With
+ * suitably in-range numbers, this also allows us to have the compiler deal
+ * with, in particular, the bf16 fields without it actually knowing of such a
+ * type.
+ *
+ * Notation in comments:
+ * I - identity matrix (all ones on the main diagonal)
+ * AI - all ones on the antidiagonal
+ */
+
+typedef union {
+#ifdef FLOAT_SIZE
+# define MACC "tdpbf16ps"
+ float val;
+ float res;
+ struct {
+ unsigned int zero:16;
+ unsigned int bf16:16;
+ };
+#else
+# ifdef INT_SIZE
+# define SIGNED signed
+# define MACC "tdpbssd"
+# else
+# define MACC "tdpbuud"
+# define SIGNED unsigned
+# endif
+ SIGNED int res;
+ struct {
+ SIGNED int val :8;
+ unsigned int zero:24;
+ };
+#endif
+} elem_t;
+
+typedef elem_t tile_t[N][N];
+
+static void ldtilecfg(const struct x86_tilecfg *cfg)
+{
+ asm volatile ( "ldtilecfg %0" :: "m" (*cfg) );
+}
+
+#define load_diag(r, v) ({ \
+ struct { \
+ elem_t arr[2 * N - 1]; \
+ } in = { .arr[N - 1].val = (v) }; \
+ asm volatile ( "tileloadd -%c[scale](%[base],%[stride],%c[scale]), %%" #r \
+ :: [base] "r" (&in.arr[N]), \
+ [stride] "r" (-1L), \
+ [scale] "i" (sizeof(elem_t)), \
+ "m" (in) ); \
+})
+
+#define load_antidiag(r, v) ({ \
+ struct { \
+ elem_t arr[2 * N - 1]; \
+ } in = { .arr[N - 1].val = (v) }; \
+ asm volatile ( "tileloadd (%[base],%[stride]), %%" #r \
+ :: [base] "r" (&in.arr), \
+ [stride] "r" (sizeof(elem_t)), \
+ "m" (in) ); \
+})
+
+#define load_linear(r, t) ({ \
+ (void)((t) == (const tile_t *)0); \
+ asm volatile ( "tileloadd (%[base]), %%" #r \
+ :: [base] "r" (t), \
+ "m" (*(t)) ); \
+})
+
+static const elem_t tpm[N * N][N * N] = {
+#include stringify(TPM_H)
+};
+
+#define load_tpm(r) \
+ asm volatile ( "tileloadd (%[base],%[stride],%c[scale]), %%" #r \
+ :: [base] "r" (&tpm), \
+ [stride] "r" (N * N * 1L), \
+ [scale] "i" (sizeof(elem_t)), \
+ "m" (tpm) ); \
+
+#define store(t, r) ({ \
+ (void)((t) == (tile_t *)0); \
+ asm volatile ( "tilestored %%" #r ", (%[base],%[stride],%c[scale])" \
+ /* "+m" to keep the compiler from eliminating fill(). */ \
+ : "+m" (*(t)) \
+ : [base] "r" (t), \
+ [stride] "r" (N * 1L), \
+ [scale] "i" (sizeof(elem_t)) ); \
+})
+
+#define macc(srcdst, src1, src2) \
+ asm volatile ( MACC " %" #src2 ", %" #src1 ", %" #srcdst )
+
+#define mul(dst, src1, src2) ({ \
+ asm volatile ( "tilezero %" #dst ); \
+ macc(dst, src1, src2); \
+})
+
+#define add(dst, src1, src2, scratch) ({ \
+ load_diag(scratch, 1); \
+ mul(dst, src1, scratch); \
+ macc(dst, scratch, src2); \
+})
+
+static inline void fill(tile_t *t)
+{
+ unsigned int cnt = N * N;
+
+ asm ( "repe stosl"
+ : "=m" (*t), "+D" (t), "+c" (cnt)
+ : "a" (~0) );
+}
+
+static inline bool zero(const tile_t *t)
+{
+ unsigned int cnt = N * N;
+ bool zf;
+
+ asm ( "repe scasl"
+ : "=@ccz" (zf), "+D" (t), "+c" (cnt)
+ : "m" (*t), "a" (0) );
+
+ return zf;
+}
+
+#define C(cols) ((cols) * sizeof(elem_t))
+#define R(rows) (rows)
+
+int matrix_test(void)
+{
+ struct x86_tilecfg cfg = {
+ .palette = 1,
+ .colsb = { C(N), C(N), C(N), C(N), 0, C(N * N), C(N * N), C(N * N) },
+ .rows = { R(N), R(N), R(N), R(N), 0, R(1), R(1), R(N * N) },
+ };
+ tile_t x;
+ unsigned int i, j;
+
+ ldtilecfg(&cfg);
+
+ fill(&x);
+ store(&x, tmm0);
+ if ( !zero(&x) ) return __LINE__;
+
+ /* Load and store I. */
+ fill(&x);
+ load_diag(tmm0, 1);
+ store(&x, tmm0);
+ for ( i = 0; i < N; ++i )
+ for ( j = 0; j < N; ++j )
+ if ( x[i][j].res != (i == j) )
+ return __LINE__;
+
+ /* I + AI */
+ fill(&x);
+ load_antidiag(tmm1, 1);
+ add(tmm2, tmm0, tmm1, tmm3);
+ store(&x, tmm2);
+ for ( i = 0; i < N; ++i )
+ for ( j = 0; j < N; ++j )
+ if ( i == j && i + j == N - 1 )
+ {
+ if ( x[i][j].res != 2 )
+ return __LINE__;
+ }
+ else if ( i == j || i + j == N - 1 )
+ {
+ if ( x[i][j].res != 1 )
+ return __LINE__;
+ }
+ else if ( x[i][j].res )
+ return __LINE__;
+
+#ifndef UINT_SIZE
+ /* I + AI * -AI == 0 */
+ fill(&x);
+ load_antidiag(tmm2, -1);
+ macc(tmm0, tmm1, tmm2);
+ store(&x, tmm0);
+ if ( !zero(&x) ) return __LINE__;
+#endif
+
+ /*
+ * Transpose a matrix via linearization and multiplication by the
+ * respective transpostion permutation matrix. Note that linearization
+ * merely requires a different tile layout (see the initializer of cfg
+ * above).
+ */
+#ifdef UINT_SIZE
+# define VAL(r, c) ((c) < (r) ? (c) : (r) + (c) )
+#else
+# define VAL(r, c) ((c) < (r) ? -(r) : (r) + (c) )
+#endif
+ for ( i = 0; i < N; ++i )
+ for (j = 0; j < N; ++j )
+ x[i][j].val = VAL(i, j);
+ load_linear(tmm6, &x);
+ load_tpm(tmm7);
+ mul(tmm5, tmm6, tmm7);
+ /* There's just a single row, so re-use plain store() here. */
+ store(&x, tmm5);
+ for ( i = 0; i < N; ++i )
+ for (j = 0; j < N; ++j )
+ if ( x[i][j].res != VAL(j, i) )
+ return __LINE__;
+#undef VAL
+
+ return 0;
+}
--- /dev/null
+++ b/tools/tests/x86_emulator/mktpm.c
@@ -0,0 +1,41 @@
+/* make Transposition Permutation Matrix */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+static void line(unsigned one, unsigned cols)
+{
+ unsigned i;
+
+ printf(" { ");
+ for ( i = 0; i < cols - 1; ++i )
+ printf("{ %d }, ", i == one);
+ printf("{ %d } },\n", i == one);
+}
+
+int main(int argc, char*argv[])
+{
+ unsigned i, j, m, n;
+
+ switch ( argc )
+ {
+ default:
+ fprintf(stderr, "Usage: %s <rows> [<cols>]\n", argv[0]);
+ return argc != 1;
+
+ case 3:
+ n = strtoul(argv[2], NULL, 0);
+ /* fall-through */
+ case 2:
+ m = strtoul(argv[1], NULL, 0);
+ if ( argc == 2 )
+ n = m;
+ break;
+ }
+
+ for ( i = 0; i < m * n; )
+ for ( j = i / n; j < m * n; j += m, ++i )
+ line(j, m * n);
+
+ return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -44,6 +44,11 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "avx512vbmi.h"
#include "avx512vbmi2-vpclmulqdq.h"
+#ifdef __x86_64__
+#include "amx-bf16.h"
+#include "amx-int8.h"
+#endif
+
#define verbose false /* Switch to true for far more logging. */
static void blowfish_set_regs(struct cpu_user_regs *regs)
@@ -263,6 +268,33 @@ static bool simd_check_regs(const struct
return false;
}
+#ifdef __x86_64__
+
+static bool amx_check_bf16(void)
+{
+ return cp.feat.amx_bf16;
+}
+
+static bool amx_check_int8(void)
+{
+ return cp.feat.amx_int8;
+}
+
+static void amx_set_regs(struct cpu_user_regs *regs)
+{
+}
+
+static bool amx_check_regs(const struct cpu_user_regs *regs)
+{
+ asm volatile ( ".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ); /* tilerelease */
+ if ( !regs->eax )
+ return true;
+ printf("[line %u] ", (unsigned int)regs->eax);
+ return false;
+}
+
+#endif
+
static const struct {
const void *code;
size_t size;
@@ -534,6 +566,25 @@ static const struct {
#undef AVX512VL
#undef SIMD_
#undef SIMD
+#ifdef __x86_64__
+# define AMX(desc, feat, t, dim) \
+ { .code = amx_ ## feat ## _x86_64_D ## t ## _ ## dim ## x ## dim, \
+ .size = sizeof(amx_ ## feat ## _x86_64_D ## t ## _ ## dim ## x ## dim), \
+ .bitness = 64, .name = "AMX-" #desc " (" #t #dim "x" #dim ")", \
+ .check_cpu = amx_check_ ## feat, \
+ .set_regs = amx_set_regs, \
+ .check_regs = amx_check_regs }
+ AMX(BF16, bf16, , 2),
+ AMX(BF16, bf16, , 3),
+ AMX(BF16, bf16, , 4),
+ AMX(INT8, int8, i, 2),
+ AMX(INT8, int8, i, 3),
+ AMX(INT8, int8, i, 4),
+ AMX(INT8, int8, u, 2),
+ AMX(INT8, int8, u, 3),
+ AMX(INT8, int8, u, 4),
+# undef AMX
+#endif
};
static unsigned int bytes_read;
next prev parent reply other threads:[~2021-04-22 14:57 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-04-22 14:38 [PATCH v3 00/22] xvmalloc() / x86 xstate area / x86 CPUID / AMX+XFD Jan Beulich
2021-04-22 14:43 ` [PATCH v3 01/22] mm: introduce xvmalloc() et al and use for grant table allocations Jan Beulich
2021-05-03 11:31 ` Roger Pau Monné
2021-05-03 13:50 ` Jan Beulich
2021-05-03 14:54 ` Roger Pau Monné
2021-05-03 15:21 ` Jan Beulich
2021-05-03 16:39 ` Roger Pau Monné
2021-04-22 14:44 ` [PATCH v3 02/22] x86/xstate: use xvzalloc() for save area allocation Jan Beulich
2021-05-05 13:29 ` Roger Pau Monné
2021-04-22 14:44 ` [PATCH v3 03/22] x86/xstate: re-size save area when CPUID policy changes Jan Beulich
2021-05-03 13:57 ` Andrew Cooper
2021-05-03 14:22 ` Jan Beulich
2021-05-11 16:41 ` Andrew Cooper
2021-05-17 7:33 ` Jan Beulich
2021-04-22 14:45 ` [PATCH v3 04/22] x86/xstate: re-use valid_xcr0() for boot-time checks Jan Beulich
2021-05-03 11:53 ` Andrew Cooper
2021-04-22 14:45 ` [PATCH v3 05/22] x86/xstate: drop xstate_offsets[] and xstate_sizes[] Jan Beulich
2021-05-03 16:10 ` Andrew Cooper
2021-05-04 7:57 ` Jan Beulich
2021-04-22 14:46 ` [PATCH v3 06/22] x86/xstate: replace xsave_cntxt_size and drop XCNTXT_MASK Jan Beulich
2021-04-22 14:47 ` [PATCH v3 07/22] x86/xstate: avoid accounting for unsupported components Jan Beulich
2021-04-22 14:47 ` [PATCH v3 08/22] x86: use xvmalloc() for extended context buffer allocations Jan Beulich
2021-04-22 14:48 ` [PATCH v3 09/22] x86/xstate: enable AMX components Jan Beulich
2021-04-22 14:50 ` [PATCH v3 10/22] x86/CPUID: adjust extended leaves out of range clearing Jan Beulich
2021-04-22 14:50 ` [PATCH v3 11/22] x86/CPUID: move bounding of max_{,sub}leaf fields to library code Jan Beulich
2021-04-22 14:51 ` [PATCH v3 12/22] x86/CPUID: enable AMX leaves Jan Beulich
2021-04-22 14:52 ` [PATCH v3 13/22] x86: XFD enabling Jan Beulich
2021-04-22 14:53 ` [PATCH v3 14/22] x86emul: introduce X86EMUL_FPU_{tilecfg,tile} Jan Beulich
2021-04-22 14:53 ` [PATCH v3 15/22] x86emul: support TILERELEASE Jan Beulich
2021-04-22 14:53 ` [PATCH v3 16/22] x86: introduce struct for TILECFG register Jan Beulich
2021-04-22 14:54 ` [PATCH v3 17/22] x86emul: support {LD,ST}TILECFG Jan Beulich
2021-04-22 14:55 ` [PATCH v3 18/22] x86emul: support TILEZERO Jan Beulich
2021-04-22 14:55 ` [PATCH v3 19/22] x86emul: support TILELOADD{,T1} and TILESTORE Jan Beulich
2021-04-22 15:06 ` Jan Beulich
2021-04-22 15:11 ` Jan Beulich
2021-04-26 7:12 ` Paul Durrant
2021-04-29 9:40 ` Jan Beulich
2021-04-22 14:56 ` [PATCH v3 20/22] x86emul: support tile multiplication insns Jan Beulich
2021-04-22 14:57 ` Jan Beulich [this message]
2021-04-22 14:57 ` [PATCH v3 22/22] x86: permit guests to use AMX and XFD Jan Beulich
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=19925d83-ae92-cdda-f5be-193cfb05c775@suse.com \
--to=jbeulich@suse.com \
--cc=andrew.cooper3@citrix.com \
--cc=george.dunlap@citrix.com \
--cc=roger.pau@citrix.com \
--cc=wl@xen.org \
--cc=xen-devel@lists.xenproject.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).