All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jan Beulich <jbeulich@suse.com>
To: "xen-devel@lists.xenproject.org" <xen-devel@lists.xenproject.org>
Cc: "Andrew Cooper" <andrew.cooper3@citrix.com>,
	"George Dunlap" <george.dunlap@citrix.com>,
	"Wei Liu" <wl@xen.org>, "Roger Pau Monné" <roger.pau@citrix.com>
Subject: [PATCH v3 21/22] x86emul: test AMX insns
Date: Thu, 22 Apr 2021 16:57:04 +0200	[thread overview]
Message-ID: <19925d83-ae92-cdda-f5be-193cfb05c775@suse.com> (raw)
In-Reply-To: <322de6db-e01f-0b57-5777-5d94a13c441a@suse.com>

Carry out some basic matrix operations on 2x2, 3x3, and 4x4 matrixes.

To also have a use of a non-square matrix, also transpose ones of said
square formats via linearization and multiplication by the respective
transposition permutation matrix. To generate the latter, introduce a
small helper tool. This is mainly to avoid creating / populating a
rather large matrix (up to 16x16) in a stack variable.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -25,6 +25,12 @@ SHA := sse4-sha avx-sha avx512f-sha
 GF := sse2-gf avx2-gf avx512bw-gf
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(CLMUL) $(SHA) $(GF)
 
+MATRIX := amx-bf16 amx-int8
+
+ifeq ($(XEN_COMPILE_ARCH),x86_64)
+TESTCASES += $(MATRIX)
+endif
+
 OPMASK := avx512f avx512dq avx512bw
 
 ifeq ($(origin XEN_COMPILE_ARCH),override)
@@ -96,6 +102,13 @@ avx512f-opmask-vecs := 2
 avx512dq-opmask-vecs := 1 2
 avx512bw-opmask-vecs := 4 8
 
+amx-bf16-dims := 2 3 4
+amx-bf16-ints :=
+amx-bf16-flts := 2
+amx-int8-dims := 2 3 4
+amx-int8-ints := 1
+amx-int8-flts :=
+
 # Suppress building by default of the harness if the compiler can't deal
 # with some of the extensions used.  Don't alter the "run" target dependencies
 # though, as this target needs to be specified manually, and things may work
@@ -170,6 +183,18 @@ endef
 define opmask-defs
 $(1)-opmask-cflags := $(foreach vec,$($(1)-opmask-vecs), "-D_$(vec) -m$(1) -Os -DSIZE=$(vec)")
 endef
+amx-cflags-common = $(CFLAGS_xeninclude) -Os -DN=$(1) -DTPM_H=tpm-$(1)x$(1).h
+define matrix-defs
+$(1).h: $(foreach dim,$($(1)-dims),tpm-$(dim)x$(dim).h)
+$(1)-cflags :=
+$(1)-cflags-x86_64 := \
+	$(foreach dim,$($(1)-dims), \
+	  $(foreach flt,$($(1)-flts), \
+	    "-D_$(dim)x$(dim) -DFLOAT_SIZE=$(flt) $(call amx-cflags-common,$(dim))") \
+	  $(foreach int,$($(1)-ints), \
+	    "-Di_$(dim)x$(dim) -DINT_SIZE=$(int) $(call amx-cflags-common,$(dim))" \
+	    "-Du_$(dim)x$(dim) -DUINT_SIZE=$(int) $(call amx-cflags-common,$(dim))"))
+endef
 
 $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor))))
 $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor))))
@@ -178,6 +203,7 @@ $(foreach flavor,$(CLMUL),$(eval $(call
 $(foreach flavor,$(SHA),$(eval $(call simd-sha-defs,$(flavor))))
 $(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor))))
 $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor))))
+$(foreach flavor,$(MATRIX),$(eval $(call matrix-defs,$(flavor))))
 
 first-string = $(shell for s in $(1); do echo "$$s"; break; done)
 
@@ -248,6 +274,9 @@ $(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AE
 
 xop.h avx512f.h: simd-fma.c
 
+$(addsuffix .c,$(MATRIX)):
+	ln -sf matrix.c $@
+
 endif # 32-bit override
 
 $(TARGET): x86-emulate.o cpuid.o test_x86_emulator.o evex-disp8.o predicates.o wrappers.o
@@ -295,6 +324,12 @@ x86-emulate.o cpuid.o test_x86_emulator.
 x86-emulate.o: x86_emulate/x86_emulate.c
 x86-emulate.o: HOSTCFLAGS += -D__XEN_TOOLS__
 
+tpm-%.h: mktpm Makefile
+	set -x; ./$< $(subst x,$(space),$*) >$@
+
+mktpm: mktpm.c
+	$(HOSTCC) $(HOSTCFLAGS) -o $@ $<
+
 # In order for our custom .type assembler directives to reliably land after
 # gcc's, we need to keep it from re-ordering top-level constructs.
 $(call cc-option-add,HOSTCFLAGS-toplevel,HOSTCC,-fno-toplevel-reorder)
--- /dev/null
+++ b/tools/tests/x86_emulator/matrix.c
@@ -0,0 +1,229 @@
+#include <stdbool.h>
+
+typedef unsigned int __attribute__((mode(QI))) uint8_t;
+typedef unsigned int __attribute__((mode(HI))) uint16_t;
+
+#define stringify_(x...) #x
+#define stringify(x...)  stringify_(x)
+
+#include <xen/asm/x86-types.h>
+
+asm ( "\t.text\n"
+      "\t.globl _start\n"
+      "_start:\n"
+      "\tjmp matrix_test" );
+
+/*
+ * For the purposes here we consider the 32-bit elements to hold just a single
+ * value, with the other slots zero-filled. This way the 2- or 4-way dot
+ * products really end up as simple multiplications, allowing us to treat the
+ * underlying insns as simple matrix multiply-and-accumulate ones. With
+ * suitably in-range numbers, this also allows us to have the compiler deal
+ * with, in particular, the bf16 fields without it actually knowing of such a
+ * type.
+ *
+ * Notation in comments:
+ * I  - identity matrix (all ones on the main diagonal)
+ * AI - all ones on the antidiagonal
+ */
+
+typedef union {
+#ifdef FLOAT_SIZE
+# define MACC "tdpbf16ps"
+    float val;
+    float res;
+    struct {
+        unsigned int zero:16;
+        unsigned int bf16:16;
+    };
+#else
+# ifdef INT_SIZE
+#  define SIGNED signed
+#  define MACC "tdpbssd"
+# else
+#  define MACC "tdpbuud"
+#  define SIGNED unsigned
+# endif
+    SIGNED int res;
+    struct {
+        SIGNED   int val :8;
+        unsigned int zero:24;
+    };
+#endif
+} elem_t;
+
+typedef elem_t tile_t[N][N];
+
+static void ldtilecfg(const struct x86_tilecfg *cfg)
+{
+    asm volatile ( "ldtilecfg %0" :: "m" (*cfg) );
+}
+
+#define load_diag(r, v) ({ \
+    struct { \
+        elem_t arr[2 * N - 1]; \
+    } in = { .arr[N - 1].val = (v) }; \
+    asm volatile ( "tileloadd -%c[scale](%[base],%[stride],%c[scale]), %%" #r \
+                   :: [base] "r" (&in.arr[N]), \
+                      [stride] "r" (-1L), \
+                      [scale] "i" (sizeof(elem_t)), \
+                      "m" (in) ); \
+})
+
+#define load_antidiag(r, v) ({ \
+    struct { \
+        elem_t arr[2 * N - 1]; \
+    } in = { .arr[N - 1].val = (v) }; \
+    asm volatile ( "tileloadd (%[base],%[stride]), %%" #r \
+                   :: [base] "r" (&in.arr), \
+                      [stride] "r" (sizeof(elem_t)), \
+                      "m" (in) ); \
+})
+
+#define load_linear(r, t) ({ \
+    (void)((t) == (const tile_t *)0); \
+    asm volatile ( "tileloadd (%[base]), %%" #r \
+                   :: [base] "r" (t), \
+                      "m" (*(t)) ); \
+})
+
+static const elem_t tpm[N * N][N * N] = {
+#include stringify(TPM_H)
+};
+
+#define load_tpm(r) \
+    asm volatile ( "tileloadd (%[base],%[stride],%c[scale]), %%" #r \
+                   :: [base] "r" (&tpm), \
+                      [stride] "r" (N * N * 1L), \
+                      [scale] "i" (sizeof(elem_t)), \
+                      "m" (tpm) ); \
+
+#define store(t, r) ({ \
+    (void)((t) == (tile_t *)0); \
+    asm volatile ( "tilestored %%" #r ", (%[base],%[stride],%c[scale])" \
+                   /* "+m" to keep the compiler from eliminating fill(). */ \
+                   : "+m" (*(t)) \
+                   : [base] "r" (t), \
+                     [stride] "r" (N * 1L), \
+                     [scale] "i" (sizeof(elem_t)) ); \
+})
+
+#define macc(srcdst, src1, src2) \
+    asm volatile ( MACC " %" #src2 ", %" #src1 ", %" #srcdst )
+
+#define mul(dst, src1, src2) ({ \
+    asm volatile ( "tilezero %" #dst ); \
+    macc(dst, src1, src2); \
+})
+
+#define add(dst, src1, src2, scratch) ({ \
+    load_diag(scratch, 1); \
+    mul(dst, src1, scratch); \
+    macc(dst, scratch, src2); \
+})
+
+static inline void fill(tile_t *t)
+{
+    unsigned int cnt = N * N;
+
+    asm ( "repe stosl"
+          : "=m" (*t), "+D" (t), "+c" (cnt)
+          : "a" (~0) );
+}
+
+static inline bool zero(const tile_t *t)
+{
+    unsigned int cnt = N * N;
+    bool zf;
+
+    asm ( "repe scasl"
+          : "=@ccz" (zf), "+D" (t), "+c" (cnt)
+          : "m" (*t), "a" (0) );
+
+    return zf;
+}
+
+#define C(cols) ((cols) * sizeof(elem_t))
+#define R(rows) (rows)
+
+int matrix_test(void)
+{
+    struct x86_tilecfg cfg = {
+        .palette = 1,
+        .colsb   = { C(N), C(N), C(N), C(N), 0, C(N * N), C(N * N), C(N * N) },
+        .rows    = { R(N), R(N), R(N), R(N), 0, R(1),     R(1),     R(N * N) },
+    };
+    tile_t x;
+    unsigned int i, j;
+
+    ldtilecfg(&cfg);
+
+    fill(&x);
+    store(&x, tmm0);
+    if ( !zero(&x) ) return __LINE__;
+
+    /* Load and store I. */
+    fill(&x);
+    load_diag(tmm0, 1);
+    store(&x, tmm0);
+    for ( i = 0; i < N; ++i )
+        for ( j = 0; j < N; ++j )
+            if ( x[i][j].res != (i == j) )
+                return __LINE__;
+
+    /* I + AI */
+    fill(&x);
+    load_antidiag(tmm1, 1);
+    add(tmm2, tmm0, tmm1, tmm3);
+    store(&x, tmm2);
+    for ( i = 0; i < N; ++i )
+        for ( j = 0; j < N; ++j )
+            if ( i == j && i + j == N - 1 )
+            {
+                if ( x[i][j].res != 2 )
+                    return __LINE__;
+            }
+            else if ( i == j || i + j == N - 1 )
+            {
+                if ( x[i][j].res != 1 )
+                    return __LINE__;
+            }
+            else if ( x[i][j].res )
+                return __LINE__;
+
+#ifndef UINT_SIZE
+    /* I + AI * -AI == 0 */
+    fill(&x);
+    load_antidiag(tmm2, -1);
+    macc(tmm0, tmm1, tmm2);
+    store(&x, tmm0);
+    if ( !zero(&x) ) return __LINE__;
+#endif
+
+    /*
+     * Transpose a matrix via linearization and multiplication by the
+     * respective transpostion permutation matrix. Note that linearization
+     * merely requires a different tile layout (see the initializer of cfg
+     * above).
+     */
+#ifdef UINT_SIZE
+# define VAL(r, c) ((c) < (r) ? (c) : (r) + (c) )
+#else
+# define VAL(r, c) ((c) < (r) ? -(r) : (r) + (c) )
+#endif
+    for ( i = 0; i < N; ++i )
+        for (j = 0; j < N; ++j )
+            x[i][j].val = VAL(i, j);
+    load_linear(tmm6, &x);
+    load_tpm(tmm7);
+    mul(tmm5, tmm6, tmm7);
+    /* There's just a single row, so re-use plain store() here. */
+    store(&x, tmm5);
+    for ( i = 0; i < N; ++i )
+        for (j = 0; j < N; ++j )
+            if ( x[i][j].res != VAL(j, i) )
+                return __LINE__;
+#undef VAL
+
+    return 0;
+}
--- /dev/null
+++ b/tools/tests/x86_emulator/mktpm.c
@@ -0,0 +1,41 @@
+/* make Transposition Permutation Matrix */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+static void line(unsigned one, unsigned cols)
+{
+    unsigned i;
+
+    printf("    { ");
+    for ( i = 0; i < cols - 1; ++i )
+        printf("{ %d }, ", i == one);
+    printf("{ %d } },\n", i == one);
+}
+
+int main(int argc, char*argv[])
+{
+    unsigned i, j, m, n;
+
+    switch ( argc )
+    {
+    default:
+        fprintf(stderr, "Usage: %s <rows> [<cols>]\n", argv[0]);
+        return argc != 1;
+
+    case 3:
+        n = strtoul(argv[2], NULL, 0);
+        /* fall-through */
+    case 2:
+        m = strtoul(argv[1], NULL, 0);
+        if ( argc == 2 )
+            n = m;
+        break;
+    }
+
+    for ( i = 0; i < m * n; )
+        for ( j = i / n; j < m * n; j += m, ++i )
+            line(j, m * n);
+
+    return 0;
+}
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -44,6 +44,11 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512vbmi.h"
 #include "avx512vbmi2-vpclmulqdq.h"
 
+#ifdef __x86_64__
+#include "amx-bf16.h"
+#include "amx-int8.h"
+#endif
+
 #define verbose false /* Switch to true for far more logging. */
 
 static void blowfish_set_regs(struct cpu_user_regs *regs)
@@ -263,6 +268,33 @@ static bool simd_check_regs(const struct
     return false;
 }
 
+#ifdef __x86_64__
+
+static bool amx_check_bf16(void)
+{
+    return cp.feat.amx_bf16;
+}
+
+static bool amx_check_int8(void)
+{
+    return cp.feat.amx_int8;
+}
+
+static void amx_set_regs(struct cpu_user_regs *regs)
+{
+}
+
+static bool amx_check_regs(const struct cpu_user_regs *regs)
+{
+    asm volatile ( ".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ); /* tilerelease */
+    if ( !regs->eax )
+        return true;
+    printf("[line %u] ", (unsigned int)regs->eax);
+    return false;
+}
+
+#endif
+
 static const struct {
     const void *code;
     size_t size;
@@ -534,6 +566,25 @@ static const struct {
 #undef AVX512VL
 #undef SIMD_
 #undef SIMD
+#ifdef __x86_64__
+# define AMX(desc, feat, t, dim)                                              \
+    { .code = amx_ ## feat ## _x86_64_D ## t ## _ ## dim ## x ## dim,         \
+      .size = sizeof(amx_ ## feat ## _x86_64_D ## t ## _ ## dim ## x ## dim), \
+      .bitness = 64, .name = "AMX-" #desc " (" #t #dim "x" #dim ")",          \
+      .check_cpu = amx_check_ ## feat,                                        \
+      .set_regs = amx_set_regs,                                               \
+      .check_regs = amx_check_regs }
+    AMX(BF16, bf16, , 2),
+    AMX(BF16, bf16, , 3),
+    AMX(BF16, bf16, , 4),
+    AMX(INT8, int8, i, 2),
+    AMX(INT8, int8, i, 3),
+    AMX(INT8, int8, i, 4),
+    AMX(INT8, int8, u, 2),
+    AMX(INT8, int8, u, 3),
+    AMX(INT8, int8, u, 4),
+# undef AMX
+#endif
 };
 
 static unsigned int bytes_read;



  parent reply	other threads:[~2021-04-22 14:57 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-22 14:38 [PATCH v3 00/22] xvmalloc() / x86 xstate area / x86 CPUID / AMX+XFD Jan Beulich
2021-04-22 14:43 ` [PATCH v3 01/22] mm: introduce xvmalloc() et al and use for grant table allocations Jan Beulich
2021-05-03 11:31   ` Roger Pau Monné
2021-05-03 13:50     ` Jan Beulich
2021-05-03 14:54       ` Roger Pau Monné
2021-05-03 15:21         ` Jan Beulich
2021-05-03 16:39           ` Roger Pau Monné
2021-04-22 14:44 ` [PATCH v3 02/22] x86/xstate: use xvzalloc() for save area allocation Jan Beulich
2021-05-05 13:29   ` Roger Pau Monné
2021-04-22 14:44 ` [PATCH v3 03/22] x86/xstate: re-size save area when CPUID policy changes Jan Beulich
2021-05-03 13:57   ` Andrew Cooper
2021-05-03 14:22     ` Jan Beulich
2021-05-11 16:41       ` Andrew Cooper
2021-05-17  7:33         ` Jan Beulich
2021-04-22 14:45 ` [PATCH v3 04/22] x86/xstate: re-use valid_xcr0() for boot-time checks Jan Beulich
2021-05-03 11:53   ` Andrew Cooper
2021-04-22 14:45 ` [PATCH v3 05/22] x86/xstate: drop xstate_offsets[] and xstate_sizes[] Jan Beulich
2021-05-03 16:10   ` Andrew Cooper
2021-05-04  7:57     ` Jan Beulich
2021-04-22 14:46 ` [PATCH v3 06/22] x86/xstate: replace xsave_cntxt_size and drop XCNTXT_MASK Jan Beulich
2021-04-22 14:47 ` [PATCH v3 07/22] x86/xstate: avoid accounting for unsupported components Jan Beulich
2021-04-22 14:47 ` [PATCH v3 08/22] x86: use xvmalloc() for extended context buffer allocations Jan Beulich
2021-04-22 14:48 ` [PATCH v3 09/22] x86/xstate: enable AMX components Jan Beulich
2021-04-22 14:50 ` [PATCH v3 10/22] x86/CPUID: adjust extended leaves out of range clearing Jan Beulich
2021-04-22 14:50 ` [PATCH v3 11/22] x86/CPUID: move bounding of max_{,sub}leaf fields to library code Jan Beulich
2021-04-22 14:51 ` [PATCH v3 12/22] x86/CPUID: enable AMX leaves Jan Beulich
2021-04-22 14:52 ` [PATCH v3 13/22] x86: XFD enabling Jan Beulich
2021-04-22 14:53 ` [PATCH v3 14/22] x86emul: introduce X86EMUL_FPU_{tilecfg,tile} Jan Beulich
2021-04-22 14:53 ` [PATCH v3 15/22] x86emul: support TILERELEASE Jan Beulich
2021-04-22 14:53 ` [PATCH v3 16/22] x86: introduce struct for TILECFG register Jan Beulich
2021-04-22 14:54 ` [PATCH v3 17/22] x86emul: support {LD,ST}TILECFG Jan Beulich
2021-04-22 14:55 ` [PATCH v3 18/22] x86emul: support TILEZERO Jan Beulich
2021-04-22 14:55 ` [PATCH v3 19/22] x86emul: support TILELOADD{,T1} and TILESTORE Jan Beulich
2021-04-22 15:06   ` Jan Beulich
2021-04-22 15:11     ` Jan Beulich
2021-04-26  7:12       ` Paul Durrant
2021-04-29  9:40         ` Jan Beulich
2021-04-22 14:56 ` [PATCH v3 20/22] x86emul: support tile multiplication insns Jan Beulich
2021-04-22 14:57 ` Jan Beulich [this message]
2021-04-22 14:57 ` [PATCH v3 22/22] x86: permit guests to use AMX and XFD Jan Beulich

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=19925d83-ae92-cdda-f5be-193cfb05c775@suse.com \
    --to=jbeulich@suse.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=george.dunlap@citrix.com \
    --cc=roger.pau@citrix.com \
    --cc=wl@xen.org \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.