* [PATCH] target/arm: Implement SVE2 FMMLA
@ 2020-04-20 15:10 Stephen Long
2020-04-22 3:21 ` Richard Henderson
0 siblings, 1 reply; 2+ messages in thread
From: Stephen Long @ 2020-04-20 15:10 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm, richard.henderson, apazos
I'm not sure if this insn is an SVE2 insn, since it shows up under
SVE floating point matrix multiply accumulate in the manual [1]. But,
I'll submit this for early review anyways.
[1] https://static.docs.arm.com/ddi0602/d/ISA_A64_xml_futureA-2019-12_OPT.pdf (page 2955-2956)
Signed-off-by: Stephen Long <steplong@quicinc.com>
---
target/arm/cpu.h | 10 ++++++++++
target/arm/helper-sve.h | 3 +++
target/arm/sve.decode | 4 ++++
target/arm/sve_helper.c | 35 +++++++++++++++++++++++++++++++++++
target/arm/translate-sve.c | 29 +++++++++++++++++++++++++++++
5 files changed, 81 insertions(+)
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index b7c7946771..d41c4a08c0 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3870,6 +3870,16 @@ static inline bool isar_feature_aa64_sve2_bitperm(const ARMISARegisters *id)
return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, BITPERM) != 0;
}
+static inline bool isar_feature_aa64_sve2_f32mm(const ARMISARegisters *id)
+{
+ return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, F32MM) != 0;
+}
+
+static inline bool isar_feature_aa64_sve2_f64mm(const ARMISARegisters *id)
+{
+ return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, F64MM) != 0;
+}
+
/*
* Feature tests for "does this exist in either 32-bit or 64-bit?"
*/
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index ea53750141..8104d23c5f 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2683,3 +2683,6 @@ DEF_HELPER_FLAGS_5(sve2_sqrdcmlah_zzzz_s, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_sqrdcmlah_zzzz_d, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(fmmla_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(fmmla_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 95c73c665a..dd987da648 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1383,3 +1383,7 @@ UMLSLT_zzzw 01000100 .. 0 ..... 010 111 ..... ..... @rda_rn_rm
CMLA_zzzz 01000100 esz:2 0 rm:5 0010 rot:2 rn:5 rd:5 ra=%reg_movprfx
SQRDCMLAH_zzzz 01000100 esz:2 0 rm:5 0011 rot:2 rn:5 rd:5 ra=%reg_movprfx
+
+### SVE2 floating point matrix multiply accumulate
+
+FMMLA 01100100 .. 1 ..... 111001 ..... ..... @rda_rn_rm
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index b392a87aef..0d8fe856d0 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -7389,3 +7389,38 @@ void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc)
*(uint64_t *)(vd + i + 8) = out1;
}
}
+
+#define DO_FP_MATRIX_MUL(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
+ void *status, uint32_t desc) \
+{ \
+ intptr_t s, i, j; \
+ intptr_t opr_sz = simd_oprsz(desc) / (sizeof(TYPE) >> 2); \
+ \
+ for (s = 0; s < opr_sz; ++s) { \
+ TYPE *n = vn + s * (sizeof(TYPE) >> 2); \
+ TYPE *m = vm + s * (sizeof(TYPE) >> 2); \
+ TYPE *a = va + s * (sizeof(TYPE) >> 2); \
+ TYPE *d = vd + s * (sizeof(TYPE) >> 2); \
+ \
+ for (i = 0; i < 1; ++i) { \
+ for (j = 0; j < 1; ++j) { \
+ TYPE addend = a[H(2*i + j)]; \
+ \
+ TYPE nn0 = n[H(2*i)]; \
+ TYPE mm0 = m[H(2*j)]; \
+ TYPE prod0 = TYPE##_mul(nn0, mm0, status); \
+ \
+ TYPE nn1 = n[H4(2*i + 1)]; \
+ TYPE mm1 = m[H4(2*j + 1)]; \
+ TYPE prod1 = TYPE##_mul(nn1, mm1, status); \
+ \
+ TYPE sum = TYPE##_add(prod0, prod1, status); \
+ d[H(2*i + j)] = TYPE##_add(sum, addend, status); \
+ } \
+ } \
+ } \
+}
+
+DO_FP_MATRIX_MUL(fmmla_s, float32, H4)
+DO_FP_MATRIX_MUL(fmmla_d, float64, )
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 0cbb35c691..29532424c1 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -7615,6 +7615,35 @@ static bool do_sve2_zzzz_fn(DisasContext *s, int rd, int rn, int rm, int ra,
return true;
}
+static bool trans_FMMLA(DisasContext *s, arg_rrrr_esz *a)
+{
+ if (a->esz < MO_32) {
+ return false;
+ }
+
+ if (a->esz == MO_32 && !dc_isar_feature(aa64_sve2_f32mm, s)) {
+ return false;
+ }
+
+ if (a->esz == MO_64 && !dc_isar_feature(aa64_sve2_f64mm, s)) {
+ return false;
+ }
+
+ static gen_helper_gvec_4_ptr * const fns[2] = {
+ gen_helper_fmmla_s, gen_helper_fmmla_d
+ };
+ if (sve_access_check(s)) {
+ unsigned vsz = vec_full_reg_size(s);
+ TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+ tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->ra),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ status, vsz, vsz, 0, fns[a->esz - 2]);
+ }
+ return true;
+}
+
static bool do_sqdmlal_zzzw(DisasContext *s, arg_rrrr_esz *a,
bool sel1, bool sel2)
{
--
2.17.1
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH] target/arm: Implement SVE2 FMMLA
2020-04-20 15:10 [PATCH] target/arm: Implement SVE2 FMMLA Stephen Long
@ 2020-04-22 3:21 ` Richard Henderson
0 siblings, 0 replies; 2+ messages in thread
From: Richard Henderson @ 2020-04-22 3:21 UTC (permalink / raw)
To: Stephen Long, qemu-devel; +Cc: qemu-arm, apazos
On 4/20/20 8:10 AM, Stephen Long wrote:
> +#define DO_FP_MATRIX_MUL(NAME, TYPE, H) \
> +void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
> + void *status, uint32_t desc) \
> +{ \
> + intptr_t s, i, j; \
> + intptr_t opr_sz = simd_oprsz(desc) / (sizeof(TYPE) >> 2); \
> + \
> + for (s = 0; s < opr_sz; ++s) { \
> + TYPE *n = vn + s * (sizeof(TYPE) >> 2); \
> + TYPE *m = vm + s * (sizeof(TYPE) >> 2); \
> + TYPE *a = va + s * (sizeof(TYPE) >> 2); \
> + TYPE *d = vd + s * (sizeof(TYPE) >> 2); \
> + \
> + for (i = 0; i < 1; ++i) { \
> + for (j = 0; j < 1; ++j) { \
> + TYPE addend = a[H(2*i + j)]; \
> + \
> + TYPE nn0 = n[H(2*i)]; \
> + TYPE mm0 = m[H(2*j)]; \
> + TYPE prod0 = TYPE##_mul(nn0, mm0, status); \
> + \
> + TYPE nn1 = n[H4(2*i + 1)]; \
> + TYPE mm1 = m[H4(2*j + 1)]; \
> + TYPE prod1 = TYPE##_mul(nn1, mm1, status); \
> + \
> + TYPE sum = TYPE##_add(prod0, prod1, status); \
> + d[H(2*i + j)] = TYPE##_add(sum, addend, status); \
> + } \
> + } \
This has a read-after-write problem, when D overlaps any of the inputs. You
need to read all of the inputs before writing anything.
It might be easiest to just unroll these two inner loops:
TYPE n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3];
TYPE m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3];
TYPE p0, p1;
// i = 0, j = 0
p0 = mul(n00, m00, status);
p1 = mul(n01, m01, status);
a[0] = add(a[0], add(p0, p1, status), status);
// i = 0, j = 1
p0 = mul(n00, m10, status);
p1 = mul(n01, m11, status);
a[1] = add(a[1], add(p0, p1, status), status);
...
r~
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2020-04-22 3:22 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-20 15:10 [PATCH] target/arm: Implement SVE2 FMMLA Stephen Long
2020-04-22 3:21 ` Richard Henderson
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.