* [PATCH v1 01/11] target/arm: Add isar_feature_{aa32, aa64, aa64_sve}_bf16
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 10:43 ` Peter Maydell
2021-04-16 23:59 ` [PATCH v1 02/11] target/arm: Unify unallocated path in disas_fp_1src Richard Henderson
` (9 subsequent siblings)
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
Note that the SVE BFLOAT16 support does not require SVE2,
it is an independent extension.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/cpu.h | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 134dc65e34..38db20c721 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3783,6 +3783,11 @@ static inline bool isar_feature_aa32_predinv(const ARMISARegisters *id)
return FIELD_EX32(id->id_isar6, ID_ISAR6, SPECRES) != 0;
}
+static inline bool isar_feature_aa32_bf16(const ARMISARegisters *id)
+{
+ return FIELD_EX32(id->id_isar6, ID_ISAR6, BF16) != 0;
+}
+
static inline bool isar_feature_aa32_i8mm(const ARMISARegisters *id)
{
return FIELD_EX32(id->id_isar6, ID_ISAR6, I8MM) != 0;
@@ -4112,6 +4117,11 @@ static inline bool isar_feature_aa64_dcpodp(const ARMISARegisters *id)
return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, DPB) >= 2;
}
+static inline bool isar_feature_aa64_bf16(const ARMISARegisters *id)
+{
+ return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, BF16) != 0;
+}
+
static inline bool isar_feature_aa64_fp_simd(const ARMISARegisters *id)
{
/* We always set the AdvSIMD and FP fields identically. */
@@ -4256,6 +4266,11 @@ static inline bool isar_feature_aa64_sve2_bitperm(const ARMISARegisters *id)
return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, BITPERM) != 0;
}
+static inline bool isar_feature_aa64_sve_bf16(const ARMISARegisters *id)
+{
+ return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, BFLOAT16) != 0;
+}
+
static inline bool isar_feature_aa64_sve2_sha3(const ARMISARegisters *id)
{
return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, SHA3) != 0;
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH v1 02/11] target/arm: Unify unallocated path in disas_fp_1src
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
2021-04-16 23:59 ` [PATCH v1 01/11] target/arm: Add isar_feature_{aa32, aa64, aa64_sve}_bf16 Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 10:43 ` Peter Maydell
2021-04-16 23:59 ` [PATCH v1 03/11] target/arm: Implement scalar float32 to bfloat16 conversion Richard Henderson
` (8 subsequent siblings)
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/translate-a64.c | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 668edf3a00..d8ec219bb2 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -6509,8 +6509,7 @@ static void disas_fp_1src(DisasContext *s, uint32_t insn)
int rd = extract32(insn, 0, 5);
if (mos) {
- unallocated_encoding(s);
- return;
+ goto do_unallocated;
}
switch (opcode) {
@@ -6519,8 +6518,7 @@ static void disas_fp_1src(DisasContext *s, uint32_t insn)
/* FCVT between half, single and double precision */
int dtype = extract32(opcode, 0, 2);
if (type == 2 || dtype == type) {
- unallocated_encoding(s);
- return;
+ goto do_unallocated;
}
if (!fp_access_check(s)) {
return;
@@ -6532,8 +6530,7 @@ static void disas_fp_1src(DisasContext *s, uint32_t insn)
case 0x10 ... 0x13: /* FRINT{32,64}{X,Z} */
if (type > 1 || !dc_isar_feature(aa64_frint, s)) {
- unallocated_encoding(s);
- return;
+ goto do_unallocated;
}
/* fall through */
case 0x0 ... 0x3:
@@ -6555,8 +6552,7 @@ static void disas_fp_1src(DisasContext *s, uint32_t insn)
break;
case 3:
if (!dc_isar_feature(aa64_fp16, s)) {
- unallocated_encoding(s);
- return;
+ goto do_unallocated;
}
if (!fp_access_check(s)) {
@@ -6565,11 +6561,12 @@ static void disas_fp_1src(DisasContext *s, uint32_t insn)
handle_fp_1src_half(s, opcode, rd, rn);
break;
default:
- unallocated_encoding(s);
+ goto do_unallocated;
}
break;
default:
+ do_unallocated:
unallocated_encoding(s);
break;
}
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH v1 03/11] target/arm: Implement scalar float32 to bfloat16 conversion
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
2021-04-16 23:59 ` [PATCH v1 01/11] target/arm: Add isar_feature_{aa32, aa64, aa64_sve}_bf16 Richard Henderson
2021-04-16 23:59 ` [PATCH v1 02/11] target/arm: Unify unallocated path in disas_fp_1src Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 10:53 ` Peter Maydell
2021-04-16 23:59 ` [PATCH v1 04/11] target/arm: Implement vector " Richard Henderson
` (7 subsequent siblings)
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
This is the 64-bit BFCVT and the 32-bit VCVT{B,T}.BF16.F32.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 1 +
target/arm/vfp.decode | 2 ++
target/arm/translate-a64.c | 19 +++++++++++++++++++
target/arm/vfp_helper.c | 5 +++++
target/arm/translate-vfp.c.inc | 24 ++++++++++++++++++++++++
5 files changed, 51 insertions(+)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 33df62f44d..0892207f80 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -143,6 +143,7 @@ DEF_HELPER_3(vfp_cmped, void, f64, f64, env)
DEF_HELPER_2(vfp_fcvtds, f64, f32, env)
DEF_HELPER_2(vfp_fcvtsd, f32, f64, env)
+DEF_HELPER_FLAGS_2(bfcvt, TCG_CALL_NO_RWG, i32, f32, ptr)
DEF_HELPER_2(vfp_uitoh, f16, i32, ptr)
DEF_HELPER_2(vfp_uitos, f32, i32, ptr)
diff --git a/target/arm/vfp.decode b/target/arm/vfp.decode
index 6f7f28f9a4..52535d9b0b 100644
--- a/target/arm/vfp.decode
+++ b/target/arm/vfp.decode
@@ -205,6 +205,8 @@ VCVT_f64_f16 ---- 1110 1.11 0010 .... 1011 t:1 1.0 .... \
# VCVTB and VCVTT to f16: Vd format is always vd_sp;
# Vm format depends on size bit
+VCVT_b16_f32 ---- 1110 1.11 0011 .... 1001 t:1 1.0 .... \
+ vd=%vd_sp vm=%vm_sp
VCVT_f16_f32 ---- 1110 1.11 0011 .... 1010 t:1 1.0 .... \
vd=%vd_sp vm=%vm_sp
VCVT_f16_f64 ---- 1110 1.11 0011 .... 1011 t:1 1.0 .... \
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index d8ec219bb2..d767194cc7 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -6288,6 +6288,9 @@ static void handle_fp_1src_single(DisasContext *s, int opcode, int rd, int rn)
case 0x3: /* FSQRT */
gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env);
goto done;
+ case 0x6: /* BFCVT */
+ gen_fpst = gen_helper_bfcvt;
+ break;
case 0x8: /* FRINTN */
case 0x9: /* FRINTP */
case 0xa: /* FRINTM */
@@ -6565,6 +6568,22 @@ static void disas_fp_1src(DisasContext *s, uint32_t insn)
}
break;
+ case 0x6:
+ switch (type) {
+ case 1: /* BFCVT */
+ if (!dc_isar_feature(aa64_bf16, s)) {
+ goto do_unallocated;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_fp_1src_single(s, opcode, rd, rn);
+ break;
+ default:
+ goto do_unallocated;
+ }
+ break;
+
default:
do_unallocated:
unallocated_encoding(s);
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
index 01b9d8557f..fe7a2a5daa 100644
--- a/target/arm/vfp_helper.c
+++ b/target/arm/vfp_helper.c
@@ -408,6 +408,11 @@ float32 VFP_HELPER(fcvts, d)(float64 x, CPUARMState *env)
return float64_to_float32(x, &env->vfp.fp_status);
}
+uint32_t HELPER(bfcvt)(float32 x, void *status)
+{
+ return float32_to_bfloat16(x, status);
+}
+
/*
* VFP3 fixed point conversion. The AArch32 versions of fix-to-float
* must always round-to-nearest; the AArch64 ones honour the FPSCR
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c.inc
index e20d9c7ba6..709d1fddcf 100644
--- a/target/arm/translate-vfp.c.inc
+++ b/target/arm/translate-vfp.c.inc
@@ -3003,6 +3003,30 @@ static bool trans_VCVT_f64_f16(DisasContext *s, arg_VCVT_f64_f16 *a)
return true;
}
+static bool trans_VCVT_b16_f32(DisasContext *s, arg_VCVT_b16_f32 *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i32 tmp;
+
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_FPCR);
+ tmp = tcg_temp_new_i32();
+
+ vfp_load_reg32(tmp, a->vm);
+ gen_helper_bfcvt(tmp, tmp, fpst);
+ tcg_gen_st16_i32(tmp, cpu_env, vfp_f16_offset(a->vd, a->t));
+ tcg_temp_free_ptr(fpst);
+ tcg_temp_free_i32(tmp);
+ return true;
+}
+
static bool trans_VCVT_f16_f32(DisasContext *s, arg_VCVT_f16_f32 *a)
{
TCGv_ptr fpst;
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH v1 04/11] target/arm: Implement vector float32 to bfloat16 conversion
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
` (2 preceding siblings ...)
2021-04-16 23:59 ` [PATCH v1 03/11] target/arm: Implement scalar float32 to bfloat16 conversion Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 11:10 ` Peter Maydell
2021-04-16 23:59 ` [PATCH v1 05/11] fpu: Add float_round_to_odd_inf Richard Henderson
` (6 subsequent siblings)
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
This is BFCVT{N,T} for both AArch64 AdvSIMD and SVE,
and VCVT.BF16.F32 for AArch32 NEON.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper-sve.h | 4 +++
target/arm/helper.h | 1 +
target/arm/neon-dp.decode | 1 +
target/arm/sve.decode | 2 ++
target/arm/sve_helper.c | 2 ++
target/arm/translate-a64.c | 17 +++++++++++++
target/arm/translate-sve.c | 16 ++++++++++++
target/arm/vfp_helper.c | 7 +++++
target/arm/translate-neon.c.inc | 45 +++++++++++++++++++++++++++++++++
9 files changed, 95 insertions(+)
diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index fa7418e706..9287e6f26c 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -1197,6 +1197,8 @@ DEF_HELPER_FLAGS_5(sve_fcvt_hd, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve_fcvt_sd, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_bfcvt, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve_fcvtzs_hh, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
@@ -2744,6 +2746,8 @@ DEF_HELPER_FLAGS_5(sve2_fcvtnt_sh, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_fcvtnt_ds, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(sve_bfcvtnt, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_5(sve2_fcvtlt_hs, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 0892207f80..0b52ee6256 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -144,6 +144,7 @@ DEF_HELPER_3(vfp_cmped, void, f64, f64, env)
DEF_HELPER_2(vfp_fcvtds, f64, f32, env)
DEF_HELPER_2(vfp_fcvtsd, f32, f64, env)
DEF_HELPER_FLAGS_2(bfcvt, TCG_CALL_NO_RWG, i32, f32, ptr)
+DEF_HELPER_FLAGS_2(bfcvt_pair, TCG_CALL_NO_RWG, i32, i64, ptr)
DEF_HELPER_2(vfp_uitoh, f16, i32, ptr)
DEF_HELPER_2(vfp_uitos, f32, i32, ptr)
diff --git a/target/arm/neon-dp.decode b/target/arm/neon-dp.decode
index ec83f10ab3..fd3a01bfa0 100644
--- a/target/arm/neon-dp.decode
+++ b/target/arm/neon-dp.decode
@@ -521,6 +521,7 @@ Vimm_1r 1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm
VRINTZ 1111 001 11 . 11 .. 10 .... 0 1011 . . 0 .... @2misc
VCVT_F16_F32 1111 001 11 . 11 .. 10 .... 0 1100 0 . 0 .... @2misc_q0
+ VCVT_B16_F32 1111 001 11 . 11 .. 10 .... 0 1100 1 . 0 .... @2misc_q0
VRINTM 1111 001 11 . 11 .. 10 .... 0 1101 . . 0 .... @2misc
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 3d7c4fa6e5..bad81580c5 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -987,6 +987,7 @@ FNMLS_zpzzz 01100101 .. 1 ..... 111 ... ..... ..... @rdn_pg_rm_ra
# SVE floating-point convert precision
FCVT_sh 01100101 10 0010 00 101 ... ..... ..... @rd_pg_rn_e0
FCVT_hs 01100101 10 0010 01 101 ... ..... ..... @rd_pg_rn_e0
+BFCVT 01100101 10 0010 10 101 ... ..... ..... @rd_pg_rn_e0
FCVT_dh 01100101 11 0010 00 101 ... ..... ..... @rd_pg_rn_e0
FCVT_hd 01100101 11 0010 01 101 ... ..... ..... @rd_pg_rn_e0
FCVT_ds 01100101 11 0010 10 101 ... ..... ..... @rd_pg_rn_e0
@@ -1561,6 +1562,7 @@ RAX1 01000101 00 1 ..... 11110 1 ..... ..... @rd_rn_rm_e0
FCVTXNT_ds 01100100 00 0010 10 101 ... ..... ..... @rd_pg_rn_e0
FCVTX_ds 01100101 00 0010 10 101 ... ..... ..... @rd_pg_rn_e0
FCVTNT_sh 01100100 10 0010 00 101 ... ..... ..... @rd_pg_rn_e0
+BFCVTNT 01100100 10 0010 10 101 ... ..... ..... @rd_pg_rn_e0
FCVTLT_hs 01100100 10 0010 01 101 ... ..... ..... @rd_pg_rn_e0
FCVTNT_ds 01100100 11 0010 10 101 ... ..... ..... @rd_pg_rn_e0
FCVTLT_sd 01100100 11 0010 11 101 ... ..... ..... @rd_pg_rn_e0
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index c5c3017745..ae3db11c0d 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -4570,6 +4570,7 @@ static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
+DO_ZPZ_FP(sve_bfcvt, uint32_t, H1_4, float32_to_bfloat16)
DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
@@ -7567,6 +7568,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
}
DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
+DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_4, H1_2, float64_to_float32)
#define DO_FCVTLT(NAME, TYPEW, TYPEN, HW, HN, OP) \
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index d767194cc7..c528fb2cf0 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -10361,6 +10361,13 @@ static void handle_2misc_narrow(DisasContext *s, bool scalar,
tcg_temp_free_i32(ahp);
}
break;
+ case 0x36: /* BFCVTN, BFCVTN2 */
+ {
+ TCGv_ptr fpst = fpstatus_ptr(FPST_FPCR);
+ gen_helper_bfcvt_pair(tcg_res[pass], tcg_op, fpst);
+ tcg_temp_free_ptr(fpst);
+ }
+ break;
case 0x56: /* FCVTXN, FCVTXN2 */
/* 64 bit to 32 bit float conversion
* with von Neumann rounding (round to odd)
@@ -12761,6 +12768,16 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
}
handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd);
return;
+ case 0x36: /* BFCVTN, BFCVTN2 */
+ if (!dc_isar_feature(aa64_bf16, s) || size != 2) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+ handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd);
+ return;
case 0x17: /* FCVTL, FCVTL2 */
if (!fp_access_check(s)) {
return;
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index cb0e7a1f68..aacbabd11e 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -4715,6 +4715,14 @@ static bool trans_FCVT_hs(DisasContext *s, arg_rpr_esz *a)
return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_hs);
}
+static bool trans_BFCVT(DisasContext *s, arg_rpr_esz *a)
+{
+ if (!dc_isar_feature(aa64_sve_bf16, s)) {
+ return false;
+ }
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_bfcvt);
+}
+
static bool trans_FCVT_dh(DisasContext *s, arg_rpr_esz *a)
{
return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_fcvt_dh);
@@ -8405,6 +8413,14 @@ static bool trans_FCVTNT_sh(DisasContext *s, arg_rpr_esz *a)
return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve2_fcvtnt_sh);
}
+static bool trans_BFCVTNT(DisasContext *s, arg_rpr_esz *a)
+{
+ if (!dc_isar_feature(aa64_sve_bf16, s)) {
+ return false;
+ }
+ return do_zpz_ptr(s, a->rd, a->rn, a->pg, false, gen_helper_sve_bfcvtnt);
+}
+
static bool trans_FCVTNT_ds(DisasContext *s, arg_rpr_esz *a)
{
if (!dc_isar_feature(aa64_sve2, s)) {
diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c
index fe7a2a5daa..3328423cec 100644
--- a/target/arm/vfp_helper.c
+++ b/target/arm/vfp_helper.c
@@ -413,6 +413,13 @@ uint32_t HELPER(bfcvt)(float32 x, void *status)
return float32_to_bfloat16(x, status);
}
+uint32_t HELPER(bfcvt_pair)(uint64_t pair, void *status)
+{
+ bfloat16 lo = float32_to_bfloat16(extract64(pair, 0, 32), status);
+ bfloat16 hi = float32_to_bfloat16(extract64(pair, 32, 32), status);
+ return deposit32(lo, 16, 16, hi);
+}
+
/*
* VFP3 fixed point conversion. The AArch32 versions of fix-to-float
* must always round-to-nearest; the AArch64 ones honour the FPSCR
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
index f1893b1dc8..8cc53892d6 100644
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c.inc
@@ -3413,6 +3413,51 @@ static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
return true;
}
+static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
+{
+ TCGv_ptr fpst;
+ TCGv_i64 tmp;
+ TCGv_i32 dst0, dst1;
+
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+
+ /* UNDEF accesses to D16-D31 if they don't exist. */
+ if (!dc_isar_feature(aa32_simd_r32, s) &&
+ ((a->vd | a->vm) & 0x10)) {
+ return false;
+ }
+
+ if ((a->vm & 1) || (a->size != 1)) {
+ return false;
+ }
+
+ if (!vfp_access_check(s)) {
+ return true;
+ }
+
+ fpst = fpstatus_ptr(FPST_STD);
+ tmp = tcg_temp_new_i64();
+ dst0 = tcg_temp_new_i32();
+ dst1 = tcg_temp_new_i32();
+
+ read_neon_element64(tmp, a->vm, 0, MO_64);
+ gen_helper_bfcvt_pair(dst0, tmp, fpst);
+
+ read_neon_element64(tmp, a->vm, 1, MO_64);
+ gen_helper_bfcvt_pair(dst1, tmp, fpst);
+
+ write_neon_element32(dst0, a->vd, 0, MO_32);
+ write_neon_element32(dst1, a->vd, 1, MO_32);
+
+ tcg_temp_free_i64(tmp);
+ tcg_temp_free_i32(dst0);
+ tcg_temp_free_i32(dst1);
+ tcg_temp_free_ptr(fpst);
+ return true;
+}
+
static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
{
TCGv_ptr fpst;
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* Re: [PATCH v1 04/11] target/arm: Implement vector float32 to bfloat16 conversion
2021-04-16 23:59 ` [PATCH v1 04/11] target/arm: Implement vector " Richard Henderson
@ 2021-05-18 11:10 ` Peter Maydell
2021-05-18 14:32 ` Richard Henderson
0 siblings, 1 reply; 30+ messages in thread
From: Peter Maydell @ 2021-05-18 11:10 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-arm, QEMU Developers
On Sat, 17 Apr 2021 at 01:03, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> This is BFCVT{N,T} for both AArch64 AdvSIMD and SVE,
> and VCVT.BF16.F32 for AArch32 NEON.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> @@ -7567,6 +7568,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
> }
>
> DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
> +DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
> DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_4, H1_2, float64_to_float32)
Not related to this patch, but are the H macros for sve2_fcvtnt_ds definitely
right? Just noticed they're the same as the ones being used for the f32->f16
helpers despite the types being different sizes.
> diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
> index f1893b1dc8..8cc53892d6 100644
> --- a/target/arm/translate-neon.c.inc
> +++ b/target/arm/translate-neon.c.inc
> @@ -3413,6 +3413,51 @@ static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
> return true;
> }
>
> +static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
> +{
> + TCGv_ptr fpst;
> + TCGv_i64 tmp;
> + TCGv_i32 dst0, dst1;
> +
> + if (!dc_isar_feature(aa32_bf16, s)) {
> + return false;
> + }
Do we need to also check ARM_FEATURE_NEON here ?
> +
> + /* UNDEF accesses to D16-D31 if they don't exist. */
> + if (!dc_isar_feature(aa32_simd_r32, s) &&
> + ((a->vd | a->vm) & 0x10)) {
> + return false;
> + }
> +
> + if ((a->vm & 1) || (a->size != 1)) {
> + return false;
> + }
> +
> + if (!vfp_access_check(s)) {
> + return true;
> + }
> +
> + fpst = fpstatus_ptr(FPST_STD);
> + tmp = tcg_temp_new_i64();
> + dst0 = tcg_temp_new_i32();
> + dst1 = tcg_temp_new_i32();
> +
> + read_neon_element64(tmp, a->vm, 0, MO_64);
> + gen_helper_bfcvt_pair(dst0, tmp, fpst);
> +
> + read_neon_element64(tmp, a->vm, 1, MO_64);
> + gen_helper_bfcvt_pair(dst1, tmp, fpst);
> +
> + write_neon_element32(dst0, a->vd, 0, MO_32);
> + write_neon_element32(dst1, a->vd, 1, MO_32);
> +
> + tcg_temp_free_i64(tmp);
> + tcg_temp_free_i32(dst0);
> + tcg_temp_free_i32(dst1);
> + tcg_temp_free_ptr(fpst);
> + return true;
> +}
> +
> static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
> {
> TCGv_ptr fpst;
> --
> 2.25.1
Otherwise
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
thanks
-- PMM
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH v1 04/11] target/arm: Implement vector float32 to bfloat16 conversion
2021-05-18 11:10 ` Peter Maydell
@ 2021-05-18 14:32 ` Richard Henderson
0 siblings, 0 replies; 30+ messages in thread
From: Richard Henderson @ 2021-05-18 14:32 UTC (permalink / raw)
To: Peter Maydell; +Cc: qemu-arm, QEMU Developers
On 5/18/21 6:10 AM, Peter Maydell wrote:
> On Sat, 17 Apr 2021 at 01:03, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> This is BFCVT{N,T} for both AArch64 AdvSIMD and SVE,
>> and VCVT.BF16.F32 for AArch32 NEON.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>
>> @@ -7567,6 +7568,7 @@ void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
>> }
>>
>> DO_FCVTNT(sve2_fcvtnt_sh, uint32_t, uint16_t, H1_4, H1_2, sve_f32_to_f16)
>> +DO_FCVTNT(sve_bfcvtnt, uint32_t, uint16_t, H1_4, H1_2, float32_to_bfloat16)
>> DO_FCVTNT(sve2_fcvtnt_ds, uint64_t, uint32_t, H1_4, H1_2, float64_to_float32)
>
> Not related to this patch, but are the H macros for sve2_fcvtnt_ds definitely
> right? Just noticed they're the same as the ones being used for the f32->f16
> helpers despite the types being different sizes.
Definitely wrong, and now fixed in the sve2 patch set (need to fix some
regressions there before re-posting).
>> + if (!dc_isar_feature(aa32_bf16, s)) {
>> + return false;
>> + }
>
> Do we need to also check ARM_FEATURE_NEON here ?
Hmm, I dunno. Since FEAT_AA32BF16 has both VFP and NEON instructions, I guess
we could turn off one without the other.
I'll add it.
r~
^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH v1 05/11] fpu: Add float_round_to_odd_inf
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
` (3 preceding siblings ...)
2021-04-16 23:59 ` [PATCH v1 04/11] target/arm: Implement vector " Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 11:20 ` Peter Maydell
2021-04-16 23:59 ` [PATCH v1 06/11] target/arm: Implement bfloat16 dot product (vector) Richard Henderson
` (5 subsequent siblings)
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
For Arm BFDOT and BFMMLA, we need a version of round-to-odd
that overflows to infinity, instead of the max normal number.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
include/fpu/softfloat-types.h | 4 +++-
fpu/softfloat.c | 8 ++++++--
2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
index 8a3f20fae9..3b757c3d6a 100644
--- a/include/fpu/softfloat-types.h
+++ b/include/fpu/softfloat-types.h
@@ -134,8 +134,10 @@ typedef enum __attribute__((__packed__)) {
float_round_up = 2,
float_round_to_zero = 3,
float_round_ties_away = 4,
- /* Not an IEEE rounding mode: round to the closest odd mantissa value */
+ /* Not an IEEE rounding mode: round to closest odd, overflow to max */
float_round_to_odd = 5,
+ /* Not an IEEE rounding mode: round to closest odd, overflow to inf */
+ float_round_to_odd_inf = 6,
} FloatRoundMode;
/*
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 67cfa0fd82..76097679b0 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -694,13 +694,12 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
switch (p.cls) {
case float_class_normal:
+ overflow_norm = false;
switch (s->float_rounding_mode) {
case float_round_nearest_even:
- overflow_norm = false;
inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
break;
case float_round_ties_away:
- overflow_norm = false;
inc = frac_lsbm1;
break;
case float_round_to_zero:
@@ -717,6 +716,8 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
break;
case float_round_to_odd:
overflow_norm = true;
+ /* fall through */
+ case float_round_to_odd_inf:
inc = frac & frac_lsb ? 0 : round_mask;
break;
default:
@@ -771,6 +772,7 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
? frac_lsbm1 : 0);
break;
case float_round_to_odd:
+ case float_round_to_odd_inf:
inc = frac & frac_lsb ? 0 : round_mask;
break;
default:
@@ -6860,6 +6862,8 @@ float128 float128_round_to_int(float128 a, float_status *status)
case float_round_to_zero:
break;
+ default:
+ g_assert_not_reached();
}
return packFloat128( aSign, 0, 0, 0 );
}
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* Re: [PATCH v1 05/11] fpu: Add float_round_to_odd_inf
2021-04-16 23:59 ` [PATCH v1 05/11] fpu: Add float_round_to_odd_inf Richard Henderson
@ 2021-05-18 11:20 ` Peter Maydell
2021-05-18 14:24 ` Richard Henderson
0 siblings, 1 reply; 30+ messages in thread
From: Peter Maydell @ 2021-05-18 11:20 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-arm, QEMU Developers
On Sat, 17 Apr 2021 at 01:02, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> For Arm BFDOT and BFMMLA, we need a version of round-to-odd
> that overflows to infinity, instead of the max normal number.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> include/fpu/softfloat-types.h | 4 +++-
> fpu/softfloat.c | 8 ++++++--
> 2 files changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
> index 8a3f20fae9..3b757c3d6a 100644
> --- a/include/fpu/softfloat-types.h
> +++ b/include/fpu/softfloat-types.h
> @@ -134,8 +134,10 @@ typedef enum __attribute__((__packed__)) {
> float_round_up = 2,
> float_round_to_zero = 3,
> float_round_ties_away = 4,
> - /* Not an IEEE rounding mode: round to the closest odd mantissa value */
> + /* Not an IEEE rounding mode: round to closest odd, overflow to max */
> float_round_to_odd = 5,
> + /* Not an IEEE rounding mode: round to closest odd, overflow to inf */
> + float_round_to_odd_inf = 6,
> } FloatRoundMode;
>
> /*
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index 67cfa0fd82..76097679b0 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -694,13 +694,12 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
>
> switch (p.cls) {
> case float_class_normal:
> + overflow_norm = false;
> switch (s->float_rounding_mode) {
> case float_round_nearest_even:
> - overflow_norm = false;
> inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
> break;
> case float_round_ties_away:
> - overflow_norm = false;
> inc = frac_lsbm1;
> break;
> case float_round_to_zero:
> @@ -717,6 +716,8 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
> break;
> case float_round_to_odd:
> overflow_norm = true;
> + /* fall through */
> + case float_round_to_odd_inf:
> inc = frac & frac_lsb ? 0 : round_mask;
> break;
> default:
> @@ -771,6 +772,7 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
> ? frac_lsbm1 : 0);
> break;
> case float_round_to_odd:
> + case float_round_to_odd_inf:
> inc = frac & frac_lsb ? 0 : round_mask;
> break;
> default:
> @@ -6860,6 +6862,8 @@ float128 float128_round_to_int(float128 a, float_status *status)
>
> case float_round_to_zero:
> break;
> + default:
> + g_assert_not_reached();
> }
> return packFloat128( aSign, 0, 0, 0 );
> }
This code change looks OK as far as it goes, but there are a bunch
of other places in softfloat.c which switch on the float rounding mode.
If this rounding mode is only supported for a particular subset of
operations we should at least document that in the comment.
thanks
-- PMM
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH v1 05/11] fpu: Add float_round_to_odd_inf
2021-05-18 11:20 ` Peter Maydell
@ 2021-05-18 14:24 ` Richard Henderson
0 siblings, 0 replies; 30+ messages in thread
From: Richard Henderson @ 2021-05-18 14:24 UTC (permalink / raw)
To: Peter Maydell; +Cc: qemu-arm, QEMU Developers
On 5/18/21 6:20 AM, Peter Maydell wrote:
> This code change looks OK as far as it goes, but there are a bunch
> of other places in softfloat.c which switch on the float rounding mode.
> If this rounding mode is only supported for a particular subset of
> operations we should at least document that in the comment.
Once the softfloat reorg is complete, there will be only one place to add this.
I didn't want to go overboard on that here, or necessarily depend on more
that just SVE2 for now.
r~
^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH v1 06/11] target/arm: Implement bfloat16 dot product (vector)
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
` (4 preceding siblings ...)
2021-04-16 23:59 ` [PATCH v1 05/11] fpu: Add float_round_to_odd_inf Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 12:15 ` Peter Maydell
2021-04-16 23:59 ` [PATCH v1 07/11] target/arm: Implement bfloat16 dot product (indexed) Richard Henderson
` (4 subsequent siblings)
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
This is BFDOT for both AArch64 AdvSIMD and SVE,
and VDOT.BF16 for AArch32 NEON.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 3 +++
target/arm/neon-shared.decode | 2 ++
target/arm/sve.decode | 3 +++
target/arm/translate-a64.c | 20 +++++++++++++++++
target/arm/translate-sve.c | 12 ++++++++++
target/arm/vec_helper.c | 40 +++++++++++++++++++++++++++++++++
target/arm/translate-neon.c.inc | 9 ++++++++
7 files changed, 89 insertions(+)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 0b52ee6256..eb4cb2b65b 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -1003,6 +1003,9 @@ DEF_HELPER_FLAGS_5(gvec_ummla_b, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_usmmla_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_bfdot, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
#ifdef TARGET_AARCH64
#include "helper-a64.h"
#include "helper-sve.h"
diff --git a/target/arm/neon-shared.decode b/target/arm/neon-shared.decode
index cc9f4cdd85..31a0839bbb 100644
--- a/target/arm/neon-shared.decode
+++ b/target/arm/neon-shared.decode
@@ -52,6 +52,8 @@ VUDOT 1111 110 00 . 10 .... .... 1101 . q:1 . 1 .... \
vm=%vm_dp vn=%vn_dp vd=%vd_dp
VUSDOT 1111 110 01 . 10 .... .... 1101 . q:1 . 0 .... \
vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VDOT_b16 1111 110 00 . 00 .... .... 1101 . q:1 . 0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
# VFM[AS]L
VFML 1111 110 0 s:1 . 10 .... .... 1000 . 0 . 1 .... \
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index bad81580c5..523140ca56 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1576,6 +1576,9 @@ FMLALT_zzzw 01100100 10 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0
FMLSLB_zzzw 01100100 10 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_e0
FMLSLT_zzzw 01100100 10 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_e0
+### SVE2 floating-point bfloat16 dot-product
+BFDOT_zzzz 01100100 01 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
+
### SVE2 floating-point multiply-add long (indexed)
FMLALB_zzxw 01100100 10 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
FMLALT_zzxw 01100100 10 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index c528fb2cf0..fc16e0a126 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -12243,6 +12243,16 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
}
feature = dc_isar_feature(aa64_fcma, s);
break;
+ case 0x1f: /* BFDOT */
+ switch (size) {
+ case 1:
+ feature = dc_isar_feature(aa64_bf16, s);
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ break;
default:
unallocated_encoding(s);
return;
@@ -12326,6 +12336,16 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
}
return;
+ case 0xf: /* BFDOT */
+ switch (size) {
+ case 1:
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfdot);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return;
+
default:
g_assert_not_reached();
}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index aacbabd11e..3527430c1a 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -8586,3 +8586,15 @@ static bool trans_UMMLA(DisasContext *s, arg_rrrr_esz *a)
{
return do_i8mm_zzzz_ool(s, a, gen_helper_gvec_ummla_b, 0);
}
+
+static bool trans_BFDOT_zzzz(DisasContext *s, arg_rrrr_esz *a)
+{
+ if (!dc_isar_feature(aa64_sve_bf16, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ gen_gvec_ool_zzzz(s, gen_helper_gvec_bfdot,
+ a->rd, a->rn, a->rm, a->ra, 0);
+ }
+ return true;
+}
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 6c9f1e5146..e227ba6590 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -2655,3 +2655,43 @@ static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
DO_MMLA_B(gvec_smmla_b, do_smmla_b)
DO_MMLA_B(gvec_ummla_b, do_ummla_b)
DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
+
+/*
+ * BFloat16 Dot Product
+ */
+
+static float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
+{
+ /* FPCR is ignored for BFDOT and BFMMLA. */
+ float_status bf_status = {
+ .tininess_before_rounding = float_tininess_before_rounding,
+ .float_rounding_mode = float_round_to_odd_inf,
+ .flush_to_zero = true,
+ .flush_inputs_to_zero = true,
+ .default_nan_mode = true,
+ };
+ float32 t1, t2;
+
+ /*
+ * Extract each BFloat16 from the element pair, and shift
+ * them such that they become float32.
+ */
+ t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
+ t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
+ t1 = float32_add(t1, t2, &bf_status);
+ t1 = float32_add(sum, t1, &bf_status);
+
+ return t1;
+}
+
+void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ float32 *d = vd, *a = va;
+ uint32_t *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = bfdotadd(a[i], n[i], m[i]);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
index 8cc53892d6..aed8a565e0 100644
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c.inc
@@ -287,6 +287,15 @@ static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
gen_helper_gvec_usdot_b);
}
+static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
+{
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
+ gen_helper_gvec_bfdot);
+}
+
static bool trans_VFML(DisasContext *s, arg_VFML *a)
{
int opr_sz;
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* Re: [PATCH v1 06/11] target/arm: Implement bfloat16 dot product (vector)
2021-04-16 23:59 ` [PATCH v1 06/11] target/arm: Implement bfloat16 dot product (vector) Richard Henderson
@ 2021-05-18 12:15 ` Peter Maydell
2021-05-18 14:27 ` Richard Henderson
0 siblings, 1 reply; 30+ messages in thread
From: Peter Maydell @ 2021-05-18 12:15 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-arm, QEMU Developers
On Sat, 17 Apr 2021 at 01:02, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> This is BFDOT for both AArch64 AdvSIMD and SVE,
> and VDOT.BF16 for AArch32 NEON.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> @@ -12326,6 +12336,16 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
> }
> return;
>
> + case 0xf: /* BFDOT */
> + switch (size) {
> + case 1:
> + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfdot);
> + break;
> + default:
> + g_assert_not_reached();
> + }
> + return;
The switch on size here seems unnecessary to me given we've already
decoded size earlier in the function.
> +
> default:
> g_assert_not_reached();
> }
> diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
> index 8cc53892d6..aed8a565e0 100644
> --- a/target/arm/translate-neon.c.inc
> +++ b/target/arm/translate-neon.c.inc
> @@ -287,6 +287,15 @@ static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
> gen_helper_gvec_usdot_b);
> }
>
> +static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
> +{
> + if (!dc_isar_feature(aa32_bf16, s)) {
> + return false;
> + }
Again, not sure if we need a FEATURE_NEON check ?
> + return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
> + gen_helper_gvec_bfdot);
> +}
> +
> static bool trans_VFML(DisasContext *s, arg_VFML *a)
> {
> int opr_sz;
Otherwise
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
thanks
-- PMM
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH v1 06/11] target/arm: Implement bfloat16 dot product (vector)
2021-05-18 12:15 ` Peter Maydell
@ 2021-05-18 14:27 ` Richard Henderson
0 siblings, 0 replies; 30+ messages in thread
From: Richard Henderson @ 2021-05-18 14:27 UTC (permalink / raw)
To: Peter Maydell; +Cc: qemu-arm, QEMU Developers
On 5/18/21 7:15 AM, Peter Maydell wrote:
> On Sat, 17 Apr 2021 at 01:02, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> This is BFDOT for both AArch64 AdvSIMD and SVE,
>> and VDOT.BF16 for AArch32 NEON.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>
>> @@ -12326,6 +12336,16 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
>> }
>> return;
>>
>> + case 0xf: /* BFDOT */
>> + switch (size) {
>> + case 1:
>> + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfdot);
>> + break;
>> + default:
>> + g_assert_not_reached();
>> + }
>> + return;
>
> The switch on size here seems unnecessary to me given we've already
> decoded size earlier in the function.
Size is opcode here, and the switch gains extra members as we continue. I'm
beginning the form we need to continue with.
r~
^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH v1 07/11] target/arm: Implement bfloat16 dot product (indexed)
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
` (5 preceding siblings ...)
2021-04-16 23:59 ` [PATCH v1 06/11] target/arm: Implement bfloat16 dot product (vector) Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 12:24 ` Peter Maydell
2021-04-16 23:59 ` [PATCH v1 08/11] target/arm: Implement bfloat16 matrix multiply accumulate Richard Henderson
` (3 subsequent siblings)
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
This is BFDOT for both AArch64 AdvSIMD and SVE,
and VDOT.BF16 for AArch32 NEON.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 2 ++
target/arm/neon-shared.decode | 2 ++
target/arm/sve.decode | 3 +++
target/arm/translate-a64.c | 41 +++++++++++++++++++++++++--------
target/arm/translate-sve.c | 12 ++++++++++
target/arm/vec_helper.c | 20 ++++++++++++++++
target/arm/translate-neon.c.inc | 9 ++++++++
7 files changed, 80 insertions(+), 9 deletions(-)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index eb4cb2b65b..af0ee8f693 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -1005,6 +1005,8 @@ DEF_HELPER_FLAGS_5(gvec_usmmla_b, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_bfdot, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_bfdot_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
#ifdef TARGET_AARCH64
#include "helper-a64.h"
diff --git a/target/arm/neon-shared.decode b/target/arm/neon-shared.decode
index 31a0839bbb..fa3cf14e3a 100644
--- a/target/arm/neon-shared.decode
+++ b/target/arm/neon-shared.decode
@@ -81,6 +81,8 @@ VUSDOT_scalar 1111 1110 1 . 00 .... .... 1101 . q:1 index:1 0 vm:4 \
vn=%vn_dp vd=%vd_dp
VSUDOT_scalar 1111 1110 1 . 00 .... .... 1101 . q:1 index:1 1 vm:4 \
vn=%vn_dp vd=%vd_dp
+VDOT_b16_scal 1111 1110 0 . 00 .... .... 1101 . q:1 index:1 0 vm:4 \
+ vn=%vn_dp vd=%vd_dp
%vfml_scalar_q0_rm 0:3 5:1
%vfml_scalar_q1_index 5:1 3:1
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 523140ca56..d5e1e5d400 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1584,3 +1584,6 @@ FMLALB_zzxw 01100100 10 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
FMLALT_zzxw 01100100 10 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
FMLSLB_zzxw 01100100 10 1 ..... 0110.0 ..... ..... @rrxr_3a esz=2
FMLSLT_zzxw 01100100 10 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2
+
+### SVE2 floating-point bfloat16 dot-product (indexed)
+BFDOT_zzxz 01100100 01 1 ..... 010000 ..... ..... @rrxr_2 esz=2
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index fc16e0a126..f60afbbd06 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -13457,8 +13457,22 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
return;
}
break;
- case 0x0f: /* SUDOT, USDOT */
- if (is_scalar || (size & 1) || !dc_isar_feature(aa64_i8mm, s)) {
+ case 0x0f:
+ switch (size) {
+ case 0: /* SUDOT */
+ case 2: /* USDOT */
+ if (is_scalar || !dc_isar_feature(aa64_i8mm, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ case 1: /* BFDOT */
+ if (is_scalar || !dc_isar_feature(aa64_bf16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ default:
unallocated_encoding(s);
return;
}
@@ -13578,13 +13592,22 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
u ? gen_helper_gvec_udot_idx_b
: gen_helper_gvec_sdot_idx_b);
return;
- case 0x0f: /* SUDOT, USDOT */
- gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
- extract32(insn, 23, 1)
- ? gen_helper_gvec_usdot_idx_b
- : gen_helper_gvec_sudot_idx_b);
- return;
-
+ case 0x0f:
+ switch (extract32(insn, 22, 2)) {
+ case 0: /* SUDOT */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+ gen_helper_gvec_sudot_idx_b);
+ return;
+ case 1: /* BFDOT */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+ gen_helper_gvec_bfdot_idx);
+ return;
+ case 2: /* USDOT */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
+ gen_helper_gvec_usdot_idx_b);
+ return;
+ }
+ g_assert_not_reached();
case 0x11: /* FCMLA #0 */
case 0x13: /* FCMLA #90 */
case 0x15: /* FCMLA #180 */
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 3527430c1a..ef6828c632 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -8598,3 +8598,15 @@ static bool trans_BFDOT_zzzz(DisasContext *s, arg_rrrr_esz *a)
}
return true;
}
+
+static bool trans_BFDOT_zzxz(DisasContext *s, arg_rrxr_esz *a)
+{
+ if (!dc_isar_feature(aa64_sve_bf16, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ gen_gvec_ool_zzzz(s, gen_helper_gvec_bfdot_idx,
+ a->rd, a->rn, a->rm, a->ra, a->index);
+ }
+ return true;
+}
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index e227ba6590..3e26fb0e5f 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -2695,3 +2695,23 @@ void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+
+void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ intptr_t index = simd_data(desc);
+ intptr_t elements = opr_sz / 4;
+ intptr_t eltspersegment = MIN(16 / 4, elements);
+ float32 *d = vd, *a = va;
+ uint32_t *n = vn, *m = vm;
+
+ for (i = 0; i < elements; i += eltspersegment) {
+ uint32_t m_idx = m[i + H4(index)];
+
+ for (j = i; j < i + eltspersegment; j++) {
+ d[j] = bfdotadd(a[j], n[j], m_idx);
+ }
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
index aed8a565e0..bb0adf4756 100644
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c.inc
@@ -381,6 +381,15 @@ static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
gen_helper_gvec_sudot_idx_b);
}
+static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
+{
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
+ gen_helper_gvec_bfdot_idx);
+}
+
static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
{
int opr_sz;
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* Re: [PATCH v1 07/11] target/arm: Implement bfloat16 dot product (indexed)
2021-04-16 23:59 ` [PATCH v1 07/11] target/arm: Implement bfloat16 dot product (indexed) Richard Henderson
@ 2021-05-18 12:24 ` Peter Maydell
2021-05-18 14:38 ` Richard Henderson
0 siblings, 1 reply; 30+ messages in thread
From: Peter Maydell @ 2021-05-18 12:24 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-arm, QEMU Developers
On Sat, 17 Apr 2021 at 01:06, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> This is BFDOT for both AArch64 AdvSIMD and SVE,
> and VDOT.BF16 for AArch32 NEON.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> target/arm/helper.h | 2 ++
> target/arm/neon-shared.decode | 2 ++
> target/arm/sve.decode | 3 +++
> target/arm/translate-a64.c | 41 +++++++++++++++++++++++++--------
> target/arm/translate-sve.c | 12 ++++++++++
> target/arm/vec_helper.c | 20 ++++++++++++++++
> target/arm/translate-neon.c.inc | 9 ++++++++
> 7 files changed, 80 insertions(+), 9 deletions(-)
>
> @@ -13578,13 +13592,22 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
> u ? gen_helper_gvec_udot_idx_b
> : gen_helper_gvec_sdot_idx_b);
> return;
> - case 0x0f: /* SUDOT, USDOT */
> - gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
> - extract32(insn, 23, 1)
> - ? gen_helper_gvec_usdot_idx_b
> - : gen_helper_gvec_sudot_idx_b);
> - return;
> -
> + case 0x0f:
> + switch (extract32(insn, 22, 2)) {
You already have bits [23:22] in 'size' at this point, I think.
> + case 0: /* SUDOT */
> + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
> + gen_helper_gvec_sudot_idx_b);
> + return;
> + case 1: /* BFDOT */
> + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
> + gen_helper_gvec_bfdot_idx);
> + return;
> + case 2: /* USDOT */
> + gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
> + gen_helper_gvec_usdot_idx_b);
> + return;
> + }
> + g_assert_not_reached();
otherwise
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
thanks
-- PMM
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH v1 07/11] target/arm: Implement bfloat16 dot product (indexed)
2021-05-18 12:24 ` Peter Maydell
@ 2021-05-18 14:38 ` Richard Henderson
0 siblings, 0 replies; 30+ messages in thread
From: Richard Henderson @ 2021-05-18 14:38 UTC (permalink / raw)
To: Peter Maydell; +Cc: qemu-arm, QEMU Developers
On 5/18/21 7:24 AM, Peter Maydell wrote:
>> - case 0x0f: /* SUDOT, USDOT */
>> - gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
>> - extract32(insn, 23, 1)
>> - ? gen_helper_gvec_usdot_idx_b
>> - : gen_helper_gvec_sudot_idx_b);
>> - return;
>> -
>> + case 0x0f:
>> + switch (extract32(insn, 22, 2)) {
>
> You already have bits [23:22] in 'size' at this point, I think.
Irritatingly not. For 0xf [23:22] is opcode, and size has been squashed to
MO_32 in order to get the indexing correct in the middle of this function.
It's just outside the patch context here. Later it will be moved inside a
switch for BFMLAL.
One of the many ways in which decodetree would be better here.
r~
^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH v1 08/11] target/arm: Implement bfloat16 matrix multiply accumulate
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
` (6 preceding siblings ...)
2021-04-16 23:59 ` [PATCH v1 07/11] target/arm: Implement bfloat16 dot product (indexed) Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 12:37 ` Peter Maydell
2021-04-16 23:59 ` [PATCH v1 09/11] target/arm: Implement bfloat widening fma (vector) Richard Henderson
` (2 subsequent siblings)
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
This is BFMMLA for both AArch64 AdvSIMD and SVE,
and VMMLA.BF16 for AArch32 NEON.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 3 +++
target/arm/neon-shared.decode | 2 ++
target/arm/sve.decode | 6 +++--
target/arm/translate-a64.c | 10 +++++++++
target/arm/translate-sve.c | 12 ++++++++++
target/arm/vec_helper.c | 40 +++++++++++++++++++++++++++++++++
target/arm/translate-neon.c.inc | 9 ++++++++
7 files changed, 80 insertions(+), 2 deletions(-)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index af0ee8f693..74f8bc766f 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -1008,6 +1008,9 @@ DEF_HELPER_FLAGS_5(gvec_bfdot, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_bfdot_idx, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_bfmmla, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
#ifdef TARGET_AARCH64
#include "helper-a64.h"
#include "helper-sve.h"
diff --git a/target/arm/neon-shared.decode b/target/arm/neon-shared.decode
index fa3cf14e3a..4e0a25d27c 100644
--- a/target/arm/neon-shared.decode
+++ b/target/arm/neon-shared.decode
@@ -67,6 +67,8 @@ VUMMLA 1111 1100 0.10 .... .... 1100 .1.1 .... \
vm=%vm_dp vn=%vn_dp vd=%vd_dp
VUSMMLA 1111 1100 1.10 .... .... 1100 .1.0 .... \
vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VMMLA_b16 1111 1100 0.00 .... .... 1100 .1.0 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
VCMLA_scalar 1111 1110 0 . rot:2 .... .... 1000 . q:1 index:1 0 vm:4 \
vn=%vn_dp vd=%vd_dp size=1
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index d5e1e5d400..aa8d5e4b8f 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1519,8 +1519,10 @@ SQRDCMLAH_zzzz 01000100 esz:2 0 rm:5 0011 rot:2 rn:5 rd:5 ra=%reg_movprfx
USDOT_zzzz 01000100 .. 0 ..... 011 110 ..... ..... @rda_rn_rm
### SVE2 floating point matrix multiply accumulate
-
-FMMLA 01100100 .. 1 ..... 111001 ..... ..... @rda_rn_rm
+{
+ BFMMLA 01100100 01 1 ..... 111 001 ..... ..... @rda_rn_rm_e0
+ FMMLA 01100100 .. 1 ..... 111 001 ..... ..... @rda_rn_rm
+}
### SVE2 Memory Gather Load Group
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index f60afbbd06..8636eac4a8 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -12243,6 +12243,13 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
}
feature = dc_isar_feature(aa64_fcma, s);
break;
+ case 0x1d: /* BFMMLA */
+ if (size != MO_16 || !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = dc_isar_feature(aa64_bf16, s);
+ break;
case 0x1f: /* BFDOT */
switch (size) {
case 1:
@@ -12336,6 +12343,9 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
}
return;
+ case 0xd: /* BFMMLA */
+ gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfmmla);
+ return;
case 0xf: /* BFDOT */
switch (size) {
case 1:
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index ef6828c632..9ade521705 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -8610,3 +8610,15 @@ static bool trans_BFDOT_zzxz(DisasContext *s, arg_rrxr_esz *a)
}
return true;
}
+
+static bool trans_BFMMLA(DisasContext *s, arg_rrrr_esz *a)
+{
+ if (!dc_isar_feature(aa64_sve_bf16, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ gen_gvec_ool_zzzz(s, gen_helper_gvec_bfmmla,
+ a->rd, a->rn, a->rm, a->ra, 0);
+ }
+ return true;
+}
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 3e26fb0e5f..623a0872f3 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -2715,3 +2715,43 @@ void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+
+void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
+{
+ intptr_t s, opr_sz = simd_oprsz(desc);
+ float32 *d = vd, *a = va;
+ uint32_t *n = vn, *m = vm;
+
+ for (s = 0; s < opr_sz / 4; s += 4) {
+ float32 sum00, sum01, sum10, sum11;
+
+ /*
+ * Process the entire segment at once, writing back the
+ * results only after we've consumed all of the inputs.
+ *
+ * Key to indicies by column:
+ * i j i k j k
+ */
+ sum00 = a[s + H4(0 + 0)];
+ sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
+ sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
+
+ sum01 = a[s + H4(0 + 1)];
+ sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
+ sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
+
+ sum10 = a[s + H4(2 + 0)];
+ sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
+ sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
+
+ sum11 = a[s + H4(2 + 1)];
+ sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
+ sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
+
+ d[s + H4(0 + 0)] = sum00;
+ d[s + H4(0 + 1)] = sum01;
+ d[s + H4(2 + 0)] = sum10;
+ d[s + H4(2 + 1)] = sum11;
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
index bb0adf4756..7ce65f691f 100644
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c.inc
@@ -4117,3 +4117,12 @@ static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
gen_helper_gvec_usmmla_b);
}
+
+static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
+{
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+ return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
+ gen_helper_gvec_bfmmla);
+}
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* Re: [PATCH v1 08/11] target/arm: Implement bfloat16 matrix multiply accumulate
2021-04-16 23:59 ` [PATCH v1 08/11] target/arm: Implement bfloat16 matrix multiply accumulate Richard Henderson
@ 2021-05-18 12:37 ` Peter Maydell
2021-05-18 14:45 ` Richard Henderson
0 siblings, 1 reply; 30+ messages in thread
From: Peter Maydell @ 2021-05-18 12:37 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-arm, QEMU Developers
On Sat, 17 Apr 2021 at 01:00, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> This is BFMMLA for both AArch64 AdvSIMD and SVE,
> and VMMLA.BF16 for AArch32 NEON.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> +void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
> +{
> + intptr_t s, opr_sz = simd_oprsz(desc);
> + float32 *d = vd, *a = va;
> + uint32_t *n = vn, *m = vm;
> +
> + for (s = 0; s < opr_sz / 4; s += 4) {
> + float32 sum00, sum01, sum10, sum11;
> +
> + /*
> + * Process the entire segment at once, writing back the
> + * results only after we've consumed all of the inputs.
> + *
> + * Key to indicies by column:
"indices"
> + * i j i k j k
> + */
> + sum00 = a[s + H4(0 + 0)];
> + sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
> + sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
I can't make these indices match up with the arm arm pseudocode ones,
which index by "4*i + 2*k + 0" and "4*i + 2*k + 1", not "2*i + k";
are we hiding a division by 2 somewhere?
> +
> + sum01 = a[s + H4(0 + 1)];
> + sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
> + sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
> +
> + sum10 = a[s + H4(2 + 0)];
> + sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
> + sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
> +
> + sum11 = a[s + H4(2 + 1)];
> + sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
> + sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
> +
> + d[s + H4(0 + 0)] = sum00;
> + d[s + H4(0 + 1)] = sum01;
> + d[s + H4(2 + 0)] = sum10;
> + d[s + H4(2 + 1)] = sum11;
> + }
> + clear_tail(d, opr_sz, simd_maxsz(desc));
Otherwise
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
thanks
-- PMM
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH v1 08/11] target/arm: Implement bfloat16 matrix multiply accumulate
2021-05-18 12:37 ` Peter Maydell
@ 2021-05-18 14:45 ` Richard Henderson
0 siblings, 0 replies; 30+ messages in thread
From: Richard Henderson @ 2021-05-18 14:45 UTC (permalink / raw)
To: Peter Maydell; +Cc: qemu-arm, QEMU Developers
On 5/18/21 7:37 AM, Peter Maydell wrote:
> On Sat, 17 Apr 2021 at 01:00, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> This is BFMMLA for both AArch64 AdvSIMD and SVE,
>> and VMMLA.BF16 for AArch32 NEON.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>
>> +void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
>> +{
>> + intptr_t s, opr_sz = simd_oprsz(desc);
>> + float32 *d = vd, *a = va;
>> + uint32_t *n = vn, *m = vm;
>> +
>> + for (s = 0; s < opr_sz / 4; s += 4) {
>> + float32 sum00, sum01, sum10, sum11;
>> +
>> + /*
>> + * Process the entire segment at once, writing back the
>> + * results only after we've consumed all of the inputs.
>> + *
>> + * Key to indicies by column:
>
> "indices"
>
>> + * i j i k j k
>> + */
>> + sum00 = a[s + H4(0 + 0)];
>> + sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
>> + sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
>
> I can't make these indices match up with the arm arm pseudocode ones,
> which index by "4*i + 2*k + 0" and "4*i + 2*k + 1", not "2*i + k";
> are we hiding a division by 2 somewhere?
Yes. We're passing BFloat16 pairs via uint32_t[] to bfdotadd().
r~
^ permalink raw reply [flat|nested] 30+ messages in thread
* [PATCH v1 09/11] target/arm: Implement bfloat widening fma (vector)
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
` (7 preceding siblings ...)
2021-04-16 23:59 ` [PATCH v1 08/11] target/arm: Implement bfloat16 matrix multiply accumulate Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 12:42 ` Peter Maydell
2021-04-16 23:59 ` [PATCH v1 10/11] target/arm: Implement bfloat widening fma (indexed) Richard Henderson
2021-04-16 23:59 ` [PATCH v1 11/11] target/arm: Enable BFloat16 extensions Richard Henderson
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
This is BFMLAL{B,T} for both AArch64 AdvSIMD and SVE,
and VFMA{B,T}.BF16 for AArch32 NEON.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 3 +++
target/arm/neon-shared.decode | 3 +++
target/arm/sve.decode | 3 +++
target/arm/translate-a64.c | 13 +++++++++----
target/arm/translate-sve.c | 30 ++++++++++++++++++++++++++++++
target/arm/vec_helper.c | 16 ++++++++++++++++
target/arm/translate-neon.c.inc | 9 +++++++++
7 files changed, 73 insertions(+), 4 deletions(-)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 74f8bc766f..2c6f0cecfa 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -1011,6 +1011,9 @@ DEF_HELPER_FLAGS_5(gvec_bfdot_idx, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_5(gvec_bfmmla, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmlal, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
+
#ifdef TARGET_AARCH64
#include "helper-a64.h"
#include "helper-sve.h"
diff --git a/target/arm/neon-shared.decode b/target/arm/neon-shared.decode
index 4e0a25d27c..b61addd98b 100644
--- a/target/arm/neon-shared.decode
+++ b/target/arm/neon-shared.decode
@@ -70,6 +70,9 @@ VUSMMLA 1111 1100 1.10 .... .... 1100 .1.0 .... \
VMMLA_b16 1111 1100 0.00 .... .... 1100 .1.0 .... \
vm=%vm_dp vn=%vn_dp vd=%vd_dp
+VFMA_b16 1111 110 0 0.11 .... .... 1000 . q:1 . 1 .... \
+ vm=%vm_dp vn=%vn_dp vd=%vd_dp
+
VCMLA_scalar 1111 1110 0 . rot:2 .... .... 1000 . q:1 index:1 0 vm:4 \
vn=%vn_dp vd=%vd_dp size=1
VCMLA_scalar 1111 1110 1 . rot:2 .... .... 1000 . q:1 . 0 .... \
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index aa8d5e4b8f..322bef24cf 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1578,6 +1578,9 @@ FMLALT_zzzw 01100100 10 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0
FMLSLB_zzzw 01100100 10 1 ..... 10 1 00 0 ..... ..... @rda_rn_rm_e0
FMLSLT_zzzw 01100100 10 1 ..... 10 1 00 1 ..... ..... @rda_rn_rm_e0
+BFMLALB_zzzw 01100100 11 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
+BFMLALT_zzzw 01100100 11 1 ..... 10 0 00 1 ..... ..... @rda_rn_rm_e0
+
### SVE2 floating-point bfloat16 dot-product
BFDOT_zzzz 01100100 01 1 ..... 10 0 00 0 ..... ..... @rda_rn_rm_e0
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 8636eac4a8..74794e3da3 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -12250,9 +12250,10 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
}
feature = dc_isar_feature(aa64_bf16, s);
break;
- case 0x1f: /* BFDOT */
+ case 0x1f:
switch (size) {
- case 1:
+ case 1: /* BFDOT */
+ case 3: /* BFMLAL{B,T} */
feature = dc_isar_feature(aa64_bf16, s);
break;
default:
@@ -12346,11 +12347,15 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
case 0xd: /* BFMMLA */
gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfmmla);
return;
- case 0xf: /* BFDOT */
+ case 0xf:
switch (size) {
- case 1:
+ case 1: /* BFDOT */
gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0, gen_helper_gvec_bfdot);
break;
+ case 3: /* BFMLAL{B,T} */
+ gen_gvec_op4_fpst(s, 1, rd, rn, rm, rd, false, is_q,
+ gen_helper_gvec_bfmlal);
+ break;
default:
g_assert_not_reached();
}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 9ade521705..3af980caba 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -8622,3 +8622,33 @@ static bool trans_BFMMLA(DisasContext *s, arg_rrrr_esz *a)
}
return true;
}
+
+static bool do_BFMLAL_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel)
+{
+ if (!dc_isar_feature(aa64_sve_bf16, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_ptr status = fpstatus_ptr(FPST_FPCR);
+ unsigned vsz = vec_full_reg_size(s);
+
+ tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ vec_full_reg_offset(s, a->ra),
+ status, vsz, vsz, sel,
+ gen_helper_gvec_bfmlal);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+static bool trans_BFMLALB_zzzw(DisasContext *s, arg_rrrr_esz *a)
+{
+ return do_BFMLAL_zzzw(s, a, false);
+}
+
+static bool trans_BFMLALT_zzzw(DisasContext *s, arg_rrrr_esz *a)
+{
+ return do_BFMLAL_zzzw(s, a, true);
+}
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 623a0872f3..646a364c94 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -2755,3 +2755,19 @@ void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+
+void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
+ void *stat, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ intptr_t sel = simd_data(desc);
+ float32 *d = vd, *a = va;
+ bfloat16 *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ float32 nn = n[H2(i * 2 + sel)] << 16;
+ float32 mm = m[H2(i * 2 + sel)] << 16;
+ d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
index 7ce65f691f..dd710c8450 100644
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c.inc
@@ -4126,3 +4126,12 @@ static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
gen_helper_gvec_bfmmla);
}
+
+static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
+{
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+ return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
+ gen_helper_gvec_bfmlal);
+}
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH v1 10/11] target/arm: Implement bfloat widening fma (indexed)
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
` (8 preceding siblings ...)
2021-04-16 23:59 ` [PATCH v1 09/11] target/arm: Implement bfloat widening fma (vector) Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 12:46 ` Peter Maydell
2021-04-16 23:59 ` [PATCH v1 11/11] target/arm: Enable BFloat16 extensions Richard Henderson
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
This is BFMLAL{B,T} for both AArch64 AdvSIMD and SVE,
and VFMA{B,T}.BF16 for AArch32 NEON.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/helper.h | 2 ++
target/arm/neon-shared.decode | 2 ++
target/arm/sve.decode | 2 ++
target/arm/translate-a64.c | 15 ++++++++++++++-
target/arm/translate-sve.c | 30 ++++++++++++++++++++++++++++++
target/arm/vec_helper.c | 22 ++++++++++++++++++++++
target/arm/translate-neon.c.inc | 10 ++++++++++
7 files changed, 82 insertions(+), 1 deletion(-)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 2c6f0cecfa..cbcaab2ce0 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -1013,6 +1013,8 @@ DEF_HELPER_FLAGS_5(gvec_bfmmla, TCG_CALL_NO_RWG,
DEF_HELPER_FLAGS_6(gvec_bfmlal, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmlal_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, ptr, i32)
#ifdef TARGET_AARCH64
#include "helper-a64.h"
diff --git a/target/arm/neon-shared.decode b/target/arm/neon-shared.decode
index b61addd98b..df80e6ebf6 100644
--- a/target/arm/neon-shared.decode
+++ b/target/arm/neon-shared.decode
@@ -95,3 +95,5 @@ VFML_scalar 1111 1110 0 . 0 s:1 .... .... 1000 . 0 . 1 index:1 ... \
rm=%vfml_scalar_q0_rm vn=%vn_sp vd=%vd_dp q=0
VFML_scalar 1111 1110 0 . 0 s:1 .... .... 1000 . 1 . 1 . rm:3 \
index=%vfml_scalar_q1_index vn=%vn_dp vd=%vd_dp q=1
+VFMA_b16_scal 1111 1110 0.11 .... .... 1000 . q:1 . 1 . vm:3 \
+ index=%vfml_scalar_q1_index vn=%vn_dp vd=%vd_dp
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 322bef24cf..69f979fb47 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1589,6 +1589,8 @@ FMLALB_zzxw 01100100 10 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
FMLALT_zzxw 01100100 10 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
FMLSLB_zzxw 01100100 10 1 ..... 0110.0 ..... ..... @rrxr_3a esz=2
FMLSLT_zzxw 01100100 10 1 ..... 0110.1 ..... ..... @rrxr_3a esz=2
+BFMLALB_zzxw 01100100 11 1 ..... 0100.0 ..... ..... @rrxr_3a esz=2
+BFMLALT_zzxw 01100100 11 1 ..... 0100.1 ..... ..... @rrxr_3a esz=2
### SVE2 floating-point bfloat16 dot-product (indexed)
BFDOT_zzxz 01100100 01 1 ..... 010000 ..... ..... @rrxr_2 esz=2
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 74794e3da3..7842dd51be 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -13480,18 +13480,27 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
unallocated_encoding(s);
return;
}
+ size = MO_32;
break;
case 1: /* BFDOT */
if (is_scalar || !dc_isar_feature(aa64_bf16, s)) {
unallocated_encoding(s);
return;
}
+ size = MO_32;
+ break;
+ case 3: /* BFMLAL{B,T} */
+ if (is_scalar || !dc_isar_feature(aa64_bf16, s)) {
+ unallocated_encoding(s);
+ return;
+ }
+ /* can't set is_fp without other incorrect size checks */
+ size = MO_16;
break;
default:
unallocated_encoding(s);
return;
}
- size = MO_32;
break;
case 0x11: /* FCMLA #0 */
case 0x13: /* FCMLA #90 */
@@ -13621,6 +13630,10 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, index,
gen_helper_gvec_usdot_idx_b);
return;
+ case 3: /* BFMLAL{B,T} */
+ gen_gvec_op4_fpst(s, 1, rd, rn, rm, rd, 0, (index << 1) | is_q,
+ gen_helper_gvec_bfmlal_idx);
+ return;
}
g_assert_not_reached();
case 0x11: /* FCMLA #0 */
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 3af980caba..7f33bc4682 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -8652,3 +8652,33 @@ static bool trans_BFMLALT_zzzw(DisasContext *s, arg_rrrr_esz *a)
{
return do_BFMLAL_zzzw(s, a, true);
}
+
+static bool do_BFMLAL_zzxw(DisasContext *s, arg_rrxr_esz *a, bool sel)
+{
+ if (!dc_isar_feature(aa64_sve_bf16, s)) {
+ return false;
+ }
+ if (sve_access_check(s)) {
+ TCGv_ptr status = fpstatus_ptr(FPST_FPCR);
+ unsigned vsz = vec_full_reg_size(s);
+
+ tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd),
+ vec_full_reg_offset(s, a->rn),
+ vec_full_reg_offset(s, a->rm),
+ vec_full_reg_offset(s, a->ra),
+ status, vsz, vsz, (a->index << 1) | sel,
+ gen_helper_gvec_bfmlal_idx);
+ tcg_temp_free_ptr(status);
+ }
+ return true;
+}
+
+static bool trans_BFMLALB_zzxw(DisasContext *s, arg_rrxr_esz *a)
+{
+ return do_BFMLAL_zzxw(s, a, false);
+}
+
+static bool trans_BFMLALT_zzxw(DisasContext *s, arg_rrxr_esz *a)
+{
+ return do_BFMLAL_zzxw(s, a, true);
+}
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 646a364c94..0906c5c148 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -2771,3 +2771,25 @@ void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+
+void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
+ void *va, void *stat, uint32_t desc)
+{
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
+ intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
+ intptr_t elements = opr_sz / 4;
+ intptr_t eltspersegment = MIN(16 / 4, elements);
+ float32 *d = vd, *a = va;
+ bfloat16 *n = vn, *m = vm;
+
+ for (i = 0; i < elements; i += eltspersegment) {
+ float32 m_idx = m[H2(2 * i + index)] << 16;
+
+ for (j = i; j < i + eltspersegment; j++) {
+ float32 n_j = n[H2(2 * j + sel)] << 16;
+ d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
+ }
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc
index dd710c8450..bd1068e4d4 100644
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c.inc
@@ -4135,3 +4135,13 @@ static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
gen_helper_gvec_bfmlal);
}
+
+static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
+{
+ if (!dc_isar_feature(aa32_bf16, s)) {
+ return false;
+ }
+ return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
+ (a->index << 1) | a->q, FPST_STD,
+ gen_helper_gvec_bfmlal_idx);
+}
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* [PATCH v1 11/11] target/arm: Enable BFloat16 extensions
2021-04-16 23:59 [PATCH v1 for-6.1 00/11] target/arm: Implement BFloat16 Richard Henderson
` (9 preceding siblings ...)
2021-04-16 23:59 ` [PATCH v1 10/11] target/arm: Implement bfloat widening fma (indexed) Richard Henderson
@ 2021-04-16 23:59 ` Richard Henderson
2021-05-18 12:47 ` Peter Maydell
10 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-04-16 23:59 UTC (permalink / raw)
To: qemu-devel; +Cc: qemu-arm
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/cpu64.c | 3 +++
target/arm/cpu_tcg.c | 1 +
2 files changed, 4 insertions(+)
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 379f90fab8..db4f48edcf 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -660,6 +660,7 @@ static void aarch64_max_initfn(Object *obj)
t = FIELD_DP64(t, ID_AA64ISAR1, FCMA, 1);
t = FIELD_DP64(t, ID_AA64ISAR1, SB, 1);
t = FIELD_DP64(t, ID_AA64ISAR1, SPECRES, 1);
+ t = FIELD_DP64(t, ID_AA64ISAR1, BF16, 1);
t = FIELD_DP64(t, ID_AA64ISAR1, FRINTTS, 1);
t = FIELD_DP64(t, ID_AA64ISAR1, LRCPC, 2); /* ARMv8.4-RCPC */
t = FIELD_DP64(t, ID_AA64ISAR1, I8MM, 1);
@@ -707,6 +708,7 @@ static void aarch64_max_initfn(Object *obj)
t = FIELD_DP64(t, ID_AA64ZFR0, SVEVER, 1);
t = FIELD_DP64(t, ID_AA64ZFR0, AES, 2); /* PMULL */
t = FIELD_DP64(t, ID_AA64ZFR0, BITPERM, 1);
+ t = FIELD_DP64(t, ID_AA64ZFR0, BFLOAT16, 1);
t = FIELD_DP64(t, ID_AA64ZFR0, SHA3, 1);
t = FIELD_DP64(t, ID_AA64ZFR0, SM4, 1);
t = FIELD_DP64(t, ID_AA64ZFR0, I8MM, 1);
@@ -730,6 +732,7 @@ static void aarch64_max_initfn(Object *obj)
u = FIELD_DP32(u, ID_ISAR6, FHM, 1);
u = FIELD_DP32(u, ID_ISAR6, SB, 1);
u = FIELD_DP32(u, ID_ISAR6, SPECRES, 1);
+ u = FIELD_DP32(u, ID_ISAR6, BF16, 1);
u = FIELD_DP32(u, ID_ISAR6, I8MM, 1);
cpu->isar.id_isar6 = u;
diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
index 046e476f65..b2463cf109 100644
--- a/target/arm/cpu_tcg.c
+++ b/target/arm/cpu_tcg.c
@@ -968,6 +968,7 @@ static void arm_max_initfn(Object *obj)
t = FIELD_DP32(t, ID_ISAR6, FHM, 1);
t = FIELD_DP32(t, ID_ISAR6, SB, 1);
t = FIELD_DP32(t, ID_ISAR6, SPECRES, 1);
+ t = FIELD_DP32(t, ID_ISAR6, BF16, 1);
cpu->isar.id_isar6 = t;
t = cpu->isar.mvfr1;
--
2.25.1
^ permalink raw reply related [flat|nested] 30+ messages in thread
* Re: [PATCH v1 11/11] target/arm: Enable BFloat16 extensions
2021-04-16 23:59 ` [PATCH v1 11/11] target/arm: Enable BFloat16 extensions Richard Henderson
@ 2021-05-18 12:47 ` Peter Maydell
2021-05-18 14:47 ` Richard Henderson
0 siblings, 1 reply; 30+ messages in thread
From: Peter Maydell @ 2021-05-18 12:47 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-arm, QEMU Developers
On Sat, 17 Apr 2021 at 01:05, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> target/arm/cpu64.c | 3 +++
> target/arm/cpu_tcg.c | 1 +
> 2 files changed, 4 insertions(+)
>
> diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
> index 379f90fab8..db4f48edcf 100644
> --- a/target/arm/cpu64.c
> +++ b/target/arm/cpu64.c
> @@ -660,6 +660,7 @@ static void aarch64_max_initfn(Object *obj)
> t = FIELD_DP64(t, ID_AA64ISAR1, FCMA, 1);
> t = FIELD_DP64(t, ID_AA64ISAR1, SB, 1);
> t = FIELD_DP64(t, ID_AA64ISAR1, SPECRES, 1);
> + t = FIELD_DP64(t, ID_AA64ISAR1, BF16, 1);
> t = FIELD_DP64(t, ID_AA64ISAR1, FRINTTS, 1);
> t = FIELD_DP64(t, ID_AA64ISAR1, LRCPC, 2); /* ARMv8.4-RCPC */
> t = FIELD_DP64(t, ID_AA64ISAR1, I8MM, 1);
> @@ -707,6 +708,7 @@ static void aarch64_max_initfn(Object *obj)
> t = FIELD_DP64(t, ID_AA64ZFR0, SVEVER, 1);
> t = FIELD_DP64(t, ID_AA64ZFR0, AES, 2); /* PMULL */
> t = FIELD_DP64(t, ID_AA64ZFR0, BITPERM, 1);
> + t = FIELD_DP64(t, ID_AA64ZFR0, BFLOAT16, 1);
> t = FIELD_DP64(t, ID_AA64ZFR0, SHA3, 1);
> t = FIELD_DP64(t, ID_AA64ZFR0, SM4, 1);
> t = FIELD_DP64(t, ID_AA64ZFR0, I8MM, 1);
> @@ -730,6 +732,7 @@ static void aarch64_max_initfn(Object *obj)
> u = FIELD_DP32(u, ID_ISAR6, FHM, 1);
> u = FIELD_DP32(u, ID_ISAR6, SB, 1);
> u = FIELD_DP32(u, ID_ISAR6, SPECRES, 1);
> + u = FIELD_DP32(u, ID_ISAR6, BF16, 1);
> u = FIELD_DP32(u, ID_ISAR6, I8MM, 1);
> cpu->isar.id_isar6 = u;
>
> diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
> index 046e476f65..b2463cf109 100644
> --- a/target/arm/cpu_tcg.c
> +++ b/target/arm/cpu_tcg.c
> @@ -968,6 +968,7 @@ static void arm_max_initfn(Object *obj)
> t = FIELD_DP32(t, ID_ISAR6, FHM, 1);
> t = FIELD_DP32(t, ID_ISAR6, SB, 1);
> t = FIELD_DP32(t, ID_ISAR6, SPECRES, 1);
> + t = FIELD_DP32(t, ID_ISAR6, BF16, 1);
> cpu->isar.id_isar6 = t;
>
> t = cpu->isar.mvfr1;
Same query as with SVE: do we need to clear these in the "!has_vfp"
and "!has_neon" handling code in arm_cpu_realizefn() ?
thanks
-- PMM
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH v1 11/11] target/arm: Enable BFloat16 extensions
2021-05-18 12:47 ` Peter Maydell
@ 2021-05-18 14:47 ` Richard Henderson
2021-05-25 16:57 ` Richard Henderson
0 siblings, 1 reply; 30+ messages in thread
From: Richard Henderson @ 2021-05-18 14:47 UTC (permalink / raw)
To: Peter Maydell; +Cc: qemu-arm, QEMU Developers
On 5/18/21 7:47 AM, Peter Maydell wrote:
>> diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
>> index 046e476f65..b2463cf109 100644
>> --- a/target/arm/cpu_tcg.c
>> +++ b/target/arm/cpu_tcg.c
>> @@ -968,6 +968,7 @@ static void arm_max_initfn(Object *obj)
>> t = FIELD_DP32(t, ID_ISAR6, FHM, 1);
>> t = FIELD_DP32(t, ID_ISAR6, SB, 1);
>> t = FIELD_DP32(t, ID_ISAR6, SPECRES, 1);
>> + t = FIELD_DP32(t, ID_ISAR6, BF16, 1);
>> cpu->isar.id_isar6 = t;
>>
>> t = cpu->isar.mvfr1;
>
> Same query as with SVE: do we need to clear these in the "!has_vfp"
> and "!has_neon" handling code in arm_cpu_realizefn() ?
I *think* we want to clear ID_ISAR6 only when !has_vfp && !has_neon, as
FEAT_AA32BF16 should still be usable to the other one. Which also means adding
the NEON/VFP check you suggested.
r~
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH v1 11/11] target/arm: Enable BFloat16 extensions
2021-05-18 14:47 ` Richard Henderson
@ 2021-05-25 16:57 ` Richard Henderson
0 siblings, 0 replies; 30+ messages in thread
From: Richard Henderson @ 2021-05-25 16:57 UTC (permalink / raw)
To: Peter Maydell; +Cc: qemu-arm, QEMU Developers
On 5/18/21 7:47 AM, Richard Henderson wrote:
> On 5/18/21 7:47 AM, Peter Maydell wrote:
>>> diff --git a/target/arm/cpu_tcg.c b/target/arm/cpu_tcg.c
>>> index 046e476f65..b2463cf109 100644
>>> --- a/target/arm/cpu_tcg.c
>>> +++ b/target/arm/cpu_tcg.c
>>> @@ -968,6 +968,7 @@ static void arm_max_initfn(Object *obj)
>>> t = FIELD_DP32(t, ID_ISAR6, FHM, 1);
>>> t = FIELD_DP32(t, ID_ISAR6, SB, 1);
>>> t = FIELD_DP32(t, ID_ISAR6, SPECRES, 1);
>>> + t = FIELD_DP32(t, ID_ISAR6, BF16, 1);
>>> cpu->isar.id_isar6 = t;
>>>
>>> t = cpu->isar.mvfr1;
>>
>> Same query as with SVE: do we need to clear these in the "!has_vfp"
>> and "!has_neon" handling code in arm_cpu_realizefn() ?
>
> I *think* we want to clear ID_ISAR6 only when !has_vfp && !has_neon, as
> FEAT_AA32BF16 should still be usable to the other one. Which also means adding
> the NEON/VFP check you suggested.
Alternately, we can clear BF16 when either !vfp or !neon, and then we don't
have to add the extra checks.
Unless we're presented with a real cpu that has vfp but not neon, and does have
bf16, this seems like a head-scratcher corner case. Just so long as we don't
do something actively against the rules I guess we're ok.
r~
^ permalink raw reply [flat|nested] 30+ messages in thread