qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] target/arm: Implement SVE2 fp multiply-add long
@ 2020-05-04 17:12 Stephen Long
  2020-06-18  4:05 ` Richard Henderson
  0 siblings, 1 reply; 2+ messages in thread
From: Stephen Long @ 2020-05-04 17:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-arm, richard.henderson, apazos

Implements both vectored and indexed FMLALB, FMLALT, FMLSLB, FMLSLT

Signed-off-by: Stephen Long <steplong@quicinc.com>
---
 target/arm/helper-sve.h    | 10 +++++
 target/arm/sve.decode      | 12 ++++++
 target/arm/sve_helper.c    | 76 +++++++++++++++++++++++++++++++++
 target/arm/translate-sve.c | 86 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 184 insertions(+)

diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h
index 0a62eef94e..2e5f1d810e 100644
--- a/target/arm/helper-sve.h
+++ b/target/arm/helper-sve.h
@@ -2731,3 +2731,13 @@ DEF_HELPER_FLAGS_5(sve2_sqrdcmlah_idx_h, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(sve2_sqrdcmlah_idx_s, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve2_fmlal_zzzw_s, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve2_fmlsl_zzzw_s, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_6(sve2_fmlal_zzxw_s, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_6(sve2_fmlsl_zzxw_s, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, ptr, i32)
diff --git a/target/arm/sve.decode b/target/arm/sve.decode
index 3cf824bac5..7602ba4e3f 100644
--- a/target/arm/sve.decode
+++ b/target/arm/sve.decode
@@ -1568,3 +1568,15 @@ SM4E            01000101 00 10001 1 11100 0 ..... .....  @rdn_rm_e0
 # SVE2 crypto constructive binary operations
 SM4EKEY         01000101 00 1 ..... 11110 0 ..... .....  @rd_rn_rm_e0
 RAX1            01000101 00 1 ..... 11110 1 ..... .....  @rd_rn_rm_e0
+
+### SVE2 floating-point multiply-add long (vectors)
+FMLALB_zzzw     01100100 .. 1 ..... 10 0 00 0 ..... .....  @rda_rn_rm
+FMLALT_zzzw     01100100 .. 1 ..... 10 0 00 1 ..... .....  @rda_rn_rm
+FMLSLB_zzzw     01100100 .. 1 ..... 10 1 00 0 ..... .....  @rda_rn_rm
+FMLSLT_zzzw     01100100 .. 1 ..... 10 1 00 1 ..... .....  @rda_rn_rm
+
+### SVE2 floating-point multiply-add long (indexed)
+FMLALB_zzxw     01100100 .. 1 ..... 0100.0 ..... .....          @rrxw_s
+FMLALT_zzxw     01100100 .. 1 ..... 0100.1 ..... .....          @rrxw_s
+FMLSLB_zzxw     01100100 .. 1 ..... 0110.0 ..... .....          @rrxw_s
+FMLSLT_zzxw     01100100 .. 1 ..... 0110.1 ..... .....          @rrxw_s
diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c
index aa94df302a..ae1321225a 100644
--- a/target/arm/sve_helper.c
+++ b/target/arm/sve_helper.c
@@ -7622,3 +7622,79 @@ void HELPER(fmmla_d)(void *vd, void *va, void *vn, void *vm,
         d[3] = float64_add(a[3], float64_add(p0, p1, status), status);
     }
 }
+
+/* SVE2 Floating Point Multiply-Add (Vectors) Helpers */
+
+void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
+                               void *status, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    intptr_t sel1 = simd_data(desc) * sizeof(float16);
+    for (i = 0; i < opr_sz; i += sizeof(float32)) {
+        float16 nn_16 = *(float16 *)(vn + H1_2(i + sel1));
+        float16 mm_16 = *(float16 *)(vm + H1_2(i + sel1));
+
+        float32 nn = float16_to_float32(nn_16, true, status);
+        float32 mm = float16_to_float32(mm_16, true, status);
+        float32 aa = *(float32 *)(va + H1_4(i));
+        *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
+    }
+}
+
+void HELPER(sve2_fmlsl_zzzw_s)(void *vd, void *vn, void *vm, void *va,
+                               void *status, uint32_t desc)
+{
+    intptr_t i, opr_sz = simd_oprsz(desc);
+    intptr_t sel1 = simd_data(desc) * sizeof(float16);
+    for (i = 0; i < opr_sz; i += sizeof(float32)) {
+        float16 nn_16 = *(float16 *)(vn + H1_2(i + sel1));
+        float16 mm_16 = *(float16 *)(vm + H1_2(i + sel1));
+
+        float32 nn = float16_to_float32(nn_16, true, status);
+        float32 mm = float16_to_float32(mm_16, true, status);
+        float32 aa = *(float32 *)(va + H1_4(i));
+        nn = float32_set_sign(nn, float32_is_neg(nn) ^ 1);
+        *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
+    }
+}
+
+/* SVE2 Floating Point Multiply-Add (Indexed) Helpers */
+
+void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
+                               void *status, uint32_t desc)
+{
+    intptr_t i, j, oprsz = simd_oprsz(desc);
+    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(float16);
+    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
+    for (i = 0; i < oprsz; i += 16) {
+        float16 mm_16 = *(float16 *)(vm + i + idx);
+        float32 mm = float16_to_float32(mm_16, true, status);
+        for (j = 0; j < 16; j += sizeof(float32)) {
+            float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel));
+            float32 nn = float16_to_float32(nn_16, true, status);
+            float32 aa = *(float32 *)(va + H1_4(i + j));
+            *(float32 *)(vd + H1_4(i + j)) =
+                float32_muladd(nn, mm, aa, 0, status);
+        }
+    }
+}
+
+void HELPER(sve2_fmlsl_zzxw_s)(void *vd, void *vn, void *vm, void *va,
+                               void *status, uint32_t desc)
+{
+    intptr_t i, j, oprsz = simd_oprsz(desc);
+    intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(float16);
+    intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
+    for (i = 0; i < oprsz; i += 16) {
+        float16 mm_16 = *(float16 *)(vm + i + idx);
+        float32 mm = float16_to_float32(mm_16, true, status);
+        for (j = 0; j < 16; j += sizeof(float32)) {
+            float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel));
+            float32 nn = float16_to_float32(nn_16, true, status);
+            float32 aa = *(float32 *)(va + H1_4(i + j));
+            nn = float32_set_sign(nn, float32_is_neg(nn) ^ 1);
+            *(float32 *)(vd + H1_4(i + j)) =
+                float32_muladd(nn, mm, aa, 0, status);
+        }
+    }
+}
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index a8e57ea5f4..3d7382a7de 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -8253,3 +8253,89 @@ static bool trans_RAX1(DisasContext *s, arg_rrr_esz *a)
     }
     return true;
 }
+
+static bool do_zzzz_fp(DisasContext *s, arg_rrrr_esz *a,
+                       gen_helper_gvec_4_ptr *fn, int data)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+        tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd),
+                           vec_full_reg_offset(s, a->rn),
+                           vec_full_reg_offset(s, a->rm),
+                           vec_full_reg_offset(s, a->ra),
+                           status, vsz, vsz, data, fn);
+        tcg_temp_free_ptr(status);
+    }
+    return true;
+}
+
+static bool do_zzxz_fp(DisasContext *s, arg_rrxr_esz *a,
+                       gen_helper_gvec_4_ptr *fn, int data)
+{
+    if (fn == NULL) {
+        return false;
+    }
+    if (sve_access_check(s)) {
+        unsigned vsz = vec_full_reg_size(s);
+        TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16);
+        tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd),
+                           vec_full_reg_offset(s, a->rn),
+                           vec_full_reg_offset(s, a->rm),
+                           vec_full_reg_offset(s, a->ra),
+                           status, vsz, vsz, data, fn);
+        tcg_temp_free_ptr(status);
+    }
+    return true;
+}
+
+static bool do_sve2_zzzz_fp(DisasContext *s, arg_rrrr_esz *a,
+                            gen_helper_gvec_4_ptr *fn, int data)
+{
+    if (!dc_isar_feature(aa64_sve2, s) || a->esz != 2) {
+        return false;
+    }
+    return do_zzzz_fp(s, a, fn, data);
+}
+
+static bool do_sve2_zzxz_fp(DisasContext *s, arg_rrxr_esz *a,
+                            gen_helper_gvec_4_ptr *fn, int data)
+{
+    if (!dc_isar_feature(aa64_sve2, s) || a->esz != 2) {
+        return false;
+    }
+    return do_zzxz_fp(s, a, fn, data);
+}
+
+#define DO_SVE2_FP_MULADD_VEC(NAME, FUNC, TOP)                            \
+static bool trans_##NAME(DisasContext *s, arg_rrrr_esz *a)                \
+{                                                                         \
+    return do_sve2_zzzz_fp(s, a, FUNC, TOP);                              \
+}
+
+#define DO_SVE2_FP_MULADD_IDX(NAME, FUNC, TOP)                            \
+static bool trans_##NAME(DisasContext *s, arg_rrxr_esz *a)                \
+{                                                                         \
+    return do_sve2_zzxz_fp(s, a, FUNC, (a->index << 1) | TOP);            \
+}
+
+/*
+ * SVE2 Floating Point Multiply-Add Vector Group
+ */
+
+DO_SVE2_FP_MULADD_VEC(FMLALB_zzzw, gen_helper_sve2_fmlal_zzzw_s, false)
+DO_SVE2_FP_MULADD_VEC(FMLALT_zzzw, gen_helper_sve2_fmlal_zzzw_s, true)
+DO_SVE2_FP_MULADD_VEC(FMLSLB_zzzw, gen_helper_sve2_fmlsl_zzzw_s, false)
+DO_SVE2_FP_MULADD_VEC(FMLSLT_zzzw, gen_helper_sve2_fmlsl_zzzw_s, true)
+
+/*
+ * SVE2 Floating Point Multiply-Add Indexed Group
+ */
+
+DO_SVE2_FP_MULADD_IDX(FMLALB_zzxw, gen_helper_sve2_fmlal_zzxw_s, false)
+DO_SVE2_FP_MULADD_IDX(FMLALT_zzxw, gen_helper_sve2_fmlal_zzxw_s, true)
+DO_SVE2_FP_MULADD_IDX(FMLSLB_zzxw, gen_helper_sve2_fmlsl_zzxw_s, false)
+DO_SVE2_FP_MULADD_IDX(FMLSLT_zzxw, gen_helper_sve2_fmlsl_zzxw_s, true)
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH] target/arm: Implement SVE2 fp multiply-add long
  2020-05-04 17:12 [PATCH] target/arm: Implement SVE2 fp multiply-add long Stephen Long
@ 2020-06-18  4:05 ` Richard Henderson
  0 siblings, 0 replies; 2+ messages in thread
From: Richard Henderson @ 2020-06-18  4:05 UTC (permalink / raw)
  To: Stephen Long, qemu-devel; +Cc: qemu-arm, apazos

On 5/4/20 10:12 AM, Stephen Long wrote:
> +void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
> +                               void *status, uint32_t desc)
> +{
> +    intptr_t i, opr_sz = simd_oprsz(desc);
> +    intptr_t sel1 = simd_data(desc) * sizeof(float16);
> +    for (i = 0; i < opr_sz; i += sizeof(float32)) {
> +        float16 nn_16 = *(float16 *)(vn + H1_2(i + sel1));
> +        float16 mm_16 = *(float16 *)(vm + H1_2(i + sel1));
> +
> +        float32 nn = float16_to_float32(nn_16, true, status);
> +        float32 mm = float16_to_float32(mm_16, true, status);
> +        float32 aa = *(float32 *)(va + H1_4(i));
> +        *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
> +    }
> +}

This isn't quite right.  The float16 to float32 conversion cannot raise any
exceptions -- this is an exact operation.

We already have an implementation of this function in vec_helper.c --
float16_to_float32_by_bits, used by the AdvSIMD version of this same operation.

I've rearranged the helpers to match AdvSIMD, and queued the patch.


r~


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2020-06-18  4:06 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-04 17:12 [PATCH] target/arm: Implement SVE2 fp multiply-add long Stephen Long
2020-06-18  4:05 ` Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).