All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM
@ 2019-02-14  3:43 Richard Henderson
  2019-02-14  3:43 ` [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL Richard Henderson
                   ` (5 more replies)
  0 siblings, 6 replies; 10+ messages in thread
From: Richard Henderson @ 2019-02-14  3:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, qemu-arm

Tested with RISU vs Foundation Model, both a32 and a64.

I am aware of a trivial conflict with the ARMv8.3-JSConv patch set.
I just thought it was easier to manage separately based on master.


r~


Richard Henderson (4):
  target/arm: Add helpers for FMLAL and FMLSL
  target/arm: Implement FMLAL and FMLSL for aarch64
  target/arm: Implement VFMAL and VFMSL for aarch32
  target/arm: Enable ARMv8.2-FHM for -cpu max

 target/arm/cpu.h           |  10 +++
 target/arm/helper.h        |   9 +++
 target/arm/cpu.c           |   1 +
 target/arm/cpu64.c         |   2 +
 target/arm/translate-a64.c |  49 +++++++++++-
 target/arm/translate.c     | 104 ++++++++++++++++++-------
 target/arm/vec_helper.c    | 154 +++++++++++++++++++++++++++++++++++++
 7 files changed, 299 insertions(+), 30 deletions(-)

-- 
2.17.2

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL
  2019-02-14  3:43 [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM Richard Henderson
@ 2019-02-14  3:43 ` Richard Henderson
  2019-02-14  9:16   ` Laurent Desnogues
  2019-02-14  3:43 ` [Qemu-devel] [PATCH 2/4] target/arm: Implement FMLAL and FMLSL for aarch64 Richard Henderson
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: Richard Henderson @ 2019-02-14  3:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, qemu-arm

Note that float16_to_float32 rightly squashes SNaN to QNaN.
But of course pickNaNMulAdd, for ARM, selects SNaNs first.
So we have to preserve SNaN long enough for the correct NaN
to be selected.  Thus float16_to_float32_by_bits.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/helper.h     |   9 +++
 target/arm/vec_helper.c | 154 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+)

diff --git a/target/arm/helper.h b/target/arm/helper.h
index 53a38188c6..0302e13604 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -653,6 +653,15 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_5(gvec_fmlal_h, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmlsl_h, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmlal_idx_h, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fmlsl_idx_h, TCG_CALL_NO_RWG,
+                   void, ptr, ptr, ptr, ptr, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 37f338732e..0c3b3de961 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -766,3 +766,157 @@ DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
 DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
 
 #undef DO_FMLA_IDX
+
+/*
+ * Convert float16 to float32, raising no exceptions and
+ * preserving exceptional values, including SNaN.
+ * This is effectively an unpack+repack operation.
+ */
+static float32 float16_to_float32_by_bits(uint32_t f16)
+{
+    const int f16_bias = 15;
+    const int f32_bias = 127;
+    uint32_t sign = extract32(f16, 15, 1);
+    uint32_t exp = extract32(f16, 10, 5);
+    uint32_t frac = extract32(f16, 0, 10);
+
+    if (exp == 0x1f) {
+        /* Inf or NaN */
+        exp = 0xff;
+    } else if (exp == 0) {
+        /* Zero or denormal.  */
+        if (frac != 0) {
+            /*
+             * Denormal; these are all normal float32.
+             * Shift the fraction so that the msb is at bit 11,
+             * then remove bit 11 as the implicit bit of the
+             * normalized float32.  Note that we still go through
+             * the shift for normal numbers below, to put the
+             * float32 fraction at the right place.
+             */
+            int shift = clz32(frac) - 21;
+            frac = (frac << shift) & 0x3ff;
+            exp = f32_bias - f16_bias - shift + 1;
+        }
+    } else {
+        /* Normal number; adjust the bias.  */
+        exp += f32_bias - f16_bias;
+    }
+    sign <<= 31;
+    exp <<= 23;
+    frac <<= 23 - 10;
+
+    return sign | exp | frac;
+}
+
+static float32 fmlal(float32 a, float16 n16, float16 m16, float_status *fpst)
+{
+    float32 n = float16_to_float32_by_bits(n16);
+    float32 m = float16_to_float32_by_bits(m16);
+    return float32_muladd(n, m, a, 0, fpst);
+}
+
+static float32 fmlsl(float32 a, float16 n16, float16 m16, float_status *fpst)
+{
+    float32 n = float16_to_float32_by_bits(n16);
+    float32 m = float16_to_float32_by_bits(m16);
+    return float32_muladd(float32_chs(n), m, a, 0, fpst);
+}
+
+static inline uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
+{
+    /*
+     * Branchless load of u32[0], u64[0], u32[1], or u64[1].
+     * Load the 2nd qword iff is_q & is_2.
+     * Shift to the 2nd dword iff !is_q & is_2.
+     * For !is_q & !is_2, the upper bits of the result are garbage.
+     */
+    return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
+}
+
+/*
+ * Note that FMLAL and FMLSL require oprsz == 8 or oprsz == 16,
+ * as there is not yet SVE versions that might use blocking.
+ */
+
+void HELPER(gvec_fmlal_h)(void *vd, void *vn, void *vm,
+                          void *fpst, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
+    int is_q = oprsz == 16;
+    float32 *d = vd;
+    uint64_t n_4, m_4;
+
+    /* Pre-load all of the f16 data, avoiding overlap issues.  */
+    n_4 = load4_f16(vn, is_q, is_2);
+    m_4 = load4_f16(vm, is_q, is_2);
+
+    for (i = 0; i < oprsz / 4; i++) {
+        d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i*16, 16),
+                         extract64(m_4, i*16, 16), fpst);
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlsl_h)(void *vd, void *vn, void *vm,
+                          void *fpst, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
+    int is_q = oprsz == 16;
+    float32 *d = vd;
+    uint64_t n_4, m_4;
+
+    /* Pre-load all of the f16 data, avoiding overlap issues.  */
+    n_4 = load4_f16(vn, is_q, is_2);
+    m_4 = load4_f16(vm, is_q, is_2);
+
+    for (i = 0; i < oprsz / 4; i++) {
+        d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16),
+                         extract64(m_4, i*16, 16), fpst);
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlal_idx_h)(void *vd, void *vn, void *vm,
+                              void *fpst, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
+    int index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
+    int is_q = oprsz == 16;
+    float32 *d = vd;
+    uint64_t n_4;
+    float16 m_1;
+
+    /* Pre-load all of the f16 data, avoiding overlap issues.  */
+    n_4 = load4_f16(vn, is_q, is_2);
+    m_1 = ((float16 *)vm)[H2(index)];
+
+    for (i = 0; i < oprsz / 4; i++) {
+        d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i * 16, 16), m_1, fpst);
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fmlsl_idx_h)(void *vd, void *vn, void *vm,
+                              void *fpst, uint32_t desc)
+{
+    intptr_t i, oprsz = simd_oprsz(desc);
+    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
+    int index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
+    int is_q = oprsz == 16;
+    float32 *d = vd;
+    uint64_t n_4;
+    float16 m_1;
+
+    /* Pre-load all of the f16 data, avoiding overlap issues.  */
+    n_4 = load4_f16(vn, is_q, is_2);
+    m_1 = ((float16 *)vm)[H2(index)];
+
+    for (i = 0; i < oprsz / 4; i++) {
+        d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16), m_1, fpst);
+    }
+    clear_tail(d, oprsz, simd_maxsz(desc));
+}
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [PATCH 2/4] target/arm: Implement FMLAL and FMLSL for aarch64
  2019-02-14  3:43 [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM Richard Henderson
  2019-02-14  3:43 ` [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL Richard Henderson
@ 2019-02-14  3:43 ` Richard Henderson
  2019-02-14  3:43 ` [Qemu-devel] [PATCH 3/4] target/arm: Implement VFMAL and VFMSL for aarch32 Richard Henderson
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Richard Henderson @ 2019-02-14  3:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, qemu-arm

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/cpu.h           |  5 ++++
 target/arm/translate-a64.c | 49 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 47238e4245..15085a94ff 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3305,6 +3305,11 @@ static inline bool isar_feature_aa64_dp(const ARMISARegisters *id)
     return FIELD_EX64(id->id_aa64isar0, ID_AA64ISAR0, DP) != 0;
 }
 
+static inline bool isar_feature_aa64_fhm(const ARMISARegisters *id)
+{
+    return FIELD_EX64(id->id_aa64isar0, ID_AA64ISAR0, FHM) != 0;
+}
+
 static inline bool isar_feature_aa64_fcma(const ARMISARegisters *id)
 {
     return FIELD_EX64(id->id_aa64isar1, ID_AA64ISAR1, FCMA) != 0;
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index e002251ac6..d2ee811489 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -10891,9 +10891,26 @@ static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
         if (!fp_access_check(s)) {
             return;
         }
-
         handle_3same_float(s, size, elements, fpopcode, rd, rn, rm);
         return;
+
+    case 0x1d: /* FMLAL  */
+    case 0x3d: /* FMLSL  */
+    case 0x59: /* FMLAL2 */
+    case 0x79: /* FMLSL2 */
+        if (size & 1 || !dc_isar_feature(aa64_fhm, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (fp_access_check(s)) {
+            gen_gvec_op3_fpst(s, is_q, rd, rn, rm, false,
+                              extract32(insn, 29, 1),
+                              extract32(insn, 23, 1)
+                              ? gen_helper_gvec_fmlsl_h
+                              : gen_helper_gvec_fmlal_h);
+        }
+        return;
+
     default:
         unallocated_encoding(s);
         return;
@@ -12724,6 +12741,17 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
         }
         is_fp = 2;
         break;
+    case 0x00: /* FMLAL */
+    case 0x04: /* FMLSL */
+    case 0x18: /* FMLAL2 */
+    case 0x1c: /* FMLSL2 */
+        if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_fhm, s)) {
+            unallocated_encoding(s);
+            return;
+        }
+        size = MO_16;
+        is_fp = 3;
+        break;
     default:
         unallocated_encoding(s);
         return;
@@ -12765,6 +12793,9 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
         }
         break;
 
+    case 3: /* other fp, size already set and verified. */
+        break;
+
     default: /* integer */
         switch (size) {
         case MO_8:
@@ -12834,6 +12865,22 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
             tcg_temp_free_ptr(fpst);
         }
         return;
+
+    case 0x00: /* FMLAL */
+    case 0x04: /* FMLSL */
+    case 0x18: /* FMLAL2 */
+    case 0x1c: /* FMLSL2 */
+        {
+            int data = (index << 1) | u;
+            tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+                               vec_full_reg_offset(s, rn),
+                               vec_full_reg_offset(s, rm), fpst,
+                               is_q ? 16 : 8, vec_full_reg_size(s), data,
+                               opcode & 4 ? gen_helper_gvec_fmlsl_idx_h
+                               :  gen_helper_gvec_fmlal_idx_h);
+            tcg_temp_free_ptr(fpst);
+        }
+        return;
     }
 
     if (size == 3) {
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [PATCH 3/4] target/arm: Implement VFMAL and VFMSL for aarch32
  2019-02-14  3:43 [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM Richard Henderson
  2019-02-14  3:43 ` [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL Richard Henderson
  2019-02-14  3:43 ` [Qemu-devel] [PATCH 2/4] target/arm: Implement FMLAL and FMLSL for aarch64 Richard Henderson
@ 2019-02-14  3:43 ` Richard Henderson
  2019-02-14  3:43 ` [Qemu-devel] [PATCH 4/4] target/arm: Enable ARMv8.2-FHM for -cpu max Richard Henderson
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Richard Henderson @ 2019-02-14  3:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, qemu-arm

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/cpu.h       |   5 ++
 target/arm/translate.c | 104 +++++++++++++++++++++++++++++------------
 2 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 15085a94ff..84d24044fe 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -3232,6 +3232,11 @@ static inline bool isar_feature_aa32_dp(const ARMISARegisters *id)
     return FIELD_EX32(id->id_isar6, ID_ISAR6, DP) != 0;
 }
 
+static inline bool isar_feature_aa32_fhm(const ARMISARegisters *id)
+{
+    return FIELD_EX32(id->id_isar6, ID_ISAR6, FHM) != 0;
+}
+
 static inline bool isar_feature_aa32_fp16_arith(const ARMISARegisters *id)
 {
     /*
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 66cf28c8cb..0ed4768080 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -8236,15 +8236,8 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
     gen_helper_gvec_3_ptr *fn_gvec_ptr = NULL;
     int rd, rn, rm, opr_sz;
     int data = 0;
-    bool q;
-
-    q = extract32(insn, 6, 1);
-    VFP_DREG_D(rd, insn);
-    VFP_DREG_N(rn, insn);
-    VFP_DREG_M(rm, insn);
-    if ((rd | rn | rm) & q) {
-        return 1;
-    }
+    int off_rn, off_rm;
+    bool is_long = false, q = extract32(insn, 6, 1);
 
     if ((insn & 0xfe200f10) == 0xfc200800) {
         /* VCMLA -- 1111 110R R.1S .... .... 1000 ...0 .... */
@@ -8271,10 +8264,38 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
             return 1;
         }
         fn_gvec = u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b;
+    } else if ((insn & 0xff300f10) == 0xfc200810) {
+        /* VFM[AS]L -- 1111 1100 S.10 .... .... 1000 .Q.1 .... */
+        int sub = extract32(insn, 23, 1);
+        if (!dc_isar_feature(aa32_fhm, s)) {
+            return 1;
+        }
+        is_long = true;
+        fn_gvec_ptr = sub ? gen_helper_gvec_fmlsl_h : gen_helper_gvec_fmlal_h;
+        data = 0; /* is_2 == 0 */
     } else {
         return 1;
     }
 
+    VFP_DREG_D(rd, insn);
+    if (rd & q) {
+        return 1;
+    }
+    if (q || !is_long) {
+        VFP_DREG_N(rn, insn);
+        VFP_DREG_M(rm, insn);
+        if ((rn | rm) & q & !is_long) {
+            return 1;
+        }
+        off_rn = vfp_reg_offset(1, rn);
+        off_rm = vfp_reg_offset(1, rm);
+    } else {
+        rn = VFP_SREG_N(insn);
+        rm = VFP_SREG_M(insn);
+        off_rn = vfp_reg_offset(0, rn);
+        off_rm = vfp_reg_offset(0, rm);
+    }
+
     if (s->fp_excp_el) {
         gen_exception_insn(s, 4, EXCP_UDEF,
                            syn_simd_access_trap(1, 0xe, false), s->fp_excp_el);
@@ -8287,15 +8308,11 @@ static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
     opr_sz = (1 + q) * 8;
     if (fn_gvec_ptr) {
         TCGv_ptr fpst = get_fpstatus_ptr(1);
-        tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
-                           vfp_reg_offset(1, rn),
-                           vfp_reg_offset(1, rm), fpst,
+        tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), off_rn, off_rm, fpst,
                            opr_sz, opr_sz, data, fn_gvec_ptr);
         tcg_temp_free_ptr(fpst);
     } else {
-        tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd),
-                           vfp_reg_offset(1, rn),
-                           vfp_reg_offset(1, rm),
+        tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd), off_rn, off_rm,
                            opr_sz, opr_sz, data, fn_gvec);
     }
     return 0;
@@ -8314,14 +8331,8 @@ static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
     gen_helper_gvec_3 *fn_gvec = NULL;
     gen_helper_gvec_3_ptr *fn_gvec_ptr = NULL;
     int rd, rn, rm, opr_sz, data;
-    bool q;
-
-    q = extract32(insn, 6, 1);
-    VFP_DREG_D(rd, insn);
-    VFP_DREG_N(rn, insn);
-    if ((rd | rn) & q) {
-        return 1;
-    }
+    int off_rn, off_rm;
+    bool is_long = false, q = extract32(insn, 6, 1);
 
     if ((insn & 0xff000f10) == 0xfe000800) {
         /* VCMLA (indexed) -- 1111 1110 S.RR .... .... 1000 ...0 .... */
@@ -8350,6 +8361,7 @@ static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
     } else if ((insn & 0xffb00f00) == 0xfe200d00) {
         /* V[US]DOT -- 1111 1110 0.10 .... .... 1101 .Q.U .... */
         int u = extract32(insn, 4, 1);
+
         if (!dc_isar_feature(aa32_dp, s)) {
             return 1;
         }
@@ -8357,10 +8369,48 @@ static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
         /* rm is just Vm, and index is M.  */
         data = extract32(insn, 5, 1); /* index */
         rm = extract32(insn, 0, 4);
+    } else if ((insn & 0xffa00f10) == 0xfe000810) {
+        /* VFM[AS]L -- 1111 1110 0.0S .... .... 1000 .Q.1 .... */
+        int sub = extract32(insn, 20, 1);
+        int vm20 = extract32(insn, 0, 3);
+        int vm3 = extract32(insn, 3, 1);
+        int m = extract32(insn, 5, 1);
+        int index;
+
+        if (!dc_isar_feature(aa32_fhm, s)) {
+            return 1;
+        }
+        if (q) {
+            rm = vm20;
+            index = m * 2 + vm3;
+        } else {
+            rm = vm20 * 2 + m;
+            index = vm3;
+        }
+        is_long = true;
+        data = index << 1; /* is_2 == 0 */
+        fn_gvec_ptr = (sub ? gen_helper_gvec_fmlsl_idx_h
+                       : gen_helper_gvec_fmlal_idx_h);
     } else {
         return 1;
     }
 
+    VFP_DREG_D(rd, insn);
+    if (rd & q) {
+        return 1;
+    }
+    if (q || !is_long) {
+        VFP_DREG_N(rn, insn);
+        if (rn & q & !is_long) {
+            return 1;
+        }
+        off_rn = vfp_reg_offset(1, rn);
+        off_rm = vfp_reg_offset(1, rm);
+    } else {
+        rn = VFP_SREG_N(insn);
+        off_rn = vfp_reg_offset(0, rn);
+        off_rm = vfp_reg_offset(0, rm);
+    }
     if (s->fp_excp_el) {
         gen_exception_insn(s, 4, EXCP_UDEF,
                            syn_simd_access_trap(1, 0xe, false), s->fp_excp_el);
@@ -8373,15 +8423,11 @@ static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
     opr_sz = (1 + q) * 8;
     if (fn_gvec_ptr) {
         TCGv_ptr fpst = get_fpstatus_ptr(1);
-        tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
-                           vfp_reg_offset(1, rn),
-                           vfp_reg_offset(1, rm), fpst,
+        tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), off_rn, off_rm, fpst,
                            opr_sz, opr_sz, data, fn_gvec_ptr);
         tcg_temp_free_ptr(fpst);
     } else {
-        tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd),
-                           vfp_reg_offset(1, rn),
-                           vfp_reg_offset(1, rm),
+        tcg_gen_gvec_3_ool(vfp_reg_offset(1, rd), off_rn, off_rm,
                            opr_sz, opr_sz, data, fn_gvec);
     }
     return 0;
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [Qemu-devel] [PATCH 4/4] target/arm: Enable ARMv8.2-FHM for -cpu max
  2019-02-14  3:43 [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM Richard Henderson
                   ` (2 preceding siblings ...)
  2019-02-14  3:43 ` [Qemu-devel] [PATCH 3/4] target/arm: Implement VFMAL and VFMSL for aarch32 Richard Henderson
@ 2019-02-14  3:43 ` Richard Henderson
  2019-02-14  4:14 ` [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM no-reply
  2019-02-14  4:18 ` no-reply
  5 siblings, 0 replies; 10+ messages in thread
From: Richard Henderson @ 2019-02-14  3:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, qemu-arm

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/cpu.c   | 1 +
 target/arm/cpu64.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index edf6e0e1f1..f4aa6202f5 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -2002,6 +2002,7 @@ static void arm_max_initfn(Object *obj)
 
             t = cpu->isar.id_isar6;
             t = FIELD_DP32(t, ID_ISAR6, DP, 1);
+            t = FIELD_DP32(t, ID_ISAR6, FHM, 1);
             cpu->isar.id_isar6 = t;
 
             t = cpu->id_mmfr4;
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index eff0f164dd..bffce337a4 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -308,6 +308,7 @@ static void aarch64_max_initfn(Object *obj)
         t = FIELD_DP64(t, ID_AA64ISAR0, SM3, 1);
         t = FIELD_DP64(t, ID_AA64ISAR0, SM4, 1);
         t = FIELD_DP64(t, ID_AA64ISAR0, DP, 1);
+        t = FIELD_DP64(t, ID_AA64ISAR0, FHM, 1);
         cpu->isar.id_aa64isar0 = t;
 
         t = cpu->isar.id_aa64isar1;
@@ -345,6 +346,7 @@ static void aarch64_max_initfn(Object *obj)
 
         u = cpu->isar.id_isar6;
         u = FIELD_DP32(u, ID_ISAR6, DP, 1);
+        u = FIELD_DP32(u, ID_ISAR6, FHM, 1);
         cpu->isar.id_isar6 = u;
 
         /*
-- 
2.17.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM
  2019-02-14  3:43 [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM Richard Henderson
                   ` (3 preceding siblings ...)
  2019-02-14  3:43 ` [Qemu-devel] [PATCH 4/4] target/arm: Enable ARMv8.2-FHM for -cpu max Richard Henderson
@ 2019-02-14  4:14 ` no-reply
  2019-02-14  4:18 ` no-reply
  5 siblings, 0 replies; 10+ messages in thread
From: no-reply @ 2019-02-14  4:14 UTC (permalink / raw)
  To: richard.henderson; +Cc: fam, qemu-devel, peter.maydell, qemu-arm

Patchew URL: https://patchew.org/QEMU/20190214034345.24722-1-richard.henderson@linaro.org/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Subject: [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM
Type: series
Message-id: 20190214034345.24722-1-richard.henderson@linaro.org

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag]               patchew/20190214034345.24722-1-richard.henderson@linaro.org -> patchew/20190214034345.24722-1-richard.henderson@linaro.org
Switched to a new branch 'test'
10aad4eb81 target/arm: Enable ARMv8.2-FHM for -cpu max
ff95a88ae6 target/arm: Implement VFMAL and VFMSL for aarch32
598963ad45 target/arm: Implement FMLAL and FMLSL for aarch64
4399afe237 target/arm: Add helpers for FMLAL and FMLSL

=== OUTPUT BEGIN ===
1/4 Checking commit 4399afe237f0 (target/arm: Add helpers for FMLAL and FMLSL)
ERROR: spaces required around that '*' (ctx:VxV)
#130: FILE: target/arm/vec_helper.c:856:
+        d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i*16, 16),
                                                    ^

ERROR: spaces required around that '*' (ctx:VxV)
#131: FILE: target/arm/vec_helper.c:857:
+                         extract64(m_4, i*16, 16), fpst);
                                          ^

ERROR: spaces required around that '*' (ctx:VxV)
#150: FILE: target/arm/vec_helper.c:876:
+        d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16),
                                                    ^

ERROR: spaces required around that '*' (ctx:VxV)
#151: FILE: target/arm/vec_helper.c:877:
+                         extract64(m_4, i*16, 16), fpst);
                                          ^

ERROR: spaces required around that '*' (ctx:VxV)
#193: FILE: target/arm/vec_helper.c:919:
+        d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16), m_1, fpst);
                                                    ^

total: 5 errors, 0 warnings, 172 lines checked

Patch 1/4 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

2/4 Checking commit 598963ad4504 (target/arm: Implement FMLAL and FMLSL for aarch64)
3/4 Checking commit ff95a88ae603 (target/arm: Implement VFMAL and VFMSL for aarch32)
4/4 Checking commit 10aad4eb81e8 (target/arm: Enable ARMv8.2-FHM for -cpu max)
=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20190214034345.24722-1-richard.henderson@linaro.org/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-devel@redhat.com

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM
  2019-02-14  3:43 [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM Richard Henderson
                   ` (4 preceding siblings ...)
  2019-02-14  4:14 ` [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM no-reply
@ 2019-02-14  4:18 ` no-reply
  5 siblings, 0 replies; 10+ messages in thread
From: no-reply @ 2019-02-14  4:18 UTC (permalink / raw)
  To: richard.henderson; +Cc: fam, qemu-devel, peter.maydell, qemu-arm

Patchew URL: https://patchew.org/QEMU/20190214034345.24722-1-richard.henderson@linaro.org/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Subject: [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM
Message-id: 20190214034345.24722-1-richard.henderson@linaro.org
Type: series

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag]         patchew/20190214034345.24722-1-richard.henderson@linaro.org -> patchew/20190214034345.24722-1-richard.henderson@linaro.org
Submodule 'capstone' (https://git.qemu.org/git/capstone.git) registered for path 'capstone'
Submodule 'dtc' (https://git.qemu.org/git/dtc.git) registered for path 'dtc'
Submodule 'roms/QemuMacDrivers' (https://git.qemu.org/git/QemuMacDrivers.git) registered for path 'roms/QemuMacDrivers'
Submodule 'roms/SLOF' (https://git.qemu.org/git/SLOF.git) registered for path 'roms/SLOF'
Submodule 'roms/ipxe' (https://git.qemu.org/git/ipxe.git) registered for path 'roms/ipxe'
Submodule 'roms/openbios' (https://git.qemu.org/git/openbios.git) registered for path 'roms/openbios'
Submodule 'roms/openhackware' (https://git.qemu.org/git/openhackware.git) registered for path 'roms/openhackware'
Submodule 'roms/qemu-palcode' (https://git.qemu.org/git/qemu-palcode.git) registered for path 'roms/qemu-palcode'
Submodule 'roms/seabios' (https://git.qemu.org/git/seabios.git/) registered for path 'roms/seabios'
Submodule 'roms/seabios-hppa' (https://github.com/hdeller/seabios-hppa.git) registered for path 'roms/seabios-hppa'
Submodule 'roms/sgabios' (https://git.qemu.org/git/sgabios.git) registered for path 'roms/sgabios'
Submodule 'roms/skiboot' (https://git.qemu.org/git/skiboot.git) registered for path 'roms/skiboot'
Submodule 'roms/u-boot' (https://git.qemu.org/git/u-boot.git) registered for path 'roms/u-boot'
Submodule 'roms/u-boot-sam460ex' (https://git.qemu.org/git/u-boot-sam460ex.git) registered for path 'roms/u-boot-sam460ex'
Submodule 'tests/fp/berkeley-softfloat-3' (https://github.com/cota/berkeley-softfloat-3) registered for path 'tests/fp/berkeley-softfloat-3'
Submodule 'tests/fp/berkeley-testfloat-3' (https://github.com/cota/berkeley-testfloat-3) registered for path 'tests/fp/berkeley-testfloat-3'
Submodule 'ui/keycodemapdb' (https://git.qemu.org/git/keycodemapdb.git) registered for path 'ui/keycodemapdb'
Cloning into 'capstone'...
Submodule path 'capstone': checked out '22ead3e0bfdb87516656453336160e0a37b066bf'
Cloning into 'dtc'...
Submodule path 'dtc': checked out '88f18909db731a627456f26d779445f84e449536'
Cloning into 'roms/QemuMacDrivers'...
Submodule path 'roms/QemuMacDrivers': checked out '90c488d5f4a407342247b9ea869df1c2d9c8e266'
Cloning into 'roms/SLOF'...
Submodule path 'roms/SLOF': checked out 'a5b428e1c1eae703bdd62a3f527223c291ee3fdc'
Cloning into 'roms/ipxe'...
Submodule path 'roms/ipxe': checked out 'de4565cbe76ea9f7913a01f331be3ee901bb6e17'
Cloning into 'roms/openbios'...
Submodule path 'roms/openbios': checked out '441a84d3a642a10b948369c63f32367e8ff6395b'
Cloning into 'roms/openhackware'...
Submodule path 'roms/openhackware': checked out 'c559da7c8eec5e45ef1f67978827af6f0b9546f5'
Cloning into 'roms/qemu-palcode'...
Submodule path 'roms/qemu-palcode': checked out '51c237d7e20d05100eacadee2f61abc17e6bc097'
Cloning into 'roms/seabios'...
Submodule path 'roms/seabios': checked out 'a698c8995ffb2838296ec284fe3c4ad33dfca307'
Cloning into 'roms/seabios-hppa'...
Submodule path 'roms/seabios-hppa': checked out '1ef99a01572c2581c30e16e6fe69e9ea2ef92ce0'
Cloning into 'roms/sgabios'...
Submodule path 'roms/sgabios': checked out 'cbaee52287e5f32373181cff50a00b6c4ac9015a'
Cloning into 'roms/skiboot'...
Submodule path 'roms/skiboot': checked out 'e0ee24c27a172bcf482f6f2bc905e6211c134bcc'
Cloning into 'roms/u-boot'...
Submodule path 'roms/u-boot': checked out 'd85ca029f257b53a96da6c2fb421e78a003a9943'
Cloning into 'roms/u-boot-sam460ex'...
Submodule path 'roms/u-boot-sam460ex': checked out '60b3916f33e617a815973c5a6df77055b2e3a588'
Cloning into 'tests/fp/berkeley-softfloat-3'...
Submodule path 'tests/fp/berkeley-softfloat-3': checked out 'b64af41c3276f97f0e181920400ee056b9c88037'
Cloning into 'tests/fp/berkeley-testfloat-3'...
Submodule path 'tests/fp/berkeley-testfloat-3': checked out '5a59dcec19327396a011a17fd924aed4fec416b3'
Cloning into 'ui/keycodemapdb'...
Submodule path 'ui/keycodemapdb': checked out '6b3d716e2b6472eb7189d3220552280ef3d832ce'
Switched to a new branch 'test'
10aad4e target/arm: Enable ARMv8.2-FHM for -cpu max
ff95a88 target/arm: Implement VFMAL and VFMSL for aarch32
598963a target/arm: Implement FMLAL and FMLSL for aarch64
4399afe target/arm: Add helpers for FMLAL and FMLSL

=== OUTPUT BEGIN ===
1/4 Checking commit 4399afe237f0 (target/arm: Add helpers for FMLAL and FMLSL)
ERROR: spaces required around that '*' (ctx:VxV)
#130: FILE: target/arm/vec_helper.c:856:
+        d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i*16, 16),
                                                    ^

ERROR: spaces required around that '*' (ctx:VxV)
#131: FILE: target/arm/vec_helper.c:857:
+                         extract64(m_4, i*16, 16), fpst);
                                          ^

ERROR: spaces required around that '*' (ctx:VxV)
#150: FILE: target/arm/vec_helper.c:876:
+        d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16),
                                                    ^

ERROR: spaces required around that '*' (ctx:VxV)
#151: FILE: target/arm/vec_helper.c:877:
+                         extract64(m_4, i*16, 16), fpst);
                                          ^

ERROR: spaces required around that '*' (ctx:VxV)
#193: FILE: target/arm/vec_helper.c:919:
+        d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16), m_1, fpst);
                                                    ^

total: 5 errors, 0 warnings, 172 lines checked

Patch 1/4 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

2/4 Checking commit 598963ad4504 (target/arm: Implement FMLAL and FMLSL for aarch64)
3/4 Checking commit ff95a88ae603 (target/arm: Implement VFMAL and VFMSL for aarch32)
4/4 Checking commit 10aad4eb81e8 (target/arm: Enable ARMv8.2-FHM for -cpu max)
=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20190214034345.24722-1-richard.henderson@linaro.org/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-devel@redhat.com

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL
  2019-02-14  3:43 ` [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL Richard Henderson
@ 2019-02-14  9:16   ` Laurent Desnogues
  2019-02-14 14:56     ` Richard Henderson
  0 siblings, 1 reply; 10+ messages in thread
From: Laurent Desnogues @ 2019-02-14  9:16 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, Peter Maydell, qemu-arm

Hello,

On Thu, Feb 14, 2019 at 5:00 AM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Note that float16_to_float32 rightly squashes SNaN to QNaN.
> But of course pickNaNMulAdd, for ARM, selects SNaNs first.
> So we have to preserve SNaN long enough for the correct NaN
> to be selected.  Thus float16_to_float32_by_bits.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/arm/helper.h     |   9 +++
>  target/arm/vec_helper.c | 154 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 163 insertions(+)
>
> diff --git a/target/arm/helper.h b/target/arm/helper.h
> index 53a38188c6..0302e13604 100644
> --- a/target/arm/helper.h
> +++ b/target/arm/helper.h
> @@ -653,6 +653,15 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
>  DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
>                     void, ptr, ptr, ptr, ptr, ptr, i32)
>
> +DEF_HELPER_FLAGS_5(gvec_fmlal_h, TCG_CALL_NO_RWG,
> +                   void, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(gvec_fmlsl_h, TCG_CALL_NO_RWG,
> +                   void, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_h, TCG_CALL_NO_RWG,
> +                   void, ptr, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_5(gvec_fmlsl_idx_h, TCG_CALL_NO_RWG,
> +                   void, ptr, ptr, ptr, ptr, i32)
> +
>  #ifdef TARGET_AARCH64
>  #include "helper-a64.h"
>  #include "helper-sve.h"
> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
> index 37f338732e..0c3b3de961 100644
> --- a/target/arm/vec_helper.c
> +++ b/target/arm/vec_helper.c
> @@ -766,3 +766,157 @@ DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
>  DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
>
>  #undef DO_FMLA_IDX
> +
> +/*
> + * Convert float16 to float32, raising no exceptions and
> + * preserving exceptional values, including SNaN.
> + * This is effectively an unpack+repack operation.
> + */
> +static float32 float16_to_float32_by_bits(uint32_t f16)
> +{
> +    const int f16_bias = 15;
> +    const int f32_bias = 127;
> +    uint32_t sign = extract32(f16, 15, 1);
> +    uint32_t exp = extract32(f16, 10, 5);
> +    uint32_t frac = extract32(f16, 0, 10);
> +
> +    if (exp == 0x1f) {
> +        /* Inf or NaN */
> +        exp = 0xff;
> +    } else if (exp == 0) {
> +        /* Zero or denormal.  */
> +        if (frac != 0) {
> +            /*
> +             * Denormal; these are all normal float32.
> +             * Shift the fraction so that the msb is at bit 11,
> +             * then remove bit 11 as the implicit bit of the
> +             * normalized float32.  Note that we still go through
> +             * the shift for normal numbers below, to put the
> +             * float32 fraction at the right place.
> +             */
> +            int shift = clz32(frac) - 21;
> +            frac = (frac << shift) & 0x3ff;
> +            exp = f32_bias - f16_bias - shift + 1;

If FZ16 is set, this should flush to zero.

This means you will have to use both fp_status (for the muladd) and
fp_status_f16 (for this function) and so you should pass cpu_env to
the helpers rather than the fp_status.

Thanks,

Laurent

> +        }
> +    } else {
> +        /* Normal number; adjust the bias.  */
> +        exp += f32_bias - f16_bias;
> +    }
> +    sign <<= 31;
> +    exp <<= 23;
> +    frac <<= 23 - 10;
> +
> +    return sign | exp | frac;
> +}
> +
> +static float32 fmlal(float32 a, float16 n16, float16 m16, float_status *fpst)
> +{
> +    float32 n = float16_to_float32_by_bits(n16);
> +    float32 m = float16_to_float32_by_bits(m16);
> +    return float32_muladd(n, m, a, 0, fpst);
> +}
> +
> +static float32 fmlsl(float32 a, float16 n16, float16 m16, float_status *fpst)
> +{
> +    float32 n = float16_to_float32_by_bits(n16);
> +    float32 m = float16_to_float32_by_bits(m16);
> +    return float32_muladd(float32_chs(n), m, a, 0, fpst);
> +}
> +
> +static inline uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
> +{
> +    /*
> +     * Branchless load of u32[0], u64[0], u32[1], or u64[1].
> +     * Load the 2nd qword iff is_q & is_2.
> +     * Shift to the 2nd dword iff !is_q & is_2.
> +     * For !is_q & !is_2, the upper bits of the result are garbage.
> +     */
> +    return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
> +}
> +
> +/*
> + * Note that FMLAL and FMLSL require oprsz == 8 or oprsz == 16,
> + * as there is not yet SVE versions that might use blocking.
> + */
> +
> +void HELPER(gvec_fmlal_h)(void *vd, void *vn, void *vm,
> +                          void *fpst, uint32_t desc)
> +{
> +    intptr_t i, oprsz = simd_oprsz(desc);
> +    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> +    int is_q = oprsz == 16;
> +    float32 *d = vd;
> +    uint64_t n_4, m_4;
> +
> +    /* Pre-load all of the f16 data, avoiding overlap issues.  */
> +    n_4 = load4_f16(vn, is_q, is_2);
> +    m_4 = load4_f16(vm, is_q, is_2);
> +
> +    for (i = 0; i < oprsz / 4; i++) {
> +        d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i*16, 16),
> +                         extract64(m_4, i*16, 16), fpst);
> +    }
> +    clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> +
> +void HELPER(gvec_fmlsl_h)(void *vd, void *vn, void *vm,
> +                          void *fpst, uint32_t desc)
> +{
> +    intptr_t i, oprsz = simd_oprsz(desc);
> +    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> +    int is_q = oprsz == 16;
> +    float32 *d = vd;
> +    uint64_t n_4, m_4;
> +
> +    /* Pre-load all of the f16 data, avoiding overlap issues.  */
> +    n_4 = load4_f16(vn, is_q, is_2);
> +    m_4 = load4_f16(vm, is_q, is_2);
> +
> +    for (i = 0; i < oprsz / 4; i++) {
> +        d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16),
> +                         extract64(m_4, i*16, 16), fpst);
> +    }
> +    clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> +
> +void HELPER(gvec_fmlal_idx_h)(void *vd, void *vn, void *vm,
> +                              void *fpst, uint32_t desc)
> +{
> +    intptr_t i, oprsz = simd_oprsz(desc);
> +    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> +    int index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
> +    int is_q = oprsz == 16;
> +    float32 *d = vd;
> +    uint64_t n_4;
> +    float16 m_1;
> +
> +    /* Pre-load all of the f16 data, avoiding overlap issues.  */
> +    n_4 = load4_f16(vn, is_q, is_2);
> +    m_1 = ((float16 *)vm)[H2(index)];
> +
> +    for (i = 0; i < oprsz / 4; i++) {
> +        d[H4(i)] = fmlal(d[H4(i)], extract64(n_4, i * 16, 16), m_1, fpst);
> +    }
> +    clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> +
> +void HELPER(gvec_fmlsl_idx_h)(void *vd, void *vn, void *vm,
> +                              void *fpst, uint32_t desc)
> +{
> +    intptr_t i, oprsz = simd_oprsz(desc);
> +    int is_2 = extract32(desc, SIMD_DATA_SHIFT, 1);
> +    int index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
> +    int is_q = oprsz == 16;
> +    float32 *d = vd;
> +    uint64_t n_4;
> +    float16 m_1;
> +
> +    /* Pre-load all of the f16 data, avoiding overlap issues.  */
> +    n_4 = load4_f16(vn, is_q, is_2);
> +    m_1 = ((float16 *)vm)[H2(index)];
> +
> +    for (i = 0; i < oprsz / 4; i++) {
> +        d[H4(i)] = fmlsl(d[H4(i)], extract64(n_4, i*16, 16), m_1, fpst);
> +    }
> +    clear_tail(d, oprsz, simd_maxsz(desc));
> +}
> --
> 2.17.2
>
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL
  2019-02-14  9:16   ` Laurent Desnogues
@ 2019-02-14 14:56     ` Richard Henderson
  2019-02-14 14:58       ` Laurent Desnogues
  0 siblings, 1 reply; 10+ messages in thread
From: Richard Henderson @ 2019-02-14 14:56 UTC (permalink / raw)
  To: Laurent Desnogues; +Cc: qemu-devel, Peter Maydell, qemu-arm

On 2/14/19 1:16 AM, Laurent Desnogues wrote:
> Hello,
> 
> On Thu, Feb 14, 2019 at 5:00 AM Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> Note that float16_to_float32 rightly squashes SNaN to QNaN.
>> But of course pickNaNMulAdd, for ARM, selects SNaNs first.
>> So we have to preserve SNaN long enough for the correct NaN
>> to be selected.  Thus float16_to_float32_by_bits.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>  target/arm/helper.h     |   9 +++
>>  target/arm/vec_helper.c | 154 ++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 163 insertions(+)
>>
>> diff --git a/target/arm/helper.h b/target/arm/helper.h
>> index 53a38188c6..0302e13604 100644
>> --- a/target/arm/helper.h
>> +++ b/target/arm/helper.h
>> @@ -653,6 +653,15 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
>>  DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
>>                     void, ptr, ptr, ptr, ptr, ptr, i32)
>>
>> +DEF_HELPER_FLAGS_5(gvec_fmlal_h, TCG_CALL_NO_RWG,
>> +                   void, ptr, ptr, ptr, ptr, i32)
>> +DEF_HELPER_FLAGS_5(gvec_fmlsl_h, TCG_CALL_NO_RWG,
>> +                   void, ptr, ptr, ptr, ptr, i32)
>> +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_h, TCG_CALL_NO_RWG,
>> +                   void, ptr, ptr, ptr, ptr, i32)
>> +DEF_HELPER_FLAGS_5(gvec_fmlsl_idx_h, TCG_CALL_NO_RWG,
>> +                   void, ptr, ptr, ptr, ptr, i32)
>> +
>>  #ifdef TARGET_AARCH64
>>  #include "helper-a64.h"
>>  #include "helper-sve.h"
>> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
>> index 37f338732e..0c3b3de961 100644
>> --- a/target/arm/vec_helper.c
>> +++ b/target/arm/vec_helper.c
>> @@ -766,3 +766,157 @@ DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
>>  DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
>>
>>  #undef DO_FMLA_IDX
>> +
>> +/*
>> + * Convert float16 to float32, raising no exceptions and
>> + * preserving exceptional values, including SNaN.
>> + * This is effectively an unpack+repack operation.
>> + */
>> +static float32 float16_to_float32_by_bits(uint32_t f16)
>> +{
>> +    const int f16_bias = 15;
>> +    const int f32_bias = 127;
>> +    uint32_t sign = extract32(f16, 15, 1);
>> +    uint32_t exp = extract32(f16, 10, 5);
>> +    uint32_t frac = extract32(f16, 0, 10);
>> +
>> +    if (exp == 0x1f) {
>> +        /* Inf or NaN */
>> +        exp = 0xff;
>> +    } else if (exp == 0) {
>> +        /* Zero or denormal.  */
>> +        if (frac != 0) {
>> +            /*
>> +             * Denormal; these are all normal float32.
>> +             * Shift the fraction so that the msb is at bit 11,
>> +             * then remove bit 11 as the implicit bit of the
>> +             * normalized float32.  Note that we still go through
>> +             * the shift for normal numbers below, to put the
>> +             * float32 fraction at the right place.
>> +             */
>> +            int shift = clz32(frac) - 21;
>> +            frac = (frac << shift) & 0x3ff;
>> +            exp = f32_bias - f16_bias - shift + 1;
> 
> If FZ16 is set, this should flush to zero.

Ho, hum, yes it should.

> This means you will have to use both fp_status (for the muladd) and
> fp_status_f16 (for this function) and so you should pass cpu_env to
> the helpers rather than the fp_status.

It's not quite as simple as that, because aa32 mode would pass
standard_fp_status.  I'll figure something out...


r~

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL
  2019-02-14 14:56     ` Richard Henderson
@ 2019-02-14 14:58       ` Laurent Desnogues
  0 siblings, 0 replies; 10+ messages in thread
From: Laurent Desnogues @ 2019-02-14 14:58 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, Peter Maydell, qemu-arm

On Thu, Feb 14, 2019 at 3:56 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 2/14/19 1:16 AM, Laurent Desnogues wrote:
> > Hello,
> >
> > On Thu, Feb 14, 2019 at 5:00 AM Richard Henderson
> > <richard.henderson@linaro.org> wrote:
> >>
> >> Note that float16_to_float32 rightly squashes SNaN to QNaN.
> >> But of course pickNaNMulAdd, for ARM, selects SNaNs first.
> >> So we have to preserve SNaN long enough for the correct NaN
> >> to be selected.  Thus float16_to_float32_by_bits.
> >>
> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> >> ---
> >>  target/arm/helper.h     |   9 +++
> >>  target/arm/vec_helper.c | 154 ++++++++++++++++++++++++++++++++++++++++
> >>  2 files changed, 163 insertions(+)
> >>
> >> diff --git a/target/arm/helper.h b/target/arm/helper.h
> >> index 53a38188c6..0302e13604 100644
> >> --- a/target/arm/helper.h
> >> +++ b/target/arm/helper.h
> >> @@ -653,6 +653,15 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
> >>  DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
> >>                     void, ptr, ptr, ptr, ptr, ptr, i32)
> >>
> >> +DEF_HELPER_FLAGS_5(gvec_fmlal_h, TCG_CALL_NO_RWG,
> >> +                   void, ptr, ptr, ptr, ptr, i32)
> >> +DEF_HELPER_FLAGS_5(gvec_fmlsl_h, TCG_CALL_NO_RWG,
> >> +                   void, ptr, ptr, ptr, ptr, i32)
> >> +DEF_HELPER_FLAGS_5(gvec_fmlal_idx_h, TCG_CALL_NO_RWG,
> >> +                   void, ptr, ptr, ptr, ptr, i32)
> >> +DEF_HELPER_FLAGS_5(gvec_fmlsl_idx_h, TCG_CALL_NO_RWG,
> >> +                   void, ptr, ptr, ptr, ptr, i32)
> >> +
> >>  #ifdef TARGET_AARCH64
> >>  #include "helper-a64.h"
> >>  #include "helper-sve.h"
> >> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
> >> index 37f338732e..0c3b3de961 100644
> >> --- a/target/arm/vec_helper.c
> >> +++ b/target/arm/vec_helper.c
> >> @@ -766,3 +766,157 @@ DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
> >>  DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
> >>
> >>  #undef DO_FMLA_IDX
> >> +
> >> +/*
> >> + * Convert float16 to float32, raising no exceptions and
> >> + * preserving exceptional values, including SNaN.
> >> + * This is effectively an unpack+repack operation.
> >> + */
> >> +static float32 float16_to_float32_by_bits(uint32_t f16)
> >> +{
> >> +    const int f16_bias = 15;
> >> +    const int f32_bias = 127;
> >> +    uint32_t sign = extract32(f16, 15, 1);
> >> +    uint32_t exp = extract32(f16, 10, 5);
> >> +    uint32_t frac = extract32(f16, 0, 10);
> >> +
> >> +    if (exp == 0x1f) {
> >> +        /* Inf or NaN */
> >> +        exp = 0xff;
> >> +    } else if (exp == 0) {
> >> +        /* Zero or denormal.  */
> >> +        if (frac != 0) {
> >> +            /*
> >> +             * Denormal; these are all normal float32.
> >> +             * Shift the fraction so that the msb is at bit 11,
> >> +             * then remove bit 11 as the implicit bit of the
> >> +             * normalized float32.  Note that we still go through
> >> +             * the shift for normal numbers below, to put the
> >> +             * float32 fraction at the right place.
> >> +             */
> >> +            int shift = clz32(frac) - 21;
> >> +            frac = (frac << shift) & 0x3ff;
> >> +            exp = f32_bias - f16_bias - shift + 1;
> >
> > If FZ16 is set, this should flush to zero.
>
> Ho, hum, yes it should.
>
> > This means you will have to use both fp_status (for the muladd) and
> > fp_status_f16 (for this function) and so you should pass cpu_env to
> > the helpers rather than the fp_status.
>
> It's not quite as simple as that, because aa32 mode would pass
> standard_fp_status.  I'll figure something out...

Ha yes, I only looked at AArch64... as usual :-(

Thanks,

Laurent

>
> r~

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-02-14 15:13 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-14  3:43 [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM Richard Henderson
2019-02-14  3:43 ` [Qemu-devel] [PATCH 1/4] target/arm: Add helpers for FMLAL and FMLSL Richard Henderson
2019-02-14  9:16   ` Laurent Desnogues
2019-02-14 14:56     ` Richard Henderson
2019-02-14 14:58       ` Laurent Desnogues
2019-02-14  3:43 ` [Qemu-devel] [PATCH 2/4] target/arm: Implement FMLAL and FMLSL for aarch64 Richard Henderson
2019-02-14  3:43 ` [Qemu-devel] [PATCH 3/4] target/arm: Implement VFMAL and VFMSL for aarch32 Richard Henderson
2019-02-14  3:43 ` [Qemu-devel] [PATCH 4/4] target/arm: Enable ARMv8.2-FHM for -cpu max Richard Henderson
2019-02-14  4:14 ` [Qemu-devel] [PATCH 0/4] target/arm: Implement ARMv8.2-FHM no-reply
2019-02-14  4:18 ` no-reply

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.