All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Maydell <peter.maydell@linaro.org>
To: qemu-devel@nongnu.org
Subject: [PULL 027/114] target/arm: Split out saturating/rounding shifts from neon
Date: Tue, 25 May 2021 16:01:57 +0100	[thread overview]
Message-ID: <20210525150324.32370-28-peter.maydell@linaro.org> (raw)
In-Reply-To: <20210525150324.32370-1-peter.maydell@linaro.org>

From: Richard Henderson <richard.henderson@linaro.org>

Split these operations out into a header that can be shared
between neon and sve.  The "sat" pointer acts both as a boolean
for control of saturating behavior and controls the difference
in behavior between neon and sve -- QC bit or no QC bit.

Widen the shift operand in the new helpers, as the SVE2 insns treat
the whole input element as significant.  For the neon uses, truncate
the shift to int8_t while passing the parameter.

Implement right-shift rounding as

    tmp = src >> (shift - 1);
    dst = (tmp >> 1) + (tmp & 1);

This is the same number of instructions as the current

    tmp = 1 << (shift - 1);
    dst = (src + tmp) >> shift;

without any possibility of intermediate overflow.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210525010358.152808-6-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/vec_internal.h | 138 +++++++++++
 target/arm/neon_helper.c  | 507 +++++++-------------------------------
 2 files changed, 221 insertions(+), 424 deletions(-)

diff --git a/target/arm/vec_internal.h b/target/arm/vec_internal.h
index e3eb3e7a6bc..5b78e79329d 100644
--- a/target/arm/vec_internal.h
+++ b/target/arm/vec_internal.h
@@ -30,4 +30,142 @@ static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
     }
 }
 
+static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
+                                    bool round, uint32_t *sat)
+{
+    if (shift <= -bits) {
+        /* Rounding the sign bit always produces 0. */
+        if (round) {
+            return 0;
+        }
+        return src >> 31;
+    } else if (shift < 0) {
+        if (round) {
+            src >>= -shift - 1;
+            return (src >> 1) + (src & 1);
+        }
+        return src >> -shift;
+    } else if (shift < bits) {
+        int32_t val = src << shift;
+        if (bits == 32) {
+            if (!sat || val >> shift == src) {
+                return val;
+            }
+        } else {
+            int32_t extval = sextract32(val, 0, bits);
+            if (!sat || val == extval) {
+                return extval;
+            }
+        }
+    } else if (!sat || src == 0) {
+        return 0;
+    }
+
+    *sat = 1;
+    return (1u << (bits - 1)) - (src >= 0);
+}
+
+static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
+                                     bool round, uint32_t *sat)
+{
+    if (shift <= -(bits + round)) {
+        return 0;
+    } else if (shift < 0) {
+        if (round) {
+            src >>= -shift - 1;
+            return (src >> 1) + (src & 1);
+        }
+        return src >> -shift;
+    } else if (shift < bits) {
+        uint32_t val = src << shift;
+        if (bits == 32) {
+            if (!sat || val >> shift == src) {
+                return val;
+            }
+        } else {
+            uint32_t extval = extract32(val, 0, bits);
+            if (!sat || val == extval) {
+                return extval;
+            }
+        }
+    } else if (!sat || src == 0) {
+        return 0;
+    }
+
+    *sat = 1;
+    return MAKE_64BIT_MASK(0, bits);
+}
+
+static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
+                                     bool round, uint32_t *sat)
+{
+    if (sat && src < 0) {
+        *sat = 1;
+        return 0;
+    }
+    return do_uqrshl_bhs(src, shift, bits, round, sat);
+}
+
+static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
+                                  bool round, uint32_t *sat)
+{
+    if (shift <= -64) {
+        /* Rounding the sign bit always produces 0. */
+        if (round) {
+            return 0;
+        }
+        return src >> 63;
+    } else if (shift < 0) {
+        if (round) {
+            src >>= -shift - 1;
+            return (src >> 1) + (src & 1);
+        }
+        return src >> -shift;
+    } else if (shift < 64) {
+        int64_t val = src << shift;
+        if (!sat || val >> shift == src) {
+            return val;
+        }
+    } else if (!sat || src == 0) {
+        return 0;
+    }
+
+    *sat = 1;
+    return src < 0 ? INT64_MIN : INT64_MAX;
+}
+
+static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
+                                   bool round, uint32_t *sat)
+{
+    if (shift <= -(64 + round)) {
+        return 0;
+    } else if (shift < 0) {
+        if (round) {
+            src >>= -shift - 1;
+            return (src >> 1) + (src & 1);
+        }
+        return src >> -shift;
+    } else if (shift < 64) {
+        uint64_t val = src << shift;
+        if (!sat || val >> shift == src) {
+            return val;
+        }
+    } else if (!sat || src == 0) {
+        return 0;
+    }
+
+    *sat = 1;
+    return UINT64_MAX;
+}
+
+static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
+                                   bool round, uint32_t *sat)
+{
+    if (sat && src < 0) {
+        *sat = 1;
+        return 0;
+    }
+    return do_uqrshl_d(src, shift, round, sat);
+}
+
 #endif /* TARGET_ARM_VEC_INTERNALS_H */
diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
index b637265691a..338b9189d5b 100644
--- a/target/arm/neon_helper.c
+++ b/target/arm/neon_helper.c
@@ -11,6 +11,7 @@
 #include "cpu.h"
 #include "exec/helper-proto.h"
 #include "fpu/softfloat.h"
+#include "vec_internal.h"
 
 #define SIGNBIT (uint32_t)0x80000000
 #define SIGNBIT64 ((uint64_t)1 << 63)
@@ -576,496 +577,154 @@ NEON_POP(pmax_s16, neon_s16, 2)
 NEON_POP(pmax_u16, neon_u16, 2)
 #undef NEON_FN
 
-#define NEON_FN(dest, src1, src2) do { \
-    int8_t tmp; \
-    tmp = (int8_t)src2; \
-    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
-        tmp <= -(ssize_t)sizeof(src1) * 8) { \
-        dest = 0; \
-    } else if (tmp < 0) { \
-        dest = src1 >> -tmp; \
-    } else { \
-        dest = src1 << tmp; \
-    }} while (0)
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
 NEON_VOP(shl_u16, neon_u16, 2)
 #undef NEON_FN
 
-#define NEON_FN(dest, src1, src2) do { \
-    int8_t tmp; \
-    tmp = (int8_t)src2; \
-    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
-        dest = 0; \
-    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
-        dest = src1 >> (sizeof(src1) * 8 - 1); \
-    } else if (tmp < 0) { \
-        dest = src1 >> -tmp; \
-    } else { \
-        dest = src1 << tmp; \
-    }} while (0)
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
 NEON_VOP(shl_s16, neon_s16, 2)
 #undef NEON_FN
 
-#define NEON_FN(dest, src1, src2) do { \
-    int8_t tmp; \
-    tmp = (int8_t)src2; \
-    if ((tmp >= (ssize_t)sizeof(src1) * 8) \
-        || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \
-        dest = 0; \
-    } else if (tmp < 0) { \
-        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
-    } else { \
-        dest = src1 << tmp; \
-    }} while (0)
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
 NEON_VOP(rshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
 NEON_VOP(rshl_s16, neon_s16, 2)
 #undef NEON_FN
 
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator.  */
-uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
+uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
 {
-    int32_t dest;
-    int32_t val = (int32_t)valop;
-    int8_t shift = (int8_t)shiftop;
-    if ((shift >= 32) || (shift <= -32)) {
-        dest = 0;
-    } else if (shift < 0) {
-        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
-        dest = big_dest >> -shift;
-    } else {
-        dest = val << shift;
-    }
-    return dest;
+    return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
 }
 
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values.  */
-uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
+uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
 {
-    int8_t shift = (int8_t)shiftop;
-    int64_t val = valop;
-    if ((shift >= 64) || (shift <= -64)) {
-        val = 0;
-    } else if (shift < 0) {
-        val >>= (-shift - 1);
-        if (val == INT64_MAX) {
-            /* In this case, it means that the rounding constant is 1,
-             * and the addition would overflow. Return the actual
-             * result directly.  */
-            val = 0x4000000000000000LL;
-        } else {
-            val++;
-            val >>= 1;
-        }
-    } else {
-        val <<= shift;
-    }
-    return val;
+    return do_sqrshl_d(val, (int8_t)shift, true, NULL);
 }
 
-#define NEON_FN(dest, src1, src2) do { \
-    int8_t tmp; \
-    tmp = (int8_t)src2; \
-    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
-        tmp < -(ssize_t)sizeof(src1) * 8) { \
-        dest = 0; \
-    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
-        dest = src1 >> (-tmp - 1); \
-    } else if (tmp < 0) { \
-        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
-    } else { \
-        dest = src1 << tmp; \
-    }} while (0)
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
 NEON_VOP(rshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
 NEON_VOP(rshl_u16, neon_u16, 2)
 #undef NEON_FN
 
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator.  */
-uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
+uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
 {
-    uint32_t dest;
-    int8_t shift = (int8_t)shiftop;
-    if (shift >= 32 || shift < -32) {
-        dest = 0;
-    } else if (shift == -32) {
-        dest = val >> 31;
-    } else if (shift < 0) {
-        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
-        dest = big_dest >> -shift;
-    } else {
-        dest = val << shift;
-    }
-    return dest;
+    return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
 }
 
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values.  */
-uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
+uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
 {
-    int8_t shift = (uint8_t)shiftop;
-    if (shift >= 64 || shift < -64) {
-        val = 0;
-    } else if (shift == -64) {
-        /* Rounding a 1-bit result just preserves that bit.  */
-        val >>= 63;
-    } else if (shift < 0) {
-        val >>= (-shift - 1);
-        if (val == UINT64_MAX) {
-            /* In this case, it means that the rounding constant is 1,
-             * and the addition would overflow. Return the actual
-             * result directly.  */
-            val = 0x8000000000000000ULL;
-        } else {
-            val++;
-            val >>= 1;
-        }
-    } else {
-        val <<= shift;
-    }
-    return val;
+    return do_uqrshl_d(val, (int8_t)shift, true, NULL);
 }
 
-#define NEON_FN(dest, src1, src2) do { \
-    int8_t tmp; \
-    tmp = (int8_t)src2; \
-    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
-        if (src1) { \
-            SET_QC(); \
-            dest = ~0; \
-        } else { \
-            dest = 0; \
-        } \
-    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
-        dest = 0; \
-    } else if (tmp < 0) { \
-        dest = src1 >> -tmp; \
-    } else { \
-        dest = src1 << tmp; \
-        if ((dest >> tmp) != src1) { \
-            SET_QC(); \
-            dest = ~0; \
-        } \
-    }} while (0)
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
-NEON_VOP_ENV(qshl_u32, neon_u32, 1)
 #undef NEON_FN
 
-uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
+uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
 {
-    int8_t shift = (int8_t)shiftop;
-    if (shift >= 64) {
-        if (val) {
-            val = ~(uint64_t)0;
-            SET_QC();
-        }
-    } else if (shift <= -64) {
-        val = 0;
-    } else if (shift < 0) {
-        val >>= -shift;
-    } else {
-        uint64_t tmp = val;
-        val <<= shift;
-        if ((val >> shift) != tmp) {
-            SET_QC();
-            val = ~(uint64_t)0;
-        }
-    }
-    return val;
+    return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
 }
 
-#define NEON_FN(dest, src1, src2) do { \
-    int8_t tmp; \
-    tmp = (int8_t)src2; \
-    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
-        if (src1) { \
-            SET_QC(); \
-            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
-            if (src1 > 0) { \
-                dest--; \
-            } \
-        } else { \
-            dest = src1; \
-        } \
-    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
-        dest = src1 >> 31; \
-    } else if (tmp < 0) { \
-        dest = src1 >> -tmp; \
-    } else { \
-        dest = src1 << tmp; \
-        if ((dest >> tmp) != src1) { \
-            SET_QC(); \
-            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
-            if (src1 > 0) { \
-                dest--; \
-            } \
-        } \
-    }} while (0)
+uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+    return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
-NEON_VOP_ENV(qshl_s32, neon_s32, 1)
 #undef NEON_FN
 
-uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
+uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
 {
-    int8_t shift = (uint8_t)shiftop;
-    int64_t val = valop;
-    if (shift >= 64) {
-        if (val) {
-            SET_QC();
-            val = (val >> 63) ^ ~SIGNBIT64;
-        }
-    } else if (shift <= -64) {
-        val >>= 63;
-    } else if (shift < 0) {
-        val >>= -shift;
-    } else {
-        int64_t tmp = val;
-        val <<= shift;
-        if ((val >> shift) != tmp) {
-            SET_QC();
-            val = (tmp >> 63) ^ ~SIGNBIT64;
-        }
-    }
-    return val;
+    return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
 }
 
-#define NEON_FN(dest, src1, src2) do { \
-    if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
-        SET_QC(); \
-        dest = 0; \
-    } else { \
-        int8_t tmp; \
-        tmp = (int8_t)src2; \
-        if (tmp >= (ssize_t)sizeof(src1) * 8) { \
-            if (src1) { \
-                SET_QC(); \
-                dest = ~0; \
-            } else { \
-                dest = 0; \
-            } \
-        } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
-            dest = 0; \
-        } else if (tmp < 0) { \
-            dest = src1 >> -tmp; \
-        } else { \
-            dest = src1 << tmp; \
-            if ((dest >> tmp) != src1) { \
-                SET_QC(); \
-                dest = ~0; \
-            } \
-        } \
-    }} while (0)
-NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
-NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
+uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+    return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
+NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
 #undef NEON_FN
 
-uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
+NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
+#undef NEON_FN
+
+uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
 {
-    if ((int32_t)valop < 0) {
-        SET_QC();
-        return 0;
-    }
-    return helper_neon_qshl_u32(env, valop, shiftop);
+    return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
 }
 
-uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
+uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
 {
-    if ((int64_t)valop < 0) {
-        SET_QC();
-        return 0;
-    }
-    return helper_neon_qshl_u64(env, valop, shiftop);
+    return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
 }
 
-#define NEON_FN(dest, src1, src2) do { \
-    int8_t tmp; \
-    tmp = (int8_t)src2; \
-    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
-        if (src1) { \
-            SET_QC(); \
-            dest = ~0; \
-        } else { \
-            dest = 0; \
-        } \
-    } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
-        dest = 0; \
-    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
-        dest = src1 >> (sizeof(src1) * 8 - 1); \
-    } else if (tmp < 0) { \
-        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
-    } else { \
-        dest = src1 << tmp; \
-        if ((dest >> tmp) != src1) { \
-            SET_QC(); \
-            dest = ~0; \
-        } \
-    }} while (0)
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
 #undef NEON_FN
 
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator.  */
-uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shiftop)
+uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
 {
-    uint32_t dest;
-    int8_t shift = (int8_t)shiftop;
-    if (shift >= 32) {
-        if (val) {
-            SET_QC();
-            dest = ~0;
-        } else {
-            dest = 0;
-        }
-    } else if (shift < -32) {
-        dest = 0;
-    } else if (shift == -32) {
-        dest = val >> 31;
-    } else if (shift < 0) {
-        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
-        dest = big_dest >> -shift;
-    } else {
-        dest = val << shift;
-        if ((dest >> shift) != val) {
-            SET_QC();
-            dest = ~0;
-        }
-    }
-    return dest;
+    return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
 }
 
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values.  */
-uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
+uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
 {
-    int8_t shift = (int8_t)shiftop;
-    if (shift >= 64) {
-        if (val) {
-            SET_QC();
-            val = ~0;
-        }
-    } else if (shift < -64) {
-        val = 0;
-    } else if (shift == -64) {
-        val >>= 63;
-    } else if (shift < 0) {
-        val >>= (-shift - 1);
-        if (val == UINT64_MAX) {
-            /* In this case, it means that the rounding constant is 1,
-             * and the addition would overflow. Return the actual
-             * result directly.  */
-            val = 0x8000000000000000ULL;
-        } else {
-            val++;
-            val >>= 1;
-        }
-    } else { \
-        uint64_t tmp = val;
-        val <<= shift;
-        if ((val >> shift) != tmp) {
-            SET_QC();
-            val = ~0;
-        }
-    }
-    return val;
+    return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
 }
 
-#define NEON_FN(dest, src1, src2) do { \
-    int8_t tmp; \
-    tmp = (int8_t)src2; \
-    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
-        if (src1) { \
-            SET_QC(); \
-            dest = (typeof(dest))(1 << (sizeof(src1) * 8 - 1)); \
-            if (src1 > 0) { \
-                dest--; \
-            } \
-        } else { \
-            dest = 0; \
-        } \
-    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
-        dest = 0; \
-    } else if (tmp < 0) { \
-        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
-    } else { \
-        dest = src1 << tmp; \
-        if ((dest >> tmp) != src1) { \
-            SET_QC(); \
-            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
-            if (src1 > 0) { \
-                dest--; \
-            } \
-        } \
-    }} while (0)
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
 #undef NEON_FN
 
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator.  */
-uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
+uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
 {
-    int32_t dest;
-    int32_t val = (int32_t)valop;
-    int8_t shift = (int8_t)shiftop;
-    if (shift >= 32) {
-        if (val) {
-            SET_QC();
-            dest = (val >> 31) ^ ~SIGNBIT;
-        } else {
-            dest = 0;
-        }
-    } else if (shift <= -32) {
-        dest = 0;
-    } else if (shift < 0) {
-        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
-        dest = big_dest >> -shift;
-    } else {
-        dest = val << shift;
-        if ((dest >> shift) != val) {
-            SET_QC();
-            dest = (val >> 31) ^ ~SIGNBIT;
-        }
-    }
-    return dest;
+    return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
 }
 
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values.  */
-uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
+uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
 {
-    int8_t shift = (uint8_t)shiftop;
-    int64_t val = valop;
-
-    if (shift >= 64) {
-        if (val) {
-            SET_QC();
-            val = (val >> 63) ^ ~SIGNBIT64;
-        }
-    } else if (shift <= -64) {
-        val = 0;
-    } else if (shift < 0) {
-        val >>= (-shift - 1);
-        if (val == INT64_MAX) {
-            /* In this case, it means that the rounding constant is 1,
-             * and the addition would overflow. Return the actual
-             * result directly.  */
-            val = 0x4000000000000000ULL;
-        } else {
-            val++;
-            val >>= 1;
-        }
-    } else {
-        int64_t tmp = val;
-        val <<= shift;
-        if ((val >> shift) != tmp) {
-            SET_QC();
-            val = (tmp >> 63) ^ ~SIGNBIT64;
-        }
-    }
-    return val;
+    return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
 }
 
 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
-- 
2.20.1



  parent reply	other threads:[~2021-05-25 15:26 UTC|newest]

Thread overview: 96+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-25 15:01 [PULL 000/114] target-arm queue Peter Maydell
2021-05-25 15:01 ` [PULL 001/114] hw/arm/smmuv3: Another range invalidation fix Peter Maydell
2021-05-25 15:01 ` [PULL 002/114] hw/intc/arm_gicv3_cpuif: Fix EOIR write access check logic Peter Maydell
2021-05-25 15:01 ` [PULL 003/114] hw/arm/mps2-tz: Don't duplicate modelling of SRAM in AN524 Peter Maydell
2021-05-25 15:01 ` [PULL 004/114] hw/arm/mps2-tz: Make SRAM_ADDR_WIDTH board-specific Peter Maydell
2021-05-25 15:01 ` [PULL 005/114] hw/arm/armsse.c: Correct modelling of SSE-300 internal SRAMs Peter Maydell
2021-05-25 15:01 ` [PULL 006/114] hw/arm/armsse: Convert armsse_realize() to use ERRP_GUARD Peter Maydell
2021-05-25 15:01 ` [PULL 007/114] hw/arm/mps2-tz: Allow board to specify a boot RAM size Peter Maydell
2021-05-25 15:01 ` [PULL 008/114] hw/arm: Model TCMs in the SSE-300, not the AN547 Peter Maydell
2021-05-25 15:01 ` [PULL 009/114] target/arm: Use correct SP in M-profile exception return Peter Maydell
2021-05-25 15:01 ` [PULL 010/114] accel/tcg: Replace g_new() + memcpy() by g_memdup() Peter Maydell
2021-05-25 15:01 ` [PULL 011/114] accel/tcg: Pass length argument to tlb_flush_range_locked() Peter Maydell
2021-05-25 15:01 ` [PULL 012/114] accel/tlb: Rename TLBFlushPageBitsByMMUIdxData -> TLBFlushRangeData Peter Maydell
2021-05-25 15:01 ` [PULL 013/114] accel/tcg: Remove {encode,decode}_pbm_to_runon Peter Maydell
2021-05-25 15:01 ` [PULL 014/114] accel/tcg: Add tlb_flush_range_by_mmuidx() Peter Maydell
2021-05-25 15:01 ` [PULL 015/114] accel/tcg: Add tlb_flush_range_by_mmuidx_all_cpus() Peter Maydell
2021-05-25 15:01 ` [PULL 016/114] accel/tlb: Add tlb_flush_range_by_mmuidx_all_cpus_synced() Peter Maydell
2021-05-25 15:01 ` [PULL 017/114] accel/tcg: Rename tlb_flush_page_bits -> range]_by_mmuidx_async_0 Peter Maydell
2021-05-25 15:01 ` [PULL 018/114] accel/tlb: Rename tlb_flush_[page_bits > range]_by_mmuidx_async_[2 > 1] Peter Maydell
2021-05-25 15:01 ` [PULL 019/114] target/arm: Add support for FEAT_TLBIRANGE Peter Maydell
2021-05-25 15:01 ` [PULL 020/114] target/arm: Add support for FEAT_TLBIOS Peter Maydell
2021-05-25 15:01 ` [PULL 021/114] target/arm: set ID_AA64ISAR0.TLB to 2 for max AARCH64 CPU type Peter Maydell
2021-05-25 15:01 ` [PULL 022/114] disas/libvixl: Protect C system header for C++ compiler Peter Maydell
2021-05-25 15:01 ` [PULL 023/114] target/arm: Add ID_AA64ZFR0 fields and isar_feature_aa64_sve2 Peter Maydell
2021-05-25 15:01 ` [PULL 024/114] target/arm: Implement SVE2 Integer Multiply - Unpredicated Peter Maydell
2021-05-25 15:01 ` [PULL 025/114] target/arm: Implement SVE2 integer pairwise add and accumulate long Peter Maydell
2021-05-25 15:01 ` [PULL 026/114] target/arm: Implement SVE2 integer unary operations (predicated) Peter Maydell
2021-05-25 15:01 ` Peter Maydell [this message]
2021-05-25 15:01 ` [PULL 028/114] target/arm: Implement SVE2 saturating/rounding bitwise shift left (predicated) Peter Maydell
2021-05-25 15:01 ` [PULL 029/114] target/arm: Implement SVE2 integer halving add/subtract (predicated) Peter Maydell
2021-05-25 15:02 ` [PULL 030/114] target/arm: Implement SVE2 integer pairwise arithmetic Peter Maydell
2021-05-25 15:02 ` [PULL 031/114] target/arm: Implement SVE2 saturating add/subtract (predicated) Peter Maydell
2021-05-25 15:02 ` [PULL 032/114] target/arm: Implement SVE2 integer add/subtract long Peter Maydell
2021-05-25 15:02 ` [PULL 033/114] target/arm: Implement SVE2 integer add/subtract interleaved long Peter Maydell
2021-05-25 15:02 ` [PULL 034/114] target/arm: Implement SVE2 integer add/subtract wide Peter Maydell
2021-05-25 15:02 ` [PULL 035/114] target/arm: Implement SVE2 integer multiply long Peter Maydell
2021-05-25 15:02 ` [PULL 036/114] target/arm: Implement SVE2 PMULLB, PMULLT Peter Maydell
2021-05-25 15:02 ` [PULL 037/114] target/arm: Implement SVE2 bitwise shift left long Peter Maydell
2021-05-25 15:02 ` [PULL 038/114] target/arm: Implement SVE2 bitwise exclusive-or interleaved Peter Maydell
2021-05-25 15:02 ` [PULL 039/114] target/arm: Implement SVE2 bitwise permute Peter Maydell
2021-05-25 15:02 ` [PULL 040/114] target/arm: Implement SVE2 complex integer add Peter Maydell
2021-05-25 15:02 ` [PULL 041/114] target/arm: Implement SVE2 integer absolute difference and accumulate long Peter Maydell
2021-05-25 15:02 ` [PULL 042/114] target/arm: Implement SVE2 integer add/subtract long with carry Peter Maydell
2021-05-25 15:02 ` [PULL 043/114] target/arm: Implement SVE2 bitwise shift right and accumulate Peter Maydell
2021-05-25 15:02 ` [PULL 044/114] target/arm: Implement SVE2 bitwise shift and insert Peter Maydell
2021-05-25 15:02 ` [PULL 045/114] target/arm: Implement SVE2 integer absolute difference and accumulate Peter Maydell
2021-05-25 15:02 ` [PULL 046/114] target/arm: Implement SVE2 saturating extract narrow Peter Maydell
2021-05-25 15:02 ` [PULL 047/114] target/arm: Implement SVE2 floating-point pairwise Peter Maydell
2021-05-25 15:02 ` [PULL 048/114] target/arm: Implement SVE2 SHRN, RSHRN Peter Maydell
2021-05-25 15:02 ` [PULL 049/114] target/arm: Implement SVE2 SQSHRUN, SQRSHRUN Peter Maydell
2021-05-25 15:02 ` [PULL 050/114] target/arm: Implement SVE2 UQSHRN, UQRSHRN Peter Maydell
2021-05-25 15:02 ` [PULL 051/114] target/arm: Implement SVE2 SQSHRN, SQRSHRN Peter Maydell
2021-05-25 15:02 ` [PULL 052/114] target/arm: Implement SVE2 WHILEGT, WHILEGE, WHILEHI, WHILEHS Peter Maydell
2021-05-25 15:02 ` [PULL 053/114] target/arm: Implement SVE2 WHILERW, WHILEWR Peter Maydell
2021-05-25 15:02 ` [PULL 054/114] target/arm: Implement SVE2 bitwise ternary operations Peter Maydell
2021-05-25 15:02 ` [PULL 055/114] target/arm: Implement SVE2 MATCH, NMATCH Peter Maydell
2021-05-25 15:02 ` [PULL 056/114] target/arm: Implement SVE2 saturating multiply-add long Peter Maydell
2021-05-25 15:02 ` [PULL 057/114] target/arm: Implement SVE2 saturating multiply-add high Peter Maydell
2021-05-25 15:02 ` [PULL 058/114] target/arm: Implement SVE2 integer multiply-add long Peter Maydell
2021-05-25 15:02 ` [PULL 059/114] target/arm: Implement SVE2 complex integer multiply-add Peter Maydell
2021-05-25 15:02 ` [PULL 060/114] target/arm: Implement SVE2 ADDHNB, ADDHNT Peter Maydell
2021-05-25 15:02 ` [PULL 061/114] target/arm: Implement SVE2 RADDHNB, RADDHNT Peter Maydell
2021-05-25 15:02 ` [PULL 062/114] target/arm: Implement SVE2 SUBHNB, SUBHNT Peter Maydell
2021-05-25 15:02 ` [PULL 063/114] target/arm: Implement SVE2 RSUBHNB, RSUBHNT Peter Maydell
2021-05-25 15:02 ` [PULL 064/114] target/arm: Implement SVE2 HISTCNT, HISTSEG Peter Maydell
2021-05-25 15:02 ` [PULL 065/114] target/arm: Implement SVE2 XAR Peter Maydell
2021-05-25 15:02 ` [PULL 066/114] target/arm: Implement SVE2 scatter store insns Peter Maydell
2021-05-25 15:02 ` [PULL 067/114] target/arm: Implement SVE2 gather load insns Peter Maydell
2021-05-25 15:02 ` [PULL 068/114] target/arm: Implement SVE2 FMMLA Peter Maydell
2021-05-25 15:02 ` [PULL 069/114] target/arm: Implement SVE2 SPLICE, EXT Peter Maydell
2021-05-25 15:02 ` [PULL 070/114] target/arm: Use correct output type for gvec_sdot_*_b Peter Maydell
2021-05-25 15:02 ` [PULL 071/114] target/arm: Pass separate addend to {U, S}DOT helpers Peter Maydell
2021-05-25 15:02 ` [PULL 072/114] target/arm: Pass separate addend to FCMLA helpers Peter Maydell
2021-05-25 15:02 ` [PULL 073/114] target/arm: Split out formats for 2 vectors + 1 index Peter Maydell
2021-05-25 15:02 ` [PULL 074/114] target/arm: Split out formats for 3 " Peter Maydell
2021-05-25 15:02 ` [PULL 075/114] target/arm: Implement SVE2 integer multiply (indexed) Peter Maydell
2021-05-25 15:02 ` [PULL 076/114] target/arm: Implement SVE2 integer multiply-add (indexed) Peter Maydell
2021-05-25 15:02 ` [PULL 077/114] target/arm: Implement SVE2 saturating multiply-add high (indexed) Peter Maydell
2021-05-25 15:02 ` [PULL 078/114] target/arm: Implement SVE2 saturating multiply-add (indexed) Peter Maydell
2021-05-25 15:02 ` [PULL 079/114] target/arm: Implement SVE2 saturating multiply (indexed) Peter Maydell
2021-05-25 15:02 ` [PULL 080/114] target/arm: Implement SVE2 signed saturating doubling multiply high Peter Maydell
2021-05-25 15:02 ` [PULL 081/114] target/arm: Implement SVE2 saturating multiply high (indexed) Peter Maydell
2021-05-25 15:02 ` [PULL 082/114] target/arm: Implement SVE2 multiply-add long (indexed) Peter Maydell
2021-05-25 15:02 ` [PULL 083/114] target/arm: Implement SVE2 integer multiply " Peter Maydell
2021-05-25 15:02 ` [PULL 084/114] target/arm: Implement SVE2 complex integer multiply-add (indexed) Peter Maydell
2021-05-25 15:02 ` [PULL 085/114] target/arm: Implement SVE2 complex integer dot product Peter Maydell
2021-05-25 15:02 ` [PULL 086/114] target/arm: Macroize helper_gvec_{s,u}dot_{b,h} Peter Maydell
2021-05-25 15:02 ` [PULL 087/114] target/arm: Macroize helper_gvec_{s,u}dot_idx_{b,h} Peter Maydell
2021-05-25 15:02 ` [PULL 088/114] target/arm: Implement SVE mixed sign dot product (indexed) Peter Maydell
2021-05-25 15:02 ` [PULL 089/114] target/arm: Implement SVE mixed sign dot product Peter Maydell
2021-05-25 15:03 ` [PULL 090/114] target/arm: Implement SVE2 crypto unary operations Peter Maydell
2021-05-25 15:03 ` [PULL 091/114] target/arm: Implement SVE2 crypto destructive binary operations Peter Maydell
2021-05-25 15:03 ` [PULL 092/114] target/arm: Implement SVE2 crypto constructive " Peter Maydell
2021-05-25 15:03 ` [PULL 093/114] target/arm: Implement SVE2 TBL, TBX Peter Maydell
2021-05-25 15:03 ` [PULL 094/114] target/arm: Implement SVE2 FCVTNT Peter Maydell
2021-05-25 16:30 ` [PULL 000/114] target-arm queue Peter Maydell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210525150324.32370-28-peter.maydell@linaro.org \
    --to=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    --subject='Re: [PULL 027/114] target/arm: Split out saturating/rounding shifts from neon' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.