All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only
       [not found] ` <4FEDAA07.6030600@suse.de>
@ 2012-07-02 14:51   ` Catalin Patulea
  2012-07-02 15:25     ` Catalin Patulea
  0 siblings, 1 reply; 5+ messages in thread
From: Catalin Patulea @ 2012-07-02 14:51 UTC (permalink / raw)
  To: Andreas Färber; +Cc: peter.maydell, qemu-devel

On Fri, Jun 29, 2012 at 9:13 AM, Andreas Färber <afaerber@suse.de> wrote:
> Please run scripts/checkpatch.pl for CODING_STYLE issues, I spotted one
> slightly misplaced if brace, and one empty line at the bottom seemed to
> have indentation (git-am complains about that, too).
Done. The only issue left is a false positive:

ERROR: "foo * bar" should be "foo *bar"
#248: FILE: fpu/softfloat.h:507:
+int floatx80_remainder(floatx80, floatx80, floatx80 *, uint64_t *
STATUS_PARAM);

(STATUS_PARAM is not the name of the param, it's a macro.)

> The patch description will need to be cleaned up (not be letter-style).
Done.

> I'm not too thrilled to introduce more uses of int32 (we started
> converting int16 to int_fast16_t) but I won't veto. It would be nice if
> you could review for any potential int32 vs. int32_t breakages though,
> to not make things worse than they are.
I did notice one place where int32 was used where int should be used
(the return variable of do_fprem), and that's now fixed. But for the
other uses of int32, as I was saying to Peter, they are tied to the
return type of extractFloatx80Exp. I wouldn't give the variable a
different type than the function, and especially not in this case
because the function should become int16_t, not int32_t.

I'll be sending a PATCHv2 momentarily.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only
  2012-07-02 14:51   ` [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only Catalin Patulea
@ 2012-07-02 15:25     ` Catalin Patulea
  2012-07-02 17:33       ` Catalin Patulea
                         ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Catalin Patulea @ 2012-07-02 15:25 UTC (permalink / raw)
  To: qemu-devel; +Cc: sw, Catalin Patulea, afaerber, peter.maydell

FPREM1 now passes the TestFloat floatx80_rem suite (and FPREM is implemented very
similarly).

The code (the bulk of which is remainder_kernel and do_fprem) is derived from
Bochs SVN revision 11224 dated 2012-06-21 10:33:37 -0700, with conversions to
Qemu type aliases, C features only, etc. as needed.

Signed-off-by: Catalin Patulea <catalinp@google.com>
---
 fpu/softfloat.c         |  195 +++++++++++++++++++++++++++++++++++++++++++++++
 fpu/softfloat.h         |    4 +
 target-i386/op_helper.c |  166 ++++++++++++++++------------------------
 3 files changed, 266 insertions(+), 99 deletions(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index b29256a..bd1879d 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -5234,6 +5234,16 @@ int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
 }
 
 /*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is an
+| unsupported value; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+int floatx80_is_unsupported(floatx80 a)
+{
+    return extractFloatx80Exp(a) &&
+           !(extractFloatx80Frac(a) & LIT64(0x8000000000000000));
+}
+
+/*----------------------------------------------------------------------------
 | Returns the result of converting the quadruple-precision floating-point
 | value `a' to the 32-bit two's complement integer format.  The conversion
 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
@@ -6828,6 +6838,191 @@ floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
                                           aSign, aExp, aSig, 0 STATUS_VAR );
 }
 
+/* executes single exponent reduction cycle */
+static uint64_t remainder_kernel(uint64_t aSig0, uint64_t bSig, int expDiff,
+                                 uint64_t *zSig0, uint64_t *zSig1)
+{
+    uint64_t term0, term1;
+    uint64_t aSig1 = 0;
+
+    shortShift128Left(aSig1, aSig0, expDiff, &aSig1, &aSig0);
+    uint64_t q = estimateDiv128To64(aSig1, aSig0, bSig);
+    mul64To128(bSig, q, &term0, &term1);
+    sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
+    while ((int64)(*zSig1) < 0) {
+        --q;
+        add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0);
+    }
+    return q;
+}
+
+static int do_fprem(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q,
+                    int rounding_mode STATUS_PARAM)
+{
+    int32 aExp, bExp, zExp, expDiff;
+    uint64_t aSig0, aSig1, bSig;
+    flag aSign;
+    *q = 0;
+
+    /* handle unsupported extended double-precision floating encodings */
+    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) {
+        float_raise(float_flag_invalid, status);
+        *r = floatx80_default_nan;
+        return -1;
+    }
+
+    aSig0 = extractFloatx80Frac(a);
+    aExp = extractFloatx80Exp(a);
+    aSign = extractFloatx80Sign(a);
+    bSig = extractFloatx80Frac(b);
+    bExp = extractFloatx80Exp(b);
+
+    if (aExp == 0x7FFF) {
+        if ((uint64_t) (aSig0<<1) || ((bExp == 0x7FFF) &&
+                                      (uint64_t) (bSig<<1))) {
+            *r = propagateFloatx80NaN(a, b, status);
+            return -1;
+        }
+        float_raise(float_flag_invalid, status);
+        *r = floatx80_default_nan;
+        return -1;
+    }
+    if (bExp == 0x7FFF) {
+        if ((uint64_t) (bSig<<1)) {
+            *r = propagateFloatx80NaN(a, b, status);
+            return -1;
+        }
+        if (aExp == 0 && aSig0) {
+            float_raise(float_flag_input_denormal, status);
+            normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+            *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
+                    packFloatx80(aSign, aExp, aSig0) : a;
+            return 0;
+        }
+        *r = a;
+        return 0;
+
+    }
+    if (bExp == 0) {
+        if (bSig == 0) {
+            float_raise(float_flag_invalid, status);
+            *r = floatx80_default_nan;
+            return -1;
+        }
+        float_raise(float_flag_input_denormal, status);
+        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+    }
+    if (aExp == 0) {
+        if (aSig0 == 0) {
+            *r = a;
+            return 0;
+        }
+        float_raise(float_flag_input_denormal, status);
+        normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+    }
+    expDiff = aExp - bExp;
+    aSig1 = 0;
+
+    int overflow = 0;
+
+    if (expDiff >= 64) {
+        int n = (expDiff & 0x1f) | 0x20;
+        remainder_kernel(aSig0, bSig, n, &aSig0, &aSig1);
+        zExp = aExp - n;
+        overflow = 1;
+    } else {
+        zExp = bExp;
+
+        if (expDiff < 0) {
+            if (expDiff < -1) {
+                *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
+                     packFloatx80(aSign, aExp, aSig0) : a;
+                return 0;
+            }
+            shift128Right(aSig0, 0, 1, &aSig0, &aSig1);
+            expDiff = 0;
+        }
+
+        if (expDiff > 0) {
+            *q = remainder_kernel(aSig0, bSig, expDiff, &aSig0, &aSig1);
+        } else {
+            if (bSig <= aSig0) {
+                aSig0 -= bSig;
+                *q = 1;
+            }
+        }
+
+        if (rounding_mode == float_round_nearest_even) {
+            uint64_t term0, term1;
+            shift128Right(bSig, 0, 1, &term0, &term1);
+
+            if (!lt128(aSig0, aSig1, term0, term1)) {
+                int lt = lt128(term0, term1, aSig0, aSig1);
+                int eq = eq128(aSig0, aSig1, term0, term1);
+
+                if ((eq && (*q & 1)) || lt) {
+                    aSign = !aSign;
+                    ++*q;
+                }
+                if (lt) {
+                    sub128(bSig, 0, aSig0, aSig1, &aSig0, &aSig1);
+                }
+            }
+        }
+    }
+
+    *r = normalizeRoundAndPackFloatx80(80, aSign, zExp, aSig0, aSig1, status);
+    return overflow;
+}
+
+/*----------------------------------------------------------------------------
+| Computes the remainder of the extended double-precision floating-point value
+| `a' with respect to the corresponding value `b'.  Stores the remainder in
+| `*r'.
+|
+| Returns one of the following:
+|   -1   The operands are invalid. Exceptions were raised and no remainder was
+|        computed. `*r' was set to a NaN value.
+|    0   The remainder was computed successfully and completely.
+|    1   Only a partial remainder was computed and stored in `*r'. Call this
+|        function again with `a' set to the returned partial remainder and the
+|        same `b' to continue calculating the remainder.
+|
+| The operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q
+                               STATUS_PARAM)
+{
+    return do_fprem(a, b, r, q, float_round_nearest_even STATUS_VAR);
+}
+
+/*----------------------------------------------------------------------------
+| Computes the remainder of the extended double-precision floating-point value
+| `a' with  respect to  the corresponding value `b'. Stores the remainder in
+| `*r'.
+|
+| Returns one of the following:
+|   -1   The operands are invalid. Exceptions were raised and no remainder was
+|        computed. `*r' was set to a NaN value.
+|    0   The remainder was computed successfully and completely.
+|    1   Only a partial remainder was computed and stored in `*r'. Call this
+|        function again with `a' set to the returned partial remainder and the
+|        same `b' to continue calculating the remainder.
+|
+| Unlike previous function the  function  does not compute  the remainder
+| specified  in  the IEC/IEEE Standard  for Binary  Floating-Point  Arithmetic.
+| This  function  operates differently  from the  previous  function in  the way
+| that it  rounds  the quotient of 'a' divided by 'b' to an integer.
+*----------------------------------------------------------------------------*/
+
+int floatx80_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q
+                       STATUS_PARAM)
+{
+    return do_fprem(a, b, r, q, float_round_to_zero STATUS_VAR);
+}
+
 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
 {
     flag aSign;
diff --git a/fpu/softfloat.h b/fpu/softfloat.h
index feec3a1..e6f5b87 100644
--- a/fpu/softfloat.h
+++ b/fpu/softfloat.h
@@ -497,10 +497,14 @@ int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM );
 int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM );
 int floatx80_compare( floatx80, floatx80 STATUS_PARAM );
 int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_is_unsupported(floatx80);
 int floatx80_is_quiet_nan( floatx80 );
 int floatx80_is_signaling_nan( floatx80 );
 floatx80 floatx80_maybe_silence_nan( floatx80 );
 floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM );
+int floatx80_ieee754_remainder(floatx80, floatx80, floatx80 *, uint64_t *
+                               STATUS_PARAM);
+int floatx80_remainder(floatx80, floatx80, floatx80 *, uint64_t * STATUS_PARAM);
 
 INLINE floatx80 floatx80_abs(floatx80 a)
 {
diff --git a/target-i386/op_helper.c b/target-i386/op_helper.c
index 2862ea4..53f46fd 100644
--- a/target-i386/op_helper.c
+++ b/target-i386/op_helper.c
@@ -3620,6 +3620,23 @@ static void fpu_set_exception(int mask)
         env->fpus |= FPUS_SE | FPUS_B;
 }
 
+static int fpu_exception_from_fp_status(const float_status *fp_status)
+{
+    int mask;
+
+    const int float_flag_denormals = float_flag_input_denormal |
+                                     float_flag_output_denormal;
+
+    /* Most flags have the same value in the enum and 387 status word, except
+     * the denormal flags. */
+    mask = fp_status->float_exception_flags & ~float_flag_denormals;
+    if (fp_status->float_exception_flags & float_flag_denormals) {
+        mask |= FPUS_DE;
+    }
+
+    return mask;
+}
+
 static inline floatx80 helper_fdiv(floatx80 a, floatx80 b)
 {
     if (floatx80_is_zero(b)) {
@@ -4208,121 +4225,72 @@ void helper_fxtract(void)
     }
 }
 
+static void fprem_maybe_update_flags(int overflow, uint64_t quotient)
+{
+    if (overflow >= 0) {  /* overflow < 0 means error */
+        env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
+        if (overflow) {
+            env->fpus |= 1 << 10;
+        } else {            /* (C0,C3,C1) <-- (q2,q1,q0) */
+            if (quotient & 1) {
+                env->fpus |= 1 << 9;
+            }
+            if (quotient & 2) {
+                env->fpus |= 1 << 14;
+            }
+            if (quotient & 4) {
+                env->fpus |= 1 << 8;
+            }
+        }
+    }
+}
+
 void helper_fprem1(void)
 {
-    double st0, st1, dblq, fpsrcop, fptemp;
-    CPU_LDoubleU fpsrcop1, fptemp1;
-    int expdif;
-    signed long long int q;
+    floatx80 result;
+    uint64_t quotient;
 
-    st0 = floatx80_to_double(ST0);
-    st1 = floatx80_to_double(ST1);
+    /* Use a copy of the current fp_status so that we can detect new exceptions
+     * and pass them to fpu_set_exception. TODO(catalinp): Remove this once
+     * exceptions are reported directly into env->fp_status. */
+    float_status local_status = env->fp_status;
 
-    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
-        ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        return;
-    }
-
-    fpsrcop = st0;
-    fptemp = st1;
-    fpsrcop1.d = ST0;
-    fptemp1.d = ST1;
-    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
+    /* TODO(catalinp): Bochs' FPU_pre_exception_handling resets a few more
+     * fields in fp_status before doing the operation. What is the purpose of
+     * that and is it necessary here? */
+    local_status.float_exception_flags = 0;
 
-    if (expdif < 0) {
-        /* optimisation? taken from the AMD docs */
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        /* ST0 is unchanged */
-        return;
-    }
+    int overflow = floatx80_ieee754_remainder(ST0, ST1, &result, &quotient,
+                                              &local_status);
 
-    if (expdif < 53) {
-        dblq = fpsrcop / fptemp;
-        /* round dblq towards nearest integer */
-        dblq = rint(dblq);
-        st0 = fpsrcop - fptemp * dblq;
+    /* TODO(catalinp): Bochs also checks for unmasked exceptions before storing
+     * these flags. Should we also do this?
+     */
+    fprem_maybe_update_flags(overflow, quotient);
 
-        /* convert dblq to q by truncating towards zero */
-        if (dblq < 0.0)
-           q = (signed long long int)(-dblq);
-        else
-           q = (signed long long int)dblq;
+    ST0 = result;
 
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-                                /* (C0,C3,C1) <-- (q2,q1,q0) */
-        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
-        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
-        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
-    } else {
-        env->fpus |= 0x400;  /* C2 <-- 1 */
-        fptemp = pow(2.0, expdif - 50);
-        fpsrcop = (st0 / st1) / fptemp;
-        /* fpsrcop = integer obtained by chopping */
-        fpsrcop = (fpsrcop < 0.0) ?
-                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
-        st0 -= (st1 * fpsrcop * fptemp);
-    }
-    ST0 = double_to_floatx80(st0);
+    fpu_set_exception(fpu_exception_from_fp_status(&local_status));
 }
 
 void helper_fprem(void)
 {
-    double st0, st1, dblq, fpsrcop, fptemp;
-    CPU_LDoubleU fpsrcop1, fptemp1;
-    int expdif;
-    signed long long int q;
+    floatx80 result;
+    uint64_t quotient;
 
-    st0 = floatx80_to_double(ST0);
-    st1 = floatx80_to_double(ST1);
+    /* TODO(catalinp): See comments in helper_fprem1() about lots of potential
+     * semantic ambiguities/differences between this and Bochs' implementation.
+     */
+    float_status local_status = env->fp_status;
+    local_status.float_exception_flags = 0;
+    int overflow = floatx80_remainder(ST0, ST1, &result, &quotient,
+                                      &local_status);
 
-    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
-       ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
-       env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-       return;
-    }
-
-    fpsrcop = st0;
-    fptemp = st1;
-    fpsrcop1.d = ST0;
-    fptemp1.d = ST1;
-    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
-
-    if (expdif < 0) {
-        /* optimisation? taken from the AMD docs */
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        /* ST0 is unchanged */
-        return;
-    }
-
-    if ( expdif < 53 ) {
-        dblq = fpsrcop/*ST0*/ / fptemp/*ST1*/;
-        /* round dblq towards zero */
-        dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq);
-        st0 = fpsrcop/*ST0*/ - fptemp * dblq;
+    fprem_maybe_update_flags(overflow, quotient);
 
-        /* convert dblq to q by truncating towards zero */
-        if (dblq < 0.0)
-           q = (signed long long int)(-dblq);
-        else
-           q = (signed long long int)dblq;
+    ST0 = result;
 
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-                                /* (C0,C3,C1) <-- (q2,q1,q0) */
-        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
-        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
-        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
-    } else {
-        int N = 32 + (expdif % 32); /* as per AMD docs */
-        env->fpus |= 0x400;  /* C2 <-- 1 */
-        fptemp = pow(2.0, (double)(expdif - N));
-        fpsrcop = (st0 / st1) / fptemp;
-        /* fpsrcop = integer obtained by chopping */
-        fpsrcop = (fpsrcop < 0.0) ?
-                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
-        st0 -= (st1 * fpsrcop * fptemp);
-    }
-    ST0 = double_to_floatx80(st0);
+    fpu_set_exception(fpu_exception_from_fp_status(&local_status));
 }
 
 void helper_fyl2xp1(void)
-- 
1.7.7.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only
  2012-07-02 15:25     ` Catalin Patulea
@ 2012-07-02 17:33       ` Catalin Patulea
  2012-07-09 20:33       ` Catalin Patulea
  2012-07-09 22:30       ` Peter Maydell
  2 siblings, 0 replies; 5+ messages in thread
From: Catalin Patulea @ 2012-07-02 17:33 UTC (permalink / raw)
  To: qemu-devel

On Mon, Jul 2, 2012 at 11:25 AM, Catalin Patulea <catalinp@google.com> wrote:
>
> FPREM1 now passes the TestFloat floatx80_rem suite (and FPREM is implemented very
> similarly).
>
> The code (the bulk of which is remainder_kernel and do_fprem) is derived from
> Bochs SVN revision 11224 dated 2012-06-21 10:33:37 -0700, with conversions to
> Qemu type aliases, C features only, etc. as needed.
Sorry about that, forgot to fix the subject line to read "PATCHv2".
Will fix for future patches.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only
  2012-07-02 15:25     ` Catalin Patulea
  2012-07-02 17:33       ` Catalin Patulea
@ 2012-07-09 20:33       ` Catalin Patulea
  2012-07-09 22:30       ` Peter Maydell
  2 siblings, 0 replies; 5+ messages in thread
From: Catalin Patulea @ 2012-07-09 20:33 UTC (permalink / raw)
  To: qemu-devel

On Mon, Jul 2, 2012 at 11:25 AM, Catalin Patulea <catalinp@google.com> wrote:
> FPREM1 now passes the TestFloat floatx80_rem suite (and FPREM is implemented very
> similarly).
>
> The code (the bulk of which is remainder_kernel and do_fprem) is derived from
> Bochs SVN revision 11224 dated 2012-06-21 10:33:37 -0700, with conversions to
> Qemu type aliases, C features only, etc. as needed.
>
> Signed-off-by: Catalin Patulea <catalinp@google.com>
> ---
>  fpu/softfloat.c         |  195 +++++++++++++++++++++++++++++++++++++++++++++++
>  fpu/softfloat.h         |    4 +
>  target-i386/op_helper.c |  166 ++++++++++++++++------------------------
>  3 files changed, 266 insertions(+), 99 deletions(-)
>
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index b29256a..bd1879d 100644
> [...]
Ping - how do people feel about the latest patch?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only
  2012-07-02 15:25     ` Catalin Patulea
  2012-07-02 17:33       ` Catalin Patulea
  2012-07-09 20:33       ` Catalin Patulea
@ 2012-07-09 22:30       ` Peter Maydell
  2 siblings, 0 replies; 5+ messages in thread
From: Peter Maydell @ 2012-07-09 22:30 UTC (permalink / raw)
  To: Catalin Patulea; +Cc: sw, qemu-devel, afaerber

On 2 July 2012 16:25, Catalin Patulea <catalinp@google.com> wrote:
> FPREM1 now passes the TestFloat floatx80_rem suite (and FPREM is implemented very
> similarly).
>
> The code (the bulk of which is remainder_kernel and do_fprem) is derived from
> Bochs SVN revision 11224 dated 2012-06-21 10:33:37 -0700, with conversions to
> Qemu type aliases, C features only, etc. as needed.

"QEMU" is the official capitalization.

>
> Signed-off-by: Catalin Patulea <catalinp@google.com>
> ---
>  fpu/softfloat.c         |  195 +++++++++++++++++++++++++++++++++++++++++++++++
>  fpu/softfloat.h         |    4 +
>  target-i386/op_helper.c |  166 ++++++++++++++++------------------------
>  3 files changed, 266 insertions(+), 99 deletions(-)
>
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index b29256a..bd1879d 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -5234,6 +5234,16 @@ int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
>  }
>
>  /*----------------------------------------------------------------------------
> +| Returns 1 if the extended double-precision floating-point value `a' is an
> +| unsupported value; otherwise returns 0.

Let's try for something a little less cryptic to save future readers having
to dig out the intel architecture manuals:
"an unsupported value (ie the bit pattern does not represent a valid
IEEE number)".

> +*----------------------------------------------------------------------------*/
> +int floatx80_is_unsupported(floatx80 a)
> +{
> +    return extractFloatx80Exp(a) &&
> +           !(extractFloatx80Frac(a) & LIT64(0x8000000000000000));
> +}

This doesn't match up with all the cases in the Intel Software Developer's
Manual table 8.3: it catches "exponent non-zero but explicit integer bit
is zero" (pseudo-NaNs, pseudo-infinities, and unnormals) but not the case
of "exponent is zero but explicit integer bit is one" (pseudo-denormals).

[For those following along at home, it is because floatx80 has an explicit
integer bit rather than the implicit bit used in IEEE single and double
that you can get 'unsupported' bit patterns, where the explicit bit is
a value different from what the implicit bit would be under IEEE rules.]

> +
> +/*----------------------------------------------------------------------------
>  | Returns the result of converting the quadruple-precision floating-point
>  | value `a' to the 32-bit two's complement integer format.  The conversion
>  | is performed according to the IEC/IEEE Standard for Binary Floating-Point
> @@ -6828,6 +6838,191 @@ floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
>                                            aSign, aExp, aSig, 0 STATUS_VAR );
>  }
>
> +/* executes single exponent reduction cycle */
> +static uint64_t remainder_kernel(uint64_t aSig0, uint64_t bSig, int expDiff,
> +                                 uint64_t *zSig0, uint64_t *zSig1)
> +{
> +    uint64_t term0, term1;
> +    uint64_t aSig1 = 0;
> +
> +    shortShift128Left(aSig1, aSig0, expDiff, &aSig1, &aSig0);
> +    uint64_t q = estimateDiv128To64(aSig1, aSig0, bSig);

Declaring variables in the middle of code isn't QEMU coding style;
top of the function, please. [Other cases below; I haven't bothered
to call them all out.]

> +    mul64To128(bSig, q, &term0, &term1);
> +    sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
> +    while ((int64)(*zSig1) < 0) {

Cast to int64 is almost certainly wrong: this conditional will
give different results if int64 is exactly 64 bits vs if it is
more than 64 bits. You probably wanted int64_t.

> +        --q;
> +        add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0);
> +    }
> +    return q;
> +}
> +
> +static int do_fprem(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q,
> +                    int rounding_mode STATUS_PARAM)
> +{
> +    int32 aExp, bExp, zExp, expDiff;
> +    uint64_t aSig0, aSig1, bSig;
> +    flag aSign;
> +    *q = 0;
> +
> +    /* handle unsupported extended double-precision floating encodings */
> +    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) {
> +        float_raise(float_flag_invalid, status);

Use STATUS_VAR. It's stupid, but let's be consistently stupid until
we get round to globally fixing it.

> +        *r = floatx80_default_nan;
> +        return -1;
> +    }
> +
> +    aSig0 = extractFloatx80Frac(a);
> +    aExp = extractFloatx80Exp(a);
> +    aSign = extractFloatx80Sign(a);
> +    bSig = extractFloatx80Frac(b);
> +    bExp = extractFloatx80Exp(b);
> +
> +    if (aExp == 0x7FFF) {
> +        if ((uint64_t) (aSig0<<1) || ((bExp == 0x7FFF) &&
> +                                      (uint64_t) (bSig<<1))) {

aSig0 and bSig are already uint64_t, why the casts?

> +            *r = propagateFloatx80NaN(a, b, status);
> +            return -1;
> +        }
> +        float_raise(float_flag_invalid, status);
> +        *r = floatx80_default_nan;
> +        return -1;
> +    }
> +    if (bExp == 0x7FFF) {
> +        if ((uint64_t) (bSig<<1)) {
> +            *r = propagateFloatx80NaN(a, b, status);
> +            return -1;
> +        }
> +        if (aExp == 0 && aSig0) {
> +            float_raise(float_flag_input_denormal, status);
> +            normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
> +            *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
> +                    packFloatx80(aSign, aExp, aSig0) : a;

If you rearrange this to do the "return a if high bit of
a's significand is zero" check before calling the normalize..()
function then you wouldn't need to call extractFloatx80Frac()
again. Incidentally, exponent zero but high bit one is the
other kind of 'unsupported' value that your current test
function doesn't catch [see comments earlier] -- does the
h/w really return a here rather than asserting invalid?

> +            return 0;
> +        }
> +        *r = a;
> +        return 0;
> +
> +    }
> +    if (bExp == 0) {
> +        if (bSig == 0) {
> +            float_raise(float_flag_invalid, status);
> +            *r = floatx80_default_nan;
> +            return -1;
> +        }
> +        float_raise(float_flag_input_denormal, status);
> +        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
> +    }
> +    if (aExp == 0) {
> +        if (aSig0 == 0) {
> +            *r = a;
> +            return 0;
> +        }
> +        float_raise(float_flag_input_denormal, status);
> +        normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
> +    }
> +    expDiff = aExp - bExp;
> +    aSig1 = 0;
> +
> +    int overflow = 0;
> +
> +    if (expDiff >= 64) {
> +        int n = (expDiff & 0x1f) | 0x20;
> +        remainder_kernel(aSig0, bSig, n, &aSig0, &aSig1);
> +        zExp = aExp - n;
> +        overflow = 1;
> +    } else {
> +        zExp = bExp;
> +
> +        if (expDiff < 0) {
> +            if (expDiff < -1) {
> +                *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
> +                     packFloatx80(aSign, aExp, aSig0) : a;
> +                return 0;
> +            }
> +            shift128Right(aSig0, 0, 1, &aSig0, &aSig1);
> +            expDiff = 0;
> +        }
> +
> +        if (expDiff > 0) {
> +            *q = remainder_kernel(aSig0, bSig, expDiff, &aSig0, &aSig1);
> +        } else {
> +            if (bSig <= aSig0) {
> +                aSig0 -= bSig;
> +                *q = 1;
> +            }
> +        }
> +
> +        if (rounding_mode == float_round_nearest_even) {
> +            uint64_t term0, term1;
> +            shift128Right(bSig, 0, 1, &term0, &term1);
> +
> +            if (!lt128(aSig0, aSig1, term0, term1)) {
> +                int lt = lt128(term0, term1, aSig0, aSig1);
> +                int eq = eq128(aSig0, aSig1, term0, term1);

If there's some clever reason (probably involving NaN...) that we
need to calculate three separate comparisons on the same pair
of 128 bit numbers, it would be nice to have a coment about it.


> +
> +                if ((eq && (*q & 1)) || lt) {
> +                    aSign = !aSign;
> +                    ++*q;
> +                }
> +                if (lt) {
> +                    sub128(bSig, 0, aSig0, aSig1, &aSig0, &aSig1);
> +                }
> +            }
> +        }
> +    }
> +
> +    *r = normalizeRoundAndPackFloatx80(80, aSign, zExp, aSig0, aSig1, status);
> +    return overflow;
> +}
> +
> +/*----------------------------------------------------------------------------
> +| Computes the remainder of the extended double-precision floating-point value
> +| `a' with respect to the corresponding value `b'.  Stores the remainder in
> +| `*r'.
> +|
> +| Returns one of the following:
> +|   -1   The operands are invalid. Exceptions were raised and no remainder was
> +|        computed. `*r' was set to a NaN value.
> +|    0   The remainder was computed successfully and completely.
> +|    1   Only a partial remainder was computed and stored in `*r'. Call this
> +|        function again with `a' set to the returned partial remainder and the
> +|        same `b' to continue calculating the remainder.
> +|
> +| The operation is performed according to the IEC/IEEE Standard for Binary
> +| Floating-Point Arithmetic.
> +*----------------------------------------------------------------------------*/

These aren't actually computing the remainder, they're computing a potentially
partial remainder. (The actual remainder is what floatx80_rem() does.) We
should give them a name and summary comment that reflects that. Couple
of options:
 floatx80_part_rem() [ieee should be the default]
 floatx80_x87_part_rem()
or just have one function
 floatx86_part_rem() and have the x86 code pass in the rounding mode flag,
rather than two wrappers.

(Incidentally the signature for this function is annoyingly different
from the other softfloat functions, but I guess that's kind of unavoidable.
The only thing I could think of was having them return the floatx80 and
have the 'ok/partial/error' info in the sideband channel; not sure that's
really better though.)

> +
> +int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q
> +                               STATUS_PARAM)
> +{
> +    return do_fprem(a, b, r, q, float_round_nearest_even STATUS_VAR);
> +}
> +
> +/*----------------------------------------------------------------------------
> +| Computes the remainder of the extended double-precision floating-point value
> +| `a' with  respect to  the corresponding value `b'. Stores the remainder in
> +| `*r'.
> +|
> +| Returns one of the following:
> +|   -1   The operands are invalid. Exceptions were raised and no remainder was
> +|        computed. `*r' was set to a NaN value.
> +|    0   The remainder was computed successfully and completely.
> +|    1   Only a partial remainder was computed and stored in `*r'. Call this
> +|        function again with `a' set to the returned partial remainder and the
> +|        same `b' to continue calculating the remainder.
> +|
> +| Unlike previous function the  function  does not compute  the remainder
> +| specified  in  the IEC/IEEE Standard  for Binary  Floating-Point  Arithmetic.
> +| This  function  operates differently  from the  previous  function in  the way
> +| that it  rounds  the quotient of 'a' divided by 'b' to an integer.
> +*----------------------------------------------------------------------------*/
> +
> +int floatx80_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q
> +                       STATUS_PARAM)
> +{
> +    return do_fprem(a, b, r, q, float_round_to_zero STATUS_VAR);
> +}
> +
>  float128 float128_scalbn( float128 a, int n STATUS_PARAM )
>  {
>      flag aSign;
> diff --git a/fpu/softfloat.h b/fpu/softfloat.h
> index feec3a1..e6f5b87 100644
> --- a/fpu/softfloat.h
> +++ b/fpu/softfloat.h
> @@ -497,10 +497,14 @@ int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM );
>  int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM );
>  int floatx80_compare( floatx80, floatx80 STATUS_PARAM );
>  int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM );
> +int floatx80_is_unsupported(floatx80);
>  int floatx80_is_quiet_nan( floatx80 );
>  int floatx80_is_signaling_nan( floatx80 );
>  floatx80 floatx80_maybe_silence_nan( floatx80 );
>  floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM );
> +int floatx80_ieee754_remainder(floatx80, floatx80, floatx80 *, uint64_t *
> +                               STATUS_PARAM);
> +int floatx80_remainder(floatx80, floatx80, floatx80 *, uint64_t * STATUS_PARAM);
>
>  INLINE floatx80 floatx80_abs(floatx80 a)
>  {
> diff --git a/target-i386/op_helper.c b/target-i386/op_helper.c
> index 2862ea4..53f46fd 100644
> --- a/target-i386/op_helper.c
> +++ b/target-i386/op_helper.c
> @@ -3620,6 +3620,23 @@ static void fpu_set_exception(int mask)
>          env->fpus |= FPUS_SE | FPUS_B;
>  }
>
> +static int fpu_exception_from_fp_status(const float_status *fp_status)
> +{
> +    int mask;
> +
> +    const int float_flag_denormals = float_flag_input_denormal |
> +                                     float_flag_output_denormal;
> +
> +    /* Most flags have the same value in the enum and 387 status word, except
> +     * the denormal flags. */
> +    mask = fp_status->float_exception_flags & ~float_flag_denormals;
> +    if (fp_status->float_exception_flags & float_flag_denormals) {

You shouldn't be reaching into the fp_status like this -- there are
accessor functions get_float_exception_flags() and set_float_exception_flags().

> +        mask |= FPUS_DE;

This looks suspicious -- we started with mask containing float_flag_*
bits (which are softfloat-defined and target-independent) and we've just
ORed FPUS_DE, which is x86-specific and defined by the layout of a
target x87 register.

> +    }
> +
> +    return mask;
> +}
> +
>  static inline floatx80 helper_fdiv(floatx80 a, floatx80 b)
>  {
>      if (floatx80_is_zero(b)) {
> @@ -4208,121 +4225,72 @@ void helper_fxtract(void)
>      }
>  }
>
> +static void fprem_maybe_update_flags(int overflow, uint64_t quotient)
> +{
> +    if (overflow >= 0) {  /* overflow < 0 means error */
> +        env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
> +        if (overflow) {
> +            env->fpus |= 1 << 10;
> +        } else {            /* (C0,C3,C1) <-- (q2,q1,q0) */
> +            if (quotient & 1) {
> +                env->fpus |= 1 << 9;
> +            }
> +            if (quotient & 2) {
> +                env->fpus |= 1 << 14;
> +            }
> +            if (quotient & 4) {
> +                env->fpus |= 1 << 8;
> +            }
> +        }
> +    }
> +}
> +
>  void helper_fprem1(void)
>  {
> -    double st0, st1, dblq, fpsrcop, fptemp;
> -    CPU_LDoubleU fpsrcop1, fptemp1;
> -    int expdif;
> -    signed long long int q;
> +    floatx80 result;
> +    uint64_t quotient;
>
> -    st0 = floatx80_to_double(ST0);
> -    st1 = floatx80_to_double(ST1);
> +    /* Use a copy of the current fp_status so that we can detect new exceptions
> +     * and pass them to fpu_set_exception. TODO(catalinp): Remove this once
> +     * exceptions are reported directly into env->fp_status. */

You definitely need to fix your TODOs...
(Certainly if you can do so your patch has a much higher chance of
acceptance than if you require a reviewer to spend ~30-60mins reading
enough of the Intel docs, target-i386 implementation and Bochs sources
to figure out the answers for you, especially since target-i386 is not
really maintained.)

> +    float_status local_status = env->fp_status;
>
> -    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
> -        ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
> -        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -        return;
> -    }
> -
> -    fpsrcop = st0;
> -    fptemp = st1;
> -    fpsrcop1.d = ST0;
> -    fptemp1.d = ST1;
> -    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
> +    /* TODO(catalinp): Bochs' FPU_pre_exception_handling resets a few more
> +     * fields in fp_status before doing the operation. What is the purpose of
> +     * that and is it necessary here? */
> +    local_status.float_exception_flags = 0;
>
> -    if (expdif < 0) {
> -        /* optimisation? taken from the AMD docs */
> -        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -        /* ST0 is unchanged */
> -        return;
> -    }
> +    int overflow = floatx80_ieee754_remainder(ST0, ST1, &result, &quotient,
> +                                              &local_status);
>
> -    if (expdif < 53) {
> -        dblq = fpsrcop / fptemp;
> -        /* round dblq towards nearest integer */
> -        dblq = rint(dblq);
> -        st0 = fpsrcop - fptemp * dblq;
> +    /* TODO(catalinp): Bochs also checks for unmasked exceptions before storing
> +     * these flags. Should we also do this?
> +     */
> +    fprem_maybe_update_flags(overflow, quotient);
>
> -        /* convert dblq to q by truncating towards zero */
> -        if (dblq < 0.0)
> -           q = (signed long long int)(-dblq);
> -        else
> -           q = (signed long long int)dblq;
> +    ST0 = result;
>
> -        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -                                /* (C0,C3,C1) <-- (q2,q1,q0) */
> -        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
> -        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
> -        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
> -    } else {
> -        env->fpus |= 0x400;  /* C2 <-- 1 */
> -        fptemp = pow(2.0, expdif - 50);
> -        fpsrcop = (st0 / st1) / fptemp;
> -        /* fpsrcop = integer obtained by chopping */
> -        fpsrcop = (fpsrcop < 0.0) ?
> -                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
> -        st0 -= (st1 * fpsrcop * fptemp);
> -    }
> -    ST0 = double_to_floatx80(st0);
> +    fpu_set_exception(fpu_exception_from_fp_status(&local_status));
>  }
>
>  void helper_fprem(void)
>  {
> -    double st0, st1, dblq, fpsrcop, fptemp;
> -    CPU_LDoubleU fpsrcop1, fptemp1;
> -    int expdif;
> -    signed long long int q;
> +    floatx80 result;
> +    uint64_t quotient;
>
> -    st0 = floatx80_to_double(ST0);
> -    st1 = floatx80_to_double(ST1);
> +    /* TODO(catalinp): See comments in helper_fprem1() about lots of potential
> +     * semantic ambiguities/differences between this and Bochs' implementation.
> +     */
> +    float_status local_status = env->fp_status;
> +    local_status.float_exception_flags = 0;
> +    int overflow = floatx80_remainder(ST0, ST1, &result, &quotient,
> +                                      &local_status);
>
> -    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
> -       ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
> -       env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -       return;
> -    }
> -
> -    fpsrcop = st0;
> -    fptemp = st1;
> -    fpsrcop1.d = ST0;
> -    fptemp1.d = ST1;
> -    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
> -
> -    if (expdif < 0) {
> -        /* optimisation? taken from the AMD docs */
> -        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -        /* ST0 is unchanged */
> -        return;
> -    }
> -
> -    if ( expdif < 53 ) {
> -        dblq = fpsrcop/*ST0*/ / fptemp/*ST1*/;
> -        /* round dblq towards zero */
> -        dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq);
> -        st0 = fpsrcop/*ST0*/ - fptemp * dblq;
> +    fprem_maybe_update_flags(overflow, quotient);
>
> -        /* convert dblq to q by truncating towards zero */
> -        if (dblq < 0.0)
> -           q = (signed long long int)(-dblq);
> -        else
> -           q = (signed long long int)dblq;
> +    ST0 = result;
>
> -        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
> -                                /* (C0,C3,C1) <-- (q2,q1,q0) */
> -        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
> -        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
> -        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
> -    } else {
> -        int N = 32 + (expdif % 32); /* as per AMD docs */
> -        env->fpus |= 0x400;  /* C2 <-- 1 */
> -        fptemp = pow(2.0, (double)(expdif - N));
> -        fpsrcop = (st0 / st1) / fptemp;
> -        /* fpsrcop = integer obtained by chopping */
> -        fpsrcop = (fpsrcop < 0.0) ?
> -                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
> -        st0 -= (st1 * fpsrcop * fptemp);
> -    }
> -    ST0 = double_to_floatx80(st0);
> +    fpu_set_exception(fpu_exception_from_fp_status(&local_status));
>  }
>
>  void helper_fyl2xp1(void)
> --
> 1.7.7.3
>
>

-- PMM

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2012-07-09 22:30 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1340838036-10467-1-git-send-email-catalinp@google.com>
     [not found] ` <4FEDAA07.6030600@suse.de>
2012-07-02 14:51   ` [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only Catalin Patulea
2012-07-02 15:25     ` Catalin Patulea
2012-07-02 17:33       ` Catalin Patulea
2012-07-09 20:33       ` Catalin Patulea
2012-07-09 22:30       ` Peter Maydell

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.