From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from eggs.gnu.org ([208.118.235.92]:36510)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <catalinp@dhcp-172-17-9-207.mtv.corp.google.com>)
	id 1SlkIW-0005ey-Jf
	for qemu-devel@nongnu.org; Mon, 02 Jul 2012 13:20:30 -0400
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <catalinp@dhcp-172-17-9-207.mtv.corp.google.com>)
	id 1SlkIQ-0003P6-Te
	for qemu-devel@nongnu.org; Mon, 02 Jul 2012 13:20:28 -0400
Received: from 24-212-240-69.cable.teksavvy.com ([24.212.240.69]:55284
	helo=glucid-laptop-imager.hot.corp.google.com)
	by eggs.gnu.org with esmtp (Exim 4.71)
	(envelope-from <catalinp@dhcp-172-17-9-207.mtv.corp.google.com>)
	id 1SlkIQ-0003OK-JL
	for qemu-devel@nongnu.org; Mon, 02 Jul 2012 13:20:22 -0400
From: Catalin Patulea <catalinp@google.com>
Date: Mon,  2 Jul 2012 11:25:35 -0400
Message-Id: <1341242735-19875-1-git-send-email-catalinp@google.com>
In-Reply-To: <CAJB0wA-dLJPKKHCjX91FAHbp6r9i7+q08ZBJ2Rfc5LOkFaTAaA@mail.gmail.com>
References: <CAJB0wA-dLJPKKHCjX91FAHbp6r9i7+q08ZBJ2Rfc5LOkFaTAaA@mail.gmail.com>
Subject: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using
	softfloat only
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: qemu-devel@nongnu.org
Cc: sw@weilnetz.de, Catalin Patulea <catalinp@google.com>, afaerber@suse.de, peter.maydell@linaro.org

FPREM1 now passes the TestFloat floatx80_rem suite (and FPREM is implemented very
similarly).

The code (the bulk of which is remainder_kernel and do_fprem) is derived from
Bochs SVN revision 11224 dated 2012-06-21 10:33:37 -0700, with conversions to
Qemu type aliases, C features only, etc. as needed.

Signed-off-by: Catalin Patulea <catalinp@google.com>
---
 fpu/softfloat.c         |  195 +++++++++++++++++++++++++++++++++++++++++++++++
 fpu/softfloat.h         |    4 +
 target-i386/op_helper.c |  166 ++++++++++++++++------------------------
 3 files changed, 266 insertions(+), 99 deletions(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index b29256a..bd1879d 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -5234,6 +5234,16 @@ int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
 }
 
 /*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is an
+| unsupported value; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+int floatx80_is_unsupported(floatx80 a)
+{
+    return extractFloatx80Exp(a) &&
+           !(extractFloatx80Frac(a) & LIT64(0x8000000000000000));
+}
+
+/*----------------------------------------------------------------------------
 | Returns the result of converting the quadruple-precision floating-point
 | value `a' to the 32-bit two's complement integer format.  The conversion
 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
@@ -6828,6 +6838,191 @@ floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
                                           aSign, aExp, aSig, 0 STATUS_VAR );
 }
 
+/* executes single exponent reduction cycle */
+static uint64_t remainder_kernel(uint64_t aSig0, uint64_t bSig, int expDiff,
+                                 uint64_t *zSig0, uint64_t *zSig1)
+{
+    uint64_t term0, term1;
+    uint64_t aSig1 = 0;
+
+    shortShift128Left(aSig1, aSig0, expDiff, &aSig1, &aSig0);
+    uint64_t q = estimateDiv128To64(aSig1, aSig0, bSig);
+    mul64To128(bSig, q, &term0, &term1);
+    sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
+    while ((int64)(*zSig1) < 0) {
+        --q;
+        add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0);
+    }
+    return q;
+}
+
+static int do_fprem(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q,
+                    int rounding_mode STATUS_PARAM)
+{
+    int32 aExp, bExp, zExp, expDiff;
+    uint64_t aSig0, aSig1, bSig;
+    flag aSign;
+    *q = 0;
+
+    /* handle unsupported extended double-precision floating encodings */
+    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) {
+        float_raise(float_flag_invalid, status);
+        *r = floatx80_default_nan;
+        return -1;
+    }
+
+    aSig0 = extractFloatx80Frac(a);
+    aExp = extractFloatx80Exp(a);
+    aSign = extractFloatx80Sign(a);
+    bSig = extractFloatx80Frac(b);
+    bExp = extractFloatx80Exp(b);
+
+    if (aExp == 0x7FFF) {
+        if ((uint64_t) (aSig0<<1) || ((bExp == 0x7FFF) &&
+                                      (uint64_t) (bSig<<1))) {
+            *r = propagateFloatx80NaN(a, b, status);
+            return -1;
+        }
+        float_raise(float_flag_invalid, status);
+        *r = floatx80_default_nan;
+        return -1;
+    }
+    if (bExp == 0x7FFF) {
+        if ((uint64_t) (bSig<<1)) {
+            *r = propagateFloatx80NaN(a, b, status);
+            return -1;
+        }
+        if (aExp == 0 && aSig0) {
+            float_raise(float_flag_input_denormal, status);
+            normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+            *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
+                    packFloatx80(aSign, aExp, aSig0) : a;
+            return 0;
+        }
+        *r = a;
+        return 0;
+
+    }
+    if (bExp == 0) {
+        if (bSig == 0) {
+            float_raise(float_flag_invalid, status);
+            *r = floatx80_default_nan;
+            return -1;
+        }
+        float_raise(float_flag_input_denormal, status);
+        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+    }
+    if (aExp == 0) {
+        if (aSig0 == 0) {
+            *r = a;
+            return 0;
+        }
+        float_raise(float_flag_input_denormal, status);
+        normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+    }
+    expDiff = aExp - bExp;
+    aSig1 = 0;
+
+    int overflow = 0;
+
+    if (expDiff >= 64) {
+        int n = (expDiff & 0x1f) | 0x20;
+        remainder_kernel(aSig0, bSig, n, &aSig0, &aSig1);
+        zExp = aExp - n;
+        overflow = 1;
+    } else {
+        zExp = bExp;
+
+        if (expDiff < 0) {
+            if (expDiff < -1) {
+                *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
+                     packFloatx80(aSign, aExp, aSig0) : a;
+                return 0;
+            }
+            shift128Right(aSig0, 0, 1, &aSig0, &aSig1);
+            expDiff = 0;
+        }
+
+        if (expDiff > 0) {
+            *q = remainder_kernel(aSig0, bSig, expDiff, &aSig0, &aSig1);
+        } else {
+            if (bSig <= aSig0) {
+                aSig0 -= bSig;
+                *q = 1;
+            }
+        }
+
+        if (rounding_mode == float_round_nearest_even) {
+            uint64_t term0, term1;
+            shift128Right(bSig, 0, 1, &term0, &term1);
+
+            if (!lt128(aSig0, aSig1, term0, term1)) {
+                int lt = lt128(term0, term1, aSig0, aSig1);
+                int eq = eq128(aSig0, aSig1, term0, term1);
+
+                if ((eq && (*q & 1)) || lt) {
+                    aSign = !aSign;
+                    ++*q;
+                }
+                if (lt) {
+                    sub128(bSig, 0, aSig0, aSig1, &aSig0, &aSig1);
+                }
+            }
+        }
+    }
+
+    *r = normalizeRoundAndPackFloatx80(80, aSign, zExp, aSig0, aSig1, status);
+    return overflow;
+}
+
+/*----------------------------------------------------------------------------
+| Computes the remainder of the extended double-precision floating-point value
+| `a' with respect to the corresponding value `b'.  Stores the remainder in
+| `*r'.
+|
+| Returns one of the following:
+|   -1   The operands are invalid. Exceptions were raised and no remainder was
+|        computed. `*r' was set to a NaN value.
+|    0   The remainder was computed successfully and completely.
+|    1   Only a partial remainder was computed and stored in `*r'. Call this
+|        function again with `a' set to the returned partial remainder and the
+|        same `b' to continue calculating the remainder.
+|
+| The operation is performed according to the IEC/IEEE Standard for Binary
+| Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q
+                               STATUS_PARAM)
+{
+    return do_fprem(a, b, r, q, float_round_nearest_even STATUS_VAR);
+}
+
+/*----------------------------------------------------------------------------
+| Computes the remainder of the extended double-precision floating-point value
+| `a' with  respect to  the corresponding value `b'. Stores the remainder in
+| `*r'.
+|
+| Returns one of the following:
+|   -1   The operands are invalid. Exceptions were raised and no remainder was
+|        computed. `*r' was set to a NaN value.
+|    0   The remainder was computed successfully and completely.
+|    1   Only a partial remainder was computed and stored in `*r'. Call this
+|        function again with `a' set to the returned partial remainder and the
+|        same `b' to continue calculating the remainder.
+|
+| Unlike previous function the  function  does not compute  the remainder
+| specified  in  the IEC/IEEE Standard  for Binary  Floating-Point  Arithmetic.
+| This  function  operates differently  from the  previous  function in  the way
+| that it  rounds  the quotient of 'a' divided by 'b' to an integer.
+*----------------------------------------------------------------------------*/
+
+int floatx80_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q
+                       STATUS_PARAM)
+{
+    return do_fprem(a, b, r, q, float_round_to_zero STATUS_VAR);
+}
+
 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
 {
     flag aSign;
diff --git a/fpu/softfloat.h b/fpu/softfloat.h
index feec3a1..e6f5b87 100644
--- a/fpu/softfloat.h
+++ b/fpu/softfloat.h
@@ -497,10 +497,14 @@ int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM );
 int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM );
 int floatx80_compare( floatx80, floatx80 STATUS_PARAM );
 int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_is_unsupported(floatx80);
 int floatx80_is_quiet_nan( floatx80 );
 int floatx80_is_signaling_nan( floatx80 );
 floatx80 floatx80_maybe_silence_nan( floatx80 );
 floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM );
+int floatx80_ieee754_remainder(floatx80, floatx80, floatx80 *, uint64_t *
+                               STATUS_PARAM);
+int floatx80_remainder(floatx80, floatx80, floatx80 *, uint64_t * STATUS_PARAM);
 
 INLINE floatx80 floatx80_abs(floatx80 a)
 {
diff --git a/target-i386/op_helper.c b/target-i386/op_helper.c
index 2862ea4..53f46fd 100644
--- a/target-i386/op_helper.c
+++ b/target-i386/op_helper.c
@@ -3620,6 +3620,23 @@ static void fpu_set_exception(int mask)
         env->fpus |= FPUS_SE | FPUS_B;
 }
 
+static int fpu_exception_from_fp_status(const float_status *fp_status)
+{
+    int mask;
+
+    const int float_flag_denormals = float_flag_input_denormal |
+                                     float_flag_output_denormal;
+
+    /* Most flags have the same value in the enum and 387 status word, except
+     * the denormal flags. */
+    mask = fp_status->float_exception_flags & ~float_flag_denormals;
+    if (fp_status->float_exception_flags & float_flag_denormals) {
+        mask |= FPUS_DE;
+    }
+
+    return mask;
+}
+
 static inline floatx80 helper_fdiv(floatx80 a, floatx80 b)
 {
     if (floatx80_is_zero(b)) {
@@ -4208,121 +4225,72 @@ void helper_fxtract(void)
     }
 }
 
+static void fprem_maybe_update_flags(int overflow, uint64_t quotient)
+{
+    if (overflow >= 0) {  /* overflow < 0 means error */
+        env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
+        if (overflow) {
+            env->fpus |= 1 << 10;
+        } else {            /* (C0,C3,C1) <-- (q2,q1,q0) */
+            if (quotient & 1) {
+                env->fpus |= 1 << 9;
+            }
+            if (quotient & 2) {
+                env->fpus |= 1 << 14;
+            }
+            if (quotient & 4) {
+                env->fpus |= 1 << 8;
+            }
+        }
+    }
+}
+
 void helper_fprem1(void)
 {
-    double st0, st1, dblq, fpsrcop, fptemp;
-    CPU_LDoubleU fpsrcop1, fptemp1;
-    int expdif;
-    signed long long int q;
+    floatx80 result;
+    uint64_t quotient;
 
-    st0 = floatx80_to_double(ST0);
-    st1 = floatx80_to_double(ST1);
+    /* Use a copy of the current fp_status so that we can detect new exceptions
+     * and pass them to fpu_set_exception. TODO(catalinp): Remove this once
+     * exceptions are reported directly into env->fp_status. */
+    float_status local_status = env->fp_status;
 
-    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
-        ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        return;
-    }
-
-    fpsrcop = st0;
-    fptemp = st1;
-    fpsrcop1.d = ST0;
-    fptemp1.d = ST1;
-    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
+    /* TODO(catalinp): Bochs' FPU_pre_exception_handling resets a few more
+     * fields in fp_status before doing the operation. What is the purpose of
+     * that and is it necessary here? */
+    local_status.float_exception_flags = 0;
 
-    if (expdif < 0) {
-        /* optimisation? taken from the AMD docs */
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        /* ST0 is unchanged */
-        return;
-    }
+    int overflow = floatx80_ieee754_remainder(ST0, ST1, &result, &quotient,
+                                              &local_status);
 
-    if (expdif < 53) {
-        dblq = fpsrcop / fptemp;
-        /* round dblq towards nearest integer */
-        dblq = rint(dblq);
-        st0 = fpsrcop - fptemp * dblq;
+    /* TODO(catalinp): Bochs also checks for unmasked exceptions before storing
+     * these flags. Should we also do this?
+     */
+    fprem_maybe_update_flags(overflow, quotient);
 
-        /* convert dblq to q by truncating towards zero */
-        if (dblq < 0.0)
-           q = (signed long long int)(-dblq);
-        else
-           q = (signed long long int)dblq;
+    ST0 = result;
 
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-                                /* (C0,C3,C1) <-- (q2,q1,q0) */
-        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
-        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
-        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
-    } else {
-        env->fpus |= 0x400;  /* C2 <-- 1 */
-        fptemp = pow(2.0, expdif - 50);
-        fpsrcop = (st0 / st1) / fptemp;
-        /* fpsrcop = integer obtained by chopping */
-        fpsrcop = (fpsrcop < 0.0) ?
-                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
-        st0 -= (st1 * fpsrcop * fptemp);
-    }
-    ST0 = double_to_floatx80(st0);
+    fpu_set_exception(fpu_exception_from_fp_status(&local_status));
 }
 
 void helper_fprem(void)
 {
-    double st0, st1, dblq, fpsrcop, fptemp;
-    CPU_LDoubleU fpsrcop1, fptemp1;
-    int expdif;
-    signed long long int q;
+    floatx80 result;
+    uint64_t quotient;
 
-    st0 = floatx80_to_double(ST0);
-    st1 = floatx80_to_double(ST1);
+    /* TODO(catalinp): See comments in helper_fprem1() about lots of potential
+     * semantic ambiguities/differences between this and Bochs' implementation.
+     */
+    float_status local_status = env->fp_status;
+    local_status.float_exception_flags = 0;
+    int overflow = floatx80_remainder(ST0, ST1, &result, &quotient,
+                                      &local_status);
 
-    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
-       ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
-       env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-       return;
-    }
-
-    fpsrcop = st0;
-    fptemp = st1;
-    fpsrcop1.d = ST0;
-    fptemp1.d = ST1;
-    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
-
-    if (expdif < 0) {
-        /* optimisation? taken from the AMD docs */
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        /* ST0 is unchanged */
-        return;
-    }
-
-    if ( expdif < 53 ) {
-        dblq = fpsrcop/*ST0*/ / fptemp/*ST1*/;
-        /* round dblq towards zero */
-        dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq);
-        st0 = fpsrcop/*ST0*/ - fptemp * dblq;
+    fprem_maybe_update_flags(overflow, quotient);
 
-        /* convert dblq to q by truncating towards zero */
-        if (dblq < 0.0)
-           q = (signed long long int)(-dblq);
-        else
-           q = (signed long long int)dblq;
+    ST0 = result;
 
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-                                /* (C0,C3,C1) <-- (q2,q1,q0) */
-        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
-        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
-        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
-    } else {
-        int N = 32 + (expdif % 32); /* as per AMD docs */
-        env->fpus |= 0x400;  /* C2 <-- 1 */
-        fptemp = pow(2.0, (double)(expdif - N));
-        fpsrcop = (st0 / st1) / fptemp;
-        /* fpsrcop = integer obtained by chopping */
-        fpsrcop = (fpsrcop < 0.0) ?
-                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
-        st0 -= (st1 * fpsrcop * fptemp);
-    }
-    ST0 = double_to_floatx80(st0);
+    fpu_set_exception(fpu_exception_from_fp_status(&local_status));
 }
 
 void helper_fyl2xp1(void)
-- 
1.7.7.3