From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([208.118.235.92]:36510) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1SlkIW-0005ey-Jf for qemu-devel@nongnu.org; Mon, 02 Jul 2012 13:20:30 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1SlkIQ-0003P6-Te for qemu-devel@nongnu.org; Mon, 02 Jul 2012 13:20:28 -0400 Received: from 24-212-240-69.cable.teksavvy.com ([24.212.240.69]:55284 helo=glucid-laptop-imager.hot.corp.google.com) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1SlkIQ-0003OK-JL for qemu-devel@nongnu.org; Mon, 02 Jul 2012 13:20:22 -0400 From: Catalin Patulea Date: Mon, 2 Jul 2012 11:25:35 -0400 Message-Id: <1341242735-19875-1-git-send-email-catalinp@google.com> In-Reply-To: References: Subject: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: sw@weilnetz.de, Catalin Patulea , afaerber@suse.de, peter.maydell@linaro.org FPREM1 now passes the TestFloat floatx80_rem suite (and FPREM is implemented very similarly). The code (the bulk of which is remainder_kernel and do_fprem) is derived from Bochs SVN revision 11224 dated 2012-06-21 10:33:37 -0700, with conversions to Qemu type aliases, C features only, etc. as needed. Signed-off-by: Catalin Patulea --- fpu/softfloat.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++ fpu/softfloat.h | 4 + target-i386/op_helper.c | 166 ++++++++++++++++------------------------ 3 files changed, 266 insertions(+), 99 deletions(-) diff --git a/fpu/softfloat.c b/fpu/softfloat.c index b29256a..bd1879d 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -5234,6 +5234,16 @@ int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM ) } /*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is an +| unsupported value; otherwise returns 0. +*----------------------------------------------------------------------------*/ +int floatx80_is_unsupported(floatx80 a) +{ + return extractFloatx80Exp(a) && + !(extractFloatx80Frac(a) & LIT64(0x8000000000000000)); +} + +/*---------------------------------------------------------------------------- | Returns the result of converting the quadruple-precision floating-point | value `a' to the 32-bit two's complement integer format. The conversion | is performed according to the IEC/IEEE Standard for Binary Floating-Point @@ -6828,6 +6838,191 @@ floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM ) aSign, aExp, aSig, 0 STATUS_VAR ); } +/* executes single exponent reduction cycle */ +static uint64_t remainder_kernel(uint64_t aSig0, uint64_t bSig, int expDiff, + uint64_t *zSig0, uint64_t *zSig1) +{ + uint64_t term0, term1; + uint64_t aSig1 = 0; + + shortShift128Left(aSig1, aSig0, expDiff, &aSig1, &aSig0); + uint64_t q = estimateDiv128To64(aSig1, aSig0, bSig); + mul64To128(bSig, q, &term0, &term1); + sub128(aSig1, aSig0, term0, term1, zSig1, zSig0); + while ((int64)(*zSig1) < 0) { + --q; + add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0); + } + return q; +} + +static int do_fprem(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q, + int rounding_mode STATUS_PARAM) +{ + int32 aExp, bExp, zExp, expDiff; + uint64_t aSig0, aSig1, bSig; + flag aSign; + *q = 0; + + /* handle unsupported extended double-precision floating encodings */ + if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) { + float_raise(float_flag_invalid, status); + *r = floatx80_default_nan; + return -1; + } + + aSig0 = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + aSign = extractFloatx80Sign(a); + bSig = extractFloatx80Frac(b); + bExp = extractFloatx80Exp(b); + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig0<<1) || ((bExp == 0x7FFF) && + (uint64_t) (bSig<<1))) { + *r = propagateFloatx80NaN(a, b, status); + return -1; + } + float_raise(float_flag_invalid, status); + *r = floatx80_default_nan; + return -1; + } + if (bExp == 0x7FFF) { + if ((uint64_t) (bSig<<1)) { + *r = propagateFloatx80NaN(a, b, status); + return -1; + } + if (aExp == 0 && aSig0) { + float_raise(float_flag_input_denormal, status); + normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0); + *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ? + packFloatx80(aSign, aExp, aSig0) : a; + return 0; + } + *r = a; + return 0; + + } + if (bExp == 0) { + if (bSig == 0) { + float_raise(float_flag_invalid, status); + *r = floatx80_default_nan; + return -1; + } + float_raise(float_flag_input_denormal, status); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + } + if (aExp == 0) { + if (aSig0 == 0) { + *r = a; + return 0; + } + float_raise(float_flag_input_denormal, status); + normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0); + } + expDiff = aExp - bExp; + aSig1 = 0; + + int overflow = 0; + + if (expDiff >= 64) { + int n = (expDiff & 0x1f) | 0x20; + remainder_kernel(aSig0, bSig, n, &aSig0, &aSig1); + zExp = aExp - n; + overflow = 1; + } else { + zExp = bExp; + + if (expDiff < 0) { + if (expDiff < -1) { + *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ? + packFloatx80(aSign, aExp, aSig0) : a; + return 0; + } + shift128Right(aSig0, 0, 1, &aSig0, &aSig1); + expDiff = 0; + } + + if (expDiff > 0) { + *q = remainder_kernel(aSig0, bSig, expDiff, &aSig0, &aSig1); + } else { + if (bSig <= aSig0) { + aSig0 -= bSig; + *q = 1; + } + } + + if (rounding_mode == float_round_nearest_even) { + uint64_t term0, term1; + shift128Right(bSig, 0, 1, &term0, &term1); + + if (!lt128(aSig0, aSig1, term0, term1)) { + int lt = lt128(term0, term1, aSig0, aSig1); + int eq = eq128(aSig0, aSig1, term0, term1); + + if ((eq && (*q & 1)) || lt) { + aSign = !aSign; + ++*q; + } + if (lt) { + sub128(bSig, 0, aSig0, aSig1, &aSig0, &aSig1); + } + } + } + } + + *r = normalizeRoundAndPackFloatx80(80, aSign, zExp, aSig0, aSig1, status); + return overflow; +} + +/*---------------------------------------------------------------------------- +| Computes the remainder of the extended double-precision floating-point value +| `a' with respect to the corresponding value `b'. Stores the remainder in +| `*r'. +| +| Returns one of the following: +| -1 The operands are invalid. Exceptions were raised and no remainder was +| computed. `*r' was set to a NaN value. +| 0 The remainder was computed successfully and completely. +| 1 Only a partial remainder was computed and stored in `*r'. Call this +| function again with `a' set to the returned partial remainder and the +| same `b' to continue calculating the remainder. +| +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q + STATUS_PARAM) +{ + return do_fprem(a, b, r, q, float_round_nearest_even STATUS_VAR); +} + +/*---------------------------------------------------------------------------- +| Computes the remainder of the extended double-precision floating-point value +| `a' with respect to the corresponding value `b'. Stores the remainder in +| `*r'. +| +| Returns one of the following: +| -1 The operands are invalid. Exceptions were raised and no remainder was +| computed. `*r' was set to a NaN value. +| 0 The remainder was computed successfully and completely. +| 1 Only a partial remainder was computed and stored in `*r'. Call this +| function again with `a' set to the returned partial remainder and the +| same `b' to continue calculating the remainder. +| +| Unlike previous function the function does not compute the remainder +| specified in the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +| This function operates differently from the previous function in the way +| that it rounds the quotient of 'a' divided by 'b' to an integer. +*----------------------------------------------------------------------------*/ + +int floatx80_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q + STATUS_PARAM) +{ + return do_fprem(a, b, r, q, float_round_to_zero STATUS_VAR); +} + float128 float128_scalbn( float128 a, int n STATUS_PARAM ) { flag aSign; diff --git a/fpu/softfloat.h b/fpu/softfloat.h index feec3a1..e6f5b87 100644 --- a/fpu/softfloat.h +++ b/fpu/softfloat.h @@ -497,10 +497,14 @@ int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM ); int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM ); int floatx80_compare( floatx80, floatx80 STATUS_PARAM ); int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM ); +int floatx80_is_unsupported(floatx80); int floatx80_is_quiet_nan( floatx80 ); int floatx80_is_signaling_nan( floatx80 ); floatx80 floatx80_maybe_silence_nan( floatx80 ); floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM ); +int floatx80_ieee754_remainder(floatx80, floatx80, floatx80 *, uint64_t * + STATUS_PARAM); +int floatx80_remainder(floatx80, floatx80, floatx80 *, uint64_t * STATUS_PARAM); INLINE floatx80 floatx80_abs(floatx80 a) { diff --git a/target-i386/op_helper.c b/target-i386/op_helper.c index 2862ea4..53f46fd 100644 --- a/target-i386/op_helper.c +++ b/target-i386/op_helper.c @@ -3620,6 +3620,23 @@ static void fpu_set_exception(int mask) env->fpus |= FPUS_SE | FPUS_B; } +static int fpu_exception_from_fp_status(const float_status *fp_status) +{ + int mask; + + const int float_flag_denormals = float_flag_input_denormal | + float_flag_output_denormal; + + /* Most flags have the same value in the enum and 387 status word, except + * the denormal flags. */ + mask = fp_status->float_exception_flags & ~float_flag_denormals; + if (fp_status->float_exception_flags & float_flag_denormals) { + mask |= FPUS_DE; + } + + return mask; +} + static inline floatx80 helper_fdiv(floatx80 a, floatx80 b) { if (floatx80_is_zero(b)) { @@ -4208,121 +4225,72 @@ void helper_fxtract(void) } } +static void fprem_maybe_update_flags(int overflow, uint64_t quotient) +{ + if (overflow >= 0) { /* overflow < 0 means error */ + env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */ + if (overflow) { + env->fpus |= 1 << 10; + } else { /* (C0,C3,C1) <-- (q2,q1,q0) */ + if (quotient & 1) { + env->fpus |= 1 << 9; + } + if (quotient & 2) { + env->fpus |= 1 << 14; + } + if (quotient & 4) { + env->fpus |= 1 << 8; + } + } + } +} + void helper_fprem1(void) { - double st0, st1, dblq, fpsrcop, fptemp; - CPU_LDoubleU fpsrcop1, fptemp1; - int expdif; - signed long long int q; + floatx80 result; + uint64_t quotient; - st0 = floatx80_to_double(ST0); - st1 = floatx80_to_double(ST1); + /* Use a copy of the current fp_status so that we can detect new exceptions + * and pass them to fpu_set_exception. TODO(catalinp): Remove this once + * exceptions are reported directly into env->fp_status. */ + float_status local_status = env->fp_status; - if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) { - ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */ - env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */ - return; - } - - fpsrcop = st0; - fptemp = st1; - fpsrcop1.d = ST0; - fptemp1.d = ST1; - expdif = EXPD(fpsrcop1) - EXPD(fptemp1); + /* TODO(catalinp): Bochs' FPU_pre_exception_handling resets a few more + * fields in fp_status before doing the operation. What is the purpose of + * that and is it necessary here? */ + local_status.float_exception_flags = 0; - if (expdif < 0) { - /* optimisation? taken from the AMD docs */ - env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */ - /* ST0 is unchanged */ - return; - } + int overflow = floatx80_ieee754_remainder(ST0, ST1, &result, "ient, + &local_status); - if (expdif < 53) { - dblq = fpsrcop / fptemp; - /* round dblq towards nearest integer */ - dblq = rint(dblq); - st0 = fpsrcop - fptemp * dblq; + /* TODO(catalinp): Bochs also checks for unmasked exceptions before storing + * these flags. Should we also do this? + */ + fprem_maybe_update_flags(overflow, quotient); - /* convert dblq to q by truncating towards zero */ - if (dblq < 0.0) - q = (signed long long int)(-dblq); - else - q = (signed long long int)dblq; + ST0 = result; - env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */ - /* (C0,C3,C1) <-- (q2,q1,q0) */ - env->fpus |= (q & 0x4) << (8 - 2); /* (C0) <-- q2 */ - env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */ - env->fpus |= (q & 0x1) << (9 - 0); /* (C1) <-- q0 */ - } else { - env->fpus |= 0x400; /* C2 <-- 1 */ - fptemp = pow(2.0, expdif - 50); - fpsrcop = (st0 / st1) / fptemp; - /* fpsrcop = integer obtained by chopping */ - fpsrcop = (fpsrcop < 0.0) ? - -(floor(fabs(fpsrcop))) : floor(fpsrcop); - st0 -= (st1 * fpsrcop * fptemp); - } - ST0 = double_to_floatx80(st0); + fpu_set_exception(fpu_exception_from_fp_status(&local_status)); } void helper_fprem(void) { - double st0, st1, dblq, fpsrcop, fptemp; - CPU_LDoubleU fpsrcop1, fptemp1; - int expdif; - signed long long int q; + floatx80 result; + uint64_t quotient; - st0 = floatx80_to_double(ST0); - st1 = floatx80_to_double(ST1); + /* TODO(catalinp): See comments in helper_fprem1() about lots of potential + * semantic ambiguities/differences between this and Bochs' implementation. + */ + float_status local_status = env->fp_status; + local_status.float_exception_flags = 0; + int overflow = floatx80_remainder(ST0, ST1, &result, "ient, + &local_status); - if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) { - ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */ - env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */ - return; - } - - fpsrcop = st0; - fptemp = st1; - fpsrcop1.d = ST0; - fptemp1.d = ST1; - expdif = EXPD(fpsrcop1) - EXPD(fptemp1); - - if (expdif < 0) { - /* optimisation? taken from the AMD docs */ - env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */ - /* ST0 is unchanged */ - return; - } - - if ( expdif < 53 ) { - dblq = fpsrcop/*ST0*/ / fptemp/*ST1*/; - /* round dblq towards zero */ - dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq); - st0 = fpsrcop/*ST0*/ - fptemp * dblq; + fprem_maybe_update_flags(overflow, quotient); - /* convert dblq to q by truncating towards zero */ - if (dblq < 0.0) - q = (signed long long int)(-dblq); - else - q = (signed long long int)dblq; + ST0 = result; - env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */ - /* (C0,C3,C1) <-- (q2,q1,q0) */ - env->fpus |= (q & 0x4) << (8 - 2); /* (C0) <-- q2 */ - env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */ - env->fpus |= (q & 0x1) << (9 - 0); /* (C1) <-- q0 */ - } else { - int N = 32 + (expdif % 32); /* as per AMD docs */ - env->fpus |= 0x400; /* C2 <-- 1 */ - fptemp = pow(2.0, (double)(expdif - N)); - fpsrcop = (st0 / st1) / fptemp; - /* fpsrcop = integer obtained by chopping */ - fpsrcop = (fpsrcop < 0.0) ? - -(floor(fabs(fpsrcop))) : floor(fpsrcop); - st0 -= (st1 * fpsrcop * fptemp); - } - ST0 = double_to_floatx80(st0); + fpu_set_exception(fpu_exception_from_fp_status(&local_status)); } void helper_fyl2xp1(void) -- 1.7.7.3