All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH] softfloat: enforce softfloat if the host's FMA is broken
@ 2018-12-24 19:15 Emilio G. Cota
  2018-12-24 21:38 ` Peter Maydell
  0 siblings, 1 reply; 3+ messages in thread
From: Emilio G. Cota @ 2018-12-24 19:15 UTC (permalink / raw)
  To: qemu-devel; +Cc: Richard Henderson, Alex Bennée, laurent.desnogues

The added branch to the FMA ops is marked as unlikely and therefore
its impact on performance (measured with fp-bench) is within noise range
when measured on an Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz.

In addition, when the host doesn't have a hardware FMA instruction
we force the use of softfloat, since whatever the libc does (e.g. checking
the host's FP flags) is unlikely to be faster than our softfloat
implementation. For instance, on an i386 machine with no hardware
support for FMA, we get:

  $ for precision in single double; do
        ./fp-bench -o mulAdd -p $precision
    done

- before:
5.07 MFlops
1.85 MFlops

- after:
12.65 MFlops
10.05 MFlops

Reported-by: Laurent Desnogues <laurent.desnogues@gmail.com>
Suggested-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 include/qemu/cpuid.h |  6 ++++
 fpu/softfloat.c      | 85 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h
index 69301700bd..320926ffe0 100644
--- a/include/qemu/cpuid.h
+++ b/include/qemu/cpuid.h
@@ -25,6 +25,9 @@
 #endif
 
 /* Leaf 1, %ecx */
+#ifndef bit_FMA3
+#define bit_FMA3        (1 << 12)
+#endif
 #ifndef bit_SSE4_1
 #define bit_SSE4_1      (1 << 19)
 #endif
@@ -53,5 +56,8 @@
 #ifndef bit_LZCNT
 #define bit_LZCNT       (1 << 5)
 #endif
+#ifndef bit_FMA4
+#define bit_FMA4        (1 << 16)
+#endif
 
 #endif /* QEMU_CPUID_H */
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 59eac97d10..ccaed85b0f 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -1542,6 +1542,8 @@ soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
     return float64_round_pack_canonical(pr, status);
 }
 
+static bool force_soft_fma;
+
 float32 QEMU_FLATTEN
 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
 {
@@ -1562,6 +1564,11 @@ float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
         goto soft;
     }
+
+    if (unlikely(force_soft_fma)) {
+        goto soft;
+    }
+
     /*
      * When (a || b) == 0, there's no need to check for under/over flow,
      * since we know the addend is (normal || 0) and the product is 0.
@@ -1623,6 +1630,11 @@ float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
         goto soft;
     }
+
+    if (unlikely(force_soft_fma)) {
+        goto soft;
+    }
+
     /*
      * When (a || b) == 0, there's no need to check for under/over flow,
      * since we know the addend is (normal || 0) and the product is 0.
@@ -7974,3 +7986,76 @@ float128 float128_scalbn(float128 a, int n, float_status *status)
                                          , status);
 
 }
+
+#ifdef CONFIG_CPUID_H
+#include "qemu/cpuid.h"
+#endif
+
+static void check_host_hw_fma(void)
+{
+#ifdef CONFIG_CPUID_H
+    int max = __get_cpuid_max(0, NULL);
+    int a, b, c, d;
+    bool has_fma3 = false;
+    bool has_fma4 = false;
+    bool has_avx = false;
+
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+
+        /* check whether avx is usable */
+        if (c & bit_OSXSAVE) {
+            int bv;
+
+            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+            if ((bv & 6) == 6) {
+                has_avx = c & bit_AVX;
+            }
+        }
+
+        if (has_avx) {
+            /* fma3 */
+            has_fma3 = c & bit_FMA3;
+
+            /* fma4 */
+            __cpuid(0x80000000, a, b, c, d);
+            if (a >= 0x80000001) {
+                __cpuid(0x80000001, a, b, c, d);
+
+                has_fma4 = c & bit_FMA4;
+            }
+        }
+    }
+    /*
+     * Without HW FMA, whatever the libc does is probably slower than our
+     * softfloat implementation.
+     */
+    if (!has_fma3 && !has_fma4) {
+        force_soft_fma = true;
+    }
+#endif
+}
+
+static void __attribute__((constructor)) softfloat_init(void)
+{
+    union_float64 ua, ub, uc, ur;
+
+    if (QEMU_NO_HARDFLOAT) {
+        return;
+    }
+
+    /*
+     * Test that the host's FMA is not obviously broken. For example,
+     * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
+     *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
+     */
+    ua.s = 0x0020000000000001ULL;
+    ub.s = 0x3ca0000000000000ULL;
+    uc.s = 0x0020000000000000ULL;
+    ur.h = fma(ua.h, ub.h, uc.h);
+    if (ur.s != 0x0020000000000001ULL) {
+        force_soft_fma = true;
+    }
+
+    check_host_hw_fma();
+}
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [Qemu-devel] [PATCH] softfloat: enforce softfloat if the host's FMA is broken
  2018-12-24 19:15 [Qemu-devel] [PATCH] softfloat: enforce softfloat if the host's FMA is broken Emilio G. Cota
@ 2018-12-24 21:38 ` Peter Maydell
  2018-12-25  6:55   ` Emilio G. Cota
  0 siblings, 1 reply; 3+ messages in thread
From: Peter Maydell @ 2018-12-24 21:38 UTC (permalink / raw)
  To: Emilio G. Cota
  Cc: QEMU Developers, Laurent Desnogues, Alex Bennée, Richard Henderson

On Mon, 24 Dec 2018 at 19:16, Emilio G. Cota <cota@braap.org> wrote:
> The added branch to the FMA ops is marked as unlikely and therefore
> its impact on performance (measured with fp-bench) is within noise range
> when measured on an Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz.

> +        /* check whether avx is usable */
> +        if (c & bit_OSXSAVE) {
> +            int bv;
> +
> +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +            if ((bv & 6) == 6) {
> +                has_avx = c & bit_AVX;
> +            }
> +        }

I note that the code in tcg/i386/tcg-target.inc.c
that does something similar has the note:
            /* The xgetbv instruction is not available to older versions of
             * the assembler, so we encode the instruction manually.
             */
 -- does that concern apply here?

This will also be the fourth piece of code that has an
inline xgetbv asm insn to try to query a property of the
CPU (the other 3 are target/i386/hvf/x86_cpuid.c, util/bufferiszero.c
and tcg/i386/tcg-target.inc.c), which strongly suggests that we
should abstract this out a bit better. That also might help
avoid bugs which exist in one version but not the other:
see for instance https://bugs.launchpad.net/bugs/1758819

thanks
-- PMM

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [Qemu-devel] [PATCH] softfloat: enforce softfloat if the host's FMA is broken
  2018-12-24 21:38 ` Peter Maydell
@ 2018-12-25  6:55   ` Emilio G. Cota
  0 siblings, 0 replies; 3+ messages in thread
From: Emilio G. Cota @ 2018-12-25  6:55 UTC (permalink / raw)
  To: Peter Maydell
  Cc: QEMU Developers, Laurent Desnogues, Alex Bennée, Richard Henderson

On Mon, Dec 24, 2018 at 21:38:52 +0000, Peter Maydell wrote:
> On Mon, 24 Dec 2018 at 19:16, Emilio G. Cota <cota@braap.org> wrote:
> > The added branch to the FMA ops is marked as unlikely and therefore
> > its impact on performance (measured with fp-bench) is within noise range
> > when measured on an Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz.
> 
> > +        /* check whether avx is usable */
> > +        if (c & bit_OSXSAVE) {
> > +            int bv;
> > +
> > +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> > +            if ((bv & 6) == 6) {
> > +                has_avx = c & bit_AVX;
> > +            }
> > +        }
> 
> I note that the code in tcg/i386/tcg-target.inc.c
> that does something similar has the note:
>             /* The xgetbv instruction is not available to older versions of
>              * the assembler, so we encode the instruction manually.
>              */
>  -- does that concern apply here?

Yes.

> This will also be the fourth piece of code that has an
> inline xgetbv asm insn to try to query a property of the
> CPU (the other 3 are target/i386/hvf/x86_cpuid.c, util/bufferiszero.c
> and tcg/i386/tcg-target.inc.c), which strongly suggests that we
> should abstract this out a bit better. That also might help
> avoid bugs which exist in one version but not the other:
> see for instance https://bugs.launchpad.net/bugs/1758819

Agreed.

I'll be away until mid-January starting tomorrow, so I'll
send a v2 with the cpuid bits removed, which aren't needed
to fix the bug that Laurent hit.

We can later on address cpuid consolidation.

Thanks,

		Emilio

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-12-25  6:55 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-24 19:15 [Qemu-devel] [PATCH] softfloat: enforce softfloat if the host's FMA is broken Emilio G. Cota
2018-12-24 21:38 ` Peter Maydell
2018-12-25  6:55   ` Emilio G. Cota

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.