[RFC PATCH v4 15/29] Hexagon (target/hexagon) utility functions

From: Taylor Simpson <tsimpson@quicinc.com>
To: qemu-devel@nongnu.org
Cc: ale@rev.ng, riku.voipio@iki.fi, richard.henderson@linaro.org,
	laurent@vivier.eu, tsimpson@quicinc.com, philmd@redhat.com,
	aleksandar.m.mail@gmail.com
Subject: [RFC PATCH v4 15/29] Hexagon (target/hexagon) utility functions
Date: Mon, 28 Sep 2020 12:28:44 -0500	[thread overview]
Message-ID: <1601314138-9930-16-git-send-email-tsimpson@quicinc.com> (raw)
In-Reply-To: <1601314138-9930-1-git-send-email-tsimpson@quicinc.com>

Utility functions called by various instructions

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
---
 target/hexagon/arch.h     |  42 +++
 target/hexagon/conv_emu.h |  50 +++
 target/hexagon/fma_emu.h  |  27 ++
 target/hexagon/arch.c     | 354 +++++++++++++++++++++
 target/hexagon/conv_emu.c | 369 ++++++++++++++++++++++
 target/hexagon/fma_emu.c  | 777 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1619 insertions(+)
 create mode 100644 target/hexagon/arch.h
 create mode 100644 target/hexagon/conv_emu.h
 create mode 100644 target/hexagon/fma_emu.h
 create mode 100644 target/hexagon/arch.c
 create mode 100644 target/hexagon/conv_emu.c
 create mode 100644 target/hexagon/fma_emu.c

diff --git a/target/hexagon/arch.h b/target/hexagon/arch.h
new file mode 100644
index 0000000..82644ac
--- /dev/null
+++ b/target/hexagon/arch.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright(c) 2019-2020 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HEXAGON_ARCH_H
+#define HEXAGON_ARCH_H
+
+#include "cpu.h"
+#include "hex_arch_types.h"
+
+extern uint64_t interleave(uint32_t odd, uint32_t even);
+extern uint64_t deinterleave(uint64_t src);
+extern uint32_t carry_from_add64(uint64_t a, uint64_t b, uint32_t c);
+extern int32_t conv_round(int32_t a, int n);
+extern size16s_t cast8s_to_16s(int64_t a);
+extern int64_t cast16s_to_8s(size16s_t a);
+extern size16s_t add128(size16s_t a, size16s_t b);
+extern size16s_t sub128(size16s_t a, size16s_t b);
+extern size16s_t shiftr128(size16s_t a, uint32_t n);
+extern size16s_t shiftl128(size16s_t a, uint32_t n);
+extern size16s_t and128(size16s_t a, size16s_t b);
+extern void arch_fpop_start(CPUHexagonState *env);
+extern void arch_fpop_end(CPUHexagonState *env);
+extern void arch_raise_fpflag(unsigned int flags);
+extern int arch_sf_recip_common(int32_t *Rs, int32_t *Rt, int32_t *Rd,
+                                int *adjust);
+extern int arch_sf_invsqrt_common(int32_t *Rs, int32_t *Rd, int *adjust);
+
+#endif
diff --git a/target/hexagon/conv_emu.h b/target/hexagon/conv_emu.h
new file mode 100644
index 0000000..e713a12
--- /dev/null
+++ b/target/hexagon/conv_emu.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright(c) 2019-2020 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HEXAGON_CONV_EMU_H
+#define HEXAGON_CONV_EMU_H
+
+#include "hex_arch_types.h"
+
+extern uint64_t conv_sf_to_8u(float in);
+extern uint32_t conv_sf_to_4u(float in);
+extern int64_t conv_sf_to_8s(float in);
+extern int32_t conv_sf_to_4s(float in);
+
+extern uint64_t conv_df_to_8u(double in);
+extern uint32_t conv_df_to_4u(double in);
+extern int64_t conv_df_to_8s(double in);
+extern int32_t conv_df_to_4s(double in);
+
+extern double conv_8u_to_df(uint64_t in);
+extern double conv_4u_to_df(uint32_t in);
+extern double conv_8s_to_df(int64_t in);
+extern double conv_4s_to_df(int32_t in);
+
+extern float conv_8u_to_sf(uint64_t in);
+extern float conv_4u_to_sf(uint32_t in);
+extern float conv_8s_to_sf(int64_t in);
+extern float conv_4s_to_sf(int32_t in);
+
+extern float conv_df_to_sf(double in);
+
+static inline double conv_sf_to_df(float in)
+{
+    return in;
+}
+
+#endif
diff --git a/target/hexagon/fma_emu.h b/target/hexagon/fma_emu.h
new file mode 100644
index 0000000..181b1f6
--- /dev/null
+++ b/target/hexagon/fma_emu.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright(c) 2019-2020 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef HEXAGON_FMA_EMU_H
+#define HEXAGON_FMA_EMU_H
+
+extern float internal_fmafx(float a_in, float b_in, float c_in, int scale);
+extern float internal_fmaf(float a_in, float b_in, float c_in);
+extern float internal_mpyf(float a_in, float b_in);
+extern double internal_mpyhh(double a_in, double b_in,
+                             unsigned long long int accumulated);
+
+#endif
diff --git a/target/hexagon/arch.c b/target/hexagon/arch.c
new file mode 100644
index 0000000..a9f864d
--- /dev/null
+++ b/target/hexagon/arch.c
@@ -0,0 +1,354 @@
+/*
+ *  Copyright(c) 2019-2020 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include <math.h>
+#include "fma_emu.h"
+#include "arch.h"
+#include "macros.h"
+
+#define BITS_MASK_8 0x5555555555555555ULL
+#define PAIR_MASK_8 0x3333333333333333ULL
+#define NYBL_MASK_8 0x0f0f0f0f0f0f0f0fULL
+#define BYTE_MASK_8 0x00ff00ff00ff00ffULL
+#define HALF_MASK_8 0x0000ffff0000ffffULL
+#define WORD_MASK_8 0x00000000ffffffffULL
+
+uint64_t interleave(uint32_t odd, uint32_t even)
+{
+    /* Convert to long long */
+    uint64_t myodd = odd;
+    uint64_t myeven = even;
+    /* First, spread bits out */
+    myodd = (myodd | (myodd << 16)) & HALF_MASK_8;
+    myeven = (myeven | (myeven << 16)) & HALF_MASK_8;
+    myodd = (myodd | (myodd << 8)) & BYTE_MASK_8;
+    myeven = (myeven | (myeven << 8)) & BYTE_MASK_8;
+    myodd = (myodd | (myodd << 4)) & NYBL_MASK_8;
+    myeven = (myeven | (myeven << 4)) & NYBL_MASK_8;
+    myodd = (myodd | (myodd << 2)) & PAIR_MASK_8;
+    myeven = (myeven | (myeven << 2)) & PAIR_MASK_8;
+    myodd = (myodd | (myodd << 1)) & BITS_MASK_8;
+    myeven = (myeven | (myeven << 1)) & BITS_MASK_8;
+    /* Now OR together */
+    return myeven | (myodd << 1);
+}
+
+uint64_t deinterleave(uint64_t src)
+{
+    /* Get odd and even bits */
+    uint64_t myodd = ((src >> 1) & BITS_MASK_8);
+    uint64_t myeven = (src & BITS_MASK_8);
+
+    /* Unspread bits */
+    myeven = (myeven | (myeven >> 1)) & PAIR_MASK_8;
+    myodd = (myodd | (myodd >> 1)) & PAIR_MASK_8;
+    myeven = (myeven | (myeven >> 2)) & NYBL_MASK_8;
+    myodd = (myodd | (myodd >> 2)) & NYBL_MASK_8;
+    myeven = (myeven | (myeven >> 4)) & BYTE_MASK_8;
+    myodd = (myodd | (myodd >> 4)) & BYTE_MASK_8;
+    myeven = (myeven | (myeven >> 8)) & HALF_MASK_8;
+    myodd = (myodd | (myodd >> 8)) & HALF_MASK_8;
+    myeven = (myeven | (myeven >> 16)) & WORD_MASK_8;
+    myodd = (myodd | (myodd >> 16)) & WORD_MASK_8;
+
+    /* Return odd bits in upper half */
+    return myeven | (myodd << 32);
+}
+
+uint32_t carry_from_add64(uint64_t a, uint64_t b, uint32_t c)
+{
+    uint64_t tmpa, tmpb, tmpc;
+    tmpa = fGETUWORD(0, a);
+    tmpb = fGETUWORD(0, b);
+    tmpc = tmpa + tmpb + c;
+    tmpa = fGETUWORD(1, a);
+    tmpb = fGETUWORD(1, b);
+    tmpc = tmpa + tmpb + fGETUWORD(1, tmpc);
+    tmpc = fGETUWORD(1, tmpc);
+    return tmpc;
+}
+
+int32_t conv_round(int32_t a, int n)
+{
+    int64_t val;
+
+    if (n == 0) {
+        val = a;
+    } else if ((a & ((1 << (n - 1)) - 1)) == 0) {    /* N-1..0 all zero? */
+        /* Add LSB from int part */
+        val = ((fSE32_64(a)) + (int64_t) (((uint32_t) ((1 << n) & a)) >> 1));
+    } else {
+        val = ((fSE32_64(a)) + (1 << (n - 1)));
+    }
+
+    val = val >> n;
+    return (int32_t)val;
+}
+
+size16s_t cast8s_to_16s(int64_t a)
+{
+    size16s_t result = {.hi = 0, .lo = 0};
+    result.lo = a;
+    if (a < 0) {
+        result.hi = -1;
+    }
+    return result;
+}
+
+int64_t cast16s_to_8s(size16s_t a)
+{
+    return a.lo;
+}
+
+size16s_t add128(size16s_t a, size16s_t b)
+{
+    size16s_t result = {.hi = 0, .lo = 0};
+    result.lo = a.lo + b.lo;
+    result.hi = a.hi + b.hi;
+
+    if (result.lo < b.lo) {
+        result.hi++;
+    }
+
+    return result;
+}
+
+size16s_t sub128(size16s_t a, size16s_t b)
+{
+    size16s_t result = {.hi = 0, .lo = 0};
+    result.lo = a.lo - b.lo;
+    result.hi = a.hi - b.hi;
+    if (result.lo > a.lo) {
+        result.hi--;
+    }
+
+    return result;
+}
+
+size16s_t shiftr128(size16s_t a, uint32_t n)
+{
+    size16s_t result;
+    result.lo = (a.lo >> n) | (a.hi << (64 - n));
+    result.hi = a.hi >> n;
+    return result;
+}
+
+size16s_t shiftl128(size16s_t a, uint32_t n)
+{
+    size16s_t result;
+    result.lo = a.lo << n;
+    result.hi = (a.hi << n) | (a.lo >> (64 - n));
+    return result;
+}
+
+size16s_t and128(size16s_t a, size16s_t b)
+{
+    size16s_t result;
+    result.lo = a.lo & b.lo;
+    result.hi = a.hi & b.hi;
+    return result;
+}
+
+/* Floating Point Stuff */
+
+static const int roundingmodes[] = {
+    FE_TONEAREST,
+    FE_TOWARDZERO,
+    FE_DOWNWARD,
+    FE_UPWARD
+};
+
+void arch_fpop_start(CPUHexagonState *env)
+{
+    fegetenv(&env->fenv);
+    feclearexcept(FE_ALL_EXCEPT);
+    fesetround(roundingmodes[fREAD_REG_FIELD(USR, USR_FPRND)]);
+}
+
+#define NOTHING             /* Don't do anything */
+
+#define TEST_FLAG(LIBCF, MYF, MYE) \
+    do { \
+        if (fetestexcept(LIBCF)) { \
+            if (GET_USR_FIELD(USR_##MYF) == 0) { \
+                SET_USR_FIELD(USR_##MYF, 1); \
+                if (GET_USR_FIELD(USR_##MYE)) { \
+                    NOTHING \
+                } \
+            } \
+        } \
+    } while (0)
+
+void arch_fpop_end(CPUHexagonState *env)
+{
+    if (fetestexcept(FE_ALL_EXCEPT)) {
+        TEST_FLAG(FE_INEXACT, FPINPF, FPINPE);
+        TEST_FLAG(FE_DIVBYZERO, FPDBZF, FPDBZE);
+        TEST_FLAG(FE_INVALID, FPINVF, FPINVE);
+        TEST_FLAG(FE_OVERFLOW, FPOVFF, FPOVFE);
+        TEST_FLAG(FE_UNDERFLOW, FPUNFF, FPUNFE);
+    }
+    fesetenv(&env->fenv);
+}
+
+#undef TEST_FLAG
+
+
+void arch_raise_fpflag(unsigned int flags)
+{
+    feraiseexcept(flags);
+}
+
+int arch_sf_recip_common(int32_t *Rs, int32_t *Rt, int32_t *Rd, int *adjust)
+{
+    int n_class;
+    int d_class;
+    int n_exp;
+    int d_exp;
+    int ret = 0;
+    int32_t RsV, RtV, RdV;
+    int PeV = 0;
+    RsV = *Rs;
+    RtV = *Rt;
+    n_class = fpclassify(fFLOAT(RsV));
+    d_class = fpclassify(fFLOAT(RtV));
+    if ((n_class == FP_NAN) && (d_class == FP_NAN)) {
+        if (fGETBIT(22, RsV & RtV) == 0) {
+            fRAISEFLAGS(FE_INVALID);
+        }
+        RdV = RsV = RtV = fSFNANVAL();
+    } else if (n_class == FP_NAN) {
+        if (fGETBIT(22, RsV) == 0) {
+            fRAISEFLAGS(FE_INVALID);
+        }
+        RdV = RsV = RtV = fSFNANVAL();
+    } else if (d_class == FP_NAN) {
+        /* or put NaN in num/den fixup? */
+        if (fGETBIT(22, RtV) == 0) {
+            fRAISEFLAGS(FE_INVALID);
+        }
+        RdV = RsV = RtV = fSFNANVAL();
+    } else if ((n_class == FP_INFINITE) && (d_class == FP_INFINITE)) {
+        /* or put Inf in num fixup? */
+        RdV = RsV = RtV = fSFNANVAL();
+        fRAISEFLAGS(FE_INVALID);
+    } else if ((n_class == FP_ZERO) && (d_class == FP_ZERO)) {
+        /* or put zero in num fixup? */
+        RdV = RsV = RtV = fSFNANVAL();
+        fRAISEFLAGS(FE_INVALID);
+    } else if (d_class == FP_ZERO) {
+        /* or put Inf in num fixup? */
+        RsV = fSFINFVAL(RsV ^ RtV);
+        RtV = fSFONEVAL(0);
+        RdV = fSFONEVAL(0);
+        if (n_class != FP_INFINITE) {
+            fRAISEFLAGS(FE_DIVBYZERO);
+        }
+    } else if (d_class == FP_INFINITE) {
+        RsV = 0x80000000 & (RsV ^ RtV);
+        RtV = fSFONEVAL(0);
+        RdV = fSFONEVAL(0);
+    } else if (n_class == FP_ZERO) {
+        /* Does this just work itself out? */
+        /* No, 0/Inf causes problems. */
+        RsV = 0x80000000 & (RsV ^ RtV);
+        RtV = fSFONEVAL(0);
+        RdV = fSFONEVAL(0);
+    } else if (n_class == FP_INFINITE) {
+        /* Does this just work itself out? */
+        RsV = fSFINFVAL(RsV ^ RtV);
+        RtV = fSFONEVAL(0);
+        RdV = fSFONEVAL(0);
+    } else {
+        PeV = 0x00;
+        /* Basic checks passed */
+        n_exp = fSF_GETEXP(RsV);
+        d_exp = fSF_GETEXP(RtV);
+        if ((n_exp - d_exp + fSF_BIAS()) <= fSF_MANTBITS()) {
+            /* Near quotient underflow / inexact Q */
+            PeV = 0x80;
+            RtV = fSF_MUL_POW2(RtV, -64);
+            RsV = fSF_MUL_POW2(RsV, 64);
+        } else if ((n_exp - d_exp + fSF_BIAS()) > (fSF_MAXEXP() - 24)) {
+            /* Near quotient overflow */
+            PeV = 0x40;
+            RtV = fSF_MUL_POW2(RtV, 32);
+            RsV = fSF_MUL_POW2(RsV, -32);
+        } else if (n_exp <= fSF_MANTBITS() + 2) {
+            RtV = fSF_MUL_POW2(RtV, 64);
+            RsV = fSF_MUL_POW2(RsV, 64);
+        } else if (d_exp <= 1) {
+            RtV = fSF_MUL_POW2(RtV, 32);
+            RsV = fSF_MUL_POW2(RsV, 32);
+        } else if (d_exp > 252) {
+            RtV = fSF_MUL_POW2(RtV, -32);
+            RsV = fSF_MUL_POW2(RsV, -32);
+        }
+        RdV = 0;
+        ret = 1;
+    }
+    *Rs = RsV;
+    *Rt = RtV;
+    *Rd = RdV;
+    *adjust = PeV;
+    return ret;
+}
+
+int arch_sf_invsqrt_common(int32_t *Rs, int32_t *Rd, int *adjust)
+{
+    int r_class;
+    int32_t RsV, RdV;
+    int PeV = 0;
+    int r_exp;
+    int ret = 0;
+    RsV = *Rs;
+    r_class = fpclassify(fFLOAT(RsV));
+    if (r_class == FP_NAN) {
+        if (fGETBIT(22, RsV) == 0) {
+            fRAISEFLAGS(FE_INVALID);
+        }
+        RdV = RsV = fSFNANVAL();
+    } else if (fFLOAT(RsV) < 0.0) {
+        /* Negative nonzero values are NaN */
+        fRAISEFLAGS(FE_INVALID);
+        RsV = fSFNANVAL();
+        RdV = fSFNANVAL();
+    } else if (r_class == FP_INFINITE) {
+        /* or put Inf in num fixup? */
+        RsV = fSFINFVAL(-1);
+        RdV = fSFINFVAL(-1);
+    } else if (r_class == FP_ZERO) {
+        /* or put zero in num fixup? */
+        RdV = fSFONEVAL(0);
+    } else {
+        PeV = 0x00;
+        /* Basic checks passed */
+        r_exp = fSF_GETEXP(RsV);
+        if (r_exp <= 24) {
+            RsV = fSF_MUL_POW2(RsV, 64);
+            PeV = 0xe0;
+        }
+        RdV = 0;
+        ret = 1;
+    }
+    *Rs = RsV;
+    *Rd = RdV;
+    *adjust = PeV;
+    return ret;
+}
+
diff --git a/target/hexagon/conv_emu.c b/target/hexagon/conv_emu.c
new file mode 100644
index 0000000..22fc647
--- /dev/null
+++ b/target/hexagon/conv_emu.c
@@ -0,0 +1,369 @@
+/*
+ *  Copyright(c) 2019-2020 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "qemu/osdep.h"
+#include "hex_arch_types.h"
+#include "macros.h"
+#include "conv_emu.h"
+
+#define isz(X) (fpclassify(X) == FP_ZERO)
+#define DF_BIAS 1023
+#define SF_BIAS 127
+
+#define LL_MAX_POS 0x7fffffffffffffffULL
+#define MAX_POS 0x7fffffffU
+
+#ifdef VCPP
+/*
+ * Visual C isn't GNU C and doesn't have __builtin_clzll
+ */
+
+static int __builtin_clzll(unsigned long long int input)
+{
+    int total = 0;
+    if (input == 0) {
+        return 64;
+    }
+    total += ((input >> (total + 32)) != 0) ? 32 : 0;
+    total += ((input >> (total + 16)) != 0) ? 16 : 0;
+    total += ((input >> (total +  8)) != 0) ?  8 : 0;
+    total += ((input >> (total +  4)) != 0) ?  4 : 0;
+    total += ((input >> (total +  2)) != 0) ?  2 : 0;
+    total += ((input >> (total +  1)) != 0) ?  1 : 0;
+    return 63 - total;
+}
+#endif
+
+typedef union {
+    double f;
+    uint64_t i;
+    struct {
+        uint64_t mant:52;
+        uint64_t exp:11;
+        uint64_t sign:1;
+    } x;
+} df_t;
+
+
+typedef union {
+    float f;
+    uint32_t i;
+    struct {
+        uint32_t mant:23;
+        uint32_t exp:8;
+        uint32_t sign:1;
+    } x;
+} sf_t;
+
+
+#define MAKE_CONV_8U_TO_XF_N(FLOATID, BIGFLOATID, RETTYPE) \
+static RETTYPE conv_8u_to_##FLOATID##_n(uint64_t in, int negate) \
+{ \
+    FLOATID##_t x; \
+    uint64_t tmp, truncbits, shamt; \
+    int leading_zeros; \
+    if (in == 0) { \
+        return 0.0; \
+    } \
+    leading_zeros = __builtin_clzll(in); \
+    tmp = in << (leading_zeros); \
+    tmp <<= 1; \
+    shamt = 64 - f##BIGFLOATID##_MANTBITS(); \
+    truncbits = tmp & ((1ULL << (shamt)) - 1); \
+    tmp >>= shamt; \
+    if (truncbits != 0) { \
+        feraiseexcept(FE_INEXACT); \
+        switch (fegetround()) { \
+        case FE_TOWARDZERO: \
+            break; \
+        case FE_DOWNWARD: \
+            if (negate) { \
+                tmp += 1; \
+            } \
+            break; \
+        case FE_UPWARD: \
+            if (!negate) { \
+                tmp += 1; \
+            } \
+            break; \
+        default: \
+            if ((truncbits & ((1ULL << (shamt - 1)) - 1)) == 0) { \
+                tmp += (tmp & 1); \
+            } else { \
+                tmp += ((truncbits >> (shamt - 1)) & 1); \
+            } \
+            break; \
+        } \
+    } \
+    if (((tmp << shamt) >> shamt) != tmp) { \
+        leading_zeros--; \
+    } \
+    x.x.mant = tmp; \
+    x.x.exp = BIGFLOATID##_BIAS + f##BIGFLOATID##_MANTBITS() - \
+              leading_zeros + shamt - 1; \
+    x.x.sign = negate; \
+    return x.f; \
+}
+
+MAKE_CONV_8U_TO_XF_N(df, DF, double)
+MAKE_CONV_8U_TO_XF_N(sf, SF, float)
+
+double conv_8u_to_df(uint64_t in)
+{
+    return conv_8u_to_df_n(in, 0);
+}
+
+double conv_8s_to_df(int64_t in)
+{
+    if (in == 0x8000000000000000) {
+        return -0x1p63;
+    }
+    if (in < 0) {
+        return conv_8u_to_df_n(-in, 1);
+    } else {
+        return conv_8u_to_df_n(in, 0);
+    }
+}
+
+double conv_4u_to_df(uint32_t in)
+{
+    return conv_8u_to_df((uint64_t) in);
+}
+
+double conv_4s_to_df(int32_t in)
+{
+    return conv_8s_to_df(in);
+}
+
+float conv_8u_to_sf(uint64_t in)
+{
+    return conv_8u_to_sf_n(in, 0);
+}
+
+float conv_8s_to_sf(int64_t in)
+{
+    if (in == 0x8000000000000000) {
+        return -0x1p63;
+    }
+    if (in < 0) {
+        return conv_8u_to_sf_n(-in, 1);
+    } else {
+        return conv_8u_to_sf_n(in, 0);
+    }
+}
+
+float conv_4u_to_sf(uint32_t in)
+{
+    return conv_8u_to_sf(in);
+}
+
+float conv_4s_to_sf(int32_t in)
+{
+    return conv_8s_to_sf(in);
+}
+
+
+static uint64_t conv_df_to_8u_n(double in, int will_negate)
+{
+    df_t x;
+    int fracshift, endshift;
+    uint64_t tmp, truncbits;
+    x.f = in;
+    if (isinf(in)) {
+        feraiseexcept(FE_INVALID);
+        if (in > 0.0) {
+            return ~0ULL;
+        } else {
+            return 0ULL;
+        }
+    }
+    if (isnan(in)) {
+        feraiseexcept(FE_INVALID);
+        return ~0ULL;
+    }
+    if (isz(in)) {
+        return 0;
+    }
+    if (x.x.sign) {
+        feraiseexcept(FE_INVALID);
+        return 0;
+    }
+    if (in < 0.5) {
+        /* Near zero, captures large fracshifts, denorms, etc */
+        feraiseexcept(FE_INEXACT);
+        switch (fegetround()) {
+        case FE_DOWNWARD:
+            if (will_negate) {
+                return 1;
+            } else {
+                return 0;
+            }
+        case FE_UPWARD:
+            if (!will_negate) {
+                return 1;
+            } else {
+                return 0;
+            }
+        default:
+            return 0;    /* nearest or towards zero */
+        }
+    }
+    if ((x.x.exp - DF_BIAS) >= 64) {
+        /* way too big */
+        feraiseexcept(FE_INVALID);
+        return ~0ULL;
+    }
+    fracshift = fMAX(0, (fDF_MANTBITS() - (x.x.exp - DF_BIAS)));
+    endshift = fMAX(0, ((x.x.exp - DF_BIAS - fDF_MANTBITS())));
+    tmp = x.x.mant | (1ULL << fDF_MANTBITS());
+    truncbits = tmp & ((1ULL << fracshift) - 1);
+    tmp >>= fracshift;
+    if (truncbits) {
+        /* Apply Rounding */
+        feraiseexcept(FE_INEXACT);
+        switch (fegetround()) {
+        case FE_TOWARDZERO:
+            break;
+        case FE_DOWNWARD:
+            if (will_negate) {
+                tmp += 1;
+            }
+            break;
+        case FE_UPWARD:
+            if (!will_negate) {
+                tmp += 1;
+            }
+            break;
+        default:
+            if ((truncbits & ((1ULL << (fracshift - 1)) - 1)) == 0) {
+                /* Exactly .5 */
+                tmp += (tmp & 1);
+            } else {
+                tmp += ((truncbits >> (fracshift - 1)) & 1);
+            }
+        }
+    }
+    /*
+     * If we added one and it carried all the way out,
+     * check to see if overflow
+     */
+    if ((tmp & ((1ULL << (fDF_MANTBITS() + 1)) - 1)) == 0) {
+        if ((x.x.exp - DF_BIAS) == 63) {
+            feclearexcept(FE_INEXACT);
+            feraiseexcept(FE_INVALID);
+            return ~0ULL;
+        }
+    }
+    tmp <<= endshift;
+    return tmp;
+}
+
+static uint32_t conv_df_to_4u_n(double in, int will_negate)
+{
+    uint64_t tmp;
+    tmp = conv_df_to_8u_n(in, will_negate);
+    if (tmp > 0x00000000ffffffffULL) {
+        feclearexcept(FE_INEXACT);
+        feraiseexcept(FE_INVALID);
+        return ~0U;
+    }
+    return (uint32_t)tmp;
+}
+
+uint64_t conv_df_to_8u(double in)
+{
+    return conv_df_to_8u_n(in, 0);
+}
+
+uint32_t conv_df_to_4u(double in)
+{
+    return conv_df_to_4u_n(in, 0);
+}
+
+int64_t conv_df_to_8s(double in)
+{
+    uint64_t tmp;
+    df_t x;
+    x.f = in;
+    if (isnan(in)) {
+        feraiseexcept(FE_INVALID);
+        return -1;
+    }
+    if (x.x.sign) {
+        tmp = conv_df_to_8u_n(-in, 1);
+    } else {
+        tmp = conv_df_to_8u_n(in, 0);
+    }
+    if (tmp > (LL_MAX_POS + x.x.sign)) {
+        feclearexcept(FE_INEXACT);
+        feraiseexcept(FE_INVALID);
+        tmp = (LL_MAX_POS + x.x.sign);
+    }
+    if (x.x.sign) {
+        return -tmp;
+    } else {
+        return tmp;
+    }
+}
+
+int32_t conv_df_to_4s(double in)
+{
+    uint64_t tmp;
+    df_t x;
+    x.f = in;
+    if (isnan(in)) {
+        feraiseexcept(FE_INVALID);
+        return -1;
+    }
+    if (x.x.sign) {
+        tmp = conv_df_to_8u_n(-in, 1);
+    } else {
+        tmp = conv_df_to_8u_n(in, 0);
+    }
+    if (tmp > (MAX_POS + x.x.sign)) {
+        feclearexcept(FE_INEXACT);
+        feraiseexcept(FE_INVALID);
+        tmp = (MAX_POS + x.x.sign);
+    }
+    if (x.x.sign) {
+        return -tmp;
+    } else {
+        return tmp;
+    }
+}
+
+uint64_t conv_sf_to_8u(float in)
+{
+    return conv_df_to_8u(in);
+}
+
+uint32_t conv_sf_to_4u(float in)
+{
+    return conv_df_to_4u(in);
+}
+
+int64_t conv_sf_to_8s(float in)
+{
+    return conv_df_to_8s(in);
+}
+
+int32_t conv_sf_to_4s(float in)
+{
+    return conv_df_to_4s(in);
+}
+
diff --git a/target/hexagon/fma_emu.c b/target/hexagon/fma_emu.c
new file mode 100644
index 0000000..6a4d4e1
--- /dev/null
+++ b/target/hexagon/fma_emu.c
@@ -0,0 +1,777 @@
+/*
+ *  Copyright(c) 2019-2020 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <math.h>
+#include "qemu/osdep.h"
+#include "macros.h"
+#include "conv_emu.h"
+#include "fma_emu.h"
+
+#define DF_INF_EXP 0x7ff
+#define DF_BIAS 1023
+
+#define SF_INF_EXP 0xff
+#define SF_BIAS 127
+
+#define HF_INF_EXP 0x1f
+#define HF_BIAS 15
+
+#define WAY_BIG_EXP 4096
+
+#define isz(X) (fpclassify(X) == FP_ZERO)
+
+typedef union {
+    double f;
+    uint64_t i;
+    struct {
+        uint64_t mant:52;
+        uint64_t exp:11;
+        uint64_t sign:1;
+    } x;
+} df_t;
+
+typedef union {
+    float f;
+    uint32_t i;
+    struct {
+        uint32_t mant:23;
+        uint32_t exp:8;
+        uint32_t sign:1;
+    } x;
+} sf_t;
+
+typedef struct {
+    union {
+        uint64_t low;
+        struct {
+            uint32_t w0;
+            uint32_t w1;
+        };
+    };
+    union {
+        uint64_t high;
+        struct {
+            uint32_t w3;
+            uint32_t w2;
+        };
+    };
+} int128_t;
+typedef struct {
+    int128_t mant;
+    int32_t exp;
+    uint8_t sign;
+    uint8_t guard;
+    uint8_t round;
+    uint8_t sticky;
+} xf_t;
+
+static inline void xf_init(xf_t *p)
+{
+    p->mant.low = 0;
+    p->mant.high = 0;
+    p->exp = 0;
+    p->sign = 0;
+    p->guard = 0;
+    p->round = 0;
+    p->sticky = 0;
+}
+
+static inline uint64_t df_getmant(df_t a)
+{
+    int class = fpclassify(a.f);
+    switch (class) {
+    case FP_NORMAL:
+    return a.x.mant | 1ULL << 52;
+    case FP_ZERO:
+        return 0;
+    case FP_SUBNORMAL:
+        return a.x.mant;
+    default:
+        return -1;
+    };
+}
+
+static inline int32_t df_getexp(df_t a)
+{
+    int class = fpclassify(a.f);
+    switch (class) {
+    case FP_NORMAL:
+        return a.x.exp;
+    case FP_SUBNORMAL:
+        return a.x.exp + 1;
+    default:
+        return -1;
+    };
+}
+
+static inline uint64_t sf_getmant(sf_t a)
+{
+    int class = fpclassify(a.f);
+    switch (class) {
+    case FP_NORMAL:
+        return a.x.mant | 1ULL << 23;
+    case FP_ZERO:
+        return 0;
+    case FP_SUBNORMAL:
+        return a.x.mant | 0ULL;
+    default:
+        return -1;
+    };
+}
+
+static inline int32_t sf_getexp(sf_t a)
+{
+    int class = fpclassify(a.f);
+    switch (class) {
+    case FP_NORMAL:
+        return a.x.exp;
+    case FP_SUBNORMAL:
+        return a.x.exp + 1;
+    default:
+        return -1;
+    };
+}
+
+static inline int128_t int128_mul_6464(uint64_t ai, uint64_t bi)
+{
+    int128_t ret;
+    int128_t a, b;
+    uint64_t pp0, pp1a, pp1b, pp1s, pp2;
+
+    a.high = b.high = 0;
+    a.low = ai;
+    b.low = bi;
+    pp0 = (uint64_t)a.w0 * (uint64_t)b.w0;
+    pp1a = (uint64_t)a.w1 * (uint64_t)b.w0;
+    pp1b = (uint64_t)b.w1 * (uint64_t)a.w0;
+    pp2 = (uint64_t)a.w1 * (uint64_t)b.w1;
+
+    pp1s = pp1a + pp1b;
+    if ((pp1s < pp1a) || (pp1s < pp1b)) {
+        pp2 += (1ULL << 32);
+    }
+    ret.low = pp0 + (pp1s << 32);
+    if ((ret.low < pp0) || (ret.low < (pp1s << 32))) {
+        pp2 += 1;
+    }
+    ret.high = pp2 + (pp1s >> 32);
+
+    return ret;
+}
+
+static inline int128_t int128_shl(int128_t a, uint32_t amt)
+{
+    int128_t ret;
+    if (amt == 0) {
+        return a;
+    }
+    if (amt > 128) {
+        ret.high = 0;
+        ret.low = 0;
+        return ret;
+    }
+    if (amt >= 64) {
+        amt -= 64;
+        a.high = a.low;
+        a.low = 0;
+    }
+    ret.high = a.high << amt;
+    ret.high |= (a.low >> (64 - amt));
+    ret.low = a.low << amt;
+    return ret;
+}
+
+static inline int128_t int128_shr(int128_t a, uint32_t amt)
+{
+    int128_t ret;
+    if (amt == 0) {
+        return a;
+    }
+    if (amt > 128) {
+        ret.high = 0;
+        ret.low = 0;
+        return ret;
+    }
+    if (amt >= 64) {
+        amt -= 64;
+        a.low = a.high;
+        a.high = 0;
+    }
+    ret.low = a.low >> amt;
+    ret.low |= (a.high << (64 - amt));
+    ret.high = a.high >> amt;
+    return ret;
+}
+
+static inline int128_t int128_add(int128_t a, int128_t b)
+{
+    int128_t ret;
+    ret.low = a.low + b.low;
+    if ((ret.low < a.low) || (ret.low < b.low)) {
+        /* carry into high part */
+        a.high += 1;
+    }
+    ret.high = a.high + b.high;
+    return ret;
+}
+
+static inline int128_t int128_sub(int128_t a, int128_t b, int borrow)
+{
+    int128_t ret;
+    ret.low = a.low - b.low;
+    if (ret.low > a.low) {
+        /* borrow into high part */
+        a.high -= 1;
+    }
+    ret.high = a.high - b.high;
+    if (borrow == 0) {
+        return ret;
+    } else {
+        a.high = 0;
+        a.low = 1;
+        return int128_sub(ret, a, 0);
+    }
+}
+
+static inline int int128_gt(int128_t a, int128_t b)
+{
+    if (a.high == b.high) {
+        return a.low > b.low;
+    }
+    return a.high > b.high;
+}
+
+static inline xf_t xf_norm_left(xf_t a)
+{
+    a.exp--;
+    a.mant = int128_shl(a.mant, 1);
+    a.mant.low |= a.guard;
+    a.guard = a.round;
+    a.round = a.sticky;
+    return a;
+}
+
+static inline xf_t xf_norm_right(xf_t a, int amt)
+{
+    if (amt > 130) {
+        a.sticky |=
+            a.round | a.guard | (a.mant.low != 0) | (a.mant.high != 0);
+        a.guard = a.round = a.mant.high = a.mant.low = 0;
+        a.exp += amt;
+        return a;
+
+    }
+    while (amt >= 64) {
+        a.sticky |= a.round | a.guard | (a.mant.low != 0);
+        a.guard = (a.mant.low >> 63) & 1;
+        a.round = (a.mant.low >> 62) & 1;
+        a.mant.low = a.mant.high;
+        a.mant.high = 0;
+        a.exp += 64;
+        amt -= 64;
+    }
+    while (amt > 0) {
+        a.exp++;
+        a.sticky |= a.round;
+        a.round = a.guard;
+        a.guard = a.mant.low & 1;
+        a.mant = int128_shr(a.mant, 1);
+        amt--;
+    }
+    return a;
+}
+
+
+/*
+ * On the add/sub, we need to be able to shift out lots of bits, but need a
+ * sticky bit for what was shifted out, I think.
+ */
+static xf_t xf_add(xf_t a, xf_t b);
+
+static inline xf_t xf_sub(xf_t a, xf_t b, int negate)
+{
+    xf_t ret;
+    xf_init(&ret);
+    int borrow;
+
+    if (a.sign != b.sign) {
+        b.sign = !b.sign;
+        return xf_add(a, b);
+    }
+    if (b.exp > a.exp) {
+        /* small - big == - (big - small) */
+        return xf_sub(b, a, !negate);
+    }
+    if ((b.exp == a.exp) && (int128_gt(b.mant, a.mant))) {
+        /* small - big == - (big - small) */
+        return xf_sub(b, a, !negate);
+    }
+
+    while (a.exp > b.exp) {
+        /* Try to normalize exponents: shrink a exponent and grow mantissa */
+        if (a.mant.high & (1ULL << 62)) {
+            /* Can't grow a any more */
+            break;
+        } else {
+            a = xf_norm_left(a);
+        }
+    }
+
+    while (a.exp > b.exp) {
+        /* Try to normalize exponents: grow b exponent and shrink mantissa */
+        /* Keep around shifted out bits... we might need those later */
+        b = xf_norm_right(b, a.exp - b.exp);
+    }
+
+    if ((int128_gt(b.mant, a.mant))) {
+        return xf_sub(b, a, !negate);
+    }
+
+    /* OK, now things should be normalized! */
+    ret.sign = a.sign;
+    ret.exp = a.exp;
+    assert(!int128_gt(b.mant, a.mant));
+    borrow = (b.round << 2) | (b.guard << 1) | b.sticky;
+    ret.mant = int128_sub(a.mant, b.mant, (borrow != 0));
+    borrow = 0 - borrow;
+    ret.guard = (borrow >> 2) & 1;
+    ret.round = (borrow >> 1) & 1;
+    ret.sticky = (borrow >> 0) & 1;
+    if (negate) {
+        ret.sign = !ret.sign;
+    }
+    return ret;
+}
+
+static xf_t xf_add(xf_t a, xf_t b)
+{
+    xf_t ret;
+    xf_init(&ret);
+    if (a.sign != b.sign) {
+        b.sign = !b.sign;
+        return xf_sub(a, b, 0);
+    }
+    if (b.exp > a.exp) {
+        /* small + big ==  (big + small) */
+        return xf_add(b, a);
+    }
+    if ((b.exp == a.exp) && int128_gt(b.mant, a.mant)) {
+        /* small + big ==  (big + small) */
+        return xf_add(b, a);
+    }
+
+    while (a.exp > b.exp) {
+        /* Try to normalize exponents: shrink a exponent and grow mantissa */
+        if (a.mant.high & (1ULL << 62)) {
+            /* Can't grow a any more */
+            break;
+        } else {
+            a = xf_norm_left(a);
+        }
+    }
+
+    while (a.exp > b.exp) {
+        /* Try to normalize exponents: grow b exponent and shrink mantissa */
+        /* Keep around shifted out bits... we might need those later */
+        b = xf_norm_right(b, a.exp - b.exp);
+    }
+
+    /* OK, now things should be normalized! */
+    if (int128_gt(b.mant, a.mant)) {
+        return xf_add(b, a);
+    };
+    ret.sign = a.sign;
+    ret.exp = a.exp;
+    assert(!int128_gt(b.mant, a.mant));
+    ret.mant = int128_add(a.mant, b.mant);
+    ret.guard = b.guard;
+    ret.round = b.round;
+    ret.sticky = b.sticky;
+    return ret;
+}
+
+/* Return an infinity with the same sign as a */
+static inline df_t infinite_df_t(xf_t a)
+{
+    df_t ret;
+    ret.x.sign = a.sign;
+    ret.x.exp = DF_INF_EXP;
+    ret.x.mant = 0ULL;
+    return ret;
+}
+
+/* Return a maximum finite value with the same sign as a */
+static inline df_t maxfinite_df_t(xf_t a)
+{
+    df_t ret;
+    ret.x.sign = a.sign;
+    ret.x.exp = DF_INF_EXP - 1;
+    ret.x.mant = 0x000fffffffffffffULL;
+    return ret;
+}
+
+static inline df_t f2df_t(double in)
+{
+    df_t ret;
+    ret.f = in;
+    return ret;
+}
+
+/* Return an infinity with the same sign as a */
+static inline sf_t infinite_sf_t(xf_t a)
+{
+    sf_t ret;
+    ret.x.sign = a.sign;
+    ret.x.exp = SF_INF_EXP;
+    ret.x.mant = 0ULL;
+    return ret;
+}
+
+/* Return a maximum finite value with the same sign as a */
+static inline sf_t maxfinite_sf_t(xf_t a)
+{
+    sf_t ret;
+    ret.x.sign = a.sign;
+    ret.x.exp = SF_INF_EXP - 1;
+    ret.x.mant = 0x007fffffUL;
+    return ret;
+}
+
+static inline sf_t f2sf_t(float in)
+{
+    sf_t ret;
+    ret.f = in;
+    return ret;
+}
+
+#define GEN_XF_ROUND(TYPE, MANTBITS, INF_EXP) \
+static inline TYPE xf_round_##TYPE(xf_t a) \
+{ \
+    TYPE ret; \
+    ret.i = 0; \
+    ret.x.sign = a.sign; \
+    if ((a.mant.high == 0) && (a.mant.low == 0) \
+        && ((a.guard | a.round | a.sticky) == 0)) { \
+        /* result zero */ \
+        switch (fegetround()) { \
+        case FE_DOWNWARD: \
+            return f2##TYPE(-0.0); \
+        default: \
+            return f2##TYPE(0.0); \
+        } \
+    } \
+    /* Normalize right */ \
+    /* We want MANTBITS bits of mantissa plus the leading one. */ \
+    /* That means that we want MANTBITS+1 bits, or 0x000000000000FF_FFFF */ \
+    /* So we need to normalize right while the high word is non-zero and \
+    * while the low word is nonzero when masked with 0xffe0_0000_0000_0000 */ \
+    while ((a.mant.high != 0) || ((a.mant.low >> (MANTBITS + 1)) != 0)) { \
+        a = xf_norm_right(a, 1); \
+    } \
+    /* \
+     * OK, now normalize left \
+     * We want to normalize left until we have a leading one in bit 24 \
+     * Theoretically, we only need to shift a maximum of one to the left if we \
+     * shifted out lots of bits from B, or if we had no shift / 1 shift sticky \
+     * shoudl be 0  \
+     */ \
+    while ((a.mant.low & (1ULL << MANTBITS)) == 0) { \
+        a = xf_norm_left(a); \
+    } \
+    /* \
+     * OK, now we might need to denormalize because of potential underflow. \
+     * We need to do this before rounding, and rounding might make us normal \
+     * again \
+     */ \
+    while (a.exp <= 0) { \
+        a = xf_norm_right(a, 1 - a.exp); \
+        /* \
+         * Do we have underflow? \
+         * That's when we get an inexact answer because we ran out of bits \
+         * in a denormal. \
+         */ \
+        if (a.guard || a.round || a.sticky) { \
+            feraiseexcept(FE_UNDERFLOW); \
+        } \
+    } \
+    /* OK, we're relatively canonical... now we need to round */ \
+    if (a.guard || a.round || a.sticky) { \
+        feraiseexcept(FE_INEXACT); \
+        switch (fegetround()) { \
+        case FE_TOWARDZERO: \
+            /* Chop and we're done */ \
+            break; \
+        case FE_UPWARD: \
+            if (a.sign == 0) { \
+                a.mant.low += 1; \
+            } \
+            break; \
+        case FE_DOWNWARD: \
+            if (a.sign != 0) { \
+                a.mant.low += 1; \
+            } \
+            break; \
+        default: \
+            if (a.round || a.sticky) { \
+                /* round up if guard is 1, down if guard is zero */ \
+                a.mant.low += a.guard; \
+            } else if (a.guard) { \
+                /* exactly .5, round up if odd */ \
+                a.mant.low += (a.mant.low & 1); \
+            } \
+            break; \
+        } \
+    } \
+    /* \
+     * OK, now we might have carried all the way up. \
+     * So we might need to shr once \
+     * at least we know that the lsb should be zero if we rounded and \
+     * got a carry out... \
+     */ \
+    if ((a.mant.low >> (MANTBITS + 1)) != 0) { \
+        a = xf_norm_right(a, 1); \
+    } \
+    /* Overflow? */ \
+    if (a.exp >= INF_EXP) { \
+        /* Yep, inf result */ \
+        feraiseexcept(FE_OVERFLOW); \
+        feraiseexcept(FE_INEXACT); \
+        switch (fegetround()) { \
+        case FE_TOWARDZERO: \
+            return maxfinite_##TYPE(a); \
+        case FE_UPWARD: \
+            if (a.sign == 0) { \
+                return infinite_##TYPE(a); \
+            } else { \
+                return maxfinite_##TYPE(a); \
+            } \
+        case FE_DOWNWARD: \
+            if (a.sign != 0) { \
+                return infinite_##TYPE(a); \
+            } else { \
+                return maxfinite_##TYPE(a); \
+            } \
+        default: \
+            return infinite_##TYPE(a); \
+        } \
+    } \
+    /* Underflow? */ \
+    if (a.mant.low & (1ULL << MANTBITS)) { \
+        /* Leading one means: No, we're normal. So, we should be done... */ \
+        ret.x.exp = a.exp; \
+        ret.x.mant = a.mant.low; \
+        return ret; \
+    } \
+    if (a.exp != 1) { \
+        printf("a.exp == %d\n", a.exp); \
+    } \
+    assert(a.exp == 1); \
+    ret.x.exp = 0; \
+    ret.x.mant = a.mant.low; \
+    return ret; \
+}
+
+GEN_XF_ROUND(df_t, fDF_MANTBITS(), DF_INF_EXP)
+GEN_XF_ROUND(sf_t, fSF_MANTBITS(), SF_INF_EXP)
+
+static inline double special_fma(df_t a, df_t b, df_t c)
+{
+    df_t ret;
+    ret.i = 0;
+
+    /*
+     * If A multiplied by B is an exact infinity and C is also an infinity
+     * but with the opposite sign, FMA returns NaN and raises invalid.
+     */
+    if (fISINFPROD(a.f, b.f) && isinf(c.f)) {
+        if ((a.x.sign ^ b.x.sign) != c.x.sign) {
+            ret.i = fDFNANVAL();
+            feraiseexcept(FE_INVALID);
+            return ret.f;
+        }
+    }
+    if ((isinf(a.f) && isz(b.f)) || (isz(a.f) && isinf(b.f))) {
+        ret.i = fDFNANVAL();
+        feraiseexcept(FE_INVALID);
+        return ret.f;
+    }
+    /*
+     * If none of the above checks are true and C is a NaN,
+     * a NaN shall be returned
+     * If A or B are NaN, a NAN shall be returned.
+     */
+    if (isnan(a.f) || isnan(b.f) || (isnan(c.f))) {
+        if (isnan(a.f) && (fGETBIT(51, a.i) == 0)) {
+            feraiseexcept(FE_INVALID);
+        }
+        if (isnan(b.f) && (fGETBIT(51, b.i) == 0)) {
+            feraiseexcept(FE_INVALID);
+        }
+        if (isnan(c.f) && (fGETBIT(51, c.i) == 0)) {
+            feraiseexcept(FE_INVALID);
+        }
+        ret.i = fDFNANVAL();
+        return ret.f;
+    }
+    /*
+     * We have checked for adding opposite-signed infinities.
+     * Other infinities return infinity with the correct sign
+     */
+    if (isinf(c.f)) {
+        ret.x.exp = DF_INF_EXP;
+        ret.x.mant = 0;
+        ret.x.sign = c.x.sign;
+        return ret.f;
+    }
+    if (isinf(a.f) || isinf(b.f)) {
+        ret.x.exp = DF_INF_EXP;
+        ret.x.mant = 0;
+        ret.x.sign = (a.x.sign ^ b.x.sign);
+        return ret.f;
+    }
+    g_assert_not_reached();
+}
+
+static inline float special_fmaf(sf_t a, sf_t b, sf_t c)
+{
+    df_t aa, bb, cc;
+    aa.f = a.f;
+    bb.f = b.f;
+    cc.f = c.f;
+    return special_fma(aa, bb, cc);
+}
+
+float internal_fmafx(float a_in, float b_in, float c_in, int scale)
+{
+    sf_t a, b, c;
+    xf_t prod;
+    xf_t acc;
+    xf_t result;
+    xf_init(&prod);
+    xf_init(&acc);
+    xf_init(&result);
+    a.f = a_in;
+    b.f = b_in;
+    c.f = c_in;
+
+    if (isinf(a.f) || isinf(b.f) || isinf(c.f)) {
+        return special_fmaf(a, b, c);
+    }
+    if (isnan(a.f) || isnan(b.f) || isnan(c.f)) {
+        return special_fmaf(a, b, c);
+    }
+    if ((scale == 0) && (isz(a.f) || isz(b.f))) {
+        return a.f * b.f + c.f;
+    }
+
+    /* (a * 2**b) * (c * 2**d) == a*c * 2**(b+d) */
+    prod.mant = int128_mul_6464(sf_getmant(a), sf_getmant(b));
+
+    /*
+     * Note: extracting the mantissa into an int is multiplying by
+     * 2**23, so adjust here
+     */
+    prod.exp = sf_getexp(a) + sf_getexp(b) - SF_BIAS - 23;
+    prod.sign = a.x.sign ^ b.x.sign;
+    if (isz(a.f) || isz(b.f)) {
+        prod.exp = -2 * WAY_BIG_EXP;
+    }
+    if ((scale > 0) && (fpclassify(c.f) == FP_SUBNORMAL)) {
+        acc.mant = int128_mul_6464(0, 0);
+        acc.exp = -WAY_BIG_EXP;
+        acc.sign = c.x.sign;
+        acc.sticky = 1;
+        result = xf_add(prod, acc);
+    } else if (!isz(c.f)) {
+        acc.mant = int128_mul_6464(sf_getmant(c), 1);
+        acc.exp = sf_getexp(c);
+        acc.sign = c.x.sign;
+        result = xf_add(prod, acc);
+    } else {
+        result = prod;
+    }
+    result.exp += scale;
+    return xf_round_sf_t(result).f;
+}
+
+
+float internal_fmaf(float a_in, float b_in, float c_in)
+{
+    return internal_fmafx(a_in, b_in, c_in, 0);
+}
+
+float internal_mpyf(float a_in, float b_in)
+{
+    if (isz(a_in) || isz(b_in)) {
+        return a_in * b_in;
+    }
+    return internal_fmafx(a_in, b_in, 0.0, 0);
+}
+
+static inline double internal_mpyhh_special(double a, double b)
+{
+    return a * b;
+}
+
+double internal_mpyhh(double a_in, double b_in,
+                      unsigned long long int accumulated)
+{
+    df_t a, b;
+    xf_t x;
+    unsigned long long int prod;
+    unsigned int sticky;
+
+    a.f = a_in;
+    b.f = b_in;
+    sticky = accumulated & 1;
+    accumulated >>= 1;
+    xf_init(&x);
+    if (isz(a_in) || isnan(a_in) || isinf(a_in)) {
+        return internal_mpyhh_special(a_in, b_in);
+    }
+    if (isz(b_in) || isnan(b_in) || isinf(b_in)) {
+        return internal_mpyhh_special(a_in, b_in);
+    }
+    x.mant = int128_mul_6464(accumulated, 1);
+    x.sticky = sticky;
+    prod = fGETUWORD(1, df_getmant(a)) * fGETUWORD(1, df_getmant(b));
+    x.mant = int128_add(x.mant, int128_mul_6464(prod, 0x100000000ULL));
+    x.exp = df_getexp(a) + df_getexp(b) - DF_BIAS - 20;
+    if (!isnormal(a_in) || !isnormal(b_in)) {
+        /* crush to inexact zero */
+        x.sticky = 1;
+        x.exp = -4096;
+    }
+    x.sign = a.x.sign ^ b.x.sign;
+    return xf_round_df_t(x).f;
+}
+
+float conv_df_to_sf(double in_f)
+{
+    xf_t x;
+    df_t in;
+    if (isz(in_f) || isnan(in_f) || isinf(in_f)) {
+        return in_f;
+    }
+    xf_init(&x);
+    in.f = in_f;
+    x.mant = int128_mul_6464(df_getmant(in), 1);
+    x.exp = df_getexp(in) - DF_BIAS + SF_BIAS - 52 + 23;
+    x.sign = in.x.sign;
+    return xf_round_sf_t(x).f;
+}
+
-- 
2.7.4