[RFC PATCH v2 7/7] target/ppc: Implemented [pm]xvbf16ger2*

From: "Lucas Mateus Castro(alqotel)" <lucas.araujo@eldorado.org.br>
To: qemu-ppc@nongnu.org
Cc: richard.henderson@linaro.org, "Joel Stanley" <joel@jms.id.au>,
	"Lucas Mateus Castro (alqotel)" <lucas.araujo@eldorado.org.br>,
	"Cédric Le Goater" <clg@kaod.org>,
	"Daniel Henrique Barboza" <danielhb413@gmail.com>,
	"David Gibson" <david@gibson.dropbear.id.au>,
	"Greg Kurz" <groug@kaod.org>,
	qemu-devel@nongnu.org (open list:All patches CC here)
Subject: [RFC PATCH v2 7/7] target/ppc: Implemented [pm]xvbf16ger2*
Date: Fri,  6 May 2022 09:18:44 -0300	[thread overview]
Message-ID: <20220506121844.18969-8-lucas.araujo@eldorado.org.br> (raw)
In-Reply-To: <20220506121844.18969-1-lucas.araujo@eldorado.org.br>

From: "Lucas Mateus Castro (alqotel)" <lucas.araujo@eldorado.org.br>

Implement the following PowerISA v3.1 instructions:
xvbf16ger2:   VSX Vector bfloat16 GER (rank-2 update)
xvbf16ger2nn: VSX Vector bfloat16 GER (rank-2 update) Negative multiply,
Negative accumulate
xvbf16ger2np: VSX Vector bfloat16 GER (rank-2 update) Negative multiply,
Positive accumulate
xvbf16ger2pn: VSX Vector bfloat16 GER (rank-2 update) Positive multiply,
Negative accumulate
xvbf16ger2pp: VSX Vector bfloat16 GER (rank-2 update) Positive multiply,
Positive accumulate
pmxvbf16ger2:   Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
pmxvbf16ger2nn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Negative multiply, Negative accumulate
pmxvbf16ger2np: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Negative multiply, Positive accumulate
pmxvbf16ger2pn: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Positive multiply, Negative accumulate
pmxvbf16ger2pp: Prefixed Masked VSX Vector bfloat16 GER (rank-2 update)
Positive multiply, Positive accumulate

Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br>
---
There's a discrepancy between this implementation and mambo/the
hardware where implementing it with float64_mul then float64r32_muladd
sometimes results in an incorrect result after an underflow, but
implementing with float32_mul then float32_muladd results in incorrect
signal in some 0 or infinite results. I've not been able to solve this
---
 target/ppc/fpu_helper.c             | 40 +++++++++++++++++++++++++++++
 target/ppc/helper.h                 |  5 ++++
 target/ppc/insn32.decode            |  6 +++++
 target/ppc/insn64.decode            | 11 ++++++++
 target/ppc/translate/vsx-impl.c.inc | 12 +++++++++
 5 files changed, 74 insertions(+)

diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index 6857be6ccc..0882702301 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -3491,6 +3491,11 @@ static float64 extract_hf16(float16 in, float_status *fp_status)
     return float16_to_float64(in, true, fp_status);
 }
 
+static float64 extract_bf16(bfloat16 in, float_status *fp_status)
+{
+    return bfloat16_to_float64(in, fp_status);
+}
+
 static void vsxger16(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
                      ppc_acc_t  *at, uint32_t mask, bool acc,
                      bool neg_mul, bool neg_acc, extract_f16 extract)
@@ -3611,6 +3616,41 @@ static void vsxger(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b, ppc_acc_t  *at,
     do_float_check_status(env, GETPC());
 }
 
+QEMU_FLATTEN
+void helper_XVBF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+                       ppc_acc_t *at, uint32_t mask)
+{
+    vsxger16(env, a, b, at, mask, false, false, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2PP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+                         ppc_acc_t *at, uint32_t mask)
+{
+    vsxger16(env, a, b, at, mask, true, false, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2PN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+                         ppc_acc_t *at, uint32_t mask)
+{
+    vsxger16(env, a, b, at, mask, true, false, true, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2NP(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+                         ppc_acc_t *at, uint32_t mask)
+{
+    vsxger16(env, a, b, at, mask, true, true, false, extract_bf16);
+}
+
+QEMU_FLATTEN
+void helper_XVBF16GER2NN(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
+                         ppc_acc_t *at, uint32_t mask)
+{
+    vsxger16(env, a, b, at, mask, true, true, true, extract_bf16);
+}
+
 QEMU_FLATTEN
 void helper_XVF16GER2(CPUPPCState *env, ppc_vsr_t *a, ppc_vsr_t *b,
                      ppc_acc_t *at, uint32_t mask)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 5f2f574d30..59e6b74f94 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -551,6 +551,11 @@ DEF_HELPER_5(XVF16GER2PP, void, env, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XVF16GER2PN, void, env, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XVF16GER2NP, void, env, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XVF16GER2NN, void, env, vsr, vsr, vsr, i32)
+DEF_HELPER_5(XVBF16GER2, void, env, vsr, vsr, vsr, i32)
+DEF_HELPER_5(XVBF16GER2PP, void, env, vsr, vsr, vsr, i32)
+DEF_HELPER_5(XVBF16GER2PN, void, env, vsr, vsr, vsr, i32)
+DEF_HELPER_5(XVBF16GER2NP, void, env, vsr, vsr, vsr, i32)
+DEF_HELPER_5(XVBF16GER2NN, void, env, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XVF32GER, void, env, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XVF32GERPP, void, env, vsr, vsr, vsr, i32)
 DEF_HELPER_5(XVF32GERPN, void, env, vsr, vsr, vsr, i32)
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index bbd4bc80f8..2090c17268 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -736,6 +736,12 @@ XVI8GER4SPP     111011 ... -- ..... ..... 01100011 ..-  @XX3_at xa=%xx_xa
 XVI16GER2S      111011 ... -- ..... ..... 00101011 ..-  @XX3_at xa=%xx_xa
 XVI16GER2SPP    111011 ... -- ..... ..... 00101010 ..-  @XX3_at xa=%xx_xa
 
+XVBF16GER2      111011 ... -- ..... ..... 00110011 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2PP    111011 ... -- ..... ..... 00110010 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2PN    111011 ... -- ..... ..... 10110010 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2NP    111011 ... -- ..... ..... 01110010 ..-  @XX3_at xa=%xx_xa
+XVBF16GER2NN    111011 ... -- ..... ..... 11110010 ..-  @XX3_at xa=%xx_xa
+
 XVF16GER2       111011 ... -- ..... ..... 00010011 ..-  @XX3_at xa=%xx_xa
 XVF16GER2PP     111011 ... -- ..... ..... 00010010 ..-  @XX3_at xa=%xx_xa
 XVF16GER2PN     111011 ... -- ..... ..... 10010010 ..-  @XX3_at xa=%xx_xa
diff --git a/target/ppc/insn64.decode b/target/ppc/insn64.decode
index a12f11044c..78738924c6 100644
--- a/target/ppc/insn64.decode
+++ b/target/ppc/insn64.decode
@@ -150,6 +150,17 @@ PMXVI16GER2S    000001 11 1001 -- - - pmsk:2 ------ ........       \
 PMXVI16GER2SPP  000001 11 1001 -- - - pmsk:2 ------ ........       \
                 111011 ... -- ..... ..... 00101010 ..-  @MMIRR_XX3
 
+PMXVBF16GER2    000001 11 1001 -- - - pmsk:2 ------ ........ \
+                111011 ... -- ..... ..... 00110011 ..-  @MMIRR_XX3
+PMXVBF16GER2PP  000001 11 1001 -- - - pmsk:2 ------ ........ \
+                111011 ... -- ..... ..... 00110010 ..-  @MMIRR_XX3
+PMXVBF16GER2PN  000001 11 1001 -- - - pmsk:2 ------ ........ \
+                111011 ... -- ..... ..... 10110010 ..-  @MMIRR_XX3
+PMXVBF16GER2NP  000001 11 1001 -- - - pmsk:2 ------ ........ \
+                111011 ... -- ..... ..... 01110010 ..-  @MMIRR_XX3
+PMXVBF16GER2NN  000001 11 1001 -- - - pmsk:2 ------ ........ \
+                111011 ... -- ..... ..... 11110010 ..-  @MMIRR_XX3
+
 PMXVF16GER2     000001 11 1001 -- - - pmsk:2 ------ ........ \
                 111011 ... -- ..... ..... 00010011 ..-  @MMIRR_XX3
 PMXVF16GER2PP   000001 11 1001 -- - - pmsk:2 ------ ........ \
diff --git a/target/ppc/translate/vsx-impl.c.inc b/target/ppc/translate/vsx-impl.c.inc
index 00eed2b1b9..bb5b68ee06 100644
--- a/target/ppc/translate/vsx-impl.c.inc
+++ b/target/ppc/translate/vsx-impl.c.inc
@@ -2884,6 +2884,12 @@ TRANS64(PMXVI16GER2PP, do_ger_MMIRR_XX3, gen_helper_XVI16GER2PP)
 TRANS64(PMXVI16GER2S, do_ger_MMIRR_XX3, gen_helper_XVI16GER2S)
 TRANS64(PMXVI16GER2SPP, do_ger_MMIRR_XX3, gen_helper_XVI16GER2SPP)
 
+TRANS(XVBF16GER2, do_ger_XX3, gen_helper_XVBF16GER2)
+TRANS(XVBF16GER2PP, do_ger_XX3, gen_helper_XVBF16GER2PP)
+TRANS(XVBF16GER2PN, do_ger_XX3, gen_helper_XVBF16GER2PN)
+TRANS(XVBF16GER2NP, do_ger_XX3, gen_helper_XVBF16GER2NP)
+TRANS(XVBF16GER2NN, do_ger_XX3, gen_helper_XVBF16GER2NN)
+
 TRANS(XVF16GER2, do_ger_XX3, gen_helper_XVF16GER2)
 TRANS(XVF16GER2PP, do_ger_XX3, gen_helper_XVF16GER2PP)
 TRANS(XVF16GER2PN, do_ger_XX3, gen_helper_XVF16GER2PN)
@@ -2902,6 +2908,12 @@ TRANS(XVF64GERPN, do_ger_XX3, gen_helper_XVF64GERPN)
 TRANS(XVF64GERNP, do_ger_XX3, gen_helper_XVF64GERNP)
 TRANS(XVF64GERNN, do_ger_XX3, gen_helper_XVF64GERNN)
 
+TRANS64(PMXVBF16GER2, do_ger_MMIRR_XX3, gen_helper_XVBF16GER2)
+TRANS64(PMXVBF16GER2PP, do_ger_MMIRR_XX3, gen_helper_XVBF16GER2PP)
+TRANS64(PMXVBF16GER2PN, do_ger_MMIRR_XX3, gen_helper_XVBF16GER2PN)
+TRANS64(PMXVBF16GER2NP, do_ger_MMIRR_XX3, gen_helper_XVBF16GER2NP)
+TRANS64(PMXVBF16GER2NN, do_ger_MMIRR_XX3, gen_helper_XVBF16GER2NN)
+
 TRANS64(PMXVF16GER2, do_ger_MMIRR_XX3, gen_helper_XVF16GER2)
 TRANS64(PMXVF16GER2PP, do_ger_MMIRR_XX3, gen_helper_XVF16GER2PP)
 TRANS64(PMXVF16GER2PN, do_ger_MMIRR_XX3, gen_helper_XVF16GER2PN)
-- 
2.31.1