All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Maydell <peter.maydell@linaro.org>
To: qemu-devel@nongnu.org
Subject: [PULL 10/51] target/arm: Implement MVE VCMUL and VCMLA
Date: Wed,  1 Sep 2021 11:36:12 +0100	[thread overview]
Message-ID: <20210901103653.13435-11-peter.maydell@linaro.org> (raw)
In-Reply-To: <20210901103653.13435-1-peter.maydell@linaro.org>

Implement the MVE VCMUL and VCMLA insns.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/helper-mve.h    | 18 ++++++++
 target/arm/mve.decode      | 35 ++++++++++++----
 target/arm/mve_helper.c    | 86 ++++++++++++++++++++++++++++++++++++++
 target/arm/translate-mve.c |  8 ++++
 4 files changed, 139 insertions(+), 8 deletions(-)

diff --git a/target/arm/helper-mve.h b/target/arm/helper-mve.h
index c230610d25c..73950403bc3 100644
--- a/target/arm/helper-mve.h
+++ b/target/arm/helper-mve.h
@@ -440,6 +440,24 @@ DEF_HELPER_FLAGS_4(mve_vfmas, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
 DEF_HELPER_FLAGS_4(mve_vfmsh, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
 DEF_HELPER_FLAGS_4(mve_vfmss, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
 
+DEF_HELPER_FLAGS_4(mve_vcmul0h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmul0s, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmul90h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmul90s, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmul180h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmul180s, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmul270h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmul270s, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
+DEF_HELPER_FLAGS_4(mve_vcmla0h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmla0s, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmla90h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmla90s, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmla180h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmla180s, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmla270h, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+DEF_HELPER_FLAGS_4(mve_vcmla270s, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr)
+
 DEF_HELPER_FLAGS_4(mve_vadd_scalarb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(mve_vadd_scalarh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(mve_vadd_scalarw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
diff --git a/target/arm/mve.decode b/target/arm/mve.decode
index 3a2056f6b34..403381eef61 100644
--- a/target/arm/mve.decode
+++ b/target/arm/mve.decode
@@ -286,15 +286,29 @@ VQSHL_U          111 1 1111 0 . .. ... 0 ... 0 0100 . 1 . 1 ... 0 @2op_rev
 VQRSHL_S         111 0 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev
 VQRSHL_U         111 1 1111 0 . .. ... 0 ... 0 0101 . 1 . 1 ... 0 @2op_rev
 
-VQDMLADH         1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
-VQDMLADHX        1110 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
-VQRDMLADH        1110 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
-VQRDMLADHX       1110 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
+{
+  VCMUL0         111 . 1110 0 . 11 ... 0 ... 0 1110 . 0 . 0 ... 0 @2op_sz28
+  VQDMLADH       1110  1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
+  VQDMLSDH       1111  1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
+}
 
-VQDMLSDH         1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 0 @2op
-VQDMLSDHX        1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
-VQRDMLSDH        1111 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
-VQRDMLSDHX       1111 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
+{
+  VCMUL180       111 . 1110 0 . 11 ... 0 ... 1 1110 . 0 . 0 ... 0 @2op_sz28
+  VQDMLADHX      111 0 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
+  VQDMLSDHX      111 1 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 0 @2op
+}
+
+{
+  VCMUL90        111 . 1110 0 . 11 ... 0 ... 0 1110 . 0 . 0 ... 1 @2op_sz28
+  VQRDMLADH      111 0 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
+  VQRDMLSDH      111 1 1110 0 . .. ... 0 ... 0 1110 . 0 . 0 ... 1 @2op
+}
+
+{
+  VCMUL270       111 . 1110 0 . 11 ... 0 ... 1 1110 . 0 . 0 ... 1 @2op_sz28
+  VQRDMLADHX     111 0 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
+  VQRDMLSDHX     111 1 1110 0 . .. ... 0 ... 1 1110 . 0 . 0 ... 1 @2op
+}
 
 VQDMULLB         111 . 1110 0 . 11 ... 0 ... 0 1111 . 0 . 0 ... 1 @2op_sz28
 VQDMULLT         111 . 1110 0 . 11 ... 0 ... 1 1111 . 0 . 0 ... 1 @2op_sz28
@@ -642,3 +656,8 @@ VCADD270_fp       1111 1101 1 . 0 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_
 
 VFMA              1110 1111 0 . 0 . ... 0 ... 0 1100 . 1 . 1 ... 0 @2op_fp
 VFMS              1110 1111 0 . 1 . ... 0 ... 0 1100 . 1 . 1 ... 0 @2op_fp
+
+VCMLA0            1111 110 00 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCMLA90           1111 110 01 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCMLA180          1111 110 10 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
+VCMLA270          1111 110 11 . 1 . ... 0 ... 0 1000 . 1 . 0 ... 0 @2op_fp_size_rev
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
index d7f250a4455..e478408fddd 100644
--- a/target/arm/mve_helper.c
+++ b/target/arm/mve_helper.c
@@ -2931,3 +2931,89 @@ DO_VFMA(vfmah, 2, float16, false)
 DO_VFMA(vfmas, 4, float32, false)
 DO_VFMA(vfmsh, 2, float16, true)
 DO_VFMA(vfmss, 4, float32, true)
+
+#define DO_VCMLA(OP, ESIZE, TYPE, ROT, FN)                              \
+    void HELPER(glue(mve_, OP))(CPUARMState *env,                       \
+                                void *vd, void *vn, void *vm)           \
+    {                                                                   \
+        TYPE *d = vd, *n = vn, *m = vm;                                 \
+        TYPE r0, r1, e1, e2, e3, e4;                                    \
+        uint16_t mask = mve_element_mask(env);                          \
+        unsigned e;                                                     \
+        float_status *fpst0, *fpst1;                                    \
+        float_status scratch_fpst;                                      \
+        /* We loop through pairs of elements at a time */               \
+        for (e = 0; e < 16 / ESIZE; e += 2, mask >>= ESIZE * 2) {       \
+            if ((mask & MAKE_64BIT_MASK(0, ESIZE * 2)) == 0) {          \
+                continue;                                               \
+            }                                                           \
+            fpst0 = (ESIZE == 2) ? &env->vfp.standard_fp_status_f16 :   \
+                &env->vfp.standard_fp_status;                           \
+            fpst1 = fpst0;                                              \
+            if (!(mask & 1)) {                                          \
+                scratch_fpst = *fpst0;                                  \
+                fpst0 = &scratch_fpst;                                  \
+            }                                                           \
+            if (!(mask & (1 << ESIZE))) {                               \
+                scratch_fpst = *fpst1;                                  \
+                fpst1 = &scratch_fpst;                                  \
+            }                                                           \
+            switch (ROT) {                                              \
+            case 0:                                                     \
+                e1 = m[H##ESIZE(e)];                                    \
+                e2 = n[H##ESIZE(e)];                                    \
+                e3 = m[H##ESIZE(e + 1)];                                \
+                e4 = n[H##ESIZE(e)];                                    \
+                break;                                                  \
+            case 1:                                                     \
+                e1 = TYPE##_chs(m[H##ESIZE(e + 1)]);                    \
+                e2 = n[H##ESIZE(e + 1)];                                \
+                e3 = m[H##ESIZE(e)];                                    \
+                e4 = n[H##ESIZE(e + 1)];                                \
+                break;                                                  \
+            case 2:                                                     \
+                e1 = TYPE##_chs(m[H##ESIZE(e)]);                        \
+                e2 = n[H##ESIZE(e)];                                    \
+                e3 = TYPE##_chs(m[H##ESIZE(e + 1)]);                    \
+                e4 = n[H##ESIZE(e)];                                    \
+                break;                                                  \
+            case 3:                                                     \
+                e1 = m[H##ESIZE(e + 1)];                                \
+                e2 = n[H##ESIZE(e + 1)];                                \
+                e3 = TYPE##_chs(m[H##ESIZE(e)]);                        \
+                e4 = n[H##ESIZE(e + 1)];                                \
+                break;                                                  \
+            default:                                                    \
+                g_assert_not_reached();                                 \
+            }                                                           \
+            r0 = FN(e2, e1, d[H##ESIZE(e)], fpst0);                     \
+            r1 = FN(e4, e3, d[H##ESIZE(e + 1)], fpst1);                 \
+            mergemask(&d[H##ESIZE(e)], r0, mask);                       \
+            mergemask(&d[H##ESIZE(e + 1)], r1, mask >> ESIZE);          \
+        }                                                               \
+        mve_advance_vpt(env);                                           \
+    }
+
+#define DO_VCMULH(N, M, D, S) float16_mul(N, M, S)
+#define DO_VCMULS(N, M, D, S) float32_mul(N, M, S)
+
+#define DO_VCMLAH(N, M, D, S) float16_muladd(N, M, D, 0, S)
+#define DO_VCMLAS(N, M, D, S) float32_muladd(N, M, D, 0, S)
+
+DO_VCMLA(vcmul0h, 2, float16, 0, DO_VCMULH)
+DO_VCMLA(vcmul0s, 4, float32, 0, DO_VCMULS)
+DO_VCMLA(vcmul90h, 2, float16, 1, DO_VCMULH)
+DO_VCMLA(vcmul90s, 4, float32, 1, DO_VCMULS)
+DO_VCMLA(vcmul180h, 2, float16, 2, DO_VCMULH)
+DO_VCMLA(vcmul180s, 4, float32, 2, DO_VCMULS)
+DO_VCMLA(vcmul270h, 2, float16, 3, DO_VCMULH)
+DO_VCMLA(vcmul270s, 4, float32, 3, DO_VCMULS)
+
+DO_VCMLA(vcmla0h, 2, float16, 0, DO_VCMLAH)
+DO_VCMLA(vcmla0s, 4, float32, 0, DO_VCMLAS)
+DO_VCMLA(vcmla90h, 2, float16, 1, DO_VCMLAH)
+DO_VCMLA(vcmla90s, 4, float32, 1, DO_VCMLAS)
+DO_VCMLA(vcmla180h, 2, float16, 2, DO_VCMLAH)
+DO_VCMLA(vcmla180s, 4, float32, 2, DO_VCMLAS)
+DO_VCMLA(vcmla270h, 2, float16, 3, DO_VCMLAH)
+DO_VCMLA(vcmla270s, 4, float32, 3, DO_VCMLAS)
diff --git a/target/arm/translate-mve.c b/target/arm/translate-mve.c
index d61abc6d46f..d62ed1fc295 100644
--- a/target/arm/translate-mve.c
+++ b/target/arm/translate-mve.c
@@ -856,6 +856,14 @@ DO_2OP_FP(VCADD90_fp, vfcadd90)
 DO_2OP_FP(VCADD270_fp, vfcadd270)
 DO_2OP_FP(VFMA, vfma)
 DO_2OP_FP(VFMS, vfms)
+DO_2OP_FP(VCMUL0, vcmul0)
+DO_2OP_FP(VCMUL90, vcmul90)
+DO_2OP_FP(VCMUL180, vcmul180)
+DO_2OP_FP(VCMUL270, vcmul270)
+DO_2OP_FP(VCMLA0, vcmla0)
+DO_2OP_FP(VCMLA90, vcmla90)
+DO_2OP_FP(VCMLA180, vcmla180)
+DO_2OP_FP(VCMLA270, vcmla270)
 
 static bool do_2op_scalar(DisasContext *s, arg_2scalar *a,
                           MVEGenTwoOpScalarFn fn)
-- 
2.20.1



  parent reply	other threads:[~2021-09-01 10:49 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-01 10:36 [PULL 00/51] target-arm queue Peter Maydell
2021-09-01 10:36 ` [PULL 01/51] tests: Remove uses of deprecated raspi2/raspi3 machine names Peter Maydell
2021-09-01 10:36 ` [PULL 02/51] hw/arm/raspi: Remove deprecated raspi2/raspi3 aliases Peter Maydell
2021-09-01 10:36 ` [PULL 03/51] hw/intc/arm_gicv3_dist: Rename 64-bit accessors with 'q' suffix Peter Maydell
2021-09-01 10:36 ` [PULL 04/51] hw/intc/arm_gicv3: Replace mis-used MEMTX_* constants by booleans Peter Maydell
2021-09-01 10:36 ` [PULL 05/51] hw: Add compat machines for 6.2 Peter Maydell
2021-09-01 10:36 ` [PULL 06/51] target/arm: Implement MVE VADD (floating-point) Peter Maydell
2021-09-01 10:36 ` [PULL 07/51] target/arm: Implement MVE VSUB, VMUL, VABD, VMAXNM, VMINNM Peter Maydell
2021-09-01 10:36 ` [PULL 08/51] target/arm: Implement MVE VCADD Peter Maydell
2021-09-01 10:36 ` [PULL 09/51] target/arm: Implement MVE VFMA and VFMS Peter Maydell
2021-09-01 10:36 ` Peter Maydell [this message]
2021-09-01 10:36 ` [PULL 11/51] target/arm: Implement MVE VMAXNMA and VMINNMA Peter Maydell
2021-09-01 10:36 ` [PULL 12/51] target/arm: Implement MVE scalar fp insns Peter Maydell
2021-09-01 10:36 ` [PULL 13/51] target/arm: Implement MVE fp-with-scalar VFMA, VFMAS Peter Maydell
2021-09-01 10:36 ` [PULL 14/51] softfloat: Remove assertion preventing silencing of NaN in default-NaN mode Peter Maydell
2021-09-01 10:36 ` [PULL 15/51] target/arm: Implement MVE FP max/min across vector Peter Maydell
2021-09-01 10:36 ` [PULL 16/51] target/arm: Implement MVE fp vector comparisons Peter Maydell
2021-09-01 10:36 ` [PULL 17/51] target/arm: Implement MVE fp scalar comparisons Peter Maydell
2021-09-01 10:36 ` [PULL 18/51] target/arm: Implement MVE VCVT between floating and fixed point Peter Maydell
2021-09-01 10:36 ` [PULL 19/51] target/arm: Implement MVE VCVT between fp and integer Peter Maydell
2021-09-01 10:36 ` [PULL 20/51] target/arm: Implement MVE VCVT with specified rounding mode Peter Maydell
2021-09-01 10:36 ` [PULL 21/51] target/arm: Implement MVE VCVT between single and half precision Peter Maydell
2021-09-01 10:36 ` [PULL 22/51] target/arm: Implement MVE VRINT insns Peter Maydell
2021-09-01 10:36 ` [PULL 23/51] target/arm: Enable MVE in Cortex-M55 Peter Maydell
2021-09-01 10:36 ` [PULL 24/51] target-arm: Add support for Fujitsu A64FX Peter Maydell
2021-09-01 10:36 ` [PULL 25/51] hw/arm/virt: target-arm: Add A64FX processor support to virt machine Peter Maydell
2021-09-01 10:36 ` [PULL 26/51] tests/arm-cpu-features: Add A64FX processor related tests Peter Maydell
2021-09-01 10:36 ` [PULL 27/51] arm: Move M-profile RAS register block into its own device Peter Maydell
2021-09-01 10:36 ` [PULL 28/51] arm: Move systick device creation from NVIC to ARMv7M object Peter Maydell
2021-09-01 10:36 ` [PULL 29/51] arm: Move system PPB container handling to armv7m Peter Maydell
2021-09-01 10:36 ` [PULL 30/51] hw/timer/armv7m_systick: Add usual QEMU interface comment Peter Maydell
2021-09-01 10:36 ` [PULL 31/51] hw/timer/armv7m_systick: Add input clocks Peter Maydell
2021-09-01 10:36 ` [PULL 32/51] hw/arm/armv7m: Create " Peter Maydell
2021-09-01 10:36 ` [PULL 33/51] armsse: Wire up systick cpuclk clock Peter Maydell
2021-09-01 10:36 ` [PULL 34/51] hw/arm/mps2.c: Connect up armv7m clocks Peter Maydell
2021-09-01 10:36 ` [PULL 35/51] clock: Provide builtin multiplier/divider Peter Maydell
2021-09-01 10:36 ` [PULL 36/51] hw/arm: Don't allocate separate MemoryRegions in stm32 SoC realize Peter Maydell
2021-09-01 10:36 ` [PULL 37/51] hw/arm/stm32f100: Wire up sysclk and refclk Peter Maydell
2021-09-01 10:36 ` [PULL 38/51] hw/arm/stm32f205: " Peter Maydell
2021-09-01 10:36 ` [PULL 39/51] hw/arm/stm32f405: " Peter Maydell
2021-09-01 10:36 ` [PULL 40/51] hw/arm/stm32vldiscovery: Delete trailing blank line Peter Maydell
2021-09-01 10:36 ` [PULL 41/51] hw/arm/nrf51: Wire up sysclk Peter Maydell
2021-09-01 10:36 ` [PULL 42/51] hw/arm/stellaris: split stellaris_sys_init() Peter Maydell
2021-09-01 10:36 ` [PULL 43/51] hw/arm/stellaris: Wire sysclk up to armv7m Peter Maydell
2021-09-01 10:36 ` [PULL 44/51] hw/arm/msf2_soc: Don't allocate separate MemoryRegions Peter Maydell
2021-09-01 10:36 ` [PULL 45/51] hw/arm/msf2: Use Clock input to MSF2_SOC instead of m3clk property Peter Maydell
2021-09-01 10:36 ` [PULL 46/51] hw/arm/msf2-soc: Wire up refclk Peter Maydell
2021-09-01 10:36 ` [PULL 47/51] hw/timer/armv7m_systick: Use clock inputs instead of system_clock_scale Peter Maydell
2021-09-01 10:36 ` [PULL 48/51] hw/arm/stellaris: Fix code style issues in GPTM code Peter Maydell
2021-09-01 10:36 ` [PULL 49/51] hw/arm/stellaris: Split stellaris-gptm into its own file Peter Maydell
2021-09-01 10:36 ` [PULL 50/51] hw/timer/stellaris-gptm: Use Clock input instead of system_clock_scale Peter Maydell
2021-09-01 10:36 ` [PULL 51/51] arm: Remove system_clock_scale global Peter Maydell
2021-09-02  7:48 ` [PULL 00/51] target-arm queue Peter Maydell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210901103653.13435-11-peter.maydell@linaro.org \
    --to=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.