All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Maydell <peter.maydell@linaro.org>
To: qemu-devel@nongnu.org
Subject: [PULL 10/49] target/arm: Speed up aarch64 TBL/TBX
Date: Fri,  5 Mar 2021 17:14:36 +0000	[thread overview]
Message-ID: <20210305171515.1038-11-peter.maydell@linaro.org> (raw)
In-Reply-To: <20210305171515.1038-1-peter.maydell@linaro.org>

From: Richard Henderson <richard.henderson@linaro.org>

Always perform one call instead of two for 16-byte operands.
Use byte loads/stores directly into the vector register file
instead of extractions and deposits to a 64-bit local variable.

In order to easily receive pointers into the vector register file,
convert the helper to the gvec out-of-line signature.  Move the
helper into vec_helper.c, where it can make use of H1 and clear_tail.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Message-id: 20210224230532.276878-1-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
---
 target/arm/helper-a64.h    |  2 +-
 target/arm/helper-a64.c    | 32 ---------------------
 target/arm/translate-a64.c | 58 +++++---------------------------------
 target/arm/vec_helper.c    | 48 +++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 84 deletions(-)

diff --git a/target/arm/helper-a64.h b/target/arm/helper-a64.h
index 7bd6aed659b..c139fa81f94 100644
--- a/target/arm/helper-a64.h
+++ b/target/arm/helper-a64.h
@@ -28,7 +28,7 @@ DEF_HELPER_3(vfp_cmps_a64, i64, f32, f32, ptr)
 DEF_HELPER_3(vfp_cmpes_a64, i64, f32, f32, ptr)
 DEF_HELPER_3(vfp_cmpd_a64, i64, f64, f64, ptr)
 DEF_HELPER_3(vfp_cmped_a64, i64, f64, f64, ptr)
-DEF_HELPER_FLAGS_5(simd_tbl, TCG_CALL_NO_RWG_SE, i64, env, i64, i64, i32, i32)
+DEF_HELPER_FLAGS_4(simd_tblx, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(vfp_mulxs, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
 DEF_HELPER_FLAGS_3(vfp_mulxd, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
 DEF_HELPER_FLAGS_3(neon_ceq_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index 7f56c78fa6c..061c8ff846c 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -179,38 +179,6 @@ float64 HELPER(vfp_mulxd)(float64 a, float64 b, void *fpstp)
     return float64_mul(a, b, fpst);
 }
 
-uint64_t HELPER(simd_tbl)(CPUARMState *env, uint64_t result, uint64_t indices,
-                          uint32_t rn, uint32_t numregs)
-{
-    /* Helper function for SIMD TBL and TBX. We have to do the table
-     * lookup part for the 64 bits worth of indices we're passed in.
-     * result is the initial results vector (either zeroes for TBL
-     * or some guest values for TBX), rn the register number where
-     * the table starts, and numregs the number of registers in the table.
-     * We return the results of the lookups.
-     */
-    int shift;
-
-    for (shift = 0; shift < 64; shift += 8) {
-        int index = extract64(indices, shift, 8);
-        if (index < 16 * numregs) {
-            /* Convert index (a byte offset into the virtual table
-             * which is a series of 128-bit vectors concatenated)
-             * into the correct register element plus a bit offset
-             * into that element, bearing in mind that the table
-             * can wrap around from V31 to V0.
-             */
-            int elt = (rn * 2 + (index >> 3)) % 64;
-            int bitidx = (index & 7) * 8;
-            uint64_t *q = aa64_vfp_qreg(env, elt >> 1);
-            uint64_t val = extract64(q[elt & 1], bitidx, 8);
-
-            result = deposit64(result, shift, 8, val);
-        }
-    }
-    return result;
-}
-
 /* 64bit/double versions of the neon float compare functions */
 uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp)
 {
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 3aebdb4af9e..b591f096dfa 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -7532,10 +7532,8 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn)
     int rm = extract32(insn, 16, 5);
     int rn = extract32(insn, 5, 5);
     int rd = extract32(insn, 0, 5);
-    int is_tblx = extract32(insn, 12, 1);
-    int len = extract32(insn, 13, 2);
-    TCGv_i64 tcg_resl, tcg_resh, tcg_idx;
-    TCGv_i32 tcg_regno, tcg_numregs;
+    int is_tbx = extract32(insn, 12, 1);
+    int len = (extract32(insn, 13, 2) + 1) * 16;
 
     if (op2 != 0) {
         unallocated_encoding(s);
@@ -7546,53 +7544,11 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn)
         return;
     }
 
-    /* This does a table lookup: for every byte element in the input
-     * we index into a table formed from up to four vector registers,
-     * and then the output is the result of the lookups. Our helper
-     * function does the lookup operation for a single 64 bit part of
-     * the input.
-     */
-    tcg_resl = tcg_temp_new_i64();
-    tcg_resh = NULL;
-
-    if (is_tblx) {
-        read_vec_element(s, tcg_resl, rd, 0, MO_64);
-    } else {
-        tcg_gen_movi_i64(tcg_resl, 0);
-    }
-
-    if (is_q) {
-        tcg_resh = tcg_temp_new_i64();
-        if (is_tblx) {
-            read_vec_element(s, tcg_resh, rd, 1, MO_64);
-        } else {
-            tcg_gen_movi_i64(tcg_resh, 0);
-        }
-    }
-
-    tcg_idx = tcg_temp_new_i64();
-    tcg_regno = tcg_const_i32(rn);
-    tcg_numregs = tcg_const_i32(len + 1);
-    read_vec_element(s, tcg_idx, rm, 0, MO_64);
-    gen_helper_simd_tbl(tcg_resl, cpu_env, tcg_resl, tcg_idx,
-                        tcg_regno, tcg_numregs);
-    if (is_q) {
-        read_vec_element(s, tcg_idx, rm, 1, MO_64);
-        gen_helper_simd_tbl(tcg_resh, cpu_env, tcg_resh, tcg_idx,
-                            tcg_regno, tcg_numregs);
-    }
-    tcg_temp_free_i64(tcg_idx);
-    tcg_temp_free_i32(tcg_regno);
-    tcg_temp_free_i32(tcg_numregs);
-
-    write_vec_element(s, tcg_resl, rd, 0, MO_64);
-    tcg_temp_free_i64(tcg_resl);
-
-    if (is_q) {
-        write_vec_element(s, tcg_resh, rd, 1, MO_64);
-        tcg_temp_free_i64(tcg_resh);
-    }
-    clear_vec_high(s, is_q, rd);
+    tcg_gen_gvec_2_ptr(vec_full_reg_offset(s, rd),
+                       vec_full_reg_offset(s, rm), cpu_env,
+                       is_q ? 16 : 8, vec_full_reg_size(s),
+                       (len << 6) | (is_tbx << 5) | rn,
+                       gen_helper_simd_tblx);
 }
 
 /* ZIP/UZP/TRN
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 7174030377c..3fbeae87cb3 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -1937,3 +1937,51 @@ DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
 
 #undef DO_VRINT_RMODE
+
+#ifdef TARGET_AARCH64
+void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
+{
+    const uint8_t *indices = vm;
+    CPUARMState *env = venv;
+    size_t oprsz = simd_oprsz(desc);
+    uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
+    bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
+    uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
+    union {
+        uint8_t b[16];
+        uint64_t d[2];
+    } result;
+
+    /*
+     * We must construct the final result in a temp, lest the output
+     * overlaps the input table.  For TBL, begin with zero; for TBX,
+     * begin with the original register contents.  Note that we always
+     * copy 16 bytes here to avoid an extra branch; clearing the high
+     * bits of the register for oprsz == 8 is handled below.
+     */
+    if (is_tbx) {
+        memcpy(&result, vd, 16);
+    } else {
+        memset(&result, 0, 16);
+    }
+
+    for (size_t i = 0; i < oprsz; ++i) {
+        uint32_t index = indices[H1(i)];
+
+        if (index < table_len) {
+            /*
+             * Convert index (a byte offset into the virtual table
+             * which is a series of 128-bit vectors concatenated)
+             * into the correct register element, bearing in mind
+             * that the table can wrap around from V31 to V0.
+             */
+            const uint8_t *table = (const uint8_t *)
+                aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
+            result.b[H1(i)] = table[H1(index % 16)];
+        }
+    }
+
+    memcpy(vd, &result, 16);
+    clear_tail(vd, oprsz, simd_maxsz(desc));
+}
+#endif
-- 
2.20.1



  parent reply	other threads:[~2021-03-05 17:47 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-05 17:14 [PULL 00/49] target-arm queue Peter Maydell
2021-03-05 17:14 ` [PULL 01/49] sbsa-ref: remove cortex-a53 from list of supported cpus Peter Maydell
2021-03-05 17:14 ` [PULL 02/49] sbsa-ref: add 'max' to list of allowed cpus Peter Maydell
2021-03-05 17:14 ` [PULL 03/49] target/arm: Add support for FEAT_SSBS, Speculative Store Bypass Safe Peter Maydell
2021-03-05 17:14 ` [PULL 04/49] target/arm: Enable FEAT_SSBS for "max" AARCH64 CPU Peter Maydell
2021-03-05 17:14 ` [PULL 05/49] target/arm: Set ID_PFR2.SSBS to 1 for "max" 32-bit CPU Peter Maydell
2021-03-05 17:14 ` [PULL 06/49] hw/net: Add npcm7xx emc model Peter Maydell
2021-03-05 17:14 ` [PULL 07/49] hw/arm: " Peter Maydell
2021-03-05 17:14 ` [PULL 08/49] tests/qtests: Add npcm7xx emc model test Peter Maydell
2021-03-05 17:14 ` [PULL 09/49] hw/arm/xlnx-zynqmp: Remove obsolete 'has_rpu' property Peter Maydell
2021-03-05 17:14 ` Peter Maydell [this message]
2021-03-05 17:14 ` [PULL 11/49] hw/i2c/npcm7xx_smbus: Simplify npcm7xx_smbus_init() Peter Maydell
2021-03-05 17:14 ` [PULL 12/49] virtio-mmio: improve virtio-mmio get_dev_path alog Peter Maydell
2021-03-05 17:14 ` [PULL 13/49] target/arm: Use TCF0 and TFSRE0 for unprivileged tag checks Peter Maydell
2021-03-05 17:14 ` [PULL 14/49] target/arm: Restrict v8M IDAU to TCG Peter Maydell
2021-03-05 17:14 ` [PULL 15/49] target/arm/cpu: Update coding style to make checkpatch.pl happy Peter Maydell
2021-03-05 17:14 ` [PULL 16/49] hw/arm/musicpal: Remove dead code for non-32-bit-RGB surfaces Peter Maydell
2021-03-05 17:14 ` [PULL 17/49] hw/display/tc6393xb: Remove dead code for handling non-32bpp surfaces Peter Maydell
2021-03-05 17:14 ` [PULL 18/49] hw/display/tc6393xb: Expand out macros in template header Peter Maydell
2021-03-05 17:14 ` [PULL 19/49] hw/display/tc6393xb: Inline tc6393xb_draw_graphic32() at its callsite Peter Maydell
2021-03-05 17:14 ` [PULL 20/49] hw/display/omap_lcdc: Expand out macros in template header Peter Maydell
2021-03-05 17:14 ` [PULL 21/49] hw/display/omap_lcdc: Drop broken bigendian ifdef Peter Maydell
2021-03-05 17:14 ` [PULL 22/49] hw/display/omap_lcdc: Fix coding style issues in template header Peter Maydell
2021-03-05 17:14 ` [PULL 23/49] hw/display/omap_lcdc: Inline template header into C file Peter Maydell
2021-03-05 17:14 ` [PULL 24/49] hw/display/omap_lcdc: Delete unnecessary macro Peter Maydell
2021-03-05 17:14 ` [PULL 25/49] hw/display/tcx: Drop unnecessary code for handling BGR format outputs Peter Maydell
2021-03-05 17:14 ` [PULL 26/49] hw/arm/mps2-tz: Make SYSCLK frequency board-specific Peter Maydell
2021-03-05 17:14 ` [PULL 27/49] hw/misc/mps2-scc: Support configurable number of OSCCLK values Peter Maydell
2021-03-05 17:14 ` [PULL 28/49] hw/arm/mps2-tz: Correct the OSCCLK settings for mps2-an505 and mps2-an511 Peter Maydell
2021-03-05 17:14 ` [PULL 29/49] hw/arm/mps2-tz: Make the OSCCLK settings be configurable per-board Peter Maydell
2021-03-05 17:14 ` [PULL 30/49] hw/misc/mps2-fpgaio: Make number of LEDs configurable by board Peter Maydell
2021-03-05 17:14 ` [PULL 31/49] hw/misc/mps2-fpgaio: Support SWITCH register Peter Maydell
2021-03-05 17:14 ` [PULL 32/49] hw/arm/mps2-tz: Make FPGAIO switch and LED config per-board Peter Maydell
2021-03-05 17:14 ` [PULL 33/49] hw/arm/mps2-tz: Condition IRQ splitting on number of CPUs, not board type Peter Maydell
2021-03-05 17:15 ` [PULL 34/49] hw/arm/mps2-tz: Make number of IRQs board-specific Peter Maydell
2021-03-05 17:15 ` [PULL 35/49] hw/misc/mps2-scc: Implement CFG_REG5 and CFG_REG6 for MPS3 AN524 Peter Maydell
2021-03-05 17:15 ` [PULL 36/49] hw/arm/mps2-tz: Correct wrong interrupt numbers for DMA and SPI Peter Maydell
2021-03-05 17:15 ` [PULL 37/49] hw/arm/mps2-tz: Allow PPCPortInfo structures to specify device interrupts Peter Maydell
2021-03-05 17:15 ` [PULL 38/49] hw/arm/mps2-tz: Move device IRQ info to data structures Peter Maydell
2021-03-05 17:15 ` [PULL 39/49] hw/arm/mps2-tz: Size the uart-irq-orgate based on the number of UARTs Peter Maydell
2021-03-05 17:15 ` [PULL 40/49] hw/arm/mps2-tz: Allow boards to have different PPCInfo data Peter Maydell
2021-03-05 17:15 ` [PULL 41/49] hw/arm/mps2-tz: Make RAM arrangement board-specific Peter Maydell
2021-03-05 17:15 ` [PULL 42/49] hw/arm/mps2-tz: Set MachineClass default_ram info from RAMInfo data Peter Maydell
2021-03-05 17:15 ` [PULL 43/49] hw/arm/mps2-tz: Support ROMs as well as RAMs Peter Maydell
2021-03-05 17:15 ` [PULL 44/49] hw/arm/mps2-tz: Get armv7m_load_kernel() size argument from RAMInfo Peter Maydell
2021-03-05 17:15 ` [PULL 45/49] hw/arm/mps2-tz: Add new mps3-an524 board Peter Maydell
2021-03-05 17:15 ` [PULL 46/49] hw/arm/mps2-tz: Stub out USB controller for mps3-an524 Peter Maydell
2021-03-05 17:15 ` [PULL 47/49] hw/arm/mps2-tz: Provide PL031 RTC on mps3-an524 Peter Maydell
2021-03-05 17:15 ` [PULL 48/49] docs/system/arm/mps2.rst: Document the new mps3-an524 board Peter Maydell
2021-03-05 17:15 ` [PULL 49/49] hw/arm/mps2: Update old infocenter.arm.com URLs Peter Maydell
2021-03-05 18:36 ` [PULL 00/49] target-arm queue no-reply

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210305171515.1038-11-peter.maydell@linaro.org \
    --to=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.