All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] target/i386: Host vector ops for high-usage SSE
@ 2022-08-22 22:37 Richard Henderson
  2022-08-22 22:37 ` [PATCH 1/6] target/i386: Define XMMReg and access macros Richard Henderson
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Richard Henderson @ 2022-08-22 22:37 UTC (permalink / raw)
  To: qemu-devel

These 5 sets of conversions take care of all of the entries that
show up above 0.1% in a normal linux boot, i.e. easy libc usage.


r~


Richard Henderson (6):
  target/i386: Define XMMReg and access macros
  target/i386: Use tcg gvec for gen_op_movo
  target/i386: Use tcg gvec for pcmp{eq,gt}*
  target/i386: Use tcg gvec for p{add,sub}*
  target/i386: Use tcg gvec for pand, pandn, por, pxor
  target/i386: Use tcg gvec ops for pmovmskb

 target/i386/cpu.h            |  57 +++++++---
 target/i386/ops_sse.h        |  49 ---------
 target/i386/ops_sse_header.h |  24 ----
 target/i386/tcg/translate.c  | 208 ++++++++++++++++++++++++++++-------
 4 files changed, 214 insertions(+), 124 deletions(-)

-- 
2.34.1



^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/6] target/i386: Define XMMReg and access macros
  2022-08-22 22:37 [PATCH 0/6] target/i386: Host vector ops for high-usage SSE Richard Henderson
@ 2022-08-22 22:37 ` Richard Henderson
  2022-08-22 22:37 ` [PATCH 2/6] target/i386: Use tcg gvec for gen_op_movo Richard Henderson
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2022-08-22 22:37 UTC (permalink / raw)
  To: qemu-devel

This will be used for proper endian adjustments of gvec xmm ops.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/cpu.h | 53 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 82004b65b9..81e5abed86 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1233,18 +1233,33 @@ typedef struct SegmentCache {
     uint32_t flags;
 } SegmentCache;
 
-#define MMREG_UNION(n, bits)        \
-    union n {                       \
-        uint8_t  _b_##n[(bits)/8];  \
-        uint16_t _w_##n[(bits)/16]; \
-        uint32_t _l_##n[(bits)/32]; \
-        uint64_t _q_##n[(bits)/64]; \
-        float32  _s_##n[(bits)/32]; \
-        float64  _d_##n[(bits)/64]; \
-    }
+typedef union MMXReg {
+    uint8_t  _b_MMXReg[64 / 8];
+    uint16_t _w_MMXReg[64 / 16];
+    uint32_t _l_MMXReg[64 / 32];
+    uint64_t _q_MMXReg[64 / 64];
+    float32  _s_MMXReg[64 / 32];
+    float64  _d_MMXReg[64 / 64];
+} MMXReg;
 
-typedef MMREG_UNION(ZMMReg, 512) ZMMReg;
-typedef MMREG_UNION(MMXReg, 64)  MMXReg;
+typedef union XMMReg {
+    uint8_t  _b_XMMReg[128 / 8];
+    uint16_t _w_XMMReg[128 / 16];
+    uint32_t _l_XMMReg[128 / 32];
+    uint64_t _q_XMMReg[128 / 64];
+    float32  _s_XMMReg[128 / 32];
+    float64  _d_XMMReg[128 / 64];
+} XMMReg;
+
+typedef union ZMMReg {
+    uint8_t  _b_ZMMReg[512 / 8];
+    uint16_t _w_ZMMReg[512 / 16];
+    uint32_t _l_ZMMReg[512 / 32];
+    uint64_t _q_ZMMReg[512 / 64];
+    float32  _s_ZMMReg[512 / 32];
+    float64  _d_ZMMReg[512 / 64];
+    XMMReg   _x_ZMMReg[512 / 128];
+} ZMMReg;
 
 typedef struct BNDReg {
     uint64_t lb;
@@ -1267,6 +1282,14 @@ typedef struct BNDCSReg {
 #define ZMM_S(n) _s_ZMMReg[15 - (n)]
 #define ZMM_Q(n) _q_ZMMReg[7 - (n)]
 #define ZMM_D(n) _d_ZMMReg[7 - (n)]
+#define ZMM_X(n) _x_ZMMReg[3 - (n)]
+
+#define XMM_B(n) _b_XMMReg[15 - (n)]
+#define XMM_W(n) _w_XMMReg[7 - (n)]
+#define XMM_L(n) _l_XMMReg[3 - (n)]
+#define XMM_S(n) _s_XMMReg[3 - (n)]
+#define XMM_Q(n) _q_XMMReg[1 - (n)]
+#define XMM_D(n) _d_XMMReg[1 - (n)]
 
 #define MMX_B(n) _b_MMXReg[7 - (n)]
 #define MMX_W(n) _w_MMXReg[3 - (n)]
@@ -1279,6 +1302,14 @@ typedef struct BNDCSReg {
 #define ZMM_S(n) _s_ZMMReg[n]
 #define ZMM_Q(n) _q_ZMMReg[n]
 #define ZMM_D(n) _d_ZMMReg[n]
+#define ZMM_X(n) _x_ZMMReg[n]
+
+#define XMM_B(n) _b_XMMReg[n]
+#define XMM_W(n) _w_XMMReg[n]
+#define XMM_L(n) _l_XMMReg[n]
+#define XMM_S(n) _s_XMMReg[n]
+#define XMM_Q(n) _q_XMMReg[n]
+#define XMM_D(n) _d_XMMReg[n]
 
 #define MMX_B(n) _b_MMXReg[n]
 #define MMX_W(n) _w_MMXReg[n]
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/6] target/i386: Use tcg gvec for gen_op_movo
  2022-08-22 22:37 [PATCH 0/6] target/i386: Host vector ops for high-usage SSE Richard Henderson
  2022-08-22 22:37 ` [PATCH 1/6] target/i386: Define XMMReg and access macros Richard Henderson
@ 2022-08-22 22:37 ` Richard Henderson
  2022-08-22 22:37 ` [PATCH 3/6] target/i386: Use tcg gvec for pcmp{eq,gt}* Richard Henderson
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2022-08-22 22:37 UTC (permalink / raw)
  To: qemu-devel

Low hanging fruit, using gvec to move 16 bytes.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/cpu.h           | 4 ++--
 target/i386/tcg/translate.c | 7 +++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 81e5abed86..dbc9a99a3b 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1587,8 +1587,8 @@ typedef struct CPUArchState {
     float_status mmx_status; /* for 3DNow! float ops */
     float_status sse_status;
     uint32_t mxcsr;
-    ZMMReg xmm_regs[CPU_NB_REGS == 8 ? 8 : 32];
-    ZMMReg xmm_t0;
+    ZMMReg xmm_regs[CPU_NB_REGS == 8 ? 8 : 32] QEMU_ALIGNED(16);
+    ZMMReg xmm_t0 QEMU_ALIGNED(16);
     MMXReg mmx_t0;
 
     uint64_t opmask_regs[NB_OPMASK_REGS];
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index b7972f0ff5..c1f1f6f66b 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -23,6 +23,7 @@
 #include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
 #include "exec/cpu_ldst.h"
 #include "exec/translator.h"
 
@@ -2753,10 +2754,8 @@ static inline void gen_sto_env_A0(DisasContext *s, int offset)
 
 static inline void gen_op_movo(DisasContext *s, int d_offset, int s_offset)
 {
-    tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(0)));
-    tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(0)));
-    tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(ZMMReg, ZMM_Q(1)));
-    tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset + offsetof(ZMMReg, ZMM_Q(1)));
+    int xmm_ofs = offsetof(ZMMReg, ZMM_X(0));
+    tcg_gen_gvec_mov(MO_64, d_offset + xmm_ofs, s_offset + xmm_ofs, 16, 16);
 }
 
 static inline void gen_op_movq(DisasContext *s, int d_offset, int s_offset)
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/6] target/i386: Use tcg gvec for pcmp{eq,gt}*
  2022-08-22 22:37 [PATCH 0/6] target/i386: Host vector ops for high-usage SSE Richard Henderson
  2022-08-22 22:37 ` [PATCH 1/6] target/i386: Define XMMReg and access macros Richard Henderson
  2022-08-22 22:37 ` [PATCH 2/6] target/i386: Use tcg gvec for gen_op_movo Richard Henderson
@ 2022-08-22 22:37 ` Richard Henderson
  2022-08-22 22:37 ` [PATCH 4/6] target/i386: Use tcg gvec for p{add,sub}* Richard Henderson
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2022-08-22 22:37 UTC (permalink / raw)
  To: qemu-devel

As pcmpeqb is used by strlen et al, this is the highest overhead
sse operation, at 2.5%.  It's simple to include the other compares
at the same time.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/ops_sse.h        |  8 --------
 target/i386/ops_sse_header.h |  8 --------
 target/i386/tcg/translate.c  | 31 +++++++++++++++++++++++++------
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 535440f882..94440a9dc5 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -420,14 +420,6 @@ SSE_HELPER_Q(helper_pandn, FANDN)
 SSE_HELPER_Q(helper_por, FOR)
 SSE_HELPER_Q(helper_pxor, FXOR)
 
-SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
-SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
-SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
-
-SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
-SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
-SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
-
 SSE_HELPER_W(helper_pmullw, FMULLW)
 #if SHIFT == 0
 SSE_HELPER_W(helper_pmulhrw, FMULHRW)
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index cef28f2aae..b9f957daf8 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -91,14 +91,6 @@ SSE_HELPER_Q(pandn, FANDN)
 SSE_HELPER_Q(por, FOR)
 SSE_HELPER_Q(pxor, FXOR)
 
-SSE_HELPER_B(pcmpgtb, FCMPGTB)
-SSE_HELPER_W(pcmpgtw, FCMPGTW)
-SSE_HELPER_L(pcmpgtl, FCMPGTL)
-
-SSE_HELPER_B(pcmpeqb, FCMPEQ)
-SSE_HELPER_W(pcmpeqw, FCMPEQ)
-SSE_HELPER_L(pcmpeql, FCMPEQ)
-
 SSE_HELPER_W(pmullw, FMULLW)
 #if SHIFT == 0
 SSE_HELPER_W(pmulhrw, FMULHRW)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index c1f1f6f66b..467d018b68 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2847,9 +2847,9 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0x61] = MMX_OP2(punpcklwd),
     [0x62] = MMX_OP2(punpckldq),
     [0x63] = MMX_OP2(packsswb),
-    [0x64] = MMX_OP2(pcmpgtb),
-    [0x65] = MMX_OP2(pcmpgtw),
-    [0x66] = MMX_OP2(pcmpgtl),
+    [0x64] = { SSE_DUMMY, SSE_DUMMY },  /* pcmpgtb */
+    [0x65] = { SSE_DUMMY, SSE_DUMMY },  /* pcmpgtw */
+    [0x66] = { SSE_DUMMY, SSE_DUMMY },  /* pcmpgtl */
     [0x67] = MMX_OP2(packuswb),
     [0x68] = MMX_OP2(punpckhbw),
     [0x69] = MMX_OP2(punpckhwd),
@@ -2866,9 +2866,9 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0x71] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftw */
     [0x72] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftd */
     [0x73] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftq */
-    [0x74] = MMX_OP2(pcmpeqb),
-    [0x75] = MMX_OP2(pcmpeqw),
-    [0x76] = MMX_OP2(pcmpeql),
+    [0x74] = { SSE_DUMMY, SSE_DUMMY },     /* pcmpeqb */
+    [0x75] = { SSE_DUMMY, SSE_DUMMY },     /* pcmpeqw */
+    [0x76] = { SSE_DUMMY, SSE_DUMMY },     /* pcmpeql */
     [0x77] = { SSE_DUMMY }, /* emms */
     [0x78] = { NULL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* extrq_i, insertq_i */
     [0x79] = { NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r },
@@ -4415,6 +4415,9 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             return;
         }
     } else {
+        int vec_len = is_xmm ? 16 : 8;
+        int xmm_ofs = is_xmm ? offsetof(ZMMReg, ZMM_X(0)) : 0;
+
         /* generic MMX or SSE operation */
         switch(b) {
         case 0x70: /* pshufx insn */
@@ -4532,6 +4535,22 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             sse_fn_eppt = (SSEFunc_0_eppt)sse_fn_epp;
             sse_fn_eppt(cpu_env, s->ptr0, s->ptr1, s->A0);
             break;
+        case 0x64: /* pcmpgtb */
+        case 0x65: /* pcmpgtw */
+        case 0x66: /* pcmpgtl */
+            op1_offset += xmm_ofs;
+            op2_offset += xmm_ofs;
+            tcg_gen_gvec_cmp(TCG_COND_GT, b - 0x64, op1_offset, op1_offset,
+                             op2_offset, vec_len, vec_len);
+            break;
+        case 0x74: /* pcmpeqb */
+        case 0x75: /* pcmpeqw */
+        case 0x76: /* pcmpeql */
+            op1_offset += xmm_ofs;
+            op2_offset += xmm_ofs;
+            tcg_gen_gvec_cmp(TCG_COND_EQ, b - 0x74, op1_offset, op1_offset,
+                             op2_offset, vec_len, vec_len);
+            break;
         default:
             tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
             tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/6] target/i386: Use tcg gvec for p{add,sub}*
  2022-08-22 22:37 [PATCH 0/6] target/i386: Host vector ops for high-usage SSE Richard Henderson
                   ` (2 preceding siblings ...)
  2022-08-22 22:37 ` [PATCH 3/6] target/i386: Use tcg gvec for pcmp{eq,gt}* Richard Henderson
@ 2022-08-22 22:37 ` Richard Henderson
  2022-08-22 22:37 ` [PATCH 5/6] target/i386: Use tcg gvec for pand, pandn, por, pxor Richard Henderson
  2022-08-22 22:37 ` [PATCH 6/6] target/i386: Use tcg gvec ops for pmovmskb Richard Henderson
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2022-08-22 22:37 UTC (permalink / raw)
  To: qemu-devel

Since psubb is the second highest overhead sse operation, at 0.9%.
It's simple to include add and the other sizes at the same time.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/ops_sse.h        | 10 ---------
 target/i386/ops_sse_header.h | 10 ---------
 target/i386/tcg/translate.c  | 39 ++++++++++++++++++++++++++++--------
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 94440a9dc5..6f035b5c16 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -389,16 +389,6 @@ static inline int satsw(int x)
 #define FAVG(a, b) (((a) + (b) + 1) >> 1)
 #endif
 
-SSE_HELPER_B(helper_paddb, FADD)
-SSE_HELPER_W(helper_paddw, FADD)
-SSE_HELPER_L(helper_paddl, FADD)
-SSE_HELPER_Q(helper_paddq, FADD)
-
-SSE_HELPER_B(helper_psubb, FSUB)
-SSE_HELPER_W(helper_psubw, FSUB)
-SSE_HELPER_L(helper_psubl, FSUB)
-SSE_HELPER_Q(helper_psubq, FSUB)
-
 SSE_HELPER_B(helper_paddusb, FADDUB)
 SSE_HELPER_B(helper_paddsb, FADDSB)
 SSE_HELPER_B(helper_psubusb, FSUBUB)
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index b9f957daf8..da630fbc40 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -60,16 +60,6 @@ DEF_HELPER_3(glue(pslldq, SUFFIX), void, env, Reg, Reg)
 #define SSE_HELPER_Q(name, F)\
     DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg)
 
-SSE_HELPER_B(paddb, FADD)
-SSE_HELPER_W(paddw, FADD)
-SSE_HELPER_L(paddl, FADD)
-SSE_HELPER_Q(paddq, FADD)
-
-SSE_HELPER_B(psubb, FSUB)
-SSE_HELPER_W(psubw, FSUB)
-SSE_HELPER_L(psubl, FSUB)
-SSE_HELPER_Q(psubq, FSUB)
-
 SSE_HELPER_B(paddusb, FADDUB)
 SSE_HELPER_B(paddsb, FADDSB)
 SSE_HELPER_B(psubusb, FSUBUB)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 467d018b68..2a8ea3369a 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2882,7 +2882,7 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0xd1] = MMX_OP2(psrlw),
     [0xd2] = MMX_OP2(psrld),
     [0xd3] = MMX_OP2(psrlq),
-    [0xd4] = MMX_OP2(paddq),
+    [0xd4] = { SSE_DUMMY, SSE_DUMMY },  /* paddq */
     [0xd5] = MMX_OP2(pmullw),
     [0xd6] = { NULL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
     [0xd7] = { SSE_SPECIAL, SSE_SPECIAL }, /* pmovmskb */
@@ -2919,13 +2919,13 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0xf6] = MMX_OP2(psadbw),
     [0xf7] = { (SSEFunc_0_epp)gen_helper_maskmov_mmx,
                (SSEFunc_0_epp)gen_helper_maskmov_xmm }, /* XXX: casts */
-    [0xf8] = MMX_OP2(psubb),
-    [0xf9] = MMX_OP2(psubw),
-    [0xfa] = MMX_OP2(psubl),
-    [0xfb] = MMX_OP2(psubq),
-    [0xfc] = MMX_OP2(paddb),
-    [0xfd] = MMX_OP2(paddw),
-    [0xfe] = MMX_OP2(paddl),
+    [0xf8] = { SSE_DUMMY, SSE_DUMMY },  /* psubb */
+    [0xf9] = { SSE_DUMMY, SSE_DUMMY },  /* psubw */
+    [0xfa] = { SSE_DUMMY, SSE_DUMMY },  /* psubl */
+    [0xfb] = { SSE_DUMMY, SSE_DUMMY },  /* psubq */
+    [0xfc] = { SSE_DUMMY, SSE_DUMMY },  /* paddb */
+    [0xfd] = { SSE_DUMMY, SSE_DUMMY },  /* paddw */
+    [0xfe] = { SSE_DUMMY, SSE_DUMMY },  /* paddl */
 };
 
 static const SSEFunc_0_epp sse_op_table2[3 * 8][2] = {
@@ -4551,6 +4551,29 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             tcg_gen_gvec_cmp(TCG_COND_EQ, b - 0x74, op1_offset, op1_offset,
                              op2_offset, vec_len, vec_len);
             break;
+        case 0xf8: /* psubb */
+        case 0xf9: /* psubw */
+        case 0xfa: /* psubl */
+        case 0xfb: /* psubq */
+            op1_offset += xmm_ofs;
+            op2_offset += xmm_ofs;
+            tcg_gen_gvec_sub(b - 0xf8, op1_offset, op1_offset,
+                             op2_offset, vec_len, vec_len);
+            break;
+        case 0xfc: /* paddb */
+        case 0xfd: /* paddw */
+        case 0xfe: /* paddl */
+            op1_offset += xmm_ofs;
+            op2_offset += xmm_ofs;
+            tcg_gen_gvec_add(b - 0xfc, op1_offset, op1_offset,
+                             op2_offset, vec_len, vec_len);
+            break;
+        case 0xd4: /* paddq */
+            op1_offset += xmm_ofs;
+            op2_offset += xmm_ofs;
+            tcg_gen_gvec_add(MO_64, op1_offset, op1_offset,
+                             op2_offset, vec_len, vec_len);
+            break;
         default:
             tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset);
             tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset);
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 5/6] target/i386: Use tcg gvec for pand, pandn, por, pxor
  2022-08-22 22:37 [PATCH 0/6] target/i386: Host vector ops for high-usage SSE Richard Henderson
                   ` (3 preceding siblings ...)
  2022-08-22 22:37 ` [PATCH 4/6] target/i386: Use tcg gvec for p{add,sub}* Richard Henderson
@ 2022-08-22 22:37 ` Richard Henderson
  2022-08-22 22:37 ` [PATCH 6/6] target/i386: Use tcg gvec ops for pmovmskb Richard Henderson
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2022-08-22 22:37 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/ops_sse.h        |  5 ----
 target/i386/ops_sse_header.h |  5 ----
 target/i386/tcg/translate.c  | 45 +++++++++++++++++++++++++++++-------
 3 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 6f035b5c16..b21f315f37 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -405,11 +405,6 @@ SSE_HELPER_B(helper_pmaxub, FMAXUB)
 SSE_HELPER_W(helper_pminsw, FMINSW)
 SSE_HELPER_W(helper_pmaxsw, FMAXSW)
 
-SSE_HELPER_Q(helper_pand, FAND)
-SSE_HELPER_Q(helper_pandn, FANDN)
-SSE_HELPER_Q(helper_por, FOR)
-SSE_HELPER_Q(helper_pxor, FXOR)
-
 SSE_HELPER_W(helper_pmullw, FMULLW)
 #if SHIFT == 0
 SSE_HELPER_W(helper_pmulhrw, FMULHRW)
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index da630fbc40..542701720e 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -76,11 +76,6 @@ SSE_HELPER_B(pmaxub, FMAXUB)
 SSE_HELPER_W(pminsw, FMINSW)
 SSE_HELPER_W(pmaxsw, FMAXSW)
 
-SSE_HELPER_Q(pand, FAND)
-SSE_HELPER_Q(pandn, FANDN)
-SSE_HELPER_Q(por, FOR)
-SSE_HELPER_Q(pxor, FXOR)
-
 SSE_HELPER_W(pmullw, FMULLW)
 #if SHIFT == 0
 SSE_HELPER_W(pmulhrw, FMULHRW)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index 2a8ea3369a..d25d914d63 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2820,10 +2820,10 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0x51] = SSE_FOP(sqrt),
     [0x52] = { gen_helper_rsqrtps, NULL, gen_helper_rsqrtss, NULL },
     [0x53] = { gen_helper_rcpps, NULL, gen_helper_rcpss, NULL },
-    [0x54] = { gen_helper_pand_xmm, gen_helper_pand_xmm }, /* andps, andpd */
-    [0x55] = { gen_helper_pandn_xmm, gen_helper_pandn_xmm }, /* andnps, andnpd */
-    [0x56] = { gen_helper_por_xmm, gen_helper_por_xmm }, /* orps, orpd */
-    [0x57] = { gen_helper_pxor_xmm, gen_helper_pxor_xmm }, /* xorps, xorpd */
+    [0x54] = { SSE_DUMMY, SSE_DUMMY }, /* andps, andpd */
+    [0x55] = { SSE_DUMMY, SSE_DUMMY }, /* andnps, andnpd */
+    [0x56] = { SSE_DUMMY, SSE_DUMMY }, /* orps, orpd */
+    [0x57] = { SSE_DUMMY, SSE_DUMMY }, /* xorps, xorpd */
     [0x58] = SSE_FOP(add),
     [0x59] = SSE_FOP(mul),
     [0x5a] = { gen_helper_cvtps2pd, gen_helper_cvtpd2ps,
@@ -2889,11 +2889,11 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0xd8] = MMX_OP2(psubusb),
     [0xd9] = MMX_OP2(psubusw),
     [0xda] = MMX_OP2(pminub),
-    [0xdb] = MMX_OP2(pand),
+    [0xdb] = { SSE_DUMMY, SSE_DUMMY }, /* pand */
     [0xdc] = MMX_OP2(paddusb),
     [0xdd] = MMX_OP2(paddusw),
     [0xde] = MMX_OP2(pmaxub),
-    [0xdf] = MMX_OP2(pandn),
+    [0xdf] = { SSE_DUMMY, SSE_DUMMY }, /* pandn */
     [0xe0] = MMX_OP2(pavgb),
     [0xe1] = MMX_OP2(psraw),
     [0xe2] = MMX_OP2(psrad),
@@ -2905,11 +2905,11 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0xe8] = MMX_OP2(psubsb),
     [0xe9] = MMX_OP2(psubsw),
     [0xea] = MMX_OP2(pminsw),
-    [0xeb] = MMX_OP2(por),
+    [0xeb] = { SSE_DUMMY, SSE_DUMMY },  /* por */
     [0xec] = MMX_OP2(paddsb),
     [0xed] = MMX_OP2(paddsw),
     [0xee] = MMX_OP2(pmaxsw),
-    [0xef] = MMX_OP2(pxor),
+    [0xef] = { SSE_DUMMY, SSE_DUMMY },  /* pxor */
     [0xf0] = { NULL, NULL, NULL, SSE_SPECIAL }, /* lddqu */
     [0xf1] = MMX_OP2(psllw),
     [0xf2] = MMX_OP2(pslld),
@@ -4535,6 +4535,35 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             sse_fn_eppt = (SSEFunc_0_eppt)sse_fn_epp;
             sse_fn_eppt(cpu_env, s->ptr0, s->ptr1, s->A0);
             break;
+        case 0x54: /* andps, andpd */
+        case 0xdb: /* pand */
+            op1_offset += xmm_ofs;
+            op2_offset += xmm_ofs;
+            tcg_gen_gvec_and(MO_64, op1_offset, op1_offset,
+                             op2_offset, vec_len, vec_len);
+            break;
+        case 0x55: /* andnps, andnpd */
+        case 0xdf: /* pandn */
+            op1_offset += xmm_ofs;
+            op2_offset += xmm_ofs;
+            /* x86 inverts the first operand; tcg inverts the second. */
+            tcg_gen_gvec_andc(MO_64, op1_offset, op2_offset,
+                              op1_offset, vec_len, vec_len);
+            break;
+        case 0x56: /* orps, orpd */
+        case 0xeb: /* por */
+            op1_offset += xmm_ofs;
+            op2_offset += xmm_ofs;
+            tcg_gen_gvec_or(MO_64, op1_offset, op1_offset,
+                            op2_offset, vec_len, vec_len);
+            break;
+        case 0x57: /* xorps, xorpd */
+        case 0xef: /* pxor */
+            op1_offset += xmm_ofs;
+            op2_offset += xmm_ofs;
+            tcg_gen_gvec_xor(MO_64, op1_offset, op1_offset,
+                             op2_offset, vec_len, vec_len);
+            break;
         case 0x64: /* pcmpgtb */
         case 0x65: /* pcmpgtw */
         case 0x66: /* pcmpgtl */
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 6/6] target/i386: Use tcg gvec ops for pmovmskb
  2022-08-22 22:37 [PATCH 0/6] target/i386: Host vector ops for high-usage SSE Richard Henderson
                   ` (4 preceding siblings ...)
  2022-08-22 22:37 ` [PATCH 5/6] target/i386: Use tcg gvec for pand, pandn, por, pxor Richard Henderson
@ 2022-08-22 22:37 ` Richard Henderson
  5 siblings, 0 replies; 7+ messages in thread
From: Richard Henderson @ 2022-08-22 22:37 UTC (permalink / raw)
  To: qemu-devel

As pmovmskb is used by strlen et al, this is the third
highest overhead sse operation at %0.8.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/ops_sse.h        | 26 -----------
 target/i386/ops_sse_header.h |  1 -
 target/i386/tcg/translate.c  | 86 +++++++++++++++++++++++++++++++-----
 3 files changed, 74 insertions(+), 39 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index b21f315f37..9f9801be63 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -1098,32 +1098,6 @@ uint32_t helper_movmskpd(CPUX86State *env, Reg *s)
 
 #endif
 
-uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
-{
-    uint32_t val;
-
-    val = 0;
-    val |= (s->B(0) >> 7);
-    val |= (s->B(1) >> 6) & 0x02;
-    val |= (s->B(2) >> 5) & 0x04;
-    val |= (s->B(3) >> 4) & 0x08;
-    val |= (s->B(4) >> 3) & 0x10;
-    val |= (s->B(5) >> 2) & 0x20;
-    val |= (s->B(6) >> 1) & 0x40;
-    val |= (s->B(7)) & 0x80;
-#if SHIFT == 1
-    val |= (s->B(8) << 1) & 0x0100;
-    val |= (s->B(9) << 2) & 0x0200;
-    val |= (s->B(10) << 3) & 0x0400;
-    val |= (s->B(11) << 4) & 0x0800;
-    val |= (s->B(12) << 5) & 0x1000;
-    val |= (s->B(13) << 6) & 0x2000;
-    val |= (s->B(14) << 7) & 0x4000;
-    val |= (s->B(15) << 8) & 0x8000;
-#endif
-    return val;
-}
-
 void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
 {
     Reg r;
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 542701720e..d6bb10342c 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -201,7 +201,6 @@ DEF_HELPER_2(movmskps, i32, env, Reg)
 DEF_HELPER_2(movmskpd, i32, env, Reg)
 #endif
 
-DEF_HELPER_2(glue(pmovmskb, SUFFIX), i32, env, Reg)
 DEF_HELPER_3(glue(packsswb, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(packuswb, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(packssdw, SUFFIX), void, env, Reg, Reg)
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index d25d914d63..5829c702d6 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -2776,6 +2776,77 @@ static inline void gen_op_movq_env_0(DisasContext *s, int d_offset)
     tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset);
 }
 
+static void gen_pmovmskb_i64(TCGv_i64 d, TCGv_i64 s)
+{
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    tcg_gen_andi_i64(d, s, 0x8080808080808080ull);
+
+    /*
+     * After each shift+or pair:
+     * 0:  a.......b.......c.......d.......e.......f.......g.......h.......
+     * 7:  ab......bc......cd......de......ef......fg......gh......h.......
+     * 14: abcd....bcde....cdef....defg....efgh....fgh.....gh......h.......
+     * 28: abcdefghbcdefgh.cdefgh..defgh...efgh....fgh.....gh......h.......
+     * The result is left in the high bits of the word.
+     */
+    tcg_gen_shli_i64(t, d, 7);
+    tcg_gen_or_i64(d, d, t);
+    tcg_gen_shli_i64(t, d, 14);
+    tcg_gen_or_i64(d, d, t);
+    tcg_gen_shli_i64(t, d, 28);
+    tcg_gen_or_i64(d, d, t);
+}
+
+static void gen_pmovmskb_vec(unsigned vece, TCGv_vec d, TCGv_vec s)
+{
+    TCGv_vec t = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, MO_8, 0x80);
+
+    /* See above */
+    tcg_gen_and_vec(vece, d, s, m);
+    tcg_gen_shli_vec(vece, t, d, 7);
+    tcg_gen_or_vec(vece, d, d, t);
+    tcg_gen_shli_vec(vece, t, d, 14);
+    tcg_gen_or_vec(vece, d, d, t);
+    if (vece == MO_64) {
+        tcg_gen_shli_vec(vece, t, d, 28);
+        tcg_gen_or_vec(vece, d, d, t);
+    }
+}
+
+static void gen_gvec_pmovmskb(TCGv out, int s_reg, bool is_xmm)
+{
+    static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
+    static const GVecGen2 g = {
+        .fni8 = gen_pmovmskb_i64,
+        .fniv = gen_pmovmskb_vec,
+        .opt_opc = vecop_list,
+        .vece = MO_64,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64
+    };
+
+    int s_ofs = (is_xmm
+                 ? offsetof(CPUX86State, xmm_regs[s_reg].ZMM_X(0))
+                 : offsetof(CPUX86State, fpregs[s_reg].mmx));
+    int d_ofs = (is_xmm
+                 ? offsetof(CPUX86State, xmm_t0.ZMM_X(0))
+                 : offsetof(CPUX86State, mmx_t0));
+    int vec_len = is_xmm ? 16 : 8;
+
+    tcg_gen_gvec_2(d_ofs, s_ofs, vec_len, vec_len, &g);
+
+    if (is_xmm) {
+        TCGv t = tcg_temp_new();
+        tcg_gen_ld8u_tl(t, cpu_env, d_ofs + offsetof(XMMReg, XMM_B(15)));
+        tcg_gen_ld8u_tl(out, cpu_env, d_ofs + offsetof(XMMReg, XMM_B(7)));
+        tcg_gen_deposit_tl(out, out, t, 8, TARGET_LONG_BITS - 8);
+        tcg_temp_free(t);
+    } else {
+        tcg_gen_ld8u_tl(out, cpu_env, d_ofs + offsetof(MMXReg, MMX_B(7)));
+    }
+}
+
 typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg);
 typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg);
 typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val);
@@ -3742,21 +3813,12 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
             break;
         case 0xd7: /* pmovmskb */
         case 0x1d7:
-            if (mod != 3)
+            if (mod != 3) {
                 goto illegal_op;
-            if (b1) {
-                rm = (modrm & 7) | REX_B(s);
-                tcg_gen_addi_ptr(s->ptr0, cpu_env,
-                                 offsetof(CPUX86State, xmm_regs[rm]));
-                gen_helper_pmovmskb_xmm(s->tmp2_i32, cpu_env, s->ptr0);
-            } else {
-                rm = (modrm & 7);
-                tcg_gen_addi_ptr(s->ptr0, cpu_env,
-                                 offsetof(CPUX86State, fpregs[rm].mmx));
-                gen_helper_pmovmskb_mmx(s->tmp2_i32, cpu_env, s->ptr0);
             }
+            rm = (modrm & 7) | (is_xmm ? REX_B(s) : 0);
             reg = ((modrm >> 3) & 7) | REX_R(s);
-            tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32);
+            gen_gvec_pmovmskb(cpu_regs[reg], rm, is_xmm);
             break;
 
         case 0x138:
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-08-22 22:53 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-22 22:37 [PATCH 0/6] target/i386: Host vector ops for high-usage SSE Richard Henderson
2022-08-22 22:37 ` [PATCH 1/6] target/i386: Define XMMReg and access macros Richard Henderson
2022-08-22 22:37 ` [PATCH 2/6] target/i386: Use tcg gvec for gen_op_movo Richard Henderson
2022-08-22 22:37 ` [PATCH 3/6] target/i386: Use tcg gvec for pcmp{eq,gt}* Richard Henderson
2022-08-22 22:37 ` [PATCH 4/6] target/i386: Use tcg gvec for p{add,sub}* Richard Henderson
2022-08-22 22:37 ` [PATCH 5/6] target/i386: Use tcg gvec for pand, pandn, por, pxor Richard Henderson
2022-08-22 22:37 ` [PATCH 6/6] target/i386: Use tcg gvec ops for pmovmskb Richard Henderson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.