All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 00/20] tcg: vector improvements
@ 2021-12-18 19:42 Richard Henderson
  2021-12-18 19:42 ` [PATCH 01/20] tcg/optimize: Fix folding of vector ops Richard Henderson
                   ` (21 more replies)
  0 siblings, 22 replies; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

Add some opcodes for compound logic operations that were so
far marked as TODO.  Implement those for PPC and S390X.

We do not want to implement 512-bit width operations, because
those trigger a cluster clock slowdown on the current set of
Intel cpus.  But there are new operations in avx512 that apply
to 128 and 256-bit vectors, which do not trigger the slowdown,
and those are very interesting.


r~


Richard Henderson (20):
  tcg/optimize: Fix folding of vector ops
  tcg: Add opcodes for vector nand, nor, eqv
  tcg/ppc: Implement vector NAND, NOR, EQV
  tcg/s390x: Implement vector NAND, NOR, EQV
  tcg/i386: Detect AVX512
  tcg/i386: Add tcg_out_evex_opc
  tcg/i386: Use tcg_can_emit_vec_op in expand_vec_cmp_noinv
  tcg/i386: Implement avx512 variable shifts
  tcg/i386: Implement avx512 scalar shift
  tcg/i386: Implement avx512 immediate sari shift
  tcg/i386: Implement avx512 immediate rotate
  tcg/i386: Implement avx512 variable rotate
  tcg/i386: Support avx512vbmi2 vector shift-double instructions
  tcg/i386: Expand vector word rotate as avx512vbmi2 shift-double
  tcg/i386: Remove rotls_vec from tcg_target_op_def
  tcg/i386: Expand scalar rotate with avx512 insns
  tcg/i386: Implement avx512 min/max/abs
  tcg/i386: Implement avx512 multiply
  tcg/i386: Implement more logical operations for avx512
  tcg/i386: Implement bitsel for avx512

 include/qemu/cpuid.h          |  20 +-
 include/tcg/tcg-opc.h         |   3 +
 include/tcg/tcg.h             |   3 +
 tcg/aarch64/tcg-target.h      |   3 +
 tcg/arm/tcg-target.h          |   3 +
 tcg/i386/tcg-target-con-set.h |   1 +
 tcg/i386/tcg-target.h         |  17 +-
 tcg/i386/tcg-target.opc.h     |   3 +
 tcg/ppc/tcg-target.h          |   3 +
 tcg/s390x/tcg-target.h        |   3 +
 tcg/optimize.c                |  61 ++++--
 tcg/tcg-op-vec.c              |  27 ++-
 tcg/tcg.c                     |   6 +
 tcg/i386/tcg-target.c.inc     | 386 ++++++++++++++++++++++++++++------
 tcg/ppc/tcg-target.c.inc      |  15 ++
 tcg/s390x/tcg-target.c.inc    |  17 ++
 16 files changed, 472 insertions(+), 99 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 51+ messages in thread

* [PATCH 01/20] tcg/optimize: Fix folding of vector ops
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2021-12-19 11:37   ` Philippe Mathieu-Daudé
  2021-12-18 19:42 ` [PATCH 02/20] tcg: Add opcodes for vector nand, nor, eqv Richard Henderson
                   ` (20 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

Bitwise operations are easy to fold, because the operation is
identical regardess of element size.  But add and sub need
extra element size info that is not currently propagated.

Fixes: 2f9f08ba43d
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/optimize.c | 49 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 2397f2cf93..e573000951 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -308,13 +308,13 @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     CASE_OP_32_64(mul):
         return x * y;
 
-    CASE_OP_32_64(and):
+    CASE_OP_32_64_VEC(and):
         return x & y;
 
-    CASE_OP_32_64(or):
+    CASE_OP_32_64_VEC(or):
         return x | y;
 
-    CASE_OP_32_64(xor):
+    CASE_OP_32_64_VEC(xor):
         return x ^ y;
 
     case INDEX_op_shl_i32:
@@ -347,16 +347,16 @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     case INDEX_op_rotl_i64:
         return rol64(x, y & 63);
 
-    CASE_OP_32_64(not):
+    CASE_OP_32_64_VEC(not):
         return ~x;
 
     CASE_OP_32_64(neg):
         return -x;
 
-    CASE_OP_32_64(andc):
+    CASE_OP_32_64_VEC(andc):
         return x & ~y;
 
-    CASE_OP_32_64(orc):
+    CASE_OP_32_64_VEC(orc):
         return x | ~y;
 
     CASE_OP_32_64(eqv):
@@ -751,6 +751,12 @@ static bool fold_const2(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_commutative(OptContext *ctx, TCGOp *op)
+{
+    swap_commutative(op->args[0], &op->args[1], &op->args[2]);
+    return false;
+}
+
 static bool fold_const2_commutative(OptContext *ctx, TCGOp *op)
 {
     swap_commutative(op->args[0], &op->args[1], &op->args[2]);
@@ -905,6 +911,16 @@ static bool fold_add(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+/* We cannot as yet do_constant_folding with vectors. */
+static bool fold_add_vec(OptContext *ctx, TCGOp *op)
+{
+    if (fold_commutative(ctx, op) ||
+        fold_xi_to_x(ctx, op, 0)) {
+        return true;
+    }
+    return false;
+}
+
 static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 {
     if (arg_is_const(op->args[2]) && arg_is_const(op->args[3]) &&
@@ -1938,10 +1954,10 @@ static bool fold_sub_to_neg(OptContext *ctx, TCGOp *op)
     return false;
 }
 
-static bool fold_sub(OptContext *ctx, TCGOp *op)
+/* We cannot as yet do_constant_folding with vectors. */
+static bool fold_sub_vec(OptContext *ctx, TCGOp *op)
 {
-    if (fold_const2(ctx, op) ||
-        fold_xx_to_i(ctx, op, 0) ||
+    if (fold_xx_to_i(ctx, op, 0) ||
         fold_xi_to_x(ctx, op, 0) ||
         fold_sub_to_neg(ctx, op)) {
         return true;
@@ -1949,6 +1965,11 @@ static bool fold_sub(OptContext *ctx, TCGOp *op)
     return false;
 }
 
+static bool fold_sub(OptContext *ctx, TCGOp *op)
+{
+    return fold_const2(ctx, op) || fold_sub_vec(ctx, op);
+}
+
 static bool fold_sub2(OptContext *ctx, TCGOp *op)
 {
     return fold_addsub2(ctx, op, false);
@@ -2052,9 +2073,12 @@ void tcg_optimize(TCGContext *s)
          * Sorted alphabetically by opcode as much as possible.
          */
         switch (opc) {
-        CASE_OP_32_64_VEC(add):
+        CASE_OP_32_64(add):
             done = fold_add(&ctx, op);
             break;
+        case INDEX_op_add_vec:
+            done = fold_add_vec(&ctx, op);
+            break;
         CASE_OP_32_64(add2):
             done = fold_add2(&ctx, op);
             break;
@@ -2193,9 +2217,12 @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(sextract):
             done = fold_sextract(&ctx, op);
             break;
-        CASE_OP_32_64_VEC(sub):
+        CASE_OP_32_64(sub):
             done = fold_sub(&ctx, op);
             break;
+        case INDEX_op_sub_vec:
+            done = fold_sub_vec(&ctx, op);
+            break;
         CASE_OP_32_64(sub2):
             done = fold_sub2(&ctx, op);
             break;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 02/20] tcg: Add opcodes for vector nand, nor, eqv
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
  2021-12-18 19:42 ` [PATCH 01/20] tcg/optimize: Fix folding of vector ops Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2021-12-19 11:28   ` Philippe Mathieu-Daudé
  2022-02-01 18:28   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 03/20] tcg/ppc: Implement vector NAND, NOR, EQV Richard Henderson
                   ` (19 subsequent siblings)
  21 siblings, 2 replies; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

We've had placeholders for these opcodes for a while,
and should have support on ppc, s390x and avx512 hosts.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-opc.h    |  3 +++
 include/tcg/tcg.h        |  3 +++
 tcg/aarch64/tcg-target.h |  3 +++
 tcg/arm/tcg-target.h     |  3 +++
 tcg/i386/tcg-target.h    |  3 +++
 tcg/ppc/tcg-target.h     |  3 +++
 tcg/s390x/tcg-target.h   |  3 +++
 tcg/optimize.c           | 12 ++++++------
 tcg/tcg-op-vec.c         | 27 ++++++++++++++++++---------
 tcg/tcg.c                |  6 ++++++
 10 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index 675873e200..dd444734d9 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -245,6 +245,9 @@ DEF(or_vec, 1, 2, 0, IMPLVEC)
 DEF(xor_vec, 1, 2, 0, IMPLVEC)
 DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
 DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
+DEF(nand_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_nand_vec))
+DEF(nor_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_nor_vec))
+DEF(eqv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_eqv_vec))
 DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
 
 DEF(shli_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec))
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index 42f5b500ed..30c8fe2b83 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -183,6 +183,9 @@ typedef uint64_t TCGRegSet;
 #define TCG_TARGET_HAS_not_vec          0
 #define TCG_TARGET_HAS_andc_vec         0
 #define TCG_TARGET_HAS_orc_vec          0
+#define TCG_TARGET_HAS_nand_vec         0
+#define TCG_TARGET_HAS_nor_vec          0
+#define TCG_TARGET_HAS_eqv_vec          0
 #define TCG_TARGET_HAS_roti_vec         0
 #define TCG_TARGET_HAS_rots_vec         0
 #define TCG_TARGET_HAS_rotv_vec         0
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 7a93ac8023..1c669cd806 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -131,6 +131,9 @@ typedef enum {
 
 #define TCG_TARGET_HAS_andc_vec         1
 #define TCG_TARGET_HAS_orc_vec          1
+#define TCG_TARGET_HAS_nand_vec         0
+#define TCG_TARGET_HAS_nor_vec          0
+#define TCG_TARGET_HAS_eqv_vec          0
 #define TCG_TARGET_HAS_not_vec          1
 #define TCG_TARGET_HAS_neg_vec          1
 #define TCG_TARGET_HAS_abs_vec          1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index f41b809554..a9f1b30436 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -132,6 +132,9 @@ extern bool use_neon_instructions;
 
 #define TCG_TARGET_HAS_andc_vec         1
 #define TCG_TARGET_HAS_orc_vec          1
+#define TCG_TARGET_HAS_nand_vec         0
+#define TCG_TARGET_HAS_nor_vec          0
+#define TCG_TARGET_HAS_eqv_vec          0
 #define TCG_TARGET_HAS_not_vec          1
 #define TCG_TARGET_HAS_neg_vec          1
 #define TCG_TARGET_HAS_abs_vec          1
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index b00a6da293..64c1013182 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -185,6 +185,9 @@ extern bool have_movbe;
 
 #define TCG_TARGET_HAS_andc_vec         1
 #define TCG_TARGET_HAS_orc_vec          0
+#define TCG_TARGET_HAS_nand_vec         0
+#define TCG_TARGET_HAS_nor_vec          0
+#define TCG_TARGET_HAS_eqv_vec          0
 #define TCG_TARGET_HAS_not_vec          0
 #define TCG_TARGET_HAS_neg_vec          0
 #define TCG_TARGET_HAS_abs_vec          1
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 0943192cde..d4fd28c6b0 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -162,6 +162,9 @@ extern bool have_vsx;
 
 #define TCG_TARGET_HAS_andc_vec         1
 #define TCG_TARGET_HAS_orc_vec          have_isa_2_07
+#define TCG_TARGET_HAS_nand_vec         0
+#define TCG_TARGET_HAS_nor_vec          0
+#define TCG_TARGET_HAS_eqv_vec          0
 #define TCG_TARGET_HAS_not_vec          1
 #define TCG_TARGET_HAS_neg_vec          have_isa_3_00
 #define TCG_TARGET_HAS_abs_vec          0
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index 527ada0f63..ad29e62b16 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -145,6 +145,9 @@ extern uint64_t s390_facilities[3];
 
 #define TCG_TARGET_HAS_andc_vec       1
 #define TCG_TARGET_HAS_orc_vec        HAVE_FACILITY(VECTOR_ENH1)
+#define TCG_TARGET_HAS_nand_vec       0
+#define TCG_TARGET_HAS_nor_vec        0
+#define TCG_TARGET_HAS_eqv_vec        0
 #define TCG_TARGET_HAS_not_vec        1
 #define TCG_TARGET_HAS_neg_vec        1
 #define TCG_TARGET_HAS_abs_vec        1
diff --git a/tcg/optimize.c b/tcg/optimize.c
index e573000951..89a3396c9c 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -359,13 +359,13 @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y)
     CASE_OP_32_64_VEC(orc):
         return x | ~y;
 
-    CASE_OP_32_64(eqv):
+    CASE_OP_32_64_VEC(eqv):
         return ~(x ^ y);
 
-    CASE_OP_32_64(nand):
+    CASE_OP_32_64_VEC(nand):
         return ~(x & y);
 
-    CASE_OP_32_64(nor):
+    CASE_OP_32_64_VEC(nor):
         return ~(x | y);
 
     case INDEX_op_clz_i32:
@@ -2119,7 +2119,7 @@ void tcg_optimize(TCGContext *s)
         case INDEX_op_dup2_vec:
             done = fold_dup2(&ctx, op);
             break;
-        CASE_OP_32_64(eqv):
+        CASE_OP_32_64_VEC(eqv):
             done = fold_eqv(&ctx, op);
             break;
         CASE_OP_32_64(extract):
@@ -2170,13 +2170,13 @@ void tcg_optimize(TCGContext *s)
         CASE_OP_32_64(mulu2):
             done = fold_multiply2(&ctx, op);
             break;
-        CASE_OP_32_64(nand):
+        CASE_OP_32_64_VEC(nand):
             done = fold_nand(&ctx, op);
             break;
         CASE_OP_32_64(neg):
             done = fold_neg(&ctx, op);
             break;
-        CASE_OP_32_64(nor):
+        CASE_OP_32_64_VEC(nor):
             done = fold_nor(&ctx, op);
             break;
         CASE_OP_32_64_VEC(not):
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index faf30f9cdd..463dabf515 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -371,23 +371,32 @@ void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
 
 void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
 {
-    /* TODO: Add TCG_TARGET_HAS_nand_vec when adding a backend supports it. */
-    tcg_gen_and_vec(0, r, a, b);
-    tcg_gen_not_vec(0, r, r);
+    if (TCG_TARGET_HAS_nand_vec) {
+        vec_gen_op3(INDEX_op_nand_vec, 0, r, a, b);
+    } else {
+        tcg_gen_and_vec(0, r, a, b);
+        tcg_gen_not_vec(0, r, r);
+    }
 }
 
 void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
 {
-    /* TODO: Add TCG_TARGET_HAS_nor_vec when adding a backend supports it. */
-    tcg_gen_or_vec(0, r, a, b);
-    tcg_gen_not_vec(0, r, r);
+    if (TCG_TARGET_HAS_nor_vec) {
+        vec_gen_op3(INDEX_op_nor_vec, 0, r, a, b);
+    } else {
+        tcg_gen_or_vec(0, r, a, b);
+        tcg_gen_not_vec(0, r, r);
+    }
 }
 
 void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
 {
-    /* TODO: Add TCG_TARGET_HAS_eqv_vec when adding a backend supports it. */
-    tcg_gen_xor_vec(0, r, a, b);
-    tcg_gen_not_vec(0, r, r);
+    if (TCG_TARGET_HAS_eqv_vec) {
+        vec_gen_op3(INDEX_op_eqv_vec, 0, r, a, b);
+    } else {
+        tcg_gen_xor_vec(0, r, a, b);
+        tcg_gen_not_vec(0, r, r);
+    }
 }
 
 static bool do_op2(unsigned vece, TCGv_vec r, TCGv_vec a, TCGOpcode opc)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 934aa8510b..fca23858d4 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1406,6 +1406,12 @@ bool tcg_op_supported(TCGOpcode op)
         return have_vec && TCG_TARGET_HAS_andc_vec;
     case INDEX_op_orc_vec:
         return have_vec && TCG_TARGET_HAS_orc_vec;
+    case INDEX_op_nand_vec:
+        return have_vec && TCG_TARGET_HAS_nand_vec;
+    case INDEX_op_nor_vec:
+        return have_vec && TCG_TARGET_HAS_nor_vec;
+    case INDEX_op_eqv_vec:
+        return have_vec && TCG_TARGET_HAS_eqv_vec;
     case INDEX_op_mul_vec:
         return have_vec && TCG_TARGET_HAS_mul_vec;
     case INDEX_op_shli_vec:
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 03/20] tcg/ppc: Implement vector NAND, NOR, EQV
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
  2021-12-18 19:42 ` [PATCH 01/20] tcg/optimize: Fix folding of vector ops Richard Henderson
  2021-12-18 19:42 ` [PATCH 02/20] tcg: Add opcodes for vector nand, nor, eqv Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2021-12-19  0:15   ` Philippe Mathieu-Daudé
  2022-02-01 18:29   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 04/20] tcg/s390x: " Richard Henderson
                   ` (18 subsequent siblings)
  21 siblings, 2 replies; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/ppc/tcg-target.h     |  6 +++---
 tcg/ppc/tcg-target.c.inc | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index d4fd28c6b0..6a6bc3f480 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -162,9 +162,9 @@ extern bool have_vsx;
 
 #define TCG_TARGET_HAS_andc_vec         1
 #define TCG_TARGET_HAS_orc_vec          have_isa_2_07
-#define TCG_TARGET_HAS_nand_vec         0
-#define TCG_TARGET_HAS_nor_vec          0
-#define TCG_TARGET_HAS_eqv_vec          0
+#define TCG_TARGET_HAS_nand_vec         have_isa_2_07
+#define TCG_TARGET_HAS_nor_vec          1
+#define TCG_TARGET_HAS_eqv_vec          have_isa_2_07
 #define TCG_TARGET_HAS_not_vec          1
 #define TCG_TARGET_HAS_neg_vec          have_isa_3_00
 #define TCG_TARGET_HAS_abs_vec          0
diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc
index 3e4ca2be88..01fd327eb9 100644
--- a/tcg/ppc/tcg-target.c.inc
+++ b/tcg/ppc/tcg-target.c.inc
@@ -3040,6 +3040,9 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_xor_vec:
     case INDEX_op_andc_vec:
     case INDEX_op_not_vec:
+    case INDEX_op_nor_vec:
+    case INDEX_op_eqv_vec:
+    case INDEX_op_nand_vec:
         return 1;
     case INDEX_op_orc_vec:
         return have_isa_2_07;
@@ -3318,6 +3321,15 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_orc_vec:
         insn = VORC;
         break;
+    case INDEX_op_nand_vec:
+        insn = VNAND;
+        break;
+    case INDEX_op_nor_vec:
+        insn = VNOR;
+        break;
+    case INDEX_op_eqv_vec:
+        insn = VEQV;
+        break;
 
     case INDEX_op_cmp_vec:
         switch (args[3]) {
@@ -3705,6 +3717,9 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_xor_vec:
     case INDEX_op_andc_vec:
     case INDEX_op_orc_vec:
+    case INDEX_op_nor_vec:
+    case INDEX_op_eqv_vec:
+    case INDEX_op_nand_vec:
     case INDEX_op_cmp_vec:
     case INDEX_op_ssadd_vec:
     case INDEX_op_sssub_vec:
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 04/20] tcg/s390x: Implement vector NAND, NOR, EQV
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (2 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 03/20] tcg/ppc: Implement vector NAND, NOR, EQV Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2021-12-19  0:17   ` Philippe Mathieu-Daudé
                     ` (3 more replies)
  2021-12-18 19:42 ` [PATCH 05/20] tcg/i386: Detect AVX512 Richard Henderson
                   ` (17 subsequent siblings)
  21 siblings, 4 replies; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/s390x/tcg-target.h     |  6 +++---
 tcg/s390x/tcg-target.c.inc | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index ad29e62b16..fef227b0fe 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -145,9 +145,9 @@ extern uint64_t s390_facilities[3];
 
 #define TCG_TARGET_HAS_andc_vec       1
 #define TCG_TARGET_HAS_orc_vec        HAVE_FACILITY(VECTOR_ENH1)
-#define TCG_TARGET_HAS_nand_vec       0
-#define TCG_TARGET_HAS_nor_vec        0
-#define TCG_TARGET_HAS_eqv_vec        0
+#define TCG_TARGET_HAS_nand_vec       HAVE_FACILITY(VECTOR_ENH1)
+#define TCG_TARGET_HAS_nor_vec        1
+#define TCG_TARGET_HAS_eqv_vec        HAVE_FACILITY(VECTOR_ENH1)
 #define TCG_TARGET_HAS_not_vec        1
 #define TCG_TARGET_HAS_neg_vec        1
 #define TCG_TARGET_HAS_abs_vec        1
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index 57e803e339..5a90b892cb 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -288,7 +288,9 @@ typedef enum S390Opcode {
     VRRc_VMXL   = 0xe7fd,
     VRRc_VN     = 0xe768,
     VRRc_VNC    = 0xe769,
+    VRRc_VNN    = 0xe76e,
     VRRc_VNO    = 0xe76b,
+    VRRc_VNX    = 0xe76c,
     VRRc_VO     = 0xe76a,
     VRRc_VOC    = 0xe76f,
     VRRc_VPKS   = 0xe797,   /* we leave the m5 cs field 0 */
@@ -2750,6 +2752,15 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_xor_vec:
         tcg_out_insn(s, VRRc, VX, a0, a1, a2, 0);
         break;
+    case INDEX_op_nand_vec:
+        tcg_out_insn(s, VRRc, VNN, a0, a1, a2, 0);
+        break;
+    case INDEX_op_nor_vec:
+        tcg_out_insn(s, VRRc, VNO, a0, a1, a2, 0);
+        break;
+    case INDEX_op_eqv_vec:
+        tcg_out_insn(s, VRRc, VNX, a0, a1, a2, 0);
+        break;
 
     case INDEX_op_shli_vec:
         tcg_out_insn(s, VRSa, VESL, a0, a2, TCG_REG_NONE, a1, vece);
@@ -2846,7 +2857,10 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_and_vec:
     case INDEX_op_andc_vec:
     case INDEX_op_bitsel_vec:
+    case INDEX_op_eqv_vec:
+    case INDEX_op_nand_vec:
     case INDEX_op_neg_vec:
+    case INDEX_op_nor_vec:
     case INDEX_op_not_vec:
     case INDEX_op_or_vec:
     case INDEX_op_orc_vec:
@@ -3191,6 +3205,9 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_or_vec:
     case INDEX_op_orc_vec:
     case INDEX_op_xor_vec:
+    case INDEX_op_nand_vec:
+    case INDEX_op_nor_vec:
+    case INDEX_op_eqv_vec:
     case INDEX_op_cmp_vec:
     case INDEX_op_mul_vec:
     case INDEX_op_rotlv_vec:
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 05/20] tcg/i386: Detect AVX512
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (3 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 04/20] tcg/s390x: " Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-01 18:41   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 06/20] tcg/i386: Add tcg_out_evex_opc Richard Henderson
                   ` (16 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

There are some operation sizes in some subsets of AVX512 that
are missing from previous iterations of AVX.  Detect them.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/qemu/cpuid.h      | 20 +++++++++++++++++---
 tcg/i386/tcg-target.h     |  4 ++++
 tcg/i386/tcg-target.c.inc | 24 ++++++++++++++++++++++--
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h
index 09fc245b91..7adb12d320 100644
--- a/include/qemu/cpuid.h
+++ b/include/qemu/cpuid.h
@@ -45,12 +45,26 @@
 #ifndef bit_AVX2
 #define bit_AVX2        (1 << 5)
 #endif
-#ifndef bit_AVX512F
-#define bit_AVX512F        (1 << 16)
-#endif
 #ifndef bit_BMI2
 #define bit_BMI2        (1 << 8)
 #endif
+#ifndef bit_AVX512F
+#define bit_AVX512F     (1 << 16)
+#endif
+#ifndef bit_AVX512DQ
+#define bit_AVX512DQ    (1 << 17)
+#endif
+#ifndef bit_AVX512BW
+#define bit_AVX512BW    (1 << 30)
+#endif
+#ifndef bit_AVX512VL
+#define bit_AVX512VL    (1u << 31)
+#endif
+
+/* Leaf 7, %ecx */
+#ifndef bit_AVX512VBMI2
+#define bit_AVX512VBMI2 (1 << 6)
+#endif
 
 /* Leaf 0x80000001, %ecx */
 #ifndef bit_LZCNT
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 64c1013182..12d098ad6c 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -103,6 +103,10 @@ extern bool have_bmi1;
 extern bool have_popcnt;
 extern bool have_avx1;
 extern bool have_avx2;
+extern bool have_avx512bw;
+extern bool have_avx512dq;
+extern bool have_avx512vbmi2;
+extern bool have_avx512vl;
 extern bool have_movbe;
 
 /* optional instructions */
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 84b109bb84..e266f937d6 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -170,6 +170,10 @@ bool have_bmi1;
 bool have_popcnt;
 bool have_avx1;
 bool have_avx2;
+bool have_avx512bw;
+bool have_avx512dq;
+bool have_avx512vbmi2;
+bool have_avx512vl;
 bool have_movbe;
 
 #ifdef CONFIG_CPUID_H
@@ -3746,12 +3750,12 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 static void tcg_target_init(TCGContext *s)
 {
 #ifdef CONFIG_CPUID_H
-    unsigned a, b, c, d, b7 = 0;
+    unsigned a, b, c, d, b7 = 0, c7 = 0;
     int max = __get_cpuid_max(0, 0);
 
     if (max >= 7) {
         /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
-        __cpuid_count(7, 0, a, b7, c, d);
+        __cpuid_count(7, 0, a, b7, c7, d);
         have_bmi1 = (b7 & bit_BMI) != 0;
         have_bmi2 = (b7 & bit_BMI2) != 0;
     }
@@ -3781,6 +3785,22 @@ static void tcg_target_init(TCGContext *s)
             if ((xcrl & 6) == 6) {
                 have_avx1 = (c & bit_AVX) != 0;
                 have_avx2 = (b7 & bit_AVX2) != 0;
+
+                /*
+                 * There are interesting instructions in AVX512, so long
+                 * as we have AVX512VL, which indicates support for EVEX
+                 * on sizes smaller than 512 bits.  We are required to
+                 * check that OPMASK and all extended ZMM state are enabled
+                 * even if we're not using them -- the insns will fault.
+                 */
+                if ((xcrl & 0xe0) == 0xe0
+                    && (b7 & bit_AVX512F)
+                    && (b7 & bit_AVX512VL)) {
+                    have_avx512vl = true;
+                    have_avx512bw = (b7 & bit_AVX512BW) != 0;
+                    have_avx512dq = (b7 & bit_AVX512DQ) != 0;
+                    have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
+                }
             }
         }
     }
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 06/20] tcg/i386: Add tcg_out_evex_opc
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (4 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 05/20] tcg/i386: Detect AVX512 Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-01 19:20   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 07/20] tcg/i386: Use tcg_can_emit_vec_op in expand_vec_cmp_noinv Richard Henderson
                   ` (15 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

The evex encoding is added here, for use in a subsequent patch.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 51 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index e266f937d6..44d2919047 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -261,6 +261,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
+#define P_EVEX          0x100000        /* Requires EVEX encoding */
 
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
@@ -623,9 +624,57 @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
     tcg_out8(s, opc);
 }
 
+static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
+                             int rm, int index)
+{
+    /* The entire 4-byte evex prefix; with R' and V' set. */
+    uint32_t p = 0x08041062;
+    int mm, pp;
+
+    tcg_debug_assert(have_avx512vl);
+
+    /* EVEX.mm */
+    if (opc & P_EXT3A) {
+        mm = 3;
+    } else if (opc & P_EXT38) {
+        mm = 2;
+    } else if (opc & P_EXT) {
+        mm = 1;
+    } else {
+        g_assert_not_reached();
+    }
+
+    /* EVEX.pp */
+    if (opc & P_DATA16) {
+        pp = 1;                          /* 0x66 */
+    } else if (opc & P_SIMDF3) {
+        pp = 2;                          /* 0xf3 */
+    } else if (opc & P_SIMDF2) {
+        pp = 3;                          /* 0xf2 */
+    } else {
+        pp = 0;
+    }
+
+    p = deposit32(p, 8, 2, mm);
+    p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
+    p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
+    p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
+    p = deposit32(p, 16, 2, pp);
+    p = deposit32(p, 19, 4, ~v);
+    p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
+    p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
+
+    tcg_out32(s, p);
+    tcg_out8(s, opc);
+}
+
 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 {
-    tcg_out_vex_opc(s, opc, r, v, rm, 0);
+    if (opc & P_EVEX) {
+        tcg_out_evex_opc(s, opc, r, v, rm, 0);
+    } else {
+        tcg_out_vex_opc(s, opc, r, v, rm, 0);
+    }
     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 }
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 07/20] tcg/i386: Use tcg_can_emit_vec_op in expand_vec_cmp_noinv
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (5 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 06/20] tcg/i386: Add tcg_out_evex_opc Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-01 19:21   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 08/20] tcg/i386: Implement avx512 variable shifts Richard Henderson
                   ` (14 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

The condition for UMIN/UMAX availability is about to change;
use the canonical version.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 44d2919047..316e550b38 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -3527,28 +3527,28 @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
         fixup = NEED_SWAP | NEED_INV;
         break;
     case TCG_COND_LEU:
-        if (vece <= MO_32) {
+        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
             fixup = NEED_UMIN;
         } else {
             fixup = NEED_BIAS | NEED_INV;
         }
         break;
     case TCG_COND_GTU:
-        if (vece <= MO_32) {
+        if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
             fixup = NEED_UMIN | NEED_INV;
         } else {
             fixup = NEED_BIAS;
         }
         break;
     case TCG_COND_GEU:
-        if (vece <= MO_32) {
+        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
             fixup = NEED_UMAX;
         } else {
             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
         }
         break;
     case TCG_COND_LTU:
-        if (vece <= MO_32) {
+        if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
             fixup = NEED_UMAX | NEED_INV;
         } else {
             fixup = NEED_BIAS | NEED_SWAP;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 08/20] tcg/i386: Implement avx512 variable shifts
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (6 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 07/20] tcg/i386: Use tcg_can_emit_vec_op in expand_vec_cmp_noinv Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-01 20:33   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 09/20] tcg/i386: Implement avx512 scalar shift Richard Henderson
                   ` (13 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

AVX512VL has VPSRAVQ, and
AVX512BW has VPSLLVW, VPSRAVW, VPSRLVW.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 316e550b38..7b9302fcc2 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -418,9 +418,13 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
+#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
+#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
+#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 #define OPC_VZEROUPPER  (0x77 | P_EXT)
@@ -2742,16 +2746,13 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
     };
     static int const shlv_insn[4] = {
-        /* TODO: AVX512 adds support for MO_16.  */
-        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
+        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
     };
     static int const shrv_insn[4] = {
-        /* TODO: AVX512 adds support for MO_16.  */
-        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
+        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
     };
     static int const sarv_insn[4] = {
-        /* TODO: AVX512 adds support for MO_16, MO_64.  */
-        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
+        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
     };
     static int const shls_insn[4] = {
         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
@@ -3242,9 +3243,24 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
 
     case INDEX_op_shlv_vec:
     case INDEX_op_shrv_vec:
-        return have_avx2 && vece >= MO_32;
+        switch (vece) {
+        case MO_16:
+            return have_avx512bw;
+        case MO_32:
+        case MO_64:
+            return have_avx2;
+        }
+        return 0;
     case INDEX_op_sarv_vec:
-        return have_avx2 && vece == MO_32;
+        switch (vece) {
+        case MO_16:
+            return have_avx512bw;
+        case MO_32:
+            return have_avx2;
+        case MO_64:
+            return have_avx512vl;
+        }
+        return 0;
     case INDEX_op_rotlv_vec:
     case INDEX_op_rotrv_vec:
         return have_avx2 && vece >= MO_32 ? -1 : 0;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 09/20] tcg/i386: Implement avx512 scalar shift
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (7 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 08/20] tcg/i386: Implement avx512 variable shifts Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-02 13:48   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 10/20] tcg/i386: Implement avx512 immediate sari shift Richard Henderson
                   ` (12 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

AVX512VL has VPSRAQ.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 7b9302fcc2..69481c188c 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -368,6 +368,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
+#define OPC_VPSRAQ      (0x72 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
@@ -2761,7 +2762,7 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
     };
     static int const sars_insn[4] = {
-        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
+        OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
     };
     static int const abs_insn[4] = {
         /* TODO: AVX512 adds support for MO_64.  */
@@ -3237,7 +3238,14 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_shrs_vec:
         return vece >= MO_16;
     case INDEX_op_sars_vec:
-        return vece >= MO_16 && vece <= MO_32;
+        switch (vece) {
+        case MO_16:
+        case MO_32:
+            return 1;
+        case MO_64:
+            return have_avx512vl;
+        }
+        return 0;
     case INDEX_op_rotls_vec:
         return vece >= MO_16 ? -1 : 0;
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 10/20] tcg/i386: Implement avx512 immediate sari shift
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (8 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 09/20] tcg/i386: Implement avx512 scalar shift Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-02 14:02   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 11/20] tcg/i386: Implement avx512 immediate rotate Richard Henderson
                   ` (11 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

AVX512 has VPSRAQ with immediate operand, in the same form as
with AVX, but requires EVEX encoding and W1.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 69481c188c..c4e6f2e5ea 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -2893,17 +2893,21 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_shli_vec:
+        insn = shift_imm_insn[vece];
         sub = 6;
         goto gen_shift;
     case INDEX_op_shri_vec:
+        insn = shift_imm_insn[vece];
         sub = 2;
         goto gen_shift;
     case INDEX_op_sari_vec:
-        tcg_debug_assert(vece != MO_64);
+        insn = shift_imm_insn[vece];
+        if (vece == MO_64) {
+            insn |= P_VEXW | P_EVEX;
+        }
         sub = 4;
     gen_shift:
         tcg_debug_assert(vece != MO_8);
-        insn = shift_imm_insn[vece];
         if (type == TCG_TYPE_V256) {
             insn |= P_VEXL;
         }
@@ -3223,16 +3227,23 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
         return vece == MO_8 ? -1 : 1;
 
     case INDEX_op_sari_vec:
-        /* We must expand the operation for MO_8.  */
-        if (vece == MO_8) {
+        switch (vece) {
+        case MO_8:
             return -1;
-        }
-        /* We can emulate this for MO_64, but it does not pay off
-           unless we're producing at least 4 values.  */
-        if (vece == MO_64) {
+        case MO_16:
+        case MO_32:
+            return 1;
+        case MO_64:
+            if (have_avx512vl) {
+                return 1;
+            }
+            /*
+             * We can emulate this for MO_64, but it does not pay off
+             * unless we're producing at least 4 values.
+             */
             return type >= TCG_TYPE_V256 ? -1 : 0;
         }
-        return 1;
+        return 0;
 
     case INDEX_op_shls_vec:
     case INDEX_op_shrs_vec:
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 11/20] tcg/i386: Implement avx512 immediate rotate
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (9 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 10/20] tcg/i386: Implement avx512 immediate sari shift Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-02 14:05   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 12/20] tcg/i386: Implement avx512 variable rotate Richard Henderson
                   ` (10 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

AVX512VL has VPROLD and VPROLQ, layered onto the same
opcode as PSHIFTD, but requires EVEX encoding and W.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.h     |  2 +-
 tcg/i386/tcg-target.c.inc | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 12d098ad6c..38c09fd66c 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -195,7 +195,7 @@ extern bool have_movbe;
 #define TCG_TARGET_HAS_not_vec          0
 #define TCG_TARGET_HAS_neg_vec          0
 #define TCG_TARGET_HAS_abs_vec          1
-#define TCG_TARGET_HAS_roti_vec         0
+#define TCG_TARGET_HAS_roti_vec         have_avx512vl
 #define TCG_TARGET_HAS_rots_vec         0
 #define TCG_TARGET_HAS_rotv_vec         0
 #define TCG_TARGET_HAS_shi_vec          1
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index c4e6f2e5ea..5ab7c4c0fa 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -361,7 +361,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
-#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
+#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
@@ -2906,6 +2906,14 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
             insn |= P_VEXW | P_EVEX;
         }
         sub = 4;
+        goto gen_shift;
+    case INDEX_op_rotli_vec:
+        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
+        if (vece == MO_64) {
+            insn |= P_VEXW;
+        }
+        sub = 1;
+        goto gen_shift;
     gen_shift:
         tcg_debug_assert(vece != MO_8);
         if (type == TCG_TYPE_V256) {
@@ -3195,6 +3203,7 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
+    case INDEX_op_rotli_vec:
     case INDEX_op_x86_psrldq_vec:
         return C_O1_I1(x, x);
 
@@ -3216,11 +3225,13 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_xor_vec:
     case INDEX_op_andc_vec:
         return 1;
-    case INDEX_op_rotli_vec:
     case INDEX_op_cmp_vec:
     case INDEX_op_cmpsel_vec:
         return -1;
 
+    case INDEX_op_rotli_vec:
+        return have_avx512vl && vece >= MO_32 ? 1 : -1;
+
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
         /* We must expand the operation for MO_8.  */
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 12/20] tcg/i386: Implement avx512 variable rotate
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (10 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 11/20] tcg/i386: Implement avx512 immediate rotate Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-02 14:14   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 13/20] tcg/i386: Support avx512vbmi2 vector shift-double instructions Richard Henderson
                   ` (9 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

AVX512VL has VPROLVQ and VPRORVQ.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.h     |  2 +-
 tcg/i386/tcg-target.c.inc | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 38c09fd66c..841b1febab 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -197,7 +197,7 @@ extern bool have_movbe;
 #define TCG_TARGET_HAS_abs_vec          1
 #define TCG_TARGET_HAS_roti_vec         have_avx512vl
 #define TCG_TARGET_HAS_rots_vec         0
-#define TCG_TARGET_HAS_rotv_vec         0
+#define TCG_TARGET_HAS_rotv_vec         have_avx512vl
 #define TCG_TARGET_HAS_shi_vec          1
 #define TCG_TARGET_HAS_shs_vec          1
 #define TCG_TARGET_HAS_shv_vec          have_avx2
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 5ab7c4c0fa..7fd6edb887 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -419,6 +419,10 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
+#define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
+#define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
+#define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
@@ -2746,6 +2750,12 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     static int const umax_insn[4] = {
         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
     };
+    static int const rotlv_insn[4] = {
+        OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
+    };
+    static int const rotrv_insn[4] = {
+        OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
+    };
     static int const shlv_insn[4] = {
         OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
     };
@@ -2829,6 +2839,12 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_sarv_vec:
         insn = sarv_insn[vece];
         goto gen_simd;
+    case INDEX_op_rotlv_vec:
+        insn = rotlv_insn[vece];
+        goto gen_simd;
+    case INDEX_op_rotrv_vec:
+        insn = rotrv_insn[vece];
+        goto gen_simd;
     case INDEX_op_shls_vec:
         insn = shls_insn[vece];
         goto gen_simd;
@@ -3181,6 +3197,8 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_shlv_vec:
     case INDEX_op_shrv_vec:
     case INDEX_op_sarv_vec:
+    case INDEX_op_rotlv_vec:
+    case INDEX_op_rotrv_vec:
     case INDEX_op_shls_vec:
     case INDEX_op_shrs_vec:
     case INDEX_op_sars_vec:
@@ -3293,7 +3311,12 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
         return 0;
     case INDEX_op_rotlv_vec:
     case INDEX_op_rotrv_vec:
-        return have_avx2 && vece >= MO_32 ? -1 : 0;
+        switch (vece) {
+        case MO_32:
+        case MO_64:
+            return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
+        }
+        return 0;
 
     case INDEX_op_mul_vec:
         if (vece == MO_8) {
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 13/20] tcg/i386: Support avx512vbmi2 vector shift-double instructions
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (11 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 12/20] tcg/i386: Implement avx512 variable rotate Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-02 14:28   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 14/20] tcg/i386: Expand vector word rotate as avx512vbmi2 shift-double Richard Henderson
                   ` (8 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

We will use VPSHLD, VPSHLDV and VPSHRDV for 16-bit rotates.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target-con-set.h |  1 +
 tcg/i386/tcg-target.opc.h     |  3 +++
 tcg/i386/tcg-target.c.inc     | 38 +++++++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/tcg/i386/tcg-target-con-set.h b/tcg/i386/tcg-target-con-set.h
index 78774d1005..91ceb0e1da 100644
--- a/tcg/i386/tcg-target-con-set.h
+++ b/tcg/i386/tcg-target-con-set.h
@@ -45,6 +45,7 @@ C_O1_I2(r, r, rI)
 C_O1_I2(x, x, x)
 C_N1_I2(r, r, r)
 C_N1_I2(r, r, rW)
+C_O1_I3(x, 0, x, x)
 C_O1_I3(x, x, x, x)
 C_O1_I4(r, r, re, r, 0)
 C_O1_I4(r, r, r, ri, ri)
diff --git a/tcg/i386/tcg-target.opc.h b/tcg/i386/tcg-target.opc.h
index 1312941800..b5f403e35e 100644
--- a/tcg/i386/tcg-target.opc.h
+++ b/tcg/i386/tcg-target.opc.h
@@ -33,3 +33,6 @@ DEF(x86_psrldq_vec, 1, 1, 1, IMPLVEC)
 DEF(x86_vperm2i128_vec, 1, 2, 1, IMPLVEC)
 DEF(x86_punpckl_vec, 1, 2, 0, IMPLVEC)
 DEF(x86_punpckh_vec, 1, 2, 0, IMPLVEC)
+DEF(x86_vpshldi_vec, 1, 2, 1, IMPLVEC)
+DEF(x86_vpshldv_vec, 1, 3, 0, IMPLVEC)
+DEF(x86_vpshrdv_vec, 1, 3, 0, IMPLVEC)
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 7fd6edb887..30b9afc1d3 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -423,6 +423,15 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
 #define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
+#define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
+#define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
+#define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
@@ -2774,6 +2783,15 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     static int const sars_insn[4] = {
         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
     };
+    static int const vpshldi_insn[4] = {
+        OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
+    };
+    static int const vpshldv_insn[4] = {
+        OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
+    };
+    static int const vpshrdv_insn[4] = {
+        OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
+    };
     static int const abs_insn[4] = {
         /* TODO: AVX512 adds support for MO_64.  */
         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
@@ -2866,6 +2884,16 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     case INDEX_op_x86_packus_vec:
         insn = packus_insn[vece];
         goto gen_simd;
+    case INDEX_op_x86_vpshldv_vec:
+        insn = vpshldv_insn[vece];
+        a1 = a2;
+        a2 = args[3];
+        goto gen_simd;
+    case INDEX_op_x86_vpshrdv_vec:
+        insn = vpshrdv_insn[vece];
+        a1 = a2;
+        a2 = args[3];
+        goto gen_simd;
 #if TCG_TARGET_REG_BITS == 32
     case INDEX_op_dup2_vec:
         /* First merge the two 32-bit inputs to a single 64-bit element. */
@@ -2967,7 +2995,12 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         insn = OPC_VPERM2I128;
         sub = args[3];
         goto gen_simd_imm8;
+    case INDEX_op_x86_vpshldi_vec:
+        insn = vpshldi_insn[vece];
+        sub = args[3];
+        goto gen_simd_imm8;
     gen_simd_imm8:
+        tcg_debug_assert(insn != OPC_UD2);
         if (type == TCG_TYPE_V256) {
             insn |= P_VEXL;
         }
@@ -3211,6 +3244,7 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_x86_vperm2i128_vec:
     case INDEX_op_x86_punpckl_vec:
     case INDEX_op_x86_punpckh_vec:
+    case INDEX_op_x86_vpshldi_vec:
 #if TCG_TARGET_REG_BITS == 32
     case INDEX_op_dup2_vec:
 #endif
@@ -3225,6 +3259,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_x86_psrldq_vec:
         return C_O1_I1(x, x);
 
+    case INDEX_op_x86_vpshldv_vec:
+    case INDEX_op_x86_vpshrdv_vec:
+        return C_O1_I3(x, 0, x, x);
+
     case INDEX_op_x86_vpblendvb_vec:
         return C_O1_I3(x, x, x, x);
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 14/20] tcg/i386: Expand vector word rotate as avx512vbmi2 shift-double
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (12 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 13/20] tcg/i386: Support avx512vbmi2 vector shift-double instructions Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-03 10:32   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 15/20] tcg/i386: Remove rotls_vec from tcg_target_op_def Richard Henderson
                   ` (7 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

While there are no specific 16-bit rotate instructions, there
are double-word shifts, which can perform the same operation.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 30b9afc1d3..54fb8321a9 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -3350,6 +3350,8 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_rotlv_vec:
     case INDEX_op_rotrv_vec:
         switch (vece) {
+        case MO_16:
+            return have_avx512vbmi2 ? -1 : 0;
         case MO_32:
         case MO_64:
             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
@@ -3494,6 +3496,12 @@ static void expand_vec_rotli(TCGType type, unsigned vece,
         return;
     }
 
+    if (have_avx512vbmi2) {
+        vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
+                  tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
+        return;
+    }
+
     t = tcg_temp_new_vec(type);
     tcg_gen_shli_vec(vece, t, v1, imm);
     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
@@ -3524,8 +3532,16 @@ static void expand_vec_rotls(TCGType type, unsigned vece,
 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
                             TCGv_vec v1, TCGv_vec sh, bool right)
 {
-    TCGv_vec t = tcg_temp_new_vec(type);
+    TCGv_vec t;
 
+    if (have_avx512vbmi2) {
+        vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
+                  type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
+                  tcgv_vec_arg(v1), tcgv_vec_arg(sh));
+        return;
+    }
+
+    t = tcg_temp_new_vec(type);
     tcg_gen_dupi_vec(vece, t, 8 << vece);
     tcg_gen_sub_vec(vece, t, t, sh);
     if (right) {
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 15/20] tcg/i386: Remove rotls_vec from tcg_target_op_def
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (13 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 14/20] tcg/i386: Expand vector word rotate as avx512vbmi2 shift-double Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-03 10:34   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 16/20] tcg/i386: Expand scalar rotate with avx512 insns Richard Henderson
                   ` (6 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

There is no such instruction on x86, so we should
not be pretending it has arguments.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 54fb8321a9..99ec31977a 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -3235,7 +3235,6 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_shls_vec:
     case INDEX_op_shrs_vec:
     case INDEX_op_sars_vec:
-    case INDEX_op_rotls_vec:
     case INDEX_op_cmp_vec:
     case INDEX_op_x86_shufps_vec:
     case INDEX_op_x86_blend_vec:
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 16/20] tcg/i386: Expand scalar rotate with avx512 insns
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (14 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 15/20] tcg/i386: Remove rotls_vec from tcg_target_op_def Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-03 10:38   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 17/20] tcg/i386: Implement avx512 min/max/abs Richard Henderson
                   ` (5 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

Expand 32-bit and 64-bit scalar rotate with VPRO[LR]V;
expand 16-bit scalar rotate with VPSHLDV.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 49 +++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 99ec31977a..447aab7438 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -3508,26 +3508,6 @@ static void expand_vec_rotli(TCGType type, unsigned vece,
     tcg_temp_free_vec(t);
 }
 
-static void expand_vec_rotls(TCGType type, unsigned vece,
-                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
-{
-    TCGv_i32 rsh;
-    TCGv_vec t;
-
-    tcg_debug_assert(vece != MO_8);
-
-    t = tcg_temp_new_vec(type);
-    rsh = tcg_temp_new_i32();
-
-    tcg_gen_neg_i32(rsh, lsh);
-    tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
-    tcg_gen_shls_vec(vece, t, v1, lsh);
-    tcg_gen_shrs_vec(vece, v0, v1, rsh);
-    tcg_gen_or_vec(vece, v0, v0, t);
-    tcg_temp_free_vec(t);
-    tcg_temp_free_i32(rsh);
-}
-
 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
                             TCGv_vec v1, TCGv_vec sh, bool right)
 {
@@ -3554,6 +3534,35 @@ static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
     tcg_temp_free_vec(t);
 }
 
+static void expand_vec_rotls(TCGType type, unsigned vece,
+                             TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
+{
+    TCGv_vec t = tcg_temp_new_vec(type);
+
+    tcg_debug_assert(vece != MO_8);
+
+    if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
+        tcg_gen_dup_i32_vec(vece, t, lsh);
+        if (vece >= MO_32) {
+            tcg_gen_rotlv_vec(vece, v0, v1, t);
+        } else {
+            expand_vec_rotv(type, vece, v0, v1, t, false);
+        }
+    } else {
+        TCGv_i32 rsh = tcg_temp_new_i32();
+
+        tcg_gen_neg_i32(rsh, lsh);
+        tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
+        tcg_gen_shls_vec(vece, t, v1, lsh);
+        tcg_gen_shrs_vec(vece, v0, v1, rsh);
+        tcg_gen_or_vec(vece, v0, v0, t);
+
+        tcg_temp_free_i32(rsh);
+    }
+
+    tcg_temp_free_vec(t);
+}
+
 static void expand_vec_mul(TCGType type, unsigned vece,
                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
 {
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 17/20] tcg/i386: Implement avx512 min/max/abs
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (15 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 16/20] tcg/i386: Expand scalar rotate with avx512 insns Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-03 10:44   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 18/20] tcg/i386: Implement avx512 multiply Richard Henderson
                   ` (4 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 447aab7438..22eaa53cb1 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -312,6 +312,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
+#define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
@@ -338,15 +339,19 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
+#define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
+#define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
+#define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
+#define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
@@ -2748,16 +2753,16 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
     };
     static int const smin_insn[4] = {
-        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
+        OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
     };
     static int const smax_insn[4] = {
-        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
+        OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
     };
     static int const umin_insn[4] = {
-        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
+        OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
     };
     static int const umax_insn[4] = {
-        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
+        OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
     };
     static int const rotlv_insn[4] = {
         OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
@@ -2793,8 +2798,7 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
     };
     static int const abs_insn[4] = {
-        /* TODO: AVX512 adds support for MO_64.  */
-        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
+        OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
     };
 
     TCGType type = vecl + TCG_TYPE_V64;
@@ -3377,7 +3381,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_umin_vec:
     case INDEX_op_umax_vec:
     case INDEX_op_abs_vec:
-        return vece <= MO_32;
+        return vece <= MO_32 || have_avx512vl;
 
     default:
         return 0;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 18/20] tcg/i386: Implement avx512 multiply
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (16 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 17/20] tcg/i386: Implement avx512 min/max/abs Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-03 10:45   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 19/20] tcg/i386: Implement more logical operations for avx512 Richard Henderson
                   ` (3 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 22eaa53cb1..f982b6e615 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -360,6 +360,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
+#define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
@@ -2729,7 +2730,7 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
     };
     static int const mul_insn[4] = {
-        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
+        OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
     };
     static int const shift_imm_insn[4] = {
         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
@@ -3362,12 +3363,11 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
         return 0;
 
     case INDEX_op_mul_vec:
-        if (vece == MO_8) {
-            /* We can expand the operation for MO_8.  */
+        switch (vece) {
+        case MO_8:
             return -1;
-        }
-        if (vece == MO_64) {
-            return 0;
+        case MO_64:
+            return have_avx512dq;
         }
         return 1;
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 19/20] tcg/i386: Implement more logical operations for avx512
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (17 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 18/20] tcg/i386: Implement avx512 multiply Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-03 10:46   ` Alex Bennée
  2021-12-18 19:42 ` [PATCH 20/20] tcg/i386: Implement bitsel " Richard Henderson
                   ` (2 subsequent siblings)
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

The general ternary logic operation can implement
NOT, ORC, NAND, NOR, EQV.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.h     | 10 +++++-----
 tcg/i386/tcg-target.c.inc | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 841b1febab..433a632d6a 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -188,11 +188,11 @@ extern bool have_movbe;
 #define TCG_TARGET_HAS_v256             have_avx2
 
 #define TCG_TARGET_HAS_andc_vec         1
-#define TCG_TARGET_HAS_orc_vec          0
-#define TCG_TARGET_HAS_nand_vec         0
-#define TCG_TARGET_HAS_nor_vec          0
-#define TCG_TARGET_HAS_eqv_vec          0
-#define TCG_TARGET_HAS_not_vec          0
+#define TCG_TARGET_HAS_orc_vec          have_avx512vl
+#define TCG_TARGET_HAS_nand_vec         have_avx512vl
+#define TCG_TARGET_HAS_nor_vec          have_avx512vl
+#define TCG_TARGET_HAS_eqv_vec          have_avx512vl
+#define TCG_TARGET_HAS_not_vec          have_avx512vl
 #define TCG_TARGET_HAS_neg_vec          0
 #define TCG_TARGET_HAS_abs_vec          1
 #define TCG_TARGET_HAS_roti_vec         have_avx512vl
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index f982b6e615..86c66f9522 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -447,6 +447,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
+#define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 #define OPC_XCHG_ax_r32	(0x90)
 
@@ -3004,6 +3005,29 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         insn = vpshldi_insn[vece];
         sub = args[3];
         goto gen_simd_imm8;
+
+    case INDEX_op_not_vec:
+        insn = OPC_VPTERNLOGQ;
+        a2 = a1;
+        sub = 0x33; /* !B */
+        goto gen_simd_imm8;
+    case INDEX_op_nor_vec:
+        insn = OPC_VPTERNLOGQ;
+        sub = 0x11; /* norCB */
+        goto gen_simd_imm8;
+    case INDEX_op_nand_vec:
+        insn = OPC_VPTERNLOGQ;
+        sub = 0x77; /* nandCB */
+        goto gen_simd_imm8;
+    case INDEX_op_eqv_vec:
+        insn = OPC_VPTERNLOGQ;
+        sub = 0x99; /* xnorCB */
+        goto gen_simd_imm8;
+    case INDEX_op_orc_vec:
+        insn = OPC_VPTERNLOGQ;
+        sub = 0xdd; /* orB!C */
+        goto gen_simd_imm8;
+
     gen_simd_imm8:
         tcg_debug_assert(insn != OPC_UD2);
         if (type == TCG_TYPE_V256) {
@@ -3224,6 +3248,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_or_vec:
     case INDEX_op_xor_vec:
     case INDEX_op_andc_vec:
+    case INDEX_op_orc_vec:
+    case INDEX_op_nand_vec:
+    case INDEX_op_nor_vec:
+    case INDEX_op_eqv_vec:
     case INDEX_op_ssadd_vec:
     case INDEX_op_usadd_vec:
     case INDEX_op_sssub_vec:
@@ -3256,6 +3284,7 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 
     case INDEX_op_abs_vec:
     case INDEX_op_dup_vec:
+    case INDEX_op_not_vec:
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
@@ -3284,6 +3313,11 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_or_vec:
     case INDEX_op_xor_vec:
     case INDEX_op_andc_vec:
+    case INDEX_op_orc_vec:
+    case INDEX_op_nand_vec:
+    case INDEX_op_nor_vec:
+    case INDEX_op_eqv_vec:
+    case INDEX_op_not_vec:
         return 1;
     case INDEX_op_cmp_vec:
     case INDEX_op_cmpsel_vec:
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH 20/20] tcg/i386: Implement bitsel for avx512
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (18 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 19/20] tcg/i386: Implement more logical operations for avx512 Richard Henderson
@ 2021-12-18 19:42 ` Richard Henderson
  2022-02-03 10:51   ` Alex Bennée
  2022-01-29  9:28 ` [PATCH 00/20] tcg: vector improvements Richard Henderson
  2022-02-03 10:25 ` Alex Bennée
  21 siblings, 1 reply; 51+ messages in thread
From: Richard Henderson @ 2021-12-18 19:42 UTC (permalink / raw)
  To: qemu-devel

The general ternary logic operation can implement BITSEL.
Funnel the 4-operand operation into three variants of the
3-operand instruction, depending on input operand overlap.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.h     |  2 +-
 tcg/i386/tcg-target.c.inc | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 433a632d6a..ae3612d745 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -204,7 +204,7 @@ extern bool have_movbe;
 #define TCG_TARGET_HAS_mul_vec          1
 #define TCG_TARGET_HAS_sat_vec          1
 #define TCG_TARGET_HAS_minmax_vec       1
-#define TCG_TARGET_HAS_bitsel_vec       0
+#define TCG_TARGET_HAS_bitsel_vec       have_avx512vl
 #define TCG_TARGET_HAS_cmpsel_vec       -1
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 86c66f9522..be88ccae4d 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -2805,7 +2805,7 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
 
     TCGType type = vecl + TCG_TYPE_V64;
     int insn, sub;
-    TCGArg a0, a1, a2;
+    TCGArg a0, a1, a2, a3;
 
     a0 = args[0];
     a1 = args[1];
@@ -3028,6 +3028,22 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
         sub = 0xdd; /* orB!C */
         goto gen_simd_imm8;
 
+    case INDEX_op_bitsel_vec:
+        insn = OPC_VPTERNLOGQ;
+        a3 = args[3];
+        if (a0 == a1) {
+            a1 = a2;
+            a2 = a3;
+            sub = 0xca; /* A?B:C */
+        } else if (a0 == a2) {
+            a2 = a3;
+            sub = 0xe2; /* B?A:C */
+        } else {
+            tcg_out_mov(s, type, a0, a3);
+            sub = 0xb8; /* B?C:A */
+        }
+        goto gen_simd_imm8;
+
     gen_simd_imm8:
         tcg_debug_assert(insn != OPC_UD2);
         if (type == TCG_TYPE_V256) {
@@ -3296,6 +3312,7 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_x86_vpshrdv_vec:
         return C_O1_I3(x, 0, x, x);
 
+    case INDEX_op_bitsel_vec:
     case INDEX_op_x86_vpblendvb_vec:
         return C_O1_I3(x, x, x, x);
 
@@ -3318,6 +3335,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_nor_vec:
     case INDEX_op_eqv_vec:
     case INDEX_op_not_vec:
+    case INDEX_op_bitsel_vec:
         return 1;
     case INDEX_op_cmp_vec:
     case INDEX_op_cmpsel_vec:
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH 03/20] tcg/ppc: Implement vector NAND, NOR, EQV
  2021-12-18 19:42 ` [PATCH 03/20] tcg/ppc: Implement vector NAND, NOR, EQV Richard Henderson
@ 2021-12-19  0:15   ` Philippe Mathieu-Daudé
  2022-02-01 18:29   ` Alex Bennée
  1 sibling, 0 replies; 51+ messages in thread
From: Philippe Mathieu-Daudé @ 2021-12-19  0:15 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel

On 12/18/21 20:42, Richard Henderson wrote:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/ppc/tcg-target.h     |  6 +++---
>  tcg/ppc/tcg-target.c.inc | 15 +++++++++++++++
>  2 files changed, 18 insertions(+), 3 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 04/20] tcg/s390x: Implement vector NAND, NOR, EQV
  2021-12-18 19:42 ` [PATCH 04/20] tcg/s390x: " Richard Henderson
@ 2021-12-19  0:17   ` Philippe Mathieu-Daudé
  2022-02-01 18:29   ` Alex Bennée
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 51+ messages in thread
From: Philippe Mathieu-Daudé @ 2021-12-19  0:17 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel

On 12/18/21 20:42, Richard Henderson wrote:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/s390x/tcg-target.h     |  6 +++---
>  tcg/s390x/tcg-target.c.inc | 17 +++++++++++++++++
>  2 files changed, 20 insertions(+), 3 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 02/20] tcg: Add opcodes for vector nand, nor, eqv
  2021-12-18 19:42 ` [PATCH 02/20] tcg: Add opcodes for vector nand, nor, eqv Richard Henderson
@ 2021-12-19 11:28   ` Philippe Mathieu-Daudé
  2022-02-01 18:28   ` Alex Bennée
  1 sibling, 0 replies; 51+ messages in thread
From: Philippe Mathieu-Daudé @ 2021-12-19 11:28 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel

On 12/18/21 20:42, Richard Henderson wrote:
> We've had placeholders for these opcodes for a while,
> and should have support on ppc, s390x and avx512 hosts.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  include/tcg/tcg-opc.h    |  3 +++
>  include/tcg/tcg.h        |  3 +++
>  tcg/aarch64/tcg-target.h |  3 +++
>  tcg/arm/tcg-target.h     |  3 +++
>  tcg/i386/tcg-target.h    |  3 +++
>  tcg/ppc/tcg-target.h     |  3 +++
>  tcg/s390x/tcg-target.h   |  3 +++
>  tcg/optimize.c           | 12 ++++++------
>  tcg/tcg-op-vec.c         | 27 ++++++++++++++++++---------
>  tcg/tcg.c                |  6 ++++++
>  10 files changed, 51 insertions(+), 15 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 01/20] tcg/optimize: Fix folding of vector ops
  2021-12-18 19:42 ` [PATCH 01/20] tcg/optimize: Fix folding of vector ops Richard Henderson
@ 2021-12-19 11:37   ` Philippe Mathieu-Daudé
  0 siblings, 0 replies; 51+ messages in thread
From: Philippe Mathieu-Daudé @ 2021-12-19 11:37 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel

On 12/18/21 20:42, Richard Henderson wrote:
> Bitwise operations are easy to fold, because the operation is
> identical regardess of element size.  But add and sub need

Typo "regardless".

> extra element size info that is not currently propagated.
> 
> Fixes: 2f9f08ba43d
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/optimize.c | 49 ++++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 38 insertions(+), 11 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 00/20] tcg: vector improvements
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (19 preceding siblings ...)
  2021-12-18 19:42 ` [PATCH 20/20] tcg/i386: Implement bitsel " Richard Henderson
@ 2022-01-29  9:28 ` Richard Henderson
  2022-02-03 10:25 ` Alex Bennée
  21 siblings, 0 replies; 51+ messages in thread
From: Richard Henderson @ 2022-01-29  9:28 UTC (permalink / raw)
  To: qemu-devel

Ping?

Patch 1 is now upstream, but only patches 2-4 have reviews.
It applies cleanly to master...


r~

On 12/19/21 06:42, Richard Henderson wrote:
> Add some opcodes for compound logic operations that were so
> far marked as TODO.  Implement those for PPC and S390X.
> 
> We do not want to implement 512-bit width operations, because
> those trigger a cluster clock slowdown on the current set of
> Intel cpus.  But there are new operations in avx512 that apply
> to 128 and 256-bit vectors, which do not trigger the slowdown,
> and those are very interesting.
> 
> 
> r~
> 
> 
> Richard Henderson (20):
>    tcg/optimize: Fix folding of vector ops
>    tcg: Add opcodes for vector nand, nor, eqv
>    tcg/ppc: Implement vector NAND, NOR, EQV
>    tcg/s390x: Implement vector NAND, NOR, EQV
>    tcg/i386: Detect AVX512
>    tcg/i386: Add tcg_out_evex_opc
>    tcg/i386: Use tcg_can_emit_vec_op in expand_vec_cmp_noinv
>    tcg/i386: Implement avx512 variable shifts
>    tcg/i386: Implement avx512 scalar shift
>    tcg/i386: Implement avx512 immediate sari shift
>    tcg/i386: Implement avx512 immediate rotate
>    tcg/i386: Implement avx512 variable rotate
>    tcg/i386: Support avx512vbmi2 vector shift-double instructions
>    tcg/i386: Expand vector word rotate as avx512vbmi2 shift-double
>    tcg/i386: Remove rotls_vec from tcg_target_op_def
>    tcg/i386: Expand scalar rotate with avx512 insns
>    tcg/i386: Implement avx512 min/max/abs
>    tcg/i386: Implement avx512 multiply
>    tcg/i386: Implement more logical operations for avx512
>    tcg/i386: Implement bitsel for avx512
> 
>   include/qemu/cpuid.h          |  20 +-
>   include/tcg/tcg-opc.h         |   3 +
>   include/tcg/tcg.h             |   3 +
>   tcg/aarch64/tcg-target.h      |   3 +
>   tcg/arm/tcg-target.h          |   3 +
>   tcg/i386/tcg-target-con-set.h |   1 +
>   tcg/i386/tcg-target.h         |  17 +-
>   tcg/i386/tcg-target.opc.h     |   3 +
>   tcg/ppc/tcg-target.h          |   3 +
>   tcg/s390x/tcg-target.h        |   3 +
>   tcg/optimize.c                |  61 ++++--
>   tcg/tcg-op-vec.c              |  27 ++-
>   tcg/tcg.c                     |   6 +
>   tcg/i386/tcg-target.c.inc     | 386 ++++++++++++++++++++++++++++------
>   tcg/ppc/tcg-target.c.inc      |  15 ++
>   tcg/s390x/tcg-target.c.inc    |  17 ++
>   16 files changed, 472 insertions(+), 99 deletions(-)
> 



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 02/20] tcg: Add opcodes for vector nand, nor, eqv
  2021-12-18 19:42 ` [PATCH 02/20] tcg: Add opcodes for vector nand, nor, eqv Richard Henderson
  2021-12-19 11:28   ` Philippe Mathieu-Daudé
@ 2022-02-01 18:28   ` Alex Bennée
  1 sibling, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-01 18:28 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> We've had placeholders for these opcodes for a while,
> and should have support on ppc, s390x and avx512 hosts.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 03/20] tcg/ppc: Implement vector NAND, NOR, EQV
  2021-12-18 19:42 ` [PATCH 03/20] tcg/ppc: Implement vector NAND, NOR, EQV Richard Henderson
  2021-12-19  0:15   ` Philippe Mathieu-Daudé
@ 2022-02-01 18:29   ` Alex Bennée
  1 sibling, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-01 18:29 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 04/20] tcg/s390x: Implement vector NAND, NOR, EQV
  2021-12-18 19:42 ` [PATCH 04/20] tcg/s390x: " Richard Henderson
  2021-12-19  0:17   ` Philippe Mathieu-Daudé
@ 2022-02-01 18:29   ` Alex Bennée
  2022-02-01 18:31   ` Alex Bennée
  2024-01-03 13:21   ` Philippe Mathieu-Daudé
  3 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-01 18:29 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 04/20] tcg/s390x: Implement vector NAND, NOR, EQV
  2021-12-18 19:42 ` [PATCH 04/20] tcg/s390x: " Richard Henderson
  2021-12-19  0:17   ` Philippe Mathieu-Daudé
  2022-02-01 18:29   ` Alex Bennée
@ 2022-02-01 18:31   ` Alex Bennée
  2024-01-03 13:21   ` Philippe Mathieu-Daudé
  3 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-01 18:31 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 05/20] tcg/i386: Detect AVX512
  2021-12-18 19:42 ` [PATCH 05/20] tcg/i386: Detect AVX512 Richard Henderson
@ 2022-02-01 18:41   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-01 18:41 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> There are some operation sizes in some subsets of AVX512 that
> are missing from previous iterations of AVX.  Detect them.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Hard for me to test but the code looks sane

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 06/20] tcg/i386: Add tcg_out_evex_opc
  2021-12-18 19:42 ` [PATCH 06/20] tcg/i386: Add tcg_out_evex_opc Richard Henderson
@ 2022-02-01 19:20   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-01 19:20 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> The evex encoding is added here, for use in a subsequent patch.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 07/20] tcg/i386: Use tcg_can_emit_vec_op in expand_vec_cmp_noinv
  2021-12-18 19:42 ` [PATCH 07/20] tcg/i386: Use tcg_can_emit_vec_op in expand_vec_cmp_noinv Richard Henderson
@ 2022-02-01 19:21   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-01 19:21 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> The condition for UMIN/UMAX availability is about to change;
> use the canonical version.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 08/20] tcg/i386: Implement avx512 variable shifts
  2021-12-18 19:42 ` [PATCH 08/20] tcg/i386: Implement avx512 variable shifts Richard Henderson
@ 2022-02-01 20:33   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-01 20:33 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> AVX512VL has VPSRAVQ, and
> AVX512BW has VPSLLVW, VPSRAVW, VPSRLVW.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

> ---
>  tcg/i386/tcg-target.c.inc | 32 ++++++++++++++++++++++++--------
>  1 file changed, 24 insertions(+), 8 deletions(-)
>
> diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
> index 316e550b38..7b9302fcc2 100644
> --- a/tcg/i386/tcg-target.c.inc
> +++ b/tcg/i386/tcg-target.c.inc
> @@ -418,9 +418,13 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
>  #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
>  #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
>  #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
> +#define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
>  #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
>  #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
> +#define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
>  #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
> +#define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
> +#define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
>  #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
>  #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
>  #define OPC_VZEROUPPER  (0x77 | P_EXT)
> @@ -2742,16 +2746,13 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
>          OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
>      };
>      static int const shlv_insn[4] = {
> -        /* TODO: AVX512 adds support for MO_16.  */
> -        OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
> +        OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
>      };
>      static int const shrv_insn[4] = {
> -        /* TODO: AVX512 adds support for MO_16.  */
> -        OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
> +        OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
>      };
>      static int const sarv_insn[4] = {
> -        /* TODO: AVX512 adds support for MO_16, MO_64.  */
> -        OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
> +        OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
>      };
>      static int const shls_insn[4] = {
>          OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
> @@ -3242,9 +3243,24 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
>  
>      case INDEX_op_shlv_vec:
>      case INDEX_op_shrv_vec:
> -        return have_avx2 && vece >= MO_32;
> +        switch (vece) {
> +        case MO_16:
> +            return have_avx512bw;
> +        case MO_32:
> +        case MO_64:
> +            return have_avx2;
> +        }
> +        return 0;
>      case INDEX_op_sarv_vec:
> -        return have_avx2 && vece == MO_32;
> +        switch (vece) {
> +        case MO_16:
> +            return have_avx512bw;
> +        case MO_32:
> +            return have_avx2;
> +        case MO_64:
> +            return have_avx512vl;
> +        }
> +        return 0;
>      case INDEX_op_rotlv_vec:
>      case INDEX_op_rotrv_vec:
>          return have_avx2 && vece >= MO_32 ? -1 : 0;


-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 09/20] tcg/i386: Implement avx512 scalar shift
  2021-12-18 19:42 ` [PATCH 09/20] tcg/i386: Implement avx512 scalar shift Richard Henderson
@ 2022-02-02 13:48   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-02 13:48 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> AVX512VL has VPSRAQ.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 10/20] tcg/i386: Implement avx512 immediate sari shift
  2021-12-18 19:42 ` [PATCH 10/20] tcg/i386: Implement avx512 immediate sari shift Richard Henderson
@ 2022-02-02 14:02   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-02 14:02 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> AVX512 has VPSRAQ with immediate operand, in the same form as
> with AVX, but requires EVEX encoding and W1.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 11/20] tcg/i386: Implement avx512 immediate rotate
  2021-12-18 19:42 ` [PATCH 11/20] tcg/i386: Implement avx512 immediate rotate Richard Henderson
@ 2022-02-02 14:05   ` Alex Bennée
  2022-02-03  1:26     ` Richard Henderson
  0 siblings, 1 reply; 51+ messages in thread
From: Alex Bennée @ 2022-02-02 14:05 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> AVX512VL has VPROLD and VPROLQ, layered onto the same
> opcode as PSHIFTD, but requires EVEX encoding and W.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/i386/tcg-target.h     |  2 +-
>  tcg/i386/tcg-target.c.inc | 15 +++++++++++++--
>  2 files changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
> index 12d098ad6c..38c09fd66c 100644
> --- a/tcg/i386/tcg-target.h
> +++ b/tcg/i386/tcg-target.h
> @@ -195,7 +195,7 @@ extern bool have_movbe;
>  #define TCG_TARGET_HAS_not_vec          0
>  #define TCG_TARGET_HAS_neg_vec          0
>  #define TCG_TARGET_HAS_abs_vec          1
> -#define TCG_TARGET_HAS_roti_vec         0
> +#define TCG_TARGET_HAS_roti_vec         have_avx512vl
>  #define TCG_TARGET_HAS_rots_vec         0
>  #define TCG_TARGET_HAS_rotv_vec         0
>  #define TCG_TARGET_HAS_shi_vec          1
> diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
> index c4e6f2e5ea..5ab7c4c0fa 100644
> --- a/tcg/i386/tcg-target.c.inc
> +++ b/tcg/i386/tcg-target.c.inc
> @@ -361,7 +361,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
>  #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
>  #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
>  #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
> -#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
> +#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
>  #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
>  #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
>  #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
> @@ -2906,6 +2906,14 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
>              insn |= P_VEXW | P_EVEX;
>          }
>          sub = 4;
> +        goto gen_shift;
> +    case INDEX_op_rotli_vec:
> +        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
> +        if (vece == MO_64) {
> +            insn |= P_VEXW;
> +        }
> +        sub = 1;
> +        goto gen_shift;

This could just be a /* fall-through */ although given the large amount
of gotos the switch statement is gathering I'm not sure it makes too
much difference.

Is there any reason why gen_shift couldn't be pushed into a helper
function so we just had:

    static void tcg_out_vec_shift(s, vece, insn, sub, a0, a1, a2) {
        tcg_debug_assert(vece != MO_8);
        if (type == TCG_TYPE_V256) {
            insn |= P_VEXL;
        }
        tcg_out_vex_modrm(s, insn, sub, a0, a1);
        tcg_out8(s, a2);
    }

    ...

    case INDEX_op_rotli_vec:
        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
        if (vece == MO_64) {
            insn |= P_VEXW;
        }
        tcg_out_vec_shift(s, vece, insn, 1, a0, a1, a2);
        break;

Surely the compiler would inline if needed (and even if it didn't it the
code generation that critical we care about a few cycles)?


-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 12/20] tcg/i386: Implement avx512 variable rotate
  2021-12-18 19:42 ` [PATCH 12/20] tcg/i386: Implement avx512 variable rotate Richard Henderson
@ 2022-02-02 14:14   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-02 14:14 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> AVX512VL has VPROLVQ and VPRORVQ.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

I could make the same comment from the previous patch about the goto
gen_simd stuff. Anyway:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 13/20] tcg/i386: Support avx512vbmi2 vector shift-double instructions
  2021-12-18 19:42 ` [PATCH 13/20] tcg/i386: Support avx512vbmi2 vector shift-double instructions Richard Henderson
@ 2022-02-02 14:28   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-02 14:28 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> We will use VPSHLD, VPSHLDV and VPSHRDV for 16-bit rotates.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 11/20] tcg/i386: Implement avx512 immediate rotate
  2022-02-02 14:05   ` Alex Bennée
@ 2022-02-03  1:26     ` Richard Henderson
  0 siblings, 0 replies; 51+ messages in thread
From: Richard Henderson @ 2022-02-03  1:26 UTC (permalink / raw)
  To: Alex Bennée; +Cc: qemu-devel

On 2/3/22 01:05, Alex Bennée wrote:
> Is there any reason why gen_shift couldn't be pushed into a helper
> function so we just had:
> 
>      static void tcg_out_vec_shift(s, vece, insn, sub, a0, a1, a2) {
>          tcg_debug_assert(vece != MO_8);
>          if (type == TCG_TYPE_V256) {
>              insn |= P_VEXL;
>          }
>          tcg_out_vex_modrm(s, insn, sub, a0, a1);
>          tcg_out8(s, a2);
>      }
> 
>      ...
> 
>      case INDEX_op_rotli_vec:
>          insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
>          if (vece == MO_64) {
>              insn |= P_VEXW;
>          }
>          tcg_out_vec_shift(s, vece, insn, 1, a0, a1, a2);
>          break;
> 
> Surely the compiler would inline if needed (and even if it didn't it the
> code generation that critical we care about a few cycles)?

Yes, I suppose I could pull out a helper or two.
Just one of those things where something used to be cleaner, and then the code grew.

r~



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 00/20] tcg: vector improvements
  2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
                   ` (20 preceding siblings ...)
  2022-01-29  9:28 ` [PATCH 00/20] tcg: vector improvements Richard Henderson
@ 2022-02-03 10:25 ` Alex Bennée
  21 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-03 10:25 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> Add some opcodes for compound logic operations that were so
> far marked as TODO.  Implement those for PPC and S390X.
>
> We do not want to implement 512-bit width operations, because
> those trigger a cluster clock slowdown on the current set of
> Intel cpus.  But there are new operations in avx512 that apply
> to 128 and 256-bit vectors, which do not trigger the slowdown,
> and those are very interesting.

So with a tweak to the vector tests patches I sent yesterday and running
on hackbox (which has AVX on it) I got coverage in tcg/i386 from:

    	   Hit 	Total 	Coverage
Lines: 	   839 	1768 	47.5 %
Functions: 56 	81 	69.1 %
Branches: 336 	864 	38.9 %

to:

           Hit 	Total 	Coverage
Lines: 	   1077 1668 	64.6 %
Functions: 68 	77 	88.3 %
Branches:504 	852 	59.2 %

which I think warrants a:

Tested-by: Alex Bennée <alex.bennee@linaro.org>

for the series.

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 14/20] tcg/i386: Expand vector word rotate as avx512vbmi2 shift-double
  2021-12-18 19:42 ` [PATCH 14/20] tcg/i386: Expand vector word rotate as avx512vbmi2 shift-double Richard Henderson
@ 2022-02-03 10:32   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-03 10:32 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> While there are no specific 16-bit rotate instructions, there
> are double-word shifts, which can perform the same operation.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Even hackbox can't utilise these - maybe it's time to request a upgrade
for my dev box ;-)

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 15/20] tcg/i386: Remove rotls_vec from tcg_target_op_def
  2021-12-18 19:42 ` [PATCH 15/20] tcg/i386: Remove rotls_vec from tcg_target_op_def Richard Henderson
@ 2022-02-03 10:34   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-03 10:34 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> There is no such instruction on x86, so we should
> not be pretending it has arguments.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 16/20] tcg/i386: Expand scalar rotate with avx512 insns
  2021-12-18 19:42 ` [PATCH 16/20] tcg/i386: Expand scalar rotate with avx512 insns Richard Henderson
@ 2022-02-03 10:38   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-03 10:38 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> Expand 32-bit and 64-bit scalar rotate with VPRO[LR]V;
> expand 16-bit scalar rotate with VPSHLDV.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Again couldn't test but looks sane:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 17/20] tcg/i386: Implement avx512 min/max/abs
  2021-12-18 19:42 ` [PATCH 17/20] tcg/i386: Implement avx512 min/max/abs Richard Henderson
@ 2022-02-03 10:44   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-03 10:44 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 18/20] tcg/i386: Implement avx512 multiply
  2021-12-18 19:42 ` [PATCH 18/20] tcg/i386: Implement avx512 multiply Richard Henderson
@ 2022-02-03 10:45   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-03 10:45 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 19/20] tcg/i386: Implement more logical operations for avx512
  2021-12-18 19:42 ` [PATCH 19/20] tcg/i386: Implement more logical operations for avx512 Richard Henderson
@ 2022-02-03 10:46   ` Alex Bennée
  2022-02-03 21:54     ` Richard Henderson
  0 siblings, 1 reply; 51+ messages in thread
From: Alex Bennée @ 2022-02-03 10:46 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> The general ternary logic operation can implement
> NOT, ORC, NAND, NOR, EQV.

Can we not fall back to expanding to use SSE if we don't have AVX512
available or is that done by the exiting fallback expansion?

Anyway:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 20/20] tcg/i386: Implement bitsel for avx512
  2021-12-18 19:42 ` [PATCH 20/20] tcg/i386: Implement bitsel " Richard Henderson
@ 2022-02-03 10:51   ` Alex Bennée
  0 siblings, 0 replies; 51+ messages in thread
From: Alex Bennée @ 2022-02-03 10:51 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> The general ternary logic operation can implement BITSEL.
> Funnel the 4-operand operation into three variants of the
> 3-operand instruction, depending on input operand overlap.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 19/20] tcg/i386: Implement more logical operations for avx512
  2022-02-03 10:46   ` Alex Bennée
@ 2022-02-03 21:54     ` Richard Henderson
  0 siblings, 0 replies; 51+ messages in thread
From: Richard Henderson @ 2022-02-03 21:54 UTC (permalink / raw)
  To: Alex Bennée; +Cc: qemu-devel

On 2/3/22 21:46, Alex Bennée wrote:
> 
> Richard Henderson <richard.henderson@linaro.org> writes:
> 
>> The general ternary logic operation can implement
>> NOT, ORC, NAND, NOR, EQV.
> 
> Can we not fall back to expanding to use SSE if we don't have AVX512
> available or is that done by the exiting fallback expansion?

Generic code does that for us.


r~


^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 04/20] tcg/s390x: Implement vector NAND, NOR, EQV
  2021-12-18 19:42 ` [PATCH 04/20] tcg/s390x: " Richard Henderson
                     ` (2 preceding siblings ...)
  2022-02-01 18:31   ` Alex Bennée
@ 2024-01-03 13:21   ` Philippe Mathieu-Daudé
  2024-01-03 21:58     ` Richard Henderson
  3 siblings, 1 reply; 51+ messages in thread
From: Philippe Mathieu-Daudé @ 2024-01-03 13:21 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: Alex Bennée, qemu-s390x

Hi Richard,

(revisiting this old patch which is now commit 21eab5bfae)

On 18/12/21 20:42, Richard Henderson wrote:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   tcg/s390x/tcg-target.h     |  6 +++---
>   tcg/s390x/tcg-target.c.inc | 17 +++++++++++++++++
>   2 files changed, 20 insertions(+), 3 deletions(-)
> 
> diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
> index ad29e62b16..fef227b0fe 100644
> --- a/tcg/s390x/tcg-target.h
> +++ b/tcg/s390x/tcg-target.h
> @@ -145,9 +145,9 @@ extern uint64_t s390_facilities[3];
>   
>   #define TCG_TARGET_HAS_andc_vec       1
>   #define TCG_TARGET_HAS_orc_vec        HAVE_FACILITY(VECTOR_ENH1)
> -#define TCG_TARGET_HAS_nand_vec       0
> -#define TCG_TARGET_HAS_nor_vec        0
> -#define TCG_TARGET_HAS_eqv_vec        0
> +#define TCG_TARGET_HAS_nand_vec       HAVE_FACILITY(VECTOR_ENH1)
> +#define TCG_TARGET_HAS_nor_vec        1
> +#define TCG_TARGET_HAS_eqv_vec        HAVE_FACILITY(VECTOR_ENH1)

Here some opcodes are conditional, ...

>   #define TCG_TARGET_HAS_not_vec        1
>   #define TCG_TARGET_HAS_neg_vec        1
>   #define TCG_TARGET_HAS_abs_vec        1
> diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
> index 57e803e339..5a90b892cb 100644
> --- a/tcg/s390x/tcg-target.c.inc
> +++ b/tcg/s390x/tcg-target.c.inc
> @@ -288,7 +288,9 @@ typedef enum S390Opcode {
>       VRRc_VMXL   = 0xe7fd,
>       VRRc_VN     = 0xe768,
>       VRRc_VNC    = 0xe769,
> +    VRRc_VNN    = 0xe76e,
>       VRRc_VNO    = 0xe76b,
> +    VRRc_VNX    = 0xe76c,
>       VRRc_VO     = 0xe76a,
>       VRRc_VOC    = 0xe76f,
>       VRRc_VPKS   = 0xe797,   /* we leave the m5 cs field 0 */
> @@ -2750,6 +2752,15 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
>       case INDEX_op_xor_vec:
>           tcg_out_insn(s, VRRc, VX, a0, a1, a2, 0);
>           break;
> +    case INDEX_op_nand_vec:
> +        tcg_out_insn(s, VRRc, VNN, a0, a1, a2, 0);
> +        break;
> +    case INDEX_op_nor_vec:
> +        tcg_out_insn(s, VRRc, VNO, a0, a1, a2, 0);
> +        break;
> +    case INDEX_op_eqv_vec:
> +        tcg_out_insn(s, VRRc, VNX, a0, a1, a2, 0);
> +        break;
>   
>       case INDEX_op_shli_vec:
>           tcg_out_insn(s, VRSa, VESL, a0, a2, TCG_REG_NONE, a1, vece);
> @@ -2846,7 +2857,10 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
>       case INDEX_op_and_vec:
>       case INDEX_op_andc_vec:
>       case INDEX_op_bitsel_vec:
> +    case INDEX_op_eqv_vec:
> +    case INDEX_op_nand_vec:

... but here we unconditionally return 1 for them.

Shouldn't we return TCG_TARGET_HAS_opcode instead?

>       case INDEX_op_neg_vec:
> +    case INDEX_op_nor_vec:
>       case INDEX_op_not_vec:
>       case INDEX_op_or_vec:
>       case INDEX_op_orc_vec:

(expanding context)

             return 1;

> @@ -3191,6 +3205,9 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
>       case INDEX_op_or_vec:
>       case INDEX_op_orc_vec:
>       case INDEX_op_xor_vec:
> +    case INDEX_op_nand_vec:
> +    case INDEX_op_nor_vec:
> +    case INDEX_op_eqv_vec:
>       case INDEX_op_cmp_vec:
>       case INDEX_op_mul_vec:
>       case INDEX_op_rotlv_vec:



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH 04/20] tcg/s390x: Implement vector NAND, NOR, EQV
  2024-01-03 13:21   ` Philippe Mathieu-Daudé
@ 2024-01-03 21:58     ` Richard Henderson
  0 siblings, 0 replies; 51+ messages in thread
From: Richard Henderson @ 2024-01-03 21:58 UTC (permalink / raw)
  To: Philippe Mathieu-Daudé, qemu-devel; +Cc: Alex Bennée, qemu-s390x

On 1/4/24 00:21, Philippe Mathieu-Daudé wrote:
> Hi Richard,
> 
> (revisiting this old patch which is now commit 21eab5bfae)
> 
> On 18/12/21 20:42, Richard Henderson wrote:
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   tcg/s390x/tcg-target.h     |  6 +++---
>>   tcg/s390x/tcg-target.c.inc | 17 +++++++++++++++++
>>   2 files changed, 20 insertions(+), 3 deletions(-)
>>
>> diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
>> index ad29e62b16..fef227b0fe 100644
>> --- a/tcg/s390x/tcg-target.h
>> +++ b/tcg/s390x/tcg-target.h
>> @@ -145,9 +145,9 @@ extern uint64_t s390_facilities[3];
>>   #define TCG_TARGET_HAS_andc_vec       1
>>   #define TCG_TARGET_HAS_orc_vec        HAVE_FACILITY(VECTOR_ENH1)
>> -#define TCG_TARGET_HAS_nand_vec       0
>> -#define TCG_TARGET_HAS_nor_vec        0
>> -#define TCG_TARGET_HAS_eqv_vec        0
>> +#define TCG_TARGET_HAS_nand_vec       HAVE_FACILITY(VECTOR_ENH1)
>> +#define TCG_TARGET_HAS_nor_vec        1
>> +#define TCG_TARGET_HAS_eqv_vec        HAVE_FACILITY(VECTOR_ENH1)
> 
> Here some opcodes are conditional, ...
> 
>>   #define TCG_TARGET_HAS_not_vec        1
>>   #define TCG_TARGET_HAS_neg_vec        1
>>   #define TCG_TARGET_HAS_abs_vec        1
>> diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
>> index 57e803e339..5a90b892cb 100644
>> --- a/tcg/s390x/tcg-target.c.inc
>> +++ b/tcg/s390x/tcg-target.c.inc
>> @@ -288,7 +288,9 @@ typedef enum S390Opcode {
>>       VRRc_VMXL   = 0xe7fd,
>>       VRRc_VN     = 0xe768,
>>       VRRc_VNC    = 0xe769,
>> +    VRRc_VNN    = 0xe76e,
>>       VRRc_VNO    = 0xe76b,
>> +    VRRc_VNX    = 0xe76c,
>>       VRRc_VO     = 0xe76a,
>>       VRRc_VOC    = 0xe76f,
>>       VRRc_VPKS   = 0xe797,   /* we leave the m5 cs field 0 */
>> @@ -2750,6 +2752,15 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
>>       case INDEX_op_xor_vec:
>>           tcg_out_insn(s, VRRc, VX, a0, a1, a2, 0);
>>           break;
>> +    case INDEX_op_nand_vec:
>> +        tcg_out_insn(s, VRRc, VNN, a0, a1, a2, 0);
>> +        break;
>> +    case INDEX_op_nor_vec:
>> +        tcg_out_insn(s, VRRc, VNO, a0, a1, a2, 0);
>> +        break;
>> +    case INDEX_op_eqv_vec:
>> +        tcg_out_insn(s, VRRc, VNX, a0, a1, a2, 0);
>> +        break;
>>       case INDEX_op_shli_vec:
>>           tcg_out_insn(s, VRSa, VESL, a0, a2, TCG_REG_NONE, a1, vece);
>> @@ -2846,7 +2857,10 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
>>       case INDEX_op_and_vec:
>>       case INDEX_op_andc_vec:
>>       case INDEX_op_bitsel_vec:
>> +    case INDEX_op_eqv_vec:
>> +    case INDEX_op_nand_vec:
> 
> ... but here we unconditionally return 1 for them.
> 
> Shouldn't we return TCG_TARGET_HAS_opcode instead?

Yes, you're right.  There's some logical overlap between tcg_gen_emit_vec_op and 
tcg_op_supported, and I think I confused myself a bit there.


r~


^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2024-01-03 21:59 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-18 19:42 [PATCH 00/20] tcg: vector improvements Richard Henderson
2021-12-18 19:42 ` [PATCH 01/20] tcg/optimize: Fix folding of vector ops Richard Henderson
2021-12-19 11:37   ` Philippe Mathieu-Daudé
2021-12-18 19:42 ` [PATCH 02/20] tcg: Add opcodes for vector nand, nor, eqv Richard Henderson
2021-12-19 11:28   ` Philippe Mathieu-Daudé
2022-02-01 18:28   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 03/20] tcg/ppc: Implement vector NAND, NOR, EQV Richard Henderson
2021-12-19  0:15   ` Philippe Mathieu-Daudé
2022-02-01 18:29   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 04/20] tcg/s390x: " Richard Henderson
2021-12-19  0:17   ` Philippe Mathieu-Daudé
2022-02-01 18:29   ` Alex Bennée
2022-02-01 18:31   ` Alex Bennée
2024-01-03 13:21   ` Philippe Mathieu-Daudé
2024-01-03 21:58     ` Richard Henderson
2021-12-18 19:42 ` [PATCH 05/20] tcg/i386: Detect AVX512 Richard Henderson
2022-02-01 18:41   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 06/20] tcg/i386: Add tcg_out_evex_opc Richard Henderson
2022-02-01 19:20   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 07/20] tcg/i386: Use tcg_can_emit_vec_op in expand_vec_cmp_noinv Richard Henderson
2022-02-01 19:21   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 08/20] tcg/i386: Implement avx512 variable shifts Richard Henderson
2022-02-01 20:33   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 09/20] tcg/i386: Implement avx512 scalar shift Richard Henderson
2022-02-02 13:48   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 10/20] tcg/i386: Implement avx512 immediate sari shift Richard Henderson
2022-02-02 14:02   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 11/20] tcg/i386: Implement avx512 immediate rotate Richard Henderson
2022-02-02 14:05   ` Alex Bennée
2022-02-03  1:26     ` Richard Henderson
2021-12-18 19:42 ` [PATCH 12/20] tcg/i386: Implement avx512 variable rotate Richard Henderson
2022-02-02 14:14   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 13/20] tcg/i386: Support avx512vbmi2 vector shift-double instructions Richard Henderson
2022-02-02 14:28   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 14/20] tcg/i386: Expand vector word rotate as avx512vbmi2 shift-double Richard Henderson
2022-02-03 10:32   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 15/20] tcg/i386: Remove rotls_vec from tcg_target_op_def Richard Henderson
2022-02-03 10:34   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 16/20] tcg/i386: Expand scalar rotate with avx512 insns Richard Henderson
2022-02-03 10:38   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 17/20] tcg/i386: Implement avx512 min/max/abs Richard Henderson
2022-02-03 10:44   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 18/20] tcg/i386: Implement avx512 multiply Richard Henderson
2022-02-03 10:45   ` Alex Bennée
2021-12-18 19:42 ` [PATCH 19/20] tcg/i386: Implement more logical operations for avx512 Richard Henderson
2022-02-03 10:46   ` Alex Bennée
2022-02-03 21:54     ` Richard Henderson
2021-12-18 19:42 ` [PATCH 20/20] tcg/i386: Implement bitsel " Richard Henderson
2022-02-03 10:51   ` Alex Bennée
2022-01-29  9:28 ` [PATCH 00/20] tcg: vector improvements Richard Henderson
2022-02-03 10:25 ` Alex Bennée

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.