[PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2
@ 2022-03-08  1:53 Richard Henderson
  2022-03-08  1:53 ` [PATCH v3 01/11] tcg: Implement tcg_gen_{h,w}swap_{i32,i64} Richard Henderson
                   ` (11 more replies)
  0 siblings, 12 replies; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

Hi David,

I've split up the patches a bit, made some improvements to
the shifts and reversals, and fixed a few bugs.

Please especially review vector string search, as that is
has had major changes.


r~


David Miller (9):
  target/s390x: vxeh2: vector convert short/32b
  target/s390x: vxeh2: vector string search
  target/s390x: vxeh2: Update for changes to vector shifts
  target/s390x: vxeh2: vector shift double by bit
  target/s390x: vxeh2: vector {load, store} elements reversed
  target/s390x: vxeh2: vector {load, store} byte reversed elements
  target/s390x: vxeh2: vector {load, store} byte reversed element
  target/s390x: add S390_FEAT_VECTOR_ENH2 to cpu max
  tests/tcg/s390x: Tests for Vector Enhancements Facility 2

Richard Henderson (2):
  tcg: Implement tcg_gen_{h,w}swap_{i32,i64}
  target/s390x: Fix writeback to v1 in helper_vstl

 include/tcg/tcg-op.h                 |   6 +
 target/s390x/helper.h                |  13 +
 target/s390x/gen-features.c          |   2 +
 target/s390x/tcg/translate.c         |   3 +-
 target/s390x/tcg/vec_fpu_helper.c    |  31 ++
 target/s390x/tcg/vec_helper.c        |   2 -
 target/s390x/tcg/vec_int_helper.c    |  58 ++++
 target/s390x/tcg/vec_string_helper.c | 101 ++++++
 tcg/tcg-op.c                         |  30 ++
 tests/tcg/s390x/vxeh2_vcvt.c         |  97 ++++++
 tests/tcg/s390x/vxeh2_vlstr.c        | 146 +++++++++
 tests/tcg/s390x/vxeh2_vs.c           |  91 ++++++
 target/s390x/tcg/translate_vx.c.inc  | 442 ++++++++++++++++++++++++---
 target/s390x/tcg/insn-data.def       |  40 ++-
 tests/tcg/s390x/Makefile.target      |   8 +
 15 files changed, 1018 insertions(+), 52 deletions(-)
 create mode 100644 tests/tcg/s390x/vxeh2_vcvt.c
 create mode 100644 tests/tcg/s390x/vxeh2_vlstr.c
 create mode 100644 tests/tcg/s390x/vxeh2_vs.c

-- 
2.25.1



^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v3 01/11] tcg: Implement tcg_gen_{h,w}swap_{i32,i64}
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-21  9:32   ` David Hildenbrand
  2022-03-08  1:53 ` [PATCH v3 02/11] target/s390x: vxeh2: vector convert short/32b Richard Henderson
                   ` (10 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

Swap half-words (16-bit) and words (32-bit) within a larger value.
Mirrors functions of the same names within include/qemu/bitops.h.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-op.h |  6 ++++++
 tcg/tcg-op.c         | 30 ++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index caa0a63612..b09b8b4a05 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -332,6 +332,7 @@ void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg);
 void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg);
 void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg, int flags);
 void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg);
+void tcg_gen_hswap_i32(TCGv_i32 ret, TCGv_i32 arg);
 void tcg_gen_smin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
 void tcg_gen_smax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
 void tcg_gen_umin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
@@ -531,6 +532,8 @@ void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg);
 void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
 void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
 void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_hswap_i64(TCGv_i64 ret, TCGv_i64 arg);
+void tcg_gen_wswap_i64(TCGv_i64 ret, TCGv_i64 arg);
 void tcg_gen_smin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
 void tcg_gen_smax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
 void tcg_gen_umin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
@@ -1077,6 +1080,8 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
 #define tcg_gen_bswap32_tl tcg_gen_bswap32_i64
 #define tcg_gen_bswap64_tl tcg_gen_bswap64_i64
 #define tcg_gen_bswap_tl tcg_gen_bswap64_i64
+#define tcg_gen_hswap_tl tcg_gen_hswap_i64
+#define tcg_gen_wswap_tl tcg_gen_wswap_i64
 #define tcg_gen_concat_tl_i64 tcg_gen_concat32_i64
 #define tcg_gen_extr_i64_tl tcg_gen_extr32_i64
 #define tcg_gen_andc_tl tcg_gen_andc_i64
@@ -1192,6 +1197,7 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
 #define tcg_gen_bswap16_tl tcg_gen_bswap16_i32
 #define tcg_gen_bswap32_tl(D, S, F) tcg_gen_bswap32_i32(D, S)
 #define tcg_gen_bswap_tl tcg_gen_bswap32_i32
+#define tcg_gen_hswap_tl tcg_gen_hswap_i32
 #define tcg_gen_concat_tl_i64 tcg_gen_concat_i32_i64
 #define tcg_gen_extr_i64_tl tcg_gen_extr_i64_i32
 #define tcg_gen_andc_tl tcg_gen_andc_i32
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 65e1c94c2d..379adb4b9f 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -1056,6 +1056,12 @@ void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
     }
 }
 
+void tcg_gen_hswap_i32(TCGv_i32 ret, TCGv_i32 arg)
+{
+    /* Swapping 2 16-bit elements is a rotate. */
+    tcg_gen_rotli_i32(ret, arg, 16);
+}
+
 void tcg_gen_smin_i32(TCGv_i32 ret, TCGv_i32 a, TCGv_i32 b)
 {
     tcg_gen_movcond_i32(TCG_COND_LT, ret, a, b, a, b);
@@ -1792,6 +1798,30 @@ void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg)
     }
 }
 
+void tcg_gen_hswap_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    uint64_t m = 0x0000ffff0000ffffull;
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    /* See include/qemu/bitops.h, hswap64. */
+    tcg_gen_rotli_i64(t1, arg, 32);
+    tcg_gen_andi_i64(t0, t1, m);
+    tcg_gen_shri_i64(t1, t1, 16);
+    tcg_gen_shli_i64(t0, t0, 16);
+    tcg_gen_andi_i64(t1, t1, m);
+    tcg_gen_or_i64(ret, t0, t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+void tcg_gen_wswap_i64(TCGv_i64 ret, TCGv_i64 arg)
+{
+    /* Swapping 2 32-bit elements is a rotate. */
+    tcg_gen_rotli_i64(ret, arg, 32);
+}
+
 void tcg_gen_not_i64(TCGv_i64 ret, TCGv_i64 arg)
 {
     if (TCG_TARGET_REG_BITS == 32) {
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v3 02/11] target/s390x: vxeh2: vector convert short/32b
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
  2022-03-08  1:53 ` [PATCH v3 01/11] tcg: Implement tcg_gen_{h,w}swap_{i32,i64} Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-21  9:33   ` David Hildenbrand
  2022-03-08  1:53 ` [PATCH v3 03/11] target/s390x: vxeh2: vector string search Richard Henderson
                   ` (9 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

From: David Miller <dmiller423@gmail.com>

Signed-off-by: David Miller <dmiller423@gmail.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20220307020327.3003-2-dmiller423@gmail.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/helper.h               |  4 +++
 target/s390x/tcg/vec_fpu_helper.c   | 31 ++++++++++++++++++++
 target/s390x/tcg/translate_vx.c.inc | 44 ++++++++++++++++++++++++++---
 3 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index 69f69cf718..7cbcbd7f0b 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -275,6 +275,10 @@ DEF_HELPER_FLAGS_5(gvec_vfche64, TCG_CALL_NO_WG, void, ptr, cptr, cptr, env, i32
 DEF_HELPER_5(gvec_vfche64_cc, void, ptr, cptr, cptr, env, i32)
 DEF_HELPER_FLAGS_5(gvec_vfche128, TCG_CALL_NO_WG, void, ptr, cptr, cptr, env, i32)
 DEF_HELPER_5(gvec_vfche128_cc, void, ptr, cptr, cptr, env, i32)
+DEF_HELPER_FLAGS_4(gvec_vcdg32, TCG_CALL_NO_WG, void, ptr, cptr, env, i32)
+DEF_HELPER_FLAGS_4(gvec_vcdlg32, TCG_CALL_NO_WG, void, ptr, cptr, env, i32)
+DEF_HELPER_FLAGS_4(gvec_vcgd32, TCG_CALL_NO_WG, void, ptr, cptr, env, i32)
+DEF_HELPER_FLAGS_4(gvec_vclgd32, TCG_CALL_NO_WG, void, ptr, cptr, env, i32)
 DEF_HELPER_FLAGS_4(gvec_vcdg64, TCG_CALL_NO_WG, void, ptr, cptr, env, i32)
 DEF_HELPER_FLAGS_4(gvec_vcdlg64, TCG_CALL_NO_WG, void, ptr, cptr, env, i32)
 DEF_HELPER_FLAGS_4(gvec_vcgd64, TCG_CALL_NO_WG, void, ptr, cptr, env, i32)
diff --git a/target/s390x/tcg/vec_fpu_helper.c b/target/s390x/tcg/vec_fpu_helper.c
index 1a77993471..6834dbc540 100644
--- a/target/s390x/tcg/vec_fpu_helper.c
+++ b/target/s390x/tcg/vec_fpu_helper.c
@@ -176,6 +176,30 @@ static void vop128_2(S390Vector *v1, const S390Vector *v2, CPUS390XState *env,
     *v1 = tmp;
 }
 
+static float32 vcdg32(float32 a, float_status *s)
+{
+    return int32_to_float32(a, s);
+}
+
+static float32 vcdlg32(float32 a, float_status *s)
+{
+    return uint32_to_float32(a, s);
+}
+
+static float32 vcgd32(float32 a, float_status *s)
+{
+    const float32 tmp = float32_to_int32(a, s);
+
+    return float32_is_any_nan(a) ? INT32_MIN : tmp;
+}
+
+static float32 vclgd32(float32 a, float_status *s)
+{
+    const float32 tmp = float32_to_uint32(a, s);
+
+    return float32_is_any_nan(a) ? 0 : tmp;
+}
+
 static float64 vcdg64(float64 a, float_status *s)
 {
     return int64_to_float64(a, s);
@@ -211,6 +235,9 @@ void HELPER(gvec_##NAME##BITS)(void *v1, const void *v2, CPUS390XState *env,   \
     vop##BITS##_2(v1, v2, env, se, XxC, erm, FN, GETPC());                     \
 }
 
+#define DEF_GVEC_VOP2_32(NAME)                                                 \
+DEF_GVEC_VOP2_FN(NAME, NAME##32, 32)
+
 #define DEF_GVEC_VOP2_64(NAME)                                                 \
 DEF_GVEC_VOP2_FN(NAME, NAME##64, 64)
 
@@ -219,6 +246,10 @@ DEF_GVEC_VOP2_FN(NAME, float32_##OP, 32)                                       \
 DEF_GVEC_VOP2_FN(NAME, float64_##OP, 64)                                       \
 DEF_GVEC_VOP2_FN(NAME, float128_##OP, 128)
 
+DEF_GVEC_VOP2_32(vcdg)
+DEF_GVEC_VOP2_32(vcdlg)
+DEF_GVEC_VOP2_32(vcgd)
+DEF_GVEC_VOP2_32(vclgd)
 DEF_GVEC_VOP2_64(vcdg)
 DEF_GVEC_VOP2_64(vcdlg)
 DEF_GVEC_VOP2_64(vcgd)
diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
index 98eb7710a4..ea28e40d4f 100644
--- a/target/s390x/tcg/translate_vx.c.inc
+++ b/target/s390x/tcg/translate_vx.c.inc
@@ -2720,23 +2720,59 @@ static DisasJumpType op_vcdg(DisasContext *s, DisasOps *o)
 
     switch (s->fields.op2) {
     case 0xc3:
-        if (fpf == FPF_LONG) {
+        switch (fpf) {
+        case FPF_LONG:
             fn = gen_helper_gvec_vcdg64;
+            break;
+        case FPF_SHORT:
+            if (s390_has_feat(S390_FEAT_VECTOR_ENH2)) {
+                fn = gen_helper_gvec_vcdg32;
+            }
+            break;
+        default:
+            break;
         }
         break;
     case 0xc1:
-        if (fpf == FPF_LONG) {
+        switch (fpf) {
+        case FPF_LONG:
             fn = gen_helper_gvec_vcdlg64;
+            break;
+        case FPF_SHORT:
+            if (s390_has_feat(S390_FEAT_VECTOR_ENH2)) {
+                fn = gen_helper_gvec_vcdlg32;
+            }
+            break;
+        default:
+            break;
         }
         break;
     case 0xc2:
-        if (fpf == FPF_LONG) {
+        switch (fpf) {
+        case FPF_LONG:
             fn = gen_helper_gvec_vcgd64;
+            break;
+        case FPF_SHORT:
+            if (s390_has_feat(S390_FEAT_VECTOR_ENH2)) {
+                fn = gen_helper_gvec_vcgd32;
+            }
+            break;
+        default:
+            break;
         }
         break;
     case 0xc0:
-        if (fpf == FPF_LONG) {
+        switch (fpf) {
+        case FPF_LONG:
             fn = gen_helper_gvec_vclgd64;
+            break;
+        case FPF_SHORT:
+            if (s390_has_feat(S390_FEAT_VECTOR_ENH2)) {
+                fn = gen_helper_gvec_vclgd32;
+            }
+            break;
+        default:
+            break;
         }
         break;
     case 0xc7:
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v3 03/11] target/s390x: vxeh2: vector string search
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
  2022-03-08  1:53 ` [PATCH v3 01/11] tcg: Implement tcg_gen_{h,w}swap_{i32,i64} Richard Henderson
  2022-03-08  1:53 ` [PATCH v3 02/11] target/s390x: vxeh2: vector convert short/32b Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-21 10:31   ` David Hildenbrand
  2022-03-08  1:53 ` [PATCH v3 04/11] target/s390x: vxeh2: Update for changes to vector shifts Richard Henderson
                   ` (8 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

From: David Miller <dmiller423@gmail.com>

Signed-off-by: David Miller <dmiller423@gmail.com>
Message-Id: <20220307020327.3003-3-dmiller423@gmail.com>
[rth: Rewrite helpers; fix validation of m6.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---

The substring search was incorrect, in that it didn't properly
restart the search when a match failed.  Split the helper into
multiple, so that the memory accesses can be optimized.
---
 target/s390x/helper.h                |   6 ++
 target/s390x/tcg/translate.c         |   3 +-
 target/s390x/tcg/vec_string_helper.c | 101 +++++++++++++++++++++++++++
 target/s390x/tcg/translate_vx.c.inc  |  26 +++++++
 target/s390x/tcg/insn-data.def       |   2 +
 5 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index 7cbcbd7f0b..7412130883 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -246,6 +246,12 @@ DEF_HELPER_6(gvec_vstrc_cc32, void, ptr, cptr, cptr, cptr, env, i32)
 DEF_HELPER_6(gvec_vstrc_cc_rt8, void, ptr, cptr, cptr, cptr, env, i32)
 DEF_HELPER_6(gvec_vstrc_cc_rt16, void, ptr, cptr, cptr, cptr, env, i32)
 DEF_HELPER_6(gvec_vstrc_cc_rt32, void, ptr, cptr, cptr, cptr, env, i32)
+DEF_HELPER_6(gvec_vstrs_8, void, ptr, cptr, cptr, cptr, env, i32)
+DEF_HELPER_6(gvec_vstrs_16, void, ptr, cptr, cptr, cptr, env, i32)
+DEF_HELPER_6(gvec_vstrs_32, void, ptr, cptr, cptr, cptr, env, i32)
+DEF_HELPER_6(gvec_vstrs_zs8, void, ptr, cptr, cptr, cptr, env, i32)
+DEF_HELPER_6(gvec_vstrs_zs16, void, ptr, cptr, cptr, cptr, env, i32)
+DEF_HELPER_6(gvec_vstrs_zs32, void, ptr, cptr, cptr, cptr, env, i32)
 
 /* === Vector Floating-Point Instructions */
 DEF_HELPER_FLAGS_5(gvec_vfa32, TCG_CALL_NO_WG, void, ptr, cptr, cptr, env, i32)
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index 904b51542f..d9ac29573d 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -6222,7 +6222,8 @@ enum DisasInsnEnum {
 #define FAC_PCI         S390_FEAT_ZPCI /* z/PCI facility */
 #define FAC_AIS         S390_FEAT_ADAPTER_INT_SUPPRESSION
 #define FAC_V           S390_FEAT_VECTOR /* vector facility */
-#define FAC_VE          S390_FEAT_VECTOR_ENH /* vector enhancements facility 1 */
+#define FAC_VE          S390_FEAT_VECTOR_ENH  /* vector enhancements facility 1 */
+#define FAC_VE2         S390_FEAT_VECTOR_ENH2 /* vector enhancements facility 2 */
 #define FAC_MIE2        S390_FEAT_MISC_INSTRUCTION_EXT2 /* miscellaneous-instruction-extensions facility 2 */
 #define FAC_MIE3        S390_FEAT_MISC_INSTRUCTION_EXT3 /* miscellaneous-instruction-extensions facility 3 */
 
diff --git a/target/s390x/tcg/vec_string_helper.c b/target/s390x/tcg/vec_string_helper.c
index ac315eb095..6c0476ecc1 100644
--- a/target/s390x/tcg/vec_string_helper.c
+++ b/target/s390x/tcg/vec_string_helper.c
@@ -471,3 +471,104 @@ void HELPER(gvec_vstrc_cc_rt##BITS)(void *v1, const void *v2, const void *v3,  \
 DEF_VSTRC_CC_RT_HELPER(8)
 DEF_VSTRC_CC_RT_HELPER(16)
 DEF_VSTRC_CC_RT_HELPER(32)
+
+static int vstrs(S390Vector *v1, const S390Vector *v2, const S390Vector *v3,
+                 const S390Vector *v4, uint8_t es, bool zs)
+{
+    int substr_elen, substr_0, str_elen, i, j, k, cc;
+    int nelem = 16 >> es;
+    bool eos = false;
+
+    substr_elen = s390_vec_read_element8(v4, 7) >> es;
+
+    /* If ZS, bound substr length by min(nelem, strlen(v3)). */
+    if (zs) {
+        int i;
+        for (i = 0; i < nelem; i++) {
+            if (s390_vec_read_element(v3, i, es) == 0) {
+                break;
+            }
+        }
+        if (i < substr_elen) {
+            substr_elen = i;
+        }
+    }
+
+    if (substr_elen == 0) {
+        cc = 2; /* full match for degenerate case of empty substr */
+        k = 0;
+        goto done;
+    }
+
+    /* If ZS, look for eos in the searched string. */
+    if (zs) {
+        for (k = 0; k < nelem; k++) {
+            if (s390_vec_read_element(v2, k, es) == 0) {
+                eos = true;
+                break;
+            }
+        }
+        str_elen = k;
+    } else {
+        str_elen = nelem;
+    }
+
+    substr_0 = s390_vec_read_element(v3, 0, es);
+
+    for (k = 0; ; k++) {
+        for (; k < str_elen; k++) {
+            if (s390_vec_read_element(v2, k, es) == substr_0) {
+                break;
+            }
+        }
+
+        /* If we reached the end of the string, no match. */
+        if (k == str_elen) {
+            cc = eos; /* no match (with or without zero char) */
+            goto done;
+        }
+
+        /* If the substring is only one char, match. */
+        if (substr_elen == 1) {
+            cc = 2; /* full match */
+            goto done;
+        }
+
+        /* If the match begins at the last char, we have a partial match. */
+        if (k == str_elen - 1) {
+            cc = 3; /* partial match */
+            goto done;
+        }
+
+        i = MIN(nelem, k + substr_elen);
+        for (j = k + 1; j < i; j++) {
+            uint32_t e2 = s390_vec_read_element(v2, j, es);
+            uint32_t e3 = s390_vec_read_element(v3, j - k, es);
+            if (e2 != e3) {
+                break;
+            }
+        }
+        if (j == i) {
+            /* Matched up until "end". */
+            cc = i - k == substr_elen ? 2 : 3; /* full or partial match */
+            goto done;
+        }
+    }
+
+ done:
+    s390_vec_write_element64(v1, 0, k << es);
+    s390_vec_write_element64(v1, 1, 0);
+    return cc;
+}
+
+#define DEF_VSTRS_HELPER(BITS)                                             \
+void QEMU_FLATTEN HELPER(gvec_vstrs_##BITS)(void *v1, const void *v2,      \
+    const void *v3, const void *v4, CPUS390XState *env, uint32_t desc)     \
+    { env->cc_op = vstrs(v1, v2, v3, v4, MO_##BITS, false); }              \
+void QEMU_FLATTEN HELPER(gvec_vstrs_zs##BITS)(void *v1, const void *v2,    \
+    const void *v3, const void *v4, CPUS390XState *env, uint32_t desc)     \
+    { env->cc_op = vstrs(v1, v2, v3, v4, MO_##BITS, true); }
+
+DEF_VSTRS_HELPER(8)
+DEF_VSTRS_HELPER(16)
+DEF_VSTRS_HELPER(32)
diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
index ea28e40d4f..d514e8b218 100644
--- a/target/s390x/tcg/translate_vx.c.inc
+++ b/target/s390x/tcg/translate_vx.c.inc
@@ -2497,6 +2497,32 @@ static DisasJumpType op_vstrc(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType op_vstrs(DisasContext *s, DisasOps *o)
+{
+    typedef void (*helper_vstrs)(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                 TCGv_ptr, TCGv_ptr, TCGv_i32);
+    static const helper_vstrs fns[3][2] = {
+        { gen_helper_gvec_vstrs_8, gen_helper_gvec_vstrs_zs8 },
+        { gen_helper_gvec_vstrs_16, gen_helper_gvec_vstrs_zs16 },
+        { gen_helper_gvec_vstrs_32, gen_helper_gvec_vstrs_zs32 },
+    };
+
+    const uint8_t m5 = get_field(s, m5);
+    const uint8_t m6 = get_field(s, m6);
+    bool zs = m6 & 2;
+
+    if (m5 > ES_32 || m6 & ~2) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    gen_gvec_4_ptr(get_field(s, v1), get_field(s, v2),
+                   get_field(s, v3), get_field(s, v4),
+                   cpu_env, 0, fns[m5][zs]);
+    set_cc_static(s);
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vfa(DisasContext *s, DisasOps *o)
 {
     const uint8_t fpf = get_field(s, m4);
diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
index 6c8a8b229f..46add91a0e 100644
--- a/target/s390x/tcg/insn-data.def
+++ b/target/s390x/tcg/insn-data.def
@@ -1246,6 +1246,8 @@
     F(0xe75c, VISTR,   VRR_a, V,   0, 0, 0, 0, vistr, 0, IF_VEC)
 /* VECTOR STRING RANGE COMPARE */
     F(0xe78a, VSTRC,   VRR_d, V,   0, 0, 0, 0, vstrc, 0, IF_VEC)
+/* VECTOR STRING SEARCH */
+    F(0xe78b, VSTRS,   VRR_d, VE2, 0, 0, 0, 0, vstrs, 0, IF_VEC)
 
 /* === Vector Floating-Point Instructions */
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v3 04/11] target/s390x: vxeh2: Update for changes to vector shifts
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
                   ` (2 preceding siblings ...)
  2022-03-08  1:53 ` [PATCH v3 03/11] target/s390x: vxeh2: vector string search Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-21 11:15   ` David Hildenbrand
  2022-03-08  1:53 ` [PATCH v3 05/11] target/s390x: vxeh2: vector shift double by bit Richard Henderson
                   ` (7 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

From: David Miller <dmiller423@gmail.com>

Prior to vector enhancements 2, the shift count was supposed to be equal
for each byte lest the result be unpredictable, which allowed us to assume
that the shift count was the same, and optimize accordingly.

With vector enhancements 2, the shift count is allowed to be different
for each byte, and we must cope with that.

Signed-off-by: David Miller <dmiller423@gmail.com>
Message-Id: <20220307020327.3003-4-dmiller423@gmail.com>
[rth: Split out of larger patch; simplify shift/merge code.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/helper.h               |  3 ++
 target/s390x/tcg/vec_int_helper.c   | 58 ++++++++++++++++++++++
 target/s390x/tcg/translate_vx.c.inc | 77 ++++++++++++-----------------
 target/s390x/tcg/insn-data.def      | 12 ++---
 4 files changed, 99 insertions(+), 51 deletions(-)

diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index 7412130883..bf33d86f74 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -203,8 +203,11 @@ DEF_HELPER_FLAGS_3(gvec_vpopct16, TCG_CALL_NO_RWG, void, ptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_verim8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_verim16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vsl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_vsl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vsra, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_vsra_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vsrl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_vsrl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vscbi8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_FLAGS_4(gvec_vscbi16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
 DEF_HELPER_4(gvec_vtm, void, ptr, cptr, env, i32)
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
index 5561b3ed90..a881d5d267 100644
--- a/target/s390x/tcg/vec_int_helper.c
+++ b/target/s390x/tcg/vec_int_helper.c
@@ -540,18 +540,76 @@ void HELPER(gvec_vsl)(void *v1, const void *v2, uint64_t count,
     s390_vec_shl(v1, v2, count);
 }
 
+void HELPER(gvec_vsl_ve2)(void *v1, const void *v2, const void *v3,
+                          uint32_t desc)
+{
+    S390Vector tmp;
+    uint32_t sh, e0, e1 = 0;
+
+    for (int i = 15; i >= 0; --i, e1 = e0 << 24) {
+        e0 = s390_vec_read_element8(v2, i);
+        sh = s390_vec_read_element8(v3, i) & 7;
+
+        s390_vec_write_element8(&tmp, i, rol32(e0 | e1, sh));
+    }
+
+    *(S390Vector *)v1 = tmp;
+}
+
 void HELPER(gvec_vsra)(void *v1, const void *v2, uint64_t count,
                        uint32_t desc)
 {
     s390_vec_sar(v1, v2, count);
 }
 
+void HELPER(gvec_vsra_ve2)(void *v1, const void *v2, const void *v3,
+                           uint32_t desc)
+{
+    S390Vector tmp;
+    uint32_t sh, e0, e1;
+    int i = 0;
+
+    e0 = s390_vec_read_element8(v2, 0);
+    e1 = -(e0 >> 7) << 8;
+
+    for (;;) {
+        sh = s390_vec_read_element8(v3, i) & 7;
+
+        s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh);
+
+        if (++i >= 16) {
+            break;
+        }
+
+        e1 = e0 << 8;
+        e0 = s390_vec_read_element8(v2, i);
+    }
+
+    *(S390Vector *)v1 = tmp;
+}
+
 void HELPER(gvec_vsrl)(void *v1, const void *v2, uint64_t count,
                        uint32_t desc)
 {
     s390_vec_shr(v1, v2, count);
 }
 
+void HELPER(gvec_vsrl_ve2)(void *v1, const void *v2, const void *v3,
+                           uint32_t desc)
+{
+    S390Vector tmp;
+    uint32_t sh, e0, e1 = 0;
+
+    for (int i = 0; i < 16; ++i, e1 = e0 << 8) {
+        e0 = s390_vec_read_element8(v2, i);
+        sh = s390_vec_read_element8(v3, i) & 7;
+
+        s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh);
+    }
+
+    *(S390Vector *)v1 = tmp;
+}
+
 #define DEF_VSCBI(BITS)                                                        \
 void HELPER(gvec_vscbi##BITS)(void *v1, const void *v2, const void *v3,        \
                               uint32_t desc)                                   \
diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
index d514e8b218..967f6213d8 100644
--- a/target/s390x/tcg/translate_vx.c.inc
+++ b/target/s390x/tcg/translate_vx.c.inc
@@ -2018,21 +2018,42 @@ static DisasJumpType op_ves(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType gen_vsh_bit_byte(DisasContext *s, DisasOps *o,
+                                      gen_helper_gvec_2i *gen,
+                                      gen_helper_gvec_3 *gen_ve2)
+{
+    bool byte = s->insn->data;
+
+    if (!byte && s390_has_feat(S390_FEAT_VECTOR_ENH2)) {
+        gen_gvec_3_ool(get_field(s, v1), get_field(s, v2),
+                       get_field(s, v3), 0, gen_ve2);
+    } else {
+        TCGv_i64 shift = tcg_temp_new_i64();
+
+        read_vec_element_i64(shift, get_field(s, v3), 7, ES_8);
+        tcg_gen_andi_i64(shift, shift, byte ? 0x78 : 7);
+        gen_gvec_2i_ool(get_field(s, v1), get_field(s, v2), shift, 0, gen);
+        tcg_temp_free_i64(shift);
+    }
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vsl(DisasContext *s, DisasOps *o)
 {
-    TCGv_i64 shift = tcg_temp_new_i64();
+    return gen_vsh_bit_byte(s, o, gen_helper_gvec_vsl,
+                            gen_helper_gvec_vsl_ve2);
+}
 
-    read_vec_element_i64(shift, get_field(s, v3), 7, ES_8);
-    if (s->fields.op2 == 0x74) {
-        tcg_gen_andi_i64(shift, shift, 0x7);
-    } else {
-        tcg_gen_andi_i64(shift, shift, 0x78);
-    }
+static DisasJumpType op_vsra(DisasContext *s, DisasOps *o)
+{
+    return gen_vsh_bit_byte(s, o, gen_helper_gvec_vsra,
+                            gen_helper_gvec_vsra_ve2);
+}
 
-    gen_gvec_2i_ool(get_field(s, v1), get_field(s, v2),
-                    shift, 0, gen_helper_gvec_vsl);
-    tcg_temp_free_i64(shift);
-    return DISAS_NEXT;
+static DisasJumpType op_vsrl(DisasContext *s, DisasOps *o)
+{
+    return gen_vsh_bit_byte(s, o, gen_helper_gvec_vsrl,
+                            gen_helper_gvec_vsrl_ve2);
 }
 
 static DisasJumpType op_vsldb(DisasContext *s, DisasOps *o)
@@ -2064,40 +2085,6 @@ static DisasJumpType op_vsldb(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
-static DisasJumpType op_vsra(DisasContext *s, DisasOps *o)
-{
-    TCGv_i64 shift = tcg_temp_new_i64();
-
-    read_vec_element_i64(shift, get_field(s, v3), 7, ES_8);
-    if (s->fields.op2 == 0x7e) {
-        tcg_gen_andi_i64(shift, shift, 0x7);
-    } else {
-        tcg_gen_andi_i64(shift, shift, 0x78);
-    }
-
-    gen_gvec_2i_ool(get_field(s, v1), get_field(s, v2),
-                    shift, 0, gen_helper_gvec_vsra);
-    tcg_temp_free_i64(shift);
-    return DISAS_NEXT;
-}
-
-static DisasJumpType op_vsrl(DisasContext *s, DisasOps *o)
-{
-    TCGv_i64 shift = tcg_temp_new_i64();
-
-    read_vec_element_i64(shift, get_field(s, v3), 7, ES_8);
-    if (s->fields.op2 == 0x7c) {
-        tcg_gen_andi_i64(shift, shift, 0x7);
-    } else {
-        tcg_gen_andi_i64(shift, shift, 0x78);
-    }
-
-    gen_gvec_2i_ool(get_field(s, v1), get_field(s, v2),
-                    shift, 0, gen_helper_gvec_vsrl);
-    tcg_temp_free_i64(shift);
-    return DISAS_NEXT;
-}
-
 static DisasJumpType op_vs(DisasContext *s, DisasOps *o)
 {
     const uint8_t es = get_field(s, m4);
diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
index 46add91a0e..f487a64abf 100644
--- a/target/s390x/tcg/insn-data.def
+++ b/target/s390x/tcg/insn-data.def
@@ -1204,19 +1204,19 @@
     F(0xe778, VESRLV,  VRR_c, V,   0, 0, 0, 0, vesv, 0, IF_VEC)
     F(0xe738, VESRL,   VRS_a, V,   la2, 0, 0, 0, ves, 0, IF_VEC)
 /* VECTOR SHIFT LEFT */
-    F(0xe774, VSL,     VRR_c, V,   0, 0, 0, 0, vsl, 0, IF_VEC)
+    E(0xe774, VSL,     VRR_c, V,   0, 0, 0, 0, vsl, 0, 0, IF_VEC)
 /* VECTOR SHIFT LEFT BY BYTE */
-    F(0xe775, VSLB,    VRR_c, V,   0, 0, 0, 0, vsl, 0, IF_VEC)
+    E(0xe775, VSLB,    VRR_c, V,   0, 0, 0, 0, vsl, 0, 1, IF_VEC)
 /* VECTOR SHIFT LEFT DOUBLE BY BYTE */
     F(0xe777, VSLDB,   VRI_d, V,   0, 0, 0, 0, vsldb, 0, IF_VEC)
 /* VECTOR SHIFT RIGHT ARITHMETIC */
-    F(0xe77e, VSRA,    VRR_c, V,   0, 0, 0, 0, vsra, 0, IF_VEC)
+    E(0xe77e, VSRA,    VRR_c, V,   0, 0, 0, 0, vsra, 0, 0, IF_VEC)
 /* VECTOR SHIFT RIGHT ARITHMETIC BY BYTE */
-    F(0xe77f, VSRAB,   VRR_c, V,   0, 0, 0, 0, vsra, 0, IF_VEC)
+    E(0xe77f, VSRAB,   VRR_c, V,   0, 0, 0, 0, vsra, 0, 1, IF_VEC)
 /* VECTOR SHIFT RIGHT LOGICAL */
-    F(0xe77c, VSRL,    VRR_c, V,   0, 0, 0, 0, vsrl, 0, IF_VEC)
+    E(0xe77c, VSRL,    VRR_c, V,   0, 0, 0, 0, vsrl, 0, 0, IF_VEC)
 /* VECTOR SHIFT RIGHT LOGICAL BY BYTE */
-    F(0xe77d, VSRLB,   VRR_c, V,   0, 0, 0, 0, vsrl, 0, IF_VEC)
+    E(0xe77d, VSRLB,   VRR_c, V,   0, 0, 0, 0, vsrl, 0, 1, IF_VEC)
 /* VECTOR SUBTRACT */
     F(0xe7f7, VS,      VRR_c, V,   0, 0, 0, 0, vs, 0, IF_VEC)
 /* VECTOR SUBTRACT COMPUTE BORROW INDICATION */
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v3 05/11] target/s390x: vxeh2: vector shift double by bit
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
                   ` (3 preceding siblings ...)
  2022-03-08  1:53 ` [PATCH v3 04/11] target/s390x: vxeh2: Update for changes to vector shifts Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-21 11:23   ` David Hildenbrand
  2022-03-08  1:53 ` [PATCH v3 06/11] target/s390x: vxeh2: vector {load, store} elements reversed Richard Henderson
                   ` (6 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

From: David Miller <dmiller423@gmail.com>

Signed-off-by: David Miller <dmiller423@gmail.com>
Message-Id: <20220307020327.3003-4-dmiller423@gmail.com>
[rth: Split out of larger patch.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/tcg/translate_vx.c.inc | 47 ++++++++++++++++++++++++++---
 target/s390x/tcg/insn-data.def      |  6 +++-
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
index 967f6213d8..a5283ef2f8 100644
--- a/target/s390x/tcg/translate_vx.c.inc
+++ b/target/s390x/tcg/translate_vx.c.inc
@@ -2056,11 +2056,19 @@ static DisasJumpType op_vsrl(DisasContext *s, DisasOps *o)
                             gen_helper_gvec_vsrl_ve2);
 }
 
-static DisasJumpType op_vsldb(DisasContext *s, DisasOps *o)
+static DisasJumpType op_vsld(DisasContext *s, DisasOps *o)
 {
-    const uint8_t i4 = get_field(s, i4) & 0xf;
-    const int left_shift = (i4 & 7) * 8;
-    const int right_shift = 64 - left_shift;
+    const bool byte = s->insn->data;
+    const uint8_t mask = byte ? 15 : 7;
+    const uint8_t mul  = byte ?  8 : 1;
+    const uint8_t i4   = get_field(s, i4);
+    const int right_shift = 64 - (i4 & 7) * mul;
+
+    if (i4 & ~mask) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
     TCGv_i64 t0 = tcg_temp_new_i64();
     TCGv_i64 t1 = tcg_temp_new_i64();
     TCGv_i64 t2 = tcg_temp_new_i64();
@@ -2074,8 +2082,39 @@ static DisasJumpType op_vsldb(DisasContext *s, DisasOps *o)
         read_vec_element_i64(t1, get_field(s, v3), 0, ES_64);
         read_vec_element_i64(t2, get_field(s, v3), 1, ES_64);
     }
+
     tcg_gen_extract2_i64(t0, t1, t0, right_shift);
     tcg_gen_extract2_i64(t1, t2, t1, right_shift);
+
+    write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
+    write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
+
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    tcg_temp_free(t2);
+    return DISAS_NEXT;
+}
+
+static DisasJumpType op_vsrd(DisasContext *s, DisasOps *o)
+{
+    const uint8_t i4 = get_field(s, i4);
+
+    if (i4 & ~7) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+
+    read_vec_element_i64(t0, get_field(s, v2), 1, ES_64);
+    read_vec_element_i64(t1, get_field(s, v3), 0, ES_64);
+    read_vec_element_i64(t2, get_field(s, v3), 1, ES_64);
+
+    tcg_gen_extract2_i64(t0, t1, t0, i4);
+    tcg_gen_extract2_i64(t1, t2, t1, i4);
+
     write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
     write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
 
diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
index f487a64abf..98a31a557d 100644
--- a/target/s390x/tcg/insn-data.def
+++ b/target/s390x/tcg/insn-data.def
@@ -1207,12 +1207,16 @@
     E(0xe774, VSL,     VRR_c, V,   0, 0, 0, 0, vsl, 0, 0, IF_VEC)
 /* VECTOR SHIFT LEFT BY BYTE */
     E(0xe775, VSLB,    VRR_c, V,   0, 0, 0, 0, vsl, 0, 1, IF_VEC)
+/* VECTOR SHIFT LEFT DOUBLE BY BIT */
+    E(0xe786, VSLD,    VRI_d, VE2, 0, 0, 0, 0, vsld, 0, 0, IF_VEC)
 /* VECTOR SHIFT LEFT DOUBLE BY BYTE */
-    F(0xe777, VSLDB,   VRI_d, V,   0, 0, 0, 0, vsldb, 0, IF_VEC)
+    E(0xe777, VSLDB,   VRI_d, V,   0, 0, 0, 0, vsld, 0, 1, IF_VEC)
 /* VECTOR SHIFT RIGHT ARITHMETIC */
     E(0xe77e, VSRA,    VRR_c, V,   0, 0, 0, 0, vsra, 0, 0, IF_VEC)
 /* VECTOR SHIFT RIGHT ARITHMETIC BY BYTE */
     E(0xe77f, VSRAB,   VRR_c, V,   0, 0, 0, 0, vsra, 0, 1, IF_VEC)
+/* VECTOR SHIFT RIGHT DOUBLE BY BIT */
+    F(0xe787, VSRD,    VRI_d, VE2, 0, 0, 0, 0, vsrd, 0, IF_VEC)
 /* VECTOR SHIFT RIGHT LOGICAL */
     E(0xe77c, VSRL,    VRR_c, V,   0, 0, 0, 0, vsrl, 0, 0, IF_VEC)
 /* VECTOR SHIFT RIGHT LOGICAL BY BYTE */
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v3 06/11] target/s390x: vxeh2: vector {load, store} elements reversed
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
                   ` (4 preceding siblings ...)
  2022-03-08  1:53 ` [PATCH v3 05/11] target/s390x: vxeh2: vector shift double by bit Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-21 11:35   ` David Hildenbrand
  2022-03-08  1:53 ` [PATCH v3 07/11] target/s390x: vxeh2: vector {load, store} byte reversed elements Richard Henderson
                   ` (5 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

From: David Miller <dmiller423@gmail.com>

Signed-off-by: David Miller <dmiller423@gmail.com>
Message-Id: <20220307020327.3003-5-dmiller423@gmail.com>
[rth: Use new hswap and wswap tcg expanders.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/tcg/translate_vx.c.inc | 84 +++++++++++++++++++++++++++++
 target/s390x/tcg/insn-data.def      |  4 ++
 2 files changed, 88 insertions(+)

diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
index a5283ef2f8..ac807122a3 100644
--- a/target/s390x/tcg/translate_vx.c.inc
+++ b/target/s390x/tcg/translate_vx.c.inc
@@ -492,6 +492,46 @@ static DisasJumpType op_vlei(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType op_vler(DisasContext *s, DisasOps *o)
+{
+    const uint8_t es = get_field(s, m3);
+
+    if (es < ES_16 || es > ES_64) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    /* Begin with the two doublewords swapped... */
+    tcg_gen_qemu_ld_i64(t1, o->addr1, get_mem_index(s), MO_TEUQ);
+    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
+    tcg_gen_qemu_ld_i64(t0, o->addr1, get_mem_index(s), MO_TEUQ);
+
+    /* ... then swap smaller elements within the doublewords as required. */
+    switch (es) {
+    case MO_16:
+        tcg_gen_hswap_i64(t1, t1);
+        tcg_gen_hswap_i64(t0, t0);
+        break;
+    case MO_32:
+        tcg_gen_wswap_i64(t1, t1);
+        tcg_gen_wswap_i64(t0, t0);
+        break;
+    case MO_64:
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
+    write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vlgv(DisasContext *s, DisasOps *o)
 {
     const uint8_t es = get_field(s, m4);
@@ -976,6 +1016,50 @@ static DisasJumpType op_vste(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType op_vster(DisasContext *s, DisasOps *o)
+{
+    const uint8_t es = get_field(s, m3);
+    TCGv_i64 t0, t1;
+
+    if (es < ES_16 || es > ES_64) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    /* Probe write access before actually modifying memory */
+    gen_helper_probe_write_access(cpu_env, o->addr1, tcg_constant_i64(16));
+
+    /* Begin with the two doublewords swapped... */
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+    read_vec_element_i64(t1,  get_field(s, v1), 0, ES_64);
+    read_vec_element_i64(t0,  get_field(s, v1), 1, ES_64);
+
+    /* ... then swap smaller elements within the doublewords as required. */
+    switch (es) {
+    case MO_16:
+        tcg_gen_hswap_i64(t1, t1);
+        tcg_gen_hswap_i64(t0, t0);
+        break;
+    case MO_32:
+        tcg_gen_wswap_i64(t1, t1);
+        tcg_gen_wswap_i64(t0, t0);
+        break;
+    case MO_64:
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    tcg_gen_qemu_st_i64(t0, o->addr1, get_mem_index(s), MO_TEUQ);
+    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
+    tcg_gen_qemu_st_i64(t1, o->addr1, get_mem_index(s), MO_TEUQ);
+
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vstm(DisasContext *s, DisasOps *o)
 {
     const uint8_t v3 = get_field(s, v3);
diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
index 98a31a557d..b524541a7d 100644
--- a/target/s390x/tcg/insn-data.def
+++ b/target/s390x/tcg/insn-data.def
@@ -1037,6 +1037,8 @@
     E(0xe741, VLEIH,   VRI_a, V,   0, 0, 0, 0, vlei, 0, ES_16, IF_VEC)
     E(0xe743, VLEIF,   VRI_a, V,   0, 0, 0, 0, vlei, 0, ES_32, IF_VEC)
     E(0xe742, VLEIG,   VRI_a, V,   0, 0, 0, 0, vlei, 0, ES_64, IF_VEC)
+/* VECTOR LOAD ELEMENTS REVERSED */
+    F(0xe607, VLER,    VRX,   VE2, la2, 0, 0, 0, vler, 0, IF_VEC)
 /* VECTOR LOAD GR FROM VR ELEMENT */
     F(0xe721, VLGV,    VRS_c, V,   la2, 0, r1, 0, vlgv, 0, IF_VEC)
 /* VECTOR LOAD LOGICAL ELEMENT AND ZERO */
@@ -1082,6 +1084,8 @@
     E(0xe709, VSTEH,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_16, IF_VEC)
     E(0xe70b, VSTEF,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_32, IF_VEC)
     E(0xe70a, VSTEG,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_64, IF_VEC)
+/* VECTOR STORE ELEMENTS REVERSED */
+    F(0xe60f, VSTER,   VRX,   VE2, la2, 0, 0, 0, vster, 0, IF_VEC)
 /* VECTOR STORE MULTIPLE */
     F(0xe73e, VSTM,    VRS_a, V,   la2, 0, 0, 0, vstm, 0, IF_VEC)
 /* VECTOR STORE WITH LENGTH */
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v3 07/11] target/s390x: vxeh2: vector {load, store} byte reversed elements
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
                   ` (5 preceding siblings ...)
  2022-03-08  1:53 ` [PATCH v3 06/11] target/s390x: vxeh2: vector {load, store} elements reversed Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-21 11:45   ` David Hildenbrand
  2022-03-08  1:53 ` [PATCH v3 08/11] target/s390x: vxeh2: vector {load, store} byte reversed element Richard Henderson
                   ` (4 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, Richard Henderson, dmiller423

From: David Miller <dmiller423@gmail.com>

Signed-off-by: David Miller <dmiller423@gmail.com>
Message-Id: <20220307020327.3003-6-dmiller423@gmail.com>
[rth: Split out elements (plural) from element (scalar)
      Use tcg little-endian memory ops, plus hswap and wswap.]
Signed-off-by: Richard Henderson <richard.henderson@linar.org>
---
 target/s390x/tcg/translate_vx.c.inc | 101 ++++++++++++++++++++++++++++
 target/s390x/tcg/insn-data.def      |   4 ++
 2 files changed, 105 insertions(+)

diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
index ac807122a3..9a82401d71 100644
--- a/target/s390x/tcg/translate_vx.c.inc
+++ b/target/s390x/tcg/translate_vx.c.inc
@@ -457,6 +457,56 @@ static DisasJumpType op_vlrep(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType op_vlbr(DisasContext *s, DisasOps *o)
+{
+    const uint8_t es = get_field(s, m3);
+    TCGv_i64 t0, t1, tt;
+
+    if (es < ES_16 || es > ES_128) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+
+    /* Begin with byte reversed doublewords... */
+    tcg_gen_qemu_ld_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
+    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
+    tcg_gen_qemu_ld_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);
+
+    /*
+     * For 16 and 32-bit elements, the doubleword bswap also reversed
+     * the order of the elements.  Perform a larger order swap to put
+     * them back into place.  For the 128-bit "element", finish the
+     * bswap by swapping the doublewords.
+     */
+    switch (es) {
+    case ES_16:
+        tcg_gen_hswap_i64(t0, t0);
+        tcg_gen_hswap_i64(t1, t1);
+        break;
+    case ES_32:
+        tcg_gen_wswap_i64(t0, t0);
+        tcg_gen_wswap_i64(t1, t1);
+        break;
+    case ES_64:
+        break;
+    case ES_128:
+        tt = t0, t0 = t1, t1 = tt;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
+    write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
+
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vle(DisasContext *s, DisasOps *o)
 {
     const uint8_t es = s->insn->data;
@@ -998,6 +1048,57 @@ static DisasJumpType op_vst(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType op_vstbr(DisasContext *s, DisasOps *o)
+{
+    const uint8_t es = get_field(s, m3);
+    TCGv_i64 t0, t1, tt;
+
+    if (es < ES_16 || es > ES_128) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    /* Probe write access before actually modifying memory */
+    gen_helper_probe_write_access(cpu_env, o->addr1, tcg_constant_i64(16));
+
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+    read_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
+    read_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
+
+    /*
+     * For 16 and 32-bit elements, the doubleword bswap below will
+     * reverse the order of the elements.  Perform a larger order
+     * swap to put them back into place.  For the 128-bit "element",
+     * finish the bswap by swapping the doublewords.
+     */
+    switch (es) {
+    case MO_16:
+        tcg_gen_hswap_i64(t0, t0);
+        tcg_gen_hswap_i64(t1, t1);
+        break;
+    case MO_32:
+        tcg_gen_wswap_i64(t0, t0);
+        tcg_gen_wswap_i64(t1, t1);
+        break;
+    case MO_64:
+        break;
+    case MO_128:
+        tt = t0, t0 = t1, t1 = tt;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    tcg_gen_qemu_st_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
+    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
+    tcg_gen_qemu_st_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);
+
+    tcg_temp_free(t0);
+    tcg_temp_free(t1);
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vste(DisasContext *s, DisasOps *o)
 {
     const uint8_t es = s->insn->data;
diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
index b524541a7d..ee6e1dc9e5 100644
--- a/target/s390x/tcg/insn-data.def
+++ b/target/s390x/tcg/insn-data.def
@@ -1027,6 +1027,8 @@
     F(0xe756, VLR,     VRR_a, V,   0, 0, 0, 0, vlr, 0, IF_VEC)
 /* VECTOR LOAD AND REPLICATE */
     F(0xe705, VLREP,   VRX,   V,   la2, 0, 0, 0, vlrep, 0, IF_VEC)
+/* VECTOR LOAD BYTE REVERSED ELEMENTS */
+    F(0xe606, VLBR,    VRX,   VE2, la2, 0, 0, 0, vlbr, 0, IF_VEC)
 /* VECTOR LOAD ELEMENT */
     E(0xe700, VLEB,    VRX,   V,   la2, 0, 0, 0, vle, 0, ES_8, IF_VEC)
     E(0xe701, VLEH,    VRX,   V,   la2, 0, 0, 0, vle, 0, ES_16, IF_VEC)
@@ -1079,6 +1081,8 @@
     F(0xe75f, VSEG,    VRR_a, V,   0, 0, 0, 0, vseg, 0, IF_VEC)
 /* VECTOR STORE */
     F(0xe70e, VST,     VRX,   V,   la2, 0, 0, 0, vst, 0, IF_VEC)
+/* VECTOR STORE BYTE REVERSED ELEMENTS */
+    F(0xe60e, VSTBR,    VRX,   VE2, la2, 0, 0, 0, vstbr, 0, IF_VEC)
 /* VECTOR STORE ELEMENT */
     E(0xe708, VSTEB,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_8, IF_VEC)
     E(0xe709, VSTEH,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_16, IF_VEC)
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v3 08/11] target/s390x: vxeh2: vector {load, store} byte reversed element
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
                   ` (6 preceding siblings ...)
  2022-03-08  1:53 ` [PATCH v3 07/11] target/s390x: vxeh2: vector {load, store} byte reversed elements Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-21 12:33   ` David Hildenbrand
  2022-03-08  1:53 ` [PATCH v3 09/11] target/s390x: add S390_FEAT_VECTOR_ENH2 to cpu max Richard Henderson
                   ` (3 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

From: David Miller <dmiller423@gmail.com>

This includes VLEBR* and VSTEBR* (single element);
VLBRREP (load single element and replicate); and
VLLEBRZ (load single element and zero).

Signed-off-by: David Miller <dmiller423@gmail.com>
Message-Id: <20220307020327.3003-6-dmiller423@gmail.com>
[rth: Split out elements (plural) from element (scalar),
      Use tcg little-endian memory operations.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/tcg/translate_vx.c.inc | 85 +++++++++++++++++++++++++++++
 target/s390x/tcg/insn-data.def      | 12 ++++
 2 files changed, 97 insertions(+)

diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
index 9a82401d71..ce77578325 100644
--- a/target/s390x/tcg/translate_vx.c.inc
+++ b/target/s390x/tcg/translate_vx.c.inc
@@ -457,6 +457,73 @@ static DisasJumpType op_vlrep(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType op_vlebr(DisasContext *s, DisasOps *o)
+{
+    const uint8_t es = s->insn->data;
+    const uint8_t enr = get_field(s, m3);
+    TCGv_i64 tmp;
+
+    if (!valid_vec_element(enr, es)) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    tmp = tcg_temp_new_i64();
+    tcg_gen_qemu_ld_i64(tmp, o->addr1, get_mem_index(s), MO_LE | es);
+    write_vec_element_i64(tmp, get_field(s, v1), enr, es);
+    tcg_temp_free_i64(tmp);
+    return DISAS_NEXT;
+}
+
+static DisasJumpType op_vlbrrep(DisasContext *s, DisasOps *o)
+{
+    const uint8_t es = get_field(s, m3);
+    TCGv_i64 tmp;
+
+    if (es < ES_16 || es > ES_64) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    tmp = tcg_temp_new_i64();
+    tcg_gen_qemu_ld_i64(tmp, o->addr1, get_mem_index(s), MO_LE | es);
+    gen_gvec_dup_i64(es, get_field(s, v1), tmp);
+    tcg_temp_free_i64(tmp);
+    return DISAS_NEXT;
+}
+
+static DisasJumpType op_vllebrz(DisasContext *s, DisasOps *o)
+{
+    const uint8_t m3 = get_field(s, m3);
+    TCGv_i64 tmp;
+    int es, lshift;
+
+    switch (m3) {
+    case ES_16:
+    case ES_32:
+    case ES_64:
+        es = m3;
+        lshift = 0;
+        break;
+    case 6:
+        es = ES_32;
+        lshift = 32;
+        break;
+    default:
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    tmp = tcg_temp_new_i64();
+    tcg_gen_qemu_ld_i64(tmp, o->addr1, get_mem_index(s), MO_LE | es);
+    tcg_gen_shli_i64(tmp, tmp, lshift);
+
+    write_vec_element_i64(tmp, get_field(s, v1), 0, ES_64);
+    write_vec_element_i64(tcg_constant_i64(0), get_field(s, v1), 1, ES_64);
+    tcg_temp_free_i64(tmp);
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vlbr(DisasContext *s, DisasOps *o)
 {
     const uint8_t es = get_field(s, m3);
@@ -1048,6 +1115,24 @@ static DisasJumpType op_vst(DisasContext *s, DisasOps *o)
     return DISAS_NEXT;
 }
 
+static DisasJumpType op_vstebr(DisasContext *s, DisasOps *o)
+{
+    const uint8_t es = s->insn->data;
+    const uint8_t enr = get_field(s, m3);
+    TCGv_i64 tmp;
+
+    if (!valid_vec_element(enr, es)) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
+    tmp = tcg_temp_new_i64();
+    read_vec_element_i64(tmp, get_field(s, v1), enr, es);
+    tcg_gen_qemu_st_i64(tmp, o->addr1, get_mem_index(s), MO_LE | es);
+    tcg_temp_free_i64(tmp);
+    return DISAS_NEXT;
+}
+
 static DisasJumpType op_vstbr(DisasContext *s, DisasOps *o)
 {
     const uint8_t es = get_field(s, m3);
diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
index ee6e1dc9e5..b80f989002 100644
--- a/target/s390x/tcg/insn-data.def
+++ b/target/s390x/tcg/insn-data.def
@@ -1027,6 +1027,14 @@
     F(0xe756, VLR,     VRR_a, V,   0, 0, 0, 0, vlr, 0, IF_VEC)
 /* VECTOR LOAD AND REPLICATE */
     F(0xe705, VLREP,   VRX,   V,   la2, 0, 0, 0, vlrep, 0, IF_VEC)
+/* VECTOR LOAD BYTE REVERSED ELEMENT */
+    E(0xe601, VLEBRH,  VRX,   VE2, la2, 0, 0, 0, vlebr, 0, ES_16, IF_VEC)
+    E(0xe603, VLEBRF,  VRX,   VE2, la2, 0, 0, 0, vlebr, 0, ES_32, IF_VEC)
+    E(0xe602, VLEBRG,  VRX,   VE2, la2, 0, 0, 0, vlebr, 0, ES_64, IF_VEC)
+/* VECTOR LOAD BYTE REVERSED ELEMENT AND REPLOCATE */
+    F(0xe605, VLBRREP, VRX,   VE2, la2, 0, 0, 0, vlbrrep, 0, IF_VEC)
+/* VECTOR LOAD BYTE REVERSED ELEMENT AND ZERO */
+    F(0xe604, VLLEBRZ, VRX,   VE2, la2, 0, 0, 0, vllebrz, 0, IF_VEC)
 /* VECTOR LOAD BYTE REVERSED ELEMENTS */
     F(0xe606, VLBR,    VRX,   VE2, la2, 0, 0, 0, vlbr, 0, IF_VEC)
 /* VECTOR LOAD ELEMENT */
@@ -1081,6 +1089,10 @@
     F(0xe75f, VSEG,    VRR_a, V,   0, 0, 0, 0, vseg, 0, IF_VEC)
 /* VECTOR STORE */
     F(0xe70e, VST,     VRX,   V,   la2, 0, 0, 0, vst, 0, IF_VEC)
+/* VECTOR STORE BYTE REVERSED ELEMENT */
+    E(0xe609, VSTEBRH,  VRX,   VE2, la2, 0, 0, 0, vstebr, 0, ES_16, IF_VEC)
+    E(0xe60b, VSTEBRF,  VRX,   VE2, la2, 0, 0, 0, vstebr, 0, ES_32, IF_VEC)
+    E(0xe60a, VSTEBRG,  VRX,   VE2, la2, 0, 0, 0, vstebr, 0, ES_64, IF_VEC)
 /* VECTOR STORE BYTE REVERSED ELEMENTS */
     F(0xe60e, VSTBR,    VRX,   VE2, la2, 0, 0, 0, vstbr, 0, IF_VEC)
 /* VECTOR STORE ELEMENT */
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v3 09/11] target/s390x: add S390_FEAT_VECTOR_ENH2 to cpu max
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
                   ` (7 preceding siblings ...)
  2022-03-08  1:53 ` [PATCH v3 08/11] target/s390x: vxeh2: vector {load, store} byte reversed element Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-21 11:28   ` David Hildenbrand
  2022-03-08  1:53 ` [PATCH v3 10/11] tests/tcg/s390x: Tests for Vector Enhancements Facility 2 Richard Henderson
                   ` (2 subsequent siblings)
  11 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

From: David Miller <dmiller423@gmail.com>

Signed-off-by: David Miller <dmiller423@gmail.com>
Message-Id: <20220307020327.3003-7-dmiller423@gmail.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/gen-features.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index 22846121c4..499a3b10a8 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -740,7 +740,9 @@ static uint16_t qemu_V6_2[] = {
 
 static uint16_t qemu_LATEST[] = {
     S390_FEAT_MISC_INSTRUCTION_EXT3,
+    S390_FEAT_VECTOR_ENH2,
 };
+
 /* add all new definitions before this point */
 static uint16_t qemu_MAX[] = {
     /* generates a dependency warning, leave it out for now */
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v3 10/11] tests/tcg/s390x: Tests for Vector Enhancements Facility 2
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
                   ` (8 preceding siblings ...)
  2022-03-08  1:53 ` [PATCH v3 09/11] target/s390x: add S390_FEAT_VECTOR_ENH2 to cpu max Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-08  1:53 ` [PATCH v3 11/11] target/s390x: Fix writeback to v1 in helper_vstl Richard Henderson
  2022-03-08  4:09 ` [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 David Miller
  11 siblings, 0 replies; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

From: David Miller <dmiller423@gmail.com>

* tests/tcg/s390x/vxeh2_vcvt.c  : vector convert
* tests/tcg/s390x/vxeh2_vs.c    : vector shift
* tests/tcg/s390x/vxeh2_vlstr.c : vector load/store reversed

Signed-off-by: David Miller <dmiller423@gmail.com>
Message-Id: <20220307020327.3003-8-dmiller423@gmail.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/s390x/vxeh2_vcvt.c    |  97 +++++++++++++++++++++
 tests/tcg/s390x/vxeh2_vlstr.c   | 146 ++++++++++++++++++++++++++++++++
 tests/tcg/s390x/vxeh2_vs.c      |  91 ++++++++++++++++++++
 tests/tcg/s390x/Makefile.target |   8 ++
 4 files changed, 342 insertions(+)
 create mode 100644 tests/tcg/s390x/vxeh2_vcvt.c
 create mode 100644 tests/tcg/s390x/vxeh2_vlstr.c
 create mode 100644 tests/tcg/s390x/vxeh2_vs.c

diff --git a/tests/tcg/s390x/vxeh2_vcvt.c b/tests/tcg/s390x/vxeh2_vcvt.c
new file mode 100644
index 0000000000..71ecbd77b0
--- /dev/null
+++ b/tests/tcg/s390x/vxeh2_vcvt.c
@@ -0,0 +1,97 @@
+/*
+ * vxeh2_vcvt: vector-enhancements facility 2 vector convert *
+ */
+#include <stdint.h>
+
+typedef union S390Vector {
+    uint64_t d[2];  /* doubleword */
+    uint32_t w[4];  /* word */
+    uint16_t h[8];  /* halfword */
+    uint8_t  b[16]; /* byte */
+    float    f[4];
+    double   fd[2];
+    __uint128_t v;
+} S390Vector;
+
+#define M_S 8
+#define M4_XxC 4
+#define M4_def M4_XxC
+
+static inline void vcfps(S390Vector *v1, S390Vector *v2,
+    const uint8_t m3,  const uint8_t m4,  const uint8_t m5)
+{
+    asm volatile("vcfps %[v1], %[v2], %[m3], %[m4], %[m5]\n"
+                : [v1] "=v" (v1->v)
+                : [v2]  "v" (v2->v)
+                , [m3]  "i" (m3)
+                , [m4]  "i" (m4)
+                , [m5]  "i" (m5));
+}
+
+static inline void vcfpl(S390Vector *v1, S390Vector *v2,
+    const uint8_t m3,  const uint8_t m4,  const uint8_t m5)
+{
+    asm volatile("vcfpl %[v1], %[v2], %[m3], %[m4], %[m5]\n"
+                : [v1] "=v" (v1->v)
+                : [v2]  "v" (v2->v)
+                , [m3]  "i" (m3)
+                , [m4]  "i" (m4)
+                , [m5]  "i" (m5));
+}
+
+static inline void vcsfp(S390Vector *v1, S390Vector *v2,
+    const uint8_t m3,  const uint8_t m4,  const uint8_t m5)
+{
+    asm volatile("vcsfp %[v1], %[v2], %[m3], %[m4], %[m5]\n"
+                : [v1] "=v" (v1->v)
+                : [v2]  "v" (v2->v)
+                , [m3]  "i" (m3)
+                , [m4]  "i" (m4)
+                , [m5]  "i" (m5));
+}
+
+static inline void vclfp(S390Vector *v1, S390Vector *v2,
+    const uint8_t m3,  const uint8_t m4,  const uint8_t m5)
+{
+    asm volatile("vclfp %[v1], %[v2], %[m3], %[m4], %[m5]\n"
+                : [v1] "=v" (v1->v)
+                : [v2]  "v" (v2->v)
+                , [m3]  "i" (m3)
+                , [m4]  "i" (m4)
+                , [m5]  "i" (m5));
+}
+
+int main(int argc, char *argv[])
+{
+    S390Vector vd;
+    S390Vector vs_i32 = { .w[0] = 1, .w[1] = 64, .w[2] = 1024, .w[3] = -10 };
+    S390Vector vs_u32 = { .w[0] = 2, .w[1] = 32, .w[2] = 4096, .w[3] = 8888 };
+    S390Vector vs_f32 = { .f[0] = 3.987, .f[1] = 5.123,
+                          .f[2] = 4.499, .f[3] = 0.512 };
+
+    vd.d[0] = vd.d[1] = 0;
+    vcfps(&vd, &vs_i32, 2, M4_def, 0);
+    if (1 != vd.f[0] || 1024 != vd.f[2] || 64 != vd.f[1] || -10 != vd.f[3]) {
+        return 1;
+    }
+
+    vd.d[0] = vd.d[1] = 0;
+    vcfpl(&vd, &vs_u32, 2, M4_def, 0);
+    if (2 != vd.f[0] || 4096 != vd.f[2] || 32 != vd.f[1] || 8888 != vd.f[3]) {
+        return 1;
+    }
+
+    vd.d[0] = vd.d[1] = 0;
+    vcsfp(&vd, &vs_f32, 2, M4_def, 0);
+    if (4 != vd.w[0] || 4 != vd.w[2] || 5 != vd.w[1] || 1 != vd.w[3]) {
+        return 1;
+    }
+
+    vd.d[0] = vd.d[1] = 0;
+    vclfp(&vd, &vs_f32, 2, M4_def, 0);
+    if (4 != vd.w[0] || 4 != vd.w[2] || 5 != vd.w[1] || 1 != vd.w[3]) {
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/tests/tcg/s390x/vxeh2_vlstr.c b/tests/tcg/s390x/vxeh2_vlstr.c
new file mode 100644
index 0000000000..bf2954e86d
--- /dev/null
+++ b/tests/tcg/s390x/vxeh2_vlstr.c
@@ -0,0 +1,146 @@
+/*
+ * vxeh2_vlstr: vector-enhancements facility 2 vector load/store reversed *
+ */
+#include <stdint.h>
+
+typedef union S390Vector {
+    uint64_t d[2];  /* doubleword */
+    uint32_t w[4];  /* word */
+    uint16_t h[8];  /* halfword */
+    uint8_t  b[16]; /* byte */
+    __uint128_t v;
+} S390Vector;
+
+#define ES8  0
+#define ES16 1
+#define ES32 2
+#define ES64 3
+
+#define vtst(v1, v2) \
+    if (v1.d[0] != v2.d[0] || v1.d[1] != v2.d[1]) { \
+        return 1;     \
+    }
+
+static inline void vler(S390Vector *v1, const void *va, uint8_t m3)
+{
+    asm volatile("vler %[v1], 0(%[va]), %[m3]\n"
+                : [v1] "+v" (v1->v)
+                : [va]  "d" (va)
+                , [m3]  "i" (m3)
+                : "memory");
+}
+
+static inline void vster(S390Vector *v1, const void *va, uint8_t m3)
+{
+    asm volatile("vster %[v1], 0(%[va]), %[m3]\n"
+                : [va] "+d" (va)
+                : [v1]  "v" (v1->v)
+                , [m3]  "i" (m3)
+                : "memory");
+}
+
+static inline void vlbr(S390Vector *v1, void *va, const uint8_t m3)
+{
+    asm volatile("vlbr %[v1], 0(%[va]), %[m3]\n"
+                : [v1] "+v" (v1->v)
+                : [va]  "d" (va)
+                , [m3]  "i" (m3)
+                : "memory");
+}
+
+static inline void vstbr(S390Vector *v1, void *va, const uint8_t m3)
+{
+    asm volatile("vstbr %[v1], 0(%[va]), %[m3]\n"
+                : [va] "+d" (va)
+                : [v1]  "v" (v1->v)
+                , [m3]  "i" (m3)
+                : "memory");
+}
+
+
+static inline void vlebrh(S390Vector *v1, void *va, const uint8_t m3)
+{
+    asm volatile("vlebrh %[v1], 0(%[va]), %[m3]\n"
+                : [v1] "+v" (v1->v)
+                : [va]  "d" (va)
+                , [m3]  "i" (m3)
+                : "memory");
+}
+
+static inline void vstebrh(S390Vector *v1, void *va, const uint8_t m3)
+{
+    asm volatile("vstebrh %[v1], 0(%[va]), %[m3]\n"
+                : [va] "+d" (va)
+                : [v1]  "v" (v1->v)
+                , [m3]  "i" (m3)
+                : "memory");
+}
+
+static inline void vllebrz(S390Vector *v1, void *va, const uint8_t m3)
+{
+    asm volatile("vllebrz %[v1], 0(%[va]), %[m3]\n"
+                : [v1] "+v" (v1->v)
+                : [va]  "d" (va)
+                , [m3]  "i" (m3)
+                : "memory");
+}
+
+static inline void vlbrrep(S390Vector *v1, void *va, const uint8_t m3)
+{
+    asm volatile("vlbrrep %[v1], 0(%[va]), %[m3]\n"
+                : [v1] "+v" (v1->v)
+                : [va]  "d" (va)
+                , [m3]  "i" (m3)
+                : "memory");
+}
+
+
+int main(int argc, char *argv[])
+{
+    S390Vector vd = { .d[0] = 0, .d[1] = 0 };
+    S390Vector vs = { .d[0] = 0x8FEEDDCCBBAA9988ull,
+                      .d[1] = 0x7766554433221107ull };
+
+    const S390Vector vt_v_er16 = {
+        .h[0] = 0x1107, .h[1] = 0x3322, .h[2] = 0x5544, .h[3] = 0x7766,
+        .h[4] = 0x9988, .h[5] = 0xBBAA, .h[6] = 0xDDCC, .h[7] = 0x8FEE };
+
+    const S390Vector vt_v_br16 = {
+        .h[0] = 0xEE8F, .h[1] = 0xCCDD, .h[2] = 0xAABB, .h[3] = 0x8899,
+        .h[4] = 0x6677, .h[5] = 0x4455, .h[6] = 0x2233, .h[7] = 0x0711 };
+
+    int ix;
+    uint64_t ss64 = 0xFEEDFACE0BADBEEFull, sd64 = 0;
+
+    vler (&vd, &vs, ES16);  vtst(vd, vt_v_er16);
+    vster(&vs, &vd, ES16);  vtst(vd, vt_v_er16);
+
+    vlbr (&vd, &vs, ES16);  vtst(vd, vt_v_br16);
+    vstbr(&vs, &vd, ES16);  vtst(vd, vt_v_br16);
+
+    vlebrh(&vd, &ss64, 5);
+    if (0xEDFE != vd.h[5]) {
+        return 1;
+    }
+
+    vstebrh(&vs, (uint8_t *)&sd64 + 4, 7);
+    if (0x0000000007110000ull != sd64) {
+        return 1;
+    }
+
+    vllebrz(&vd, (uint8_t *)&ss64 + 3, 2);
+    for (ix = 0; ix < 4; ix++) {
+        if (vd.w[ix] != (ix != 1 ? 0 : 0xBEAD0BCE)) {
+            return 1;
+        }
+    }
+
+    vlbrrep(&vd, (uint8_t *)&ss64 + 4, 1);
+    for (ix = 0; ix < 8; ix++) {
+        if (0xAD0B != vd.h[ix]) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
diff --git a/tests/tcg/s390x/vxeh2_vs.c b/tests/tcg/s390x/vxeh2_vs.c
new file mode 100644
index 0000000000..04a3d4d7bb
--- /dev/null
+++ b/tests/tcg/s390x/vxeh2_vs.c
@@ -0,0 +1,91 @@
+/*
+ * vxeh2_vs: vector-enhancements facility 2 vector shift
+ */
+#include <stdint.h>
+
+typedef union S390Vector {
+    uint64_t d[2];  /* doubleword */
+    uint32_t w[4];  /* word */
+    uint16_t h[8];  /* halfword */
+    uint8_t  b[16]; /* byte */
+    __uint128_t v;
+} S390Vector;
+
+#define vtst(v1, v2) \
+    if (v1.d[0] != v2.d[0] || v1.d[1] != v2.d[1]) { \
+        return 1;     \
+    }
+
+static inline void vsl(S390Vector *v1, S390Vector *v2, S390Vector *v3)
+{
+    asm volatile("vsl %[v1], %[v2], %[v3]\n"
+                : [v1] "=v" (v1->v)
+                : [v2]  "v" (v2->v)
+                , [v3]  "v" (v3->v));
+}
+
+static inline void vsra(S390Vector *v1, S390Vector *v2, S390Vector *v3)
+{
+    asm volatile("vsra %[v1], %[v2], %[v3]\n"
+                : [v1] "=v" (v1->v)
+                : [v2]  "v" (v2->v)
+                , [v3]  "v" (v3->v));
+}
+
+static inline void vsrl(S390Vector *v1, S390Vector *v2, S390Vector *v3)
+{
+    asm volatile("vsrl %[v1], %[v2], %[v3]\n"
+                : [v1] "=v" (v1->v)
+                : [v2]  "v" (v2->v)
+                , [v3]  "v" (v3->v));
+}
+
+static inline void vsld(S390Vector *v1, S390Vector *v2,
+    S390Vector *v3, const uint8_t I)
+{
+    asm volatile("vsld %[v1], %[v2], %[v3], %[I]\n"
+                : [v1] "=v" (v1->v)
+                : [v2]  "v" (v2->v)
+                , [v3]  "v" (v3->v)
+                , [I]   "i" (I & 7));
+}
+
+static inline void vsrd(S390Vector *v1, S390Vector *v2,
+    S390Vector *v3, const uint8_t I)
+{
+    asm volatile("vsrd %[v1], %[v2], %[v3], %[I]\n"
+                : [v1] "=v" (v1->v)
+                : [v2]  "v" (v2->v)
+                , [v3]  "v" (v3->v)
+                , [I]   "i" (I & 7));
+}
+
+int main(int argc, char *argv[])
+{
+    const S390Vector vt_vsl  = { .d[0] = 0x7FEDBB32D5AA311Dull,
+                                 .d[1] = 0xBB65AA10912220C0ull };
+    const S390Vector vt_vsra = { .d[0] = 0xF1FE6E7399AA5466ull,
+                                 .d[1] = 0x0E762A5188221044ull };
+    const S390Vector vt_vsrl = { .d[0] = 0x11FE6E7399AA5466ull,
+                                 .d[1] = 0x0E762A5188221044ull };
+    const S390Vector vt_vsld = { .d[0] = 0x7F76EE65DD54CC43ull,
+                                 .d[1] = 0xBB32AA2199108838ull };
+    const S390Vector vt_vsrd = { .d[0] = 0x0E060802040E000Aull,
+                                 .d[1] = 0x0C060802040E000Aull };
+    S390Vector vs  = { .d[0] = 0x8FEEDDCCBBAA9988ull,
+                       .d[1] = 0x7766554433221107ull };
+    S390Vector  vd = { .d[0] = 0, .d[1] = 0 };
+    S390Vector vsi = { .d[0] = 0, .d[1] = 0 };
+
+    for (int ix = 0; ix < 16; ix++) {
+        vsi.b[ix] = (1 + (5 ^ ~ix)) & 7;
+    }
+
+    vsl (&vd, &vs, &vsi);       vtst(vd, vt_vsl);
+    vsra(&vd, &vs, &vsi);       vtst(vd, vt_vsra);
+    vsrl(&vd, &vs, &vsi);       vtst(vd, vt_vsrl);
+    vsld(&vd, &vs, &vsi, 3);  vtst(vd, vt_vsld);
+    vsrd(&vd, &vs, &vsi, 15); vtst(vd, vt_vsrd);
+
+    return 0;
+}
diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 257c568c58..badb7b16fe 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -16,6 +16,14 @@ TESTS+=shift
 TESTS+=trap
 TESTS+=signals-s390x
 
+VECTOR_TESTS=vxeh2_vs
+VECTOR_TESTS+=vxeh2_vcvt
+VECTOR_TESTS+=vxeh2_vlstr
+
+TESTS+=$(VECTOR_TESTS)
+
+$(VECTOR_TESTS): CFLAGS+=-march=z15 -O2
+
 ifneq ($(HAVE_GDB_BIN),)
 GDB_SCRIPT=$(SRC_PATH)/tests/guest-debug/run-test.py
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* [PATCH v3 11/11] target/s390x: Fix writeback to v1 in helper_vstl
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
                   ` (9 preceding siblings ...)
  2022-03-08  1:53 ` [PATCH v3 10/11] tests/tcg/s390x: Tests for Vector Enhancements Facility 2 Richard Henderson
@ 2022-03-08  1:53 ` Richard Henderson
  2022-03-21 11:26   ` David Hildenbrand
  2022-03-08  4:09 ` [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 David Miller
  11 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-08  1:53 UTC (permalink / raw)
  To: qemu-devel; +Cc: qemu-s390x, dmiller423

Copy-paste error from vector load length -- do not write
zeros back to v1 after storing from v1.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/s390x/tcg/vec_helper.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/target/s390x/tcg/vec_helper.c b/target/s390x/tcg/vec_helper.c
index ededf13cf0..48d86722b2 100644
--- a/target/s390x/tcg/vec_helper.c
+++ b/target/s390x/tcg/vec_helper.c
@@ -200,7 +200,6 @@ void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
         addr = wrap_address(env, addr + 8);
         cpu_stq_data_ra(env, addr, s390_vec_read_element64(v1, 1), GETPC());
     } else {
-        S390Vector tmp = {};
         int i;
 
         for (i = 0; i < bytes; i++) {
@@ -209,6 +208,5 @@ void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
             cpu_stb_data_ra(env, addr, byte, GETPC());
             addr = wrap_address(env, addr + 1);
         }
-        *(S390Vector *)v1 = tmp;
     }
 }
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2
  2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
                   ` (10 preceding siblings ...)
  2022-03-08  1:53 ` [PATCH v3 11/11] target/s390x: Fix writeback to v1 in helper_vstl Richard Henderson
@ 2022-03-08  4:09 ` David Miller
  2022-03-20  1:14   ` David Miller
  11 siblings, 1 reply; 29+ messages in thread
From: David Miller @ 2022-03-08  4:09 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-s390x, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 2151 bytes --]

I've reviewed all changes,  looks good.
Ran all of my own tests including vstrs, all passed.

Thank you for all reviews and changes here.

- David Miller

On Mon, Mar 7, 2022 at 8:54 PM Richard Henderson <
richard.henderson@linaro.org> wrote:

> Hi David,
>
> I've split up the patches a bit, made some improvements to
> the shifts and reversals, and fixed a few bugs.
>
> Please especially review vector string search, as that is
> has had major changes.
>
>
> r~
>
>
> David Miller (9):
>   target/s390x: vxeh2: vector convert short/32b
>   target/s390x: vxeh2: vector string search
>   target/s390x: vxeh2: Update for changes to vector shifts
>   target/s390x: vxeh2: vector shift double by bit
>   target/s390x: vxeh2: vector {load, store} elements reversed
>   target/s390x: vxeh2: vector {load, store} byte reversed elements
>   target/s390x: vxeh2: vector {load, store} byte reversed element
>   target/s390x: add S390_FEAT_VECTOR_ENH2 to cpu max
>   tests/tcg/s390x: Tests for Vector Enhancements Facility 2
>
> Richard Henderson (2):
>   tcg: Implement tcg_gen_{h,w}swap_{i32,i64}
>   target/s390x: Fix writeback to v1 in helper_vstl
>
>  include/tcg/tcg-op.h                 |   6 +
>  target/s390x/helper.h                |  13 +
>  target/s390x/gen-features.c          |   2 +
>  target/s390x/tcg/translate.c         |   3 +-
>  target/s390x/tcg/vec_fpu_helper.c    |  31 ++
>  target/s390x/tcg/vec_helper.c        |   2 -
>  target/s390x/tcg/vec_int_helper.c    |  58 ++++
>  target/s390x/tcg/vec_string_helper.c | 101 ++++++
>  tcg/tcg-op.c                         |  30 ++
>  tests/tcg/s390x/vxeh2_vcvt.c         |  97 ++++++
>  tests/tcg/s390x/vxeh2_vlstr.c        | 146 +++++++++
>  tests/tcg/s390x/vxeh2_vs.c           |  91 ++++++
>  target/s390x/tcg/translate_vx.c.inc  | 442 ++++++++++++++++++++++++---
>  target/s390x/tcg/insn-data.def       |  40 ++-
>  tests/tcg/s390x/Makefile.target      |   8 +
>  15 files changed, 1018 insertions(+), 52 deletions(-)
>  create mode 100644 tests/tcg/s390x/vxeh2_vcvt.c
>  create mode 100644 tests/tcg/s390x/vxeh2_vlstr.c
>  create mode 100644 tests/tcg/s390x/vxeh2_vs.c
>
> --
> 2.25.1
>
>

[-- Attachment #2: Type: text/html, Size: 3414 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2
  2022-03-08  4:09 ` [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 David Miller
@ 2022-03-20  1:14   ` David Miller
  2022-03-21  9:19     ` David Hildenbrand
  0 siblings, 1 reply; 29+ messages in thread
From: David Miller @ 2022-03-20  1:14 UTC (permalink / raw)
  To: Richard Henderson
  Cc: qemu-s390x, Christian Borntraeger, qemu-devel, David Hildenbrand

Is this waiting on me for anything?
I wanted to ensure this is wrapped up before starting a new project.

Thanks,
-  David Miller

On Mon, Mar 7, 2022 at 11:09 PM David Miller <dmiller423@gmail.com> wrote:
>
>
> I've reviewed all changes,  looks good.
> Ran all of my own tests including vstrs, all passed.
>
> Thank you for all reviews and changes here.
>
> - David Miller
>
> On Mon, Mar 7, 2022 at 8:54 PM Richard Henderson <richard.henderson@linaro.org> wrote:
>>
>> Hi David,
>>
>> I've split up the patches a bit, made some improvements to
>> the shifts and reversals, and fixed a few bugs.
>>
>> Please especially review vector string search, as that is
>> has had major changes.
>>
>>
>> r~
>>
>>
>> David Miller (9):
>>   target/s390x: vxeh2: vector convert short/32b
>>   target/s390x: vxeh2: vector string search
>>   target/s390x: vxeh2: Update for changes to vector shifts
>>   target/s390x: vxeh2: vector shift double by bit
>>   target/s390x: vxeh2: vector {load, store} elements reversed
>>   target/s390x: vxeh2: vector {load, store} byte reversed elements
>>   target/s390x: vxeh2: vector {load, store} byte reversed element
>>   target/s390x: add S390_FEAT_VECTOR_ENH2 to cpu max
>>   tests/tcg/s390x: Tests for Vector Enhancements Facility 2
>>
>> Richard Henderson (2):
>>   tcg: Implement tcg_gen_{h,w}swap_{i32,i64}
>>   target/s390x: Fix writeback to v1 in helper_vstl
>>
>>  include/tcg/tcg-op.h                 |   6 +
>>  target/s390x/helper.h                |  13 +
>>  target/s390x/gen-features.c          |   2 +
>>  target/s390x/tcg/translate.c         |   3 +-
>>  target/s390x/tcg/vec_fpu_helper.c    |  31 ++
>>  target/s390x/tcg/vec_helper.c        |   2 -
>>  target/s390x/tcg/vec_int_helper.c    |  58 ++++
>>  target/s390x/tcg/vec_string_helper.c | 101 ++++++
>>  tcg/tcg-op.c                         |  30 ++
>>  tests/tcg/s390x/vxeh2_vcvt.c         |  97 ++++++
>>  tests/tcg/s390x/vxeh2_vlstr.c        | 146 +++++++++
>>  tests/tcg/s390x/vxeh2_vs.c           |  91 ++++++
>>  target/s390x/tcg/translate_vx.c.inc  | 442 ++++++++++++++++++++++++---
>>  target/s390x/tcg/insn-data.def       |  40 ++-
>>  tests/tcg/s390x/Makefile.target      |   8 +
>>  15 files changed, 1018 insertions(+), 52 deletions(-)
>>  create mode 100644 tests/tcg/s390x/vxeh2_vcvt.c
>>  create mode 100644 tests/tcg/s390x/vxeh2_vlstr.c
>>  create mode 100644 tests/tcg/s390x/vxeh2_vs.c
>>
>> --
>> 2.25.1
>>


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2
  2022-03-20  1:14   ` David Miller
@ 2022-03-21  9:19     ` David Hildenbrand
  0 siblings, 0 replies; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21  9:19 UTC (permalink / raw)
  To: David Miller, Richard Henderson
  Cc: qemu-s390x, Christian Borntraeger, qemu-devel

On 20.03.22 02:14, David Miller wrote:
> Is this waiting on me for anything?
> I wanted to ensure this is wrapped up before starting a new project.

This fell through the cracks because I wasn't cc-ed by Richard on this
series. I'll try reviewing this week.


-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 01/11] tcg: Implement tcg_gen_{h,w}swap_{i32,i64}
  2022-03-08  1:53 ` [PATCH v3 01/11] tcg: Implement tcg_gen_{h,w}swap_{i32,i64} Richard Henderson
@ 2022-03-21  9:32   ` David Hildenbrand
  0 siblings, 0 replies; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21  9:32 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, dmiller423

On 08.03.22 02:53, Richard Henderson wrote:
> Swap half-words (16-bit) and words (32-bit) within a larger value.
> Mirrors functions of the same names within include/qemu/bitops.h.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  include/tcg/tcg-op.h |  6 ++++++
>  tcg/tcg-op.c         | 30 ++++++++++++++++++++++++++++++
>  2 files changed, 36 insertions(+)
> 
> diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
> index caa0a63612..b09b8b4a05 100644
> --- a/include/tcg/tcg-op.h
> +++ b/include/tcg/tcg-op.h
> @@ -332,6 +332,7 @@ void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg);
>  void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg);
>  void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg, int flags);
>  void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg);
> +void tcg_gen_hswap_i32(TCGv_i32 ret, TCGv_i32 arg);
>  void tcg_gen_smin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
>  void tcg_gen_smax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
>  void tcg_gen_umin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
> @@ -531,6 +532,8 @@ void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg);
>  void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
>  void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg, int flags);
>  void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg);
> +void tcg_gen_hswap_i64(TCGv_i64 ret, TCGv_i64 arg);
> +void tcg_gen_wswap_i64(TCGv_i64 ret, TCGv_i64 arg);
>  void tcg_gen_smin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
>  void tcg_gen_smax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
>  void tcg_gen_umin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
> @@ -1077,6 +1080,8 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
>  #define tcg_gen_bswap32_tl tcg_gen_bswap32_i64
>  #define tcg_gen_bswap64_tl tcg_gen_bswap64_i64
>  #define tcg_gen_bswap_tl tcg_gen_bswap64_i64
> +#define tcg_gen_hswap_tl tcg_gen_hswap_i64
> +#define tcg_gen_wswap_tl tcg_gen_wswap_i64
>  #define tcg_gen_concat_tl_i64 tcg_gen_concat32_i64
>  #define tcg_gen_extr_i64_tl tcg_gen_extr32_i64
>  #define tcg_gen_andc_tl tcg_gen_andc_i64
> @@ -1192,6 +1197,7 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
>  #define tcg_gen_bswap16_tl tcg_gen_bswap16_i32
>  #define tcg_gen_bswap32_tl(D, S, F) tcg_gen_bswap32_i32(D, S)
>  #define tcg_gen_bswap_tl tcg_gen_bswap32_i32
> +#define tcg_gen_hswap_tl tcg_gen_hswap_i32
>  #define tcg_gen_concat_tl_i64 tcg_gen_concat_i32_i64
>  #define tcg_gen_extr_i64_tl tcg_gen_extr_i64_i32
>  #define tcg_gen_andc_tl tcg_gen_andc_i32
> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
> index 65e1c94c2d..379adb4b9f 100644
> --- a/tcg/tcg-op.c
> +++ b/tcg/tcg-op.c
> @@ -1056,6 +1056,12 @@ void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg)
>      }
>  }
>  
> +void tcg_gen_hswap_i32(TCGv_i32 ret, TCGv_i32 arg)
> +{
> +    /* Swapping 2 16-bit elements is a rotate. */
> +    tcg_gen_rotli_i32(ret, arg, 16);
> +}
> +
>  void tcg_gen_smin_i32(TCGv_i32 ret, TCGv_i32 a, TCGv_i32 b)
>  {
>      tcg_gen_movcond_i32(TCG_COND_LT, ret, a, b, a, b);
> @@ -1792,6 +1798,30 @@ void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg)
>      }
>  }
>  
> +void tcg_gen_hswap_i64(TCGv_i64 ret, TCGv_i64 arg)
> +{
> +    uint64_t m = 0x0000ffff0000ffffull;
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +
> +    /* See include/qemu/bitops.h, hswap64. */
> +    tcg_gen_rotli_i64(t1, arg, 32);> +    tcg_gen_andi_i64(t0, t1, m);
> +    tcg_gen_shri_i64(t1, t1, 16);
> +    tcg_gen_shli_i64(t0, t0, 16);

I'd flip these two lines into

tcg_gen_shli_i64(t0, t0, 16);
tcg_gen_shri_i64(t1, t1, 16);

To make it easier to map to hswap64().


Reviewed-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 02/11] target/s390x: vxeh2: vector convert short/32b
  2022-03-08  1:53 ` [PATCH v3 02/11] target/s390x: vxeh2: vector convert short/32b Richard Henderson
@ 2022-03-21  9:33   ` David Hildenbrand
  0 siblings, 0 replies; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21  9:33 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, dmiller423

On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>
> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> Message-Id: <20220307020327.3003-2-dmiller423@gmail.com>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: David Hildenbrand <david@redhat.com>


-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 03/11] target/s390x: vxeh2: vector string search
  2022-03-08  1:53 ` [PATCH v3 03/11] target/s390x: vxeh2: vector string search Richard Henderson
@ 2022-03-21 10:31   ` David Hildenbrand
  2022-03-22 14:42     ` Richard Henderson
  0 siblings, 1 reply; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21 10:31 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, dmiller423

On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>
> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Message-Id: <20220307020327.3003-3-dmiller423@gmail.com>
> [rth: Rewrite helpers; fix validation of m6.]
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> 
> The substring search was incorrect, in that it didn't properly
> restart the search when a match failed.  Split the helper into
> multiple, so that the memory accesses can be optimized.
> ---
>  target/s390x/helper.h                |   6 ++
>  target/s390x/tcg/translate.c         |   3 +-
>  target/s390x/tcg/vec_string_helper.c | 101 +++++++++++++++++++++++++++
>  target/s390x/tcg/translate_vx.c.inc  |  26 +++++++
>  target/s390x/tcg/insn-data.def       |   2 +
>  5 files changed, 137 insertions(+), 1 deletion(-)
> 
> diff --git a/target/s390x/helper.h b/target/s390x/helper.h
> index 7cbcbd7f0b..7412130883 100644
> --- a/target/s390x/helper.h
> +++ b/target/s390x/helper.h
> @@ -246,6 +246,12 @@ DEF_HELPER_6(gvec_vstrc_cc32, void, ptr, cptr, cptr, cptr, env, i32)
>  DEF_HELPER_6(gvec_vstrc_cc_rt8, void, ptr, cptr, cptr, cptr, env, i32)
>  DEF_HELPER_6(gvec_vstrc_cc_rt16, void, ptr, cptr, cptr, cptr, env, i32)
>  DEF_HELPER_6(gvec_vstrc_cc_rt32, void, ptr, cptr, cptr, cptr, env, i32)
> +DEF_HELPER_6(gvec_vstrs_8, void, ptr, cptr, cptr, cptr, env, i32)
> +DEF_HELPER_6(gvec_vstrs_16, void, ptr, cptr, cptr, cptr, env, i32)
> +DEF_HELPER_6(gvec_vstrs_32, void, ptr, cptr, cptr, cptr, env, i32)
> +DEF_HELPER_6(gvec_vstrs_zs8, void, ptr, cptr, cptr, cptr, env, i32)
> +DEF_HELPER_6(gvec_vstrs_zs16, void, ptr, cptr, cptr, cptr, env, i32)
> +DEF_HELPER_6(gvec_vstrs_zs32, void, ptr, cptr, cptr, cptr, env, i32)
>  
>  /* === Vector Floating-Point Instructions */
>  DEF_HELPER_FLAGS_5(gvec_vfa32, TCG_CALL_NO_WG, void, ptr, cptr, cptr, env, i32)
> diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
> index 904b51542f..d9ac29573d 100644
> --- a/target/s390x/tcg/translate.c
> +++ b/target/s390x/tcg/translate.c
> @@ -6222,7 +6222,8 @@ enum DisasInsnEnum {
>  #define FAC_PCI         S390_FEAT_ZPCI /* z/PCI facility */
>  #define FAC_AIS         S390_FEAT_ADAPTER_INT_SUPPRESSION
>  #define FAC_V           S390_FEAT_VECTOR /* vector facility */
> -#define FAC_VE          S390_FEAT_VECTOR_ENH /* vector enhancements facility 1 */
> +#define FAC_VE          S390_FEAT_VECTOR_ENH  /* vector enhancements facility 1 */
> +#define FAC_VE2         S390_FEAT_VECTOR_ENH2 /* vector enhancements facility 2 */
>  #define FAC_MIE2        S390_FEAT_MISC_INSTRUCTION_EXT2 /* miscellaneous-instruction-extensions facility 2 */
>  #define FAC_MIE3        S390_FEAT_MISC_INSTRUCTION_EXT3 /* miscellaneous-instruction-extensions facility 3 */
>  
> diff --git a/target/s390x/tcg/vec_string_helper.c b/target/s390x/tcg/vec_string_helper.c
> index ac315eb095..6c0476ecc1 100644
> --- a/target/s390x/tcg/vec_string_helper.c
> +++ b/target/s390x/tcg/vec_string_helper.c
> @@ -471,3 +471,104 @@ void HELPER(gvec_vstrc_cc_rt##BITS)(void *v1, const void *v2, const void *v3,  \
>  DEF_VSTRC_CC_RT_HELPER(8)
>  DEF_VSTRC_CC_RT_HELPER(16)
>  DEF_VSTRC_CC_RT_HELPER(32)
> +
> +static int vstrs(S390Vector *v1, const S390Vector *v2, const S390Vector *v3,
> +                 const S390Vector *v4, uint8_t es, bool zs)
> +{
> +    int substr_elen, substr_0, str_elen, i, j, k, cc;
> +    int nelem = 16 >> es;
> +    bool eos = false;
> +
> +    substr_elen = s390_vec_read_element8(v4, 7) >> es;
> +
> +    /* If ZS, bound substr length by min(nelem, strlen(v3)). */
> +    if (zs) {
> +        int i;

You can drop this "int i;"

> +        for (i = 0; i < nelem; i++) {
> +            if (s390_vec_read_element(v3, i, es) == 0) {
> +                break;
> +            }
> +        }
> +        if (i < substr_elen) {
> +            substr_elen = i;
> +        }

Maybe combine both, I guess there is no need to search beyond substr_elen.

substr_elen = MIN(substr_elen, nelem);
for (i = 0; i < substr_elen; i++) {
    if (s390_vec_read_element(v3, i, es) == 0) {
        substr_elen = i;
        break;
    }
}


We should do the MIN(substr_elen, nelem) maybe right when reading it
from v4.

> +    }
> +
> +    if (substr_elen == 0) {
> +        cc = 2; /* full match for degenerate case of empty substr */
> +        k = 0;
> +        goto done;
> +    }
> +
> +    /* If ZS, look for eos in the searched string. */
> +    if (zs) {
> +        for (k = 0; k < nelem; k++) {
> +            if (s390_vec_read_element(v2, k, es) == 0) {
> +                eos = true;
> +                break;
> +            }
> +        }

I guess we could move that into the main search loop and avoid parsing
the string twice. Not sure what's better.

> +        str_elen = k;
> +    } else {
> +        str_elen = nelem;
> +    }
> +
> +    substr_0 = s390_vec_read_element(v3, 0, es);
> +
> +    for (k = 0; ; k++) {
> +        for (; k < str_elen; k++) {
> +            if (s390_vec_read_element(v2, k, es) == substr_0) {
> +                break;
> +            }
> +        }
> +
> +        /* If we reached the end of the string, no match. */
> +        if (k == str_elen) {
> +            cc = eos; /* no match (with or without zero char) */
> +            goto done;
> +        }
> +
> +        /* If the substring is only one char, match. */
> +        if (substr_elen == 1) {
> +            cc = 2; /* full match */
> +            goto done;
> +        }
> +
> +        /* If the match begins at the last char, we have a partial match. */
> +        if (k == str_elen - 1) {
> +            cc = 3; /* partial match */
> +            goto done;
> +        }
> +
> +        i = MIN(nelem, k + substr_elen);
> +        for (j = k + 1; j < i; j++) {
> +            uint32_t e2 = s390_vec_read_element(v2, j, es);
> +            uint32_t e3 = s390_vec_read_element(v3, j - k, es);
> +            if (e2 != e3) {
> +                break;
> +            }
> +        }
> +        if (j == i) {
> +            /* Matched up until "end". */
> +            cc = i - k == substr_elen ? 2 : 3; /* full or partial match */
> +            goto done;
> +        }
> +    }
> +
> + done:
> +    s390_vec_write_element64(v1, 0, k << es);
> +    s390_vec_write_element64(v1, 1, 0);
> +    return cc;
> +}
> +
> +#define DEF_VSTRS_HELPER(BITS)                                             \
> +void QEMU_FLATTEN HELPER(gvec_vstrs_##BITS)(void *v1, const void *v2,      \
> +    const void *v3, const void *v4, CPUS390XState *env, uint32_t desc)     \
> +    { env->cc_op = vstrs(v1, v2, v3, v4, MO_##BITS, false); }              \
> +void QEMU_FLATTEN HELPER(gvec_vstrs_zs##BITS)(void *v1, const void *v2,    \
> +    const void *v3, const void *v4, CPUS390XState *env, uint32_t desc)     \
> +    { env->cc_op = vstrs(v1, v2, v3, v4, MO_##BITS, true); }
> +
> +DEF_VSTRS_HELPER(8)
> +DEF_VSTRS_HELPER(16)
> +DEF_VSTRS_HELPER(32)
> diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
> index ea28e40d4f..d514e8b218 100644
> --- a/target/s390x/tcg/translate_vx.c.inc
> +++ b/target/s390x/tcg/translate_vx.c.inc
> @@ -2497,6 +2497,32 @@ static DisasJumpType op_vstrc(DisasContext *s, DisasOps *o)
>      return DISAS_NEXT;
>  }
>  
> +static DisasJumpType op_vstrs(DisasContext *s, DisasOps *o)
> +{
> +    typedef void (*helper_vstrs)(TCGv_ptr, TCGv_ptr, TCGv_ptr,
> +                                 TCGv_ptr, TCGv_ptr, TCGv_i32);
> +    static const helper_vstrs fns[3][2] = {
> +        { gen_helper_gvec_vstrs_8, gen_helper_gvec_vstrs_zs8 },
> +        { gen_helper_gvec_vstrs_16, gen_helper_gvec_vstrs_zs16 },
> +        { gen_helper_gvec_vstrs_32, gen_helper_gvec_vstrs_zs32 },
> +    };
> +

Superfluous empty line.

> +    const uint8_t m5 = get_field(s, m5);

Could so a s/m5/es/ , as we do it in other handlers.

> +    const uint8_t m6 = get_field(s, m6);
> +    bool zs = m6 & 2;

I remember we wanted to use extract32() for such bit-tests, at least we
do it in most of the other handlers :)

const bool zs = extract32(m6, 1, 1);

?

> +
> +    if (m5 > ES_32 || m6 & ~2) {
> +        gen_program_exception(s, PGM_SPECIFICATION);
> +        return DISAS_NORETURN;
> +    }
> +



-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 04/11] target/s390x: vxeh2: Update for changes to vector shifts
  2022-03-08  1:53 ` [PATCH v3 04/11] target/s390x: vxeh2: Update for changes to vector shifts Richard Henderson
@ 2022-03-21 11:15   ` David Hildenbrand
  0 siblings, 0 replies; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21 11:15 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, dmiller423

On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>
> 
> Prior to vector enhancements 2, the shift count was supposed to be equal
> for each byte lest the result be unpredictable, which allowed us to assume
> that the shift count was the same, and optimize accordingly.
> 
> With vector enhancements 2, the shift count is allowed to be different
> for each byte, and we must cope with that.
> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Message-Id: <20220307020327.3003-4-dmiller423@gmail.com>
> [rth: Split out of larger patch; simplify shift/merge code.]
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/s390x/helper.h               |  3 ++
>  target/s390x/tcg/vec_int_helper.c   | 58 ++++++++++++++++++++++
>  target/s390x/tcg/translate_vx.c.inc | 77 ++++++++++++-----------------
>  target/s390x/tcg/insn-data.def      | 12 ++---
>  4 files changed, 99 insertions(+), 51 deletions(-)
> 
> diff --git a/target/s390x/helper.h b/target/s390x/helper.h
> index 7412130883..bf33d86f74 100644
> --- a/target/s390x/helper.h
> +++ b/target/s390x/helper.h
> @@ -203,8 +203,11 @@ DEF_HELPER_FLAGS_3(gvec_vpopct16, TCG_CALL_NO_RWG, void, ptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_verim8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_verim16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vsl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_vsl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vsra, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_vsra_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vsrl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_vsrl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vscbi8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vscbi16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_4(gvec_vtm, void, ptr, cptr, env, i32)
> diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
> index 5561b3ed90..a881d5d267 100644
> --- a/target/s390x/tcg/vec_int_helper.c
> +++ b/target/s390x/tcg/vec_int_helper.c
> @@ -540,18 +540,76 @@ void HELPER(gvec_vsl)(void *v1, const void *v2, uint64_t count,
>      s390_vec_shl(v1, v2, count);
>  }
>  
> +void HELPER(gvec_vsl_ve2)(void *v1, const void *v2, const void *v3,
> +                          uint32_t desc)
> +{
> +    S390Vector tmp;
> +    uint32_t sh, e0, e1 = 0;

int i;

> +
> +    for (int i = 15; i >= 0; --i, e1 = e0 << 24) {

I'd only do "e1 = e0" here and do the shift for the rol32 ...

> +        e0 = s390_vec_read_element8(v2, i);
> +        sh = s390_vec_read_element8(v3, i) & 7;
> +
> +        s390_vec_write_element8(&tmp, i, rol32(e0 | e1, sh));

... here

s390_vec_write_element8(&tmp, i, rol32(e0 | e1 << 24, sh));

> +    }
> +
> +    *(S390Vector *)v1 = tmp;
> +}
> +
>  void HELPER(gvec_vsra)(void *v1, const void *v2, uint64_t count,
>                         uint32_t desc)
>  {
>      s390_vec_sar(v1, v2, count);
>  }
>  
> +void HELPER(gvec_vsra_ve2)(void *v1, const void *v2, const void *v3,
> +                           uint32_t desc)
> +{
> +    S390Vector tmp;
> +    uint32_t sh, e0, e1;
> +    int i = 0;
> +
> +    e0 = s390_vec_read_element8(v2, 0);
> +    e1 = -(e0 >> 7) << 8;
> +
> +    for (;;) {
> +        sh = s390_vec_read_element8(v3, i) & 7;
> +
> +        s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh);
> +
> +        if (++i >= 16) {
> +            break;
> +        }
> +
> +        e1 = e0 << 8;
> +        e0 = s390_vec_read_element8(v2, i);
> +    }

Can't we use the following that resembles the other helpers or am I
missing something?

S390Vector tmp;
uint32_t sh, e0, e1 = 0;

/* Byte 0 is special only. */
e0 = (int32_t)(int8_t)s390_vec_read_element8(v2, i);
sh = s390_vec_read_element8(v3, i) & 7;
s390_vec_write_element8(&tmp, i, e0 >> sh);

e1 = e0;
for (int i = 1; i < 16; ++i, e1 = e0) {
	e0 = s390_vec_read_element8(v2, i);
	sh = s390_vec_read_element8(v3, i) & 7;
	s390_vec_write_element8(&tmp, i, (e0 | e1 << 8) >> sh);
}

*(S390Vector *)v1 = tmp;


> +
> +    *(S390Vector *)v1 = tmp;
> +}
> +
>  void HELPER(gvec_vsrl)(void *v1, const void *v2, uint64_t count,
>                         uint32_t desc)
>  {
>      s390_vec_shr(v1, v2, count);
>  }
>  
> +void HELPER(gvec_vsrl_ve2)(void *v1, const void *v2, const void *v3,
> +                           uint32_t desc)
> +{
> +    S390Vector tmp;
> +    uint32_t sh, e0, e1 = 0;
> +
> +    for (int i = 0; i < 16; ++i, e1 = e0 << 8) {

Dito, I'd do the shift below ...

> +        e0 = s390_vec_read_element8(v2, i);
> +        sh = s390_vec_read_element8(v3, i) & 7;
> +
> +        s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh);

s390_vec_write_element8(&tmp, i, (e0 | e1 << 8) >> sh);

> +    }
> +
> +    *(S390Vector *)v1 = tmp;
> +}
> +
>  #define DEF_VSCBI(BITS)                                                        \
>  void HELPER(gvec_vscbi##BITS)(void *v1, const void *v2, const void *v3,        \
>                                uint32_t desc)                                   \
> diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
> index d514e8b218..967f6213d8 100644
> --- a/target/s390x/tcg/translate_vx.c.inc
> +++ b/target/s390x/tcg/translate_vx.c.inc
> @@ -2018,21 +2018,42 @@ static DisasJumpType op_ves(DisasContext *s, DisasOps *o)
>      return DISAS_NEXT;
>  }
>  
> +static DisasJumpType gen_vsh_bit_byte(DisasContext *s, DisasOps *o,
> +                                      gen_helper_gvec_2i *gen,
> +                                      gen_helper_gvec_3 *gen_ve2)
> +{
> +    bool byte = s->insn->data;

Nit: I'd have called this "by_byte".


-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 05/11] target/s390x: vxeh2: vector shift double by bit
  2022-03-08  1:53 ` [PATCH v3 05/11] target/s390x: vxeh2: vector shift double by bit Richard Henderson
@ 2022-03-21 11:23   ` David Hildenbrand
  0 siblings, 0 replies; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21 11:23 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, dmiller423

On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>
> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Message-Id: <20220307020327.3003-4-dmiller423@gmail.com>
> [rth: Split out of larger patch.]
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/s390x/tcg/translate_vx.c.inc | 47 ++++++++++++++++++++++++++---
>  target/s390x/tcg/insn-data.def      |  6 +++-
>  2 files changed, 48 insertions(+), 5 deletions(-)
> 
> diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
> index 967f6213d8..a5283ef2f8 100644
> --- a/target/s390x/tcg/translate_vx.c.inc
> +++ b/target/s390x/tcg/translate_vx.c.inc
> @@ -2056,11 +2056,19 @@ static DisasJumpType op_vsrl(DisasContext *s, DisasOps *o)
>                              gen_helper_gvec_vsrl_ve2);
>  }
>  
> -static DisasJumpType op_vsldb(DisasContext *s, DisasOps *o)
> +static DisasJumpType op_vsld(DisasContext *s, DisasOps *o)
>  {
> -    const uint8_t i4 = get_field(s, i4) & 0xf;
> -    const int left_shift = (i4 & 7) * 8;
> -    const int right_shift = 64 - left_shift;
> +    const bool byte = s->insn->data;
> +    const uint8_t mask = byte ? 15 : 7;
> +    const uint8_t mul  = byte ?  8 : 1;
> +    const uint8_t i4   = get_field(s, i4);
> +    const int right_shift = 64 - (i4 & 7) * mul;
> +
> +    if (i4 & ~mask) {
> +        gen_program_exception(s, PGM_SPECIFICATION);
> +        return DISAS_NORETURN;
> +    }
> +
>      TCGv_i64 t0 = tcg_temp_new_i64();
>      TCGv_i64 t1 = tcg_temp_new_i64();
>      TCGv_i64 t2 = tcg_temp_new_i64();

TCGv_i64 t0, t1, t2;

if (i4 & ~mask) {
...
}

t0 = tcg_temp_new_i64();
t1 = tcg_temp_new_i64();
t2 = tcg_temp_new_i64();

> @@ -2074,8 +2082,39 @@ static DisasJumpType op_vsldb(DisasContext *s, DisasOps *o)
>          read_vec_element_i64(t1, get_field(s, v3), 0, ES_64);
>          read_vec_element_i64(t2, get_field(s, v3), 1, ES_64);
>      }
> +
>      tcg_gen_extract2_i64(t0, t1, t0, right_shift);
>      tcg_gen_extract2_i64(t1, t2, t1, right_shift);
> +
> +    write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
> +    write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
> +
> +    tcg_temp_free(t0);
> +    tcg_temp_free(t1);
> +    tcg_temp_free(t2);
> +    return DISAS_NEXT;
> +}
> +
> +static DisasJumpType op_vsrd(DisasContext *s, DisasOps *o)
> +{
> +    const uint8_t i4 = get_field(s, i4);
> +
> +    if (i4 & ~7) {
> +        gen_program_exception(s, PGM_SPECIFICATION);
> +        return DISAS_NORETURN;
> +    }
> +
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();

TCGv_i64 t0, t1, t2;

if (i4 & ~7) {
    gen_program_exception(s, PGM_SPECIFICATION);
    return DISAS_NORETURN;
}

t0 = tcg_temp_new_i64();
t1 = tcg_temp_new_i64();
t2 = tcg_temp_new_i64();


> +
> +    read_vec_element_i64(t0, get_field(s, v2), 1, ES_64);
> +    read_vec_element_i64(t1, get_field(s, v3), 0, ES_64);
> +    read_vec_element_i64(t2, get_field(s, v3), 1, ES_64);
> +
> +    tcg_gen_extract2_i64(t0, t1, t0, i4);
> +    tcg_gen_extract2_i64(t1, t2, t1, i4);
> +
>      write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
>      write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
>  
> diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
> index f487a64abf..98a31a557d 100644
> --- a/target/s390x/tcg/insn-data.def
> +++ b/target/s390x/tcg/insn-data.def
> @@ -1207,12 +1207,16 @@
>      E(0xe774, VSL,     VRR_c, V,   0, 0, 0, 0, vsl, 0, 0, IF_VEC)
>  /* VECTOR SHIFT LEFT BY BYTE */
>      E(0xe775, VSLB,    VRR_c, V,   0, 0, 0, 0, vsl, 0, 1, IF_VEC)
> +/* VECTOR SHIFT LEFT DOUBLE BY BIT */
> +    E(0xe786, VSLD,    VRI_d, VE2, 0, 0, 0, 0, vsld, 0, 0, IF_VEC)
>  /* VECTOR SHIFT LEFT DOUBLE BY BYTE */
> -    F(0xe777, VSLDB,   VRI_d, V,   0, 0, 0, 0, vsldb, 0, IF_VEC)
> +    E(0xe777, VSLDB,   VRI_d, V,   0, 0, 0, 0, vsld, 0, 1, IF_VEC)
>  /* VECTOR SHIFT RIGHT ARITHMETIC */
>      E(0xe77e, VSRA,    VRR_c, V,   0, 0, 0, 0, vsra, 0, 0, IF_VEC)
>  /* VECTOR SHIFT RIGHT ARITHMETIC BY BYTE */
>      E(0xe77f, VSRAB,   VRR_c, V,   0, 0, 0, 0, vsra, 0, 1, IF_VEC)
> +/* VECTOR SHIFT RIGHT DOUBLE BY BIT */
> +    F(0xe787, VSRD,    VRI_d, VE2, 0, 0, 0, 0, vsrd, 0, IF_VEC)
>  /* VECTOR SHIFT RIGHT LOGICAL */
>      E(0xe77c, VSRL,    VRR_c, V,   0, 0, 0, 0, vsrl, 0, 0, IF_VEC)
>  /* VECTOR SHIFT RIGHT LOGICAL BY BYTE */


-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 11/11] target/s390x: Fix writeback to v1 in helper_vstl
  2022-03-08  1:53 ` [PATCH v3 11/11] target/s390x: Fix writeback to v1 in helper_vstl Richard Henderson
@ 2022-03-21 11:26   ` David Hildenbrand
  0 siblings, 0 replies; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21 11:26 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, dmiller423

On 08.03.22 02:53, Richard Henderson wrote:
> Copy-paste error from vector load length -- do not write
> zeros back to v1 after storing from v1.

Fixes: 0e0a5b49ad58 ("s390x/tcg: Implement VECTOR STORE WITH LENGTH")
Reviewed-by: David Hildenbrand <david@redhat.com>

> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/s390x/tcg/vec_helper.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/target/s390x/tcg/vec_helper.c b/target/s390x/tcg/vec_helper.c
> index ededf13cf0..48d86722b2 100644
> --- a/target/s390x/tcg/vec_helper.c
> +++ b/target/s390x/tcg/vec_helper.c
> @@ -200,7 +200,6 @@ void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
>          addr = wrap_address(env, addr + 8);
>          cpu_stq_data_ra(env, addr, s390_vec_read_element64(v1, 1), GETPC());
>      } else {
> -        S390Vector tmp = {};
>          int i;
>  
>          for (i = 0; i < bytes; i++) {
> @@ -209,6 +208,5 @@ void HELPER(vstl)(CPUS390XState *env, const void *v1, uint64_t addr,
>              cpu_stb_data_ra(env, addr, byte, GETPC());
>              addr = wrap_address(env, addr + 1);
>          }
> -        *(S390Vector *)v1 = tmp;
>      }
>  }


-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 09/11] target/s390x: add S390_FEAT_VECTOR_ENH2 to cpu max
  2022-03-08  1:53 ` [PATCH v3 09/11] target/s390x: add S390_FEAT_VECTOR_ENH2 to cpu max Richard Henderson
@ 2022-03-21 11:28   ` David Hildenbrand
  0 siblings, 0 replies; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21 11:28 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, dmiller423

On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>

QEMU is in soft freeze now. We'll have to perform that change for the
new 7.1 machine only, so we have to fixup the qemu model for the 7.0
machine.

Subject should be "... to qemu CPU model"

> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Message-Id: <20220307020327.3003-7-dmiller423@gmail.com>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/s390x/gen-features.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
> index 22846121c4..499a3b10a8 100644
> --- a/target/s390x/gen-features.c
> +++ b/target/s390x/gen-features.c
> @@ -740,7 +740,9 @@ static uint16_t qemu_V6_2[] = {
>  
>  static uint16_t qemu_LATEST[] = {
>      S390_FEAT_MISC_INSTRUCTION_EXT3,
> +    S390_FEAT_VECTOR_ENH2,
>  };
> +
>  /* add all new definitions before this point */
>  static uint16_t qemu_MAX[] = {
>      /* generates a dependency warning, leave it out for now */


-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 06/11] target/s390x: vxeh2: vector {load, store} elements reversed
  2022-03-08  1:53 ` [PATCH v3 06/11] target/s390x: vxeh2: vector {load, store} elements reversed Richard Henderson
@ 2022-03-21 11:35   ` David Hildenbrand
  2022-03-21 15:35     ` Richard Henderson
  0 siblings, 1 reply; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21 11:35 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, dmiller423

On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>
> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Message-Id: <20220307020327.3003-5-dmiller423@gmail.com>
> [rth: Use new hswap and wswap tcg expanders.]
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/s390x/tcg/translate_vx.c.inc | 84 +++++++++++++++++++++++++++++
>  target/s390x/tcg/insn-data.def      |  4 ++
>  2 files changed, 88 insertions(+)
> 
> diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
> index a5283ef2f8..ac807122a3 100644
> --- a/target/s390x/tcg/translate_vx.c.inc
> +++ b/target/s390x/tcg/translate_vx.c.inc
> @@ -492,6 +492,46 @@ static DisasJumpType op_vlei(DisasContext *s, DisasOps *o)
>      return DISAS_NEXT;
>  }
>  
> +static DisasJumpType op_vler(DisasContext *s, DisasOps *o)
> +{
> +    const uint8_t es = get_field(s, m3);

TCGv_i64 t0, t1;

> +
> +    if (es < ES_16 || es > ES_64) {
> +        gen_program_exception(s, PGM_SPECIFICATION);
> +        return DISAS_NORETURN;
> +    }
> +
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +
> +    /* Begin with the two doublewords swapped... */
> +    tcg_gen_qemu_ld_i64(t1, o->addr1, get_mem_index(s), MO_TEUQ);
> +    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
> +    tcg_gen_qemu_ld_i64(t0, o->addr1, get_mem_index(s), MO_TEUQ);
> +
> +    /* ... then swap smaller elements within the doublewords as required. */
> +    switch (es) {
> +    case MO_16:
> +        tcg_gen_hswap_i64(t1, t1);
> +        tcg_gen_hswap_i64(t0, t0);
> +        break;
> +    case MO_32:
> +        tcg_gen_wswap_i64(t1, t1);
> +        tcg_gen_wswap_i64(t0, t0);
> +        break;
> +    case MO_64:
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> +    write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
> +    write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
> +    tcg_temp_free(t0);
> +    tcg_temp_free(t1);
> +    return DISAS_NEXT;
> +}
> +
>  static DisasJumpType op_vlgv(DisasContext *s, DisasOps *o)
>  {
>      const uint8_t es = get_field(s, m4);
> @@ -976,6 +1016,50 @@ static DisasJumpType op_vste(DisasContext *s, DisasOps *o)
>      return DISAS_NEXT;
>  }
>  
> +static DisasJumpType op_vster(DisasContext *s, DisasOps *o)
> +{
> +    const uint8_t es = get_field(s, m3);
> +    TCGv_i64 t0, t1;
> +
> +    if (es < ES_16 || es > ES_64) {
> +        gen_program_exception(s, PGM_SPECIFICATION);
> +        return DISAS_NORETURN;
> +    }
> +
> +    /* Probe write access before actually modifying memory */
> +    gen_helper_probe_write_access(cpu_env, o->addr1, tcg_constant_i64(16));

We have to free the tcg_constant_i64() IIRC.

> +
> +    /* Begin with the two doublewords swapped... */
> +    t0 = tcg_temp_new_i64();
> +    t1 = tcg_temp_new_i64();
> +    read_vec_element_i64(t1,  get_field(s, v1), 0, ES_64);
> +    read_vec_element_i64(t0,  get_field(s, v1), 1, ES_64);
> +

apart from that LGTM

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 07/11] target/s390x: vxeh2: vector {load, store} byte reversed elements
  2022-03-08  1:53 ` [PATCH v3 07/11] target/s390x: vxeh2: vector {load, store} byte reversed elements Richard Henderson
@ 2022-03-21 11:45   ` David Hildenbrand
  0 siblings, 0 replies; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21 11:45 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, Richard Henderson, dmiller423

On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>
> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Message-Id: <20220307020327.3003-6-dmiller423@gmail.com>
> [rth: Split out elements (plural) from element (scalar)
>       Use tcg little-endian memory ops, plus hswap and wswap.]
> Signed-off-by: Richard Henderson <richard.henderson@linar.org>
> ---
>  target/s390x/tcg/translate_vx.c.inc | 101 ++++++++++++++++++++++++++++
>  target/s390x/tcg/insn-data.def      |   4 ++
>  2 files changed, 105 insertions(+)
> 
> diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
> index ac807122a3..9a82401d71 100644
> --- a/target/s390x/tcg/translate_vx.c.inc
> +++ b/target/s390x/tcg/translate_vx.c.inc
> @@ -457,6 +457,56 @@ static DisasJumpType op_vlrep(DisasContext *s, DisasOps *o)
>      return DISAS_NEXT;
>  }
>  
> +static DisasJumpType op_vlbr(DisasContext *s, DisasOps *o)
> +{
> +    const uint8_t es = get_field(s, m3);
> +    TCGv_i64 t0, t1, tt;
> +
> +    if (es < ES_16 || es > ES_128) {
> +        gen_program_exception(s, PGM_SPECIFICATION);
> +        return DISAS_NORETURN;
> +    }
> +
> +    t0 = tcg_temp_new_i64();
> +    t1 = tcg_temp_new_i64();
> +
> +    /* Begin with byte reversed doublewords... */
> +    tcg_gen_qemu_ld_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
> +    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
> +    tcg_gen_qemu_ld_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);
> +

Would it make sense to just special-case ES_128, by loading them into
the proper t0/t1 right away?

if (es == ES_128) {
    tcg_gen_qemu_ld_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);
    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
    tcg_gen_qemu_ld_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
    goto write;
}

/* Begin with byte reversed doublewords... */
tcg_gen_qemu_ld_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
tcg_gen_qemu_ld_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);

/*
 * For 16 and 32-bit elements, the doubleword bswap also reversed
 * the order of the elements.  Perform a larger order swap to put
 * them back into place.
 */
switch (es) {
...
}

write:
write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);

> +    /*
> +     * For 16 and 32-bit elements, the doubleword bswap also reversed
> +     * the order of the elements.  Perform a larger order swap to put
> +     * them back into place.  For the 128-bit "element", finish the
> +     * bswap by swapping the doublewords.
> +     */
> +    switch (es) {
> +    case ES_16:
> +        tcg_gen_hswap_i64(t0, t0);
> +        tcg_gen_hswap_i64(t1, t1);
> +        break;
> +    case ES_32:
> +        tcg_gen_wswap_i64(t0, t0);
> +        tcg_gen_wswap_i64(t1, t1);
> +        break;
> +    case ES_64:
> +        break;
> +    case ES_128:
> +        tt = t0, t0 = t1, t1 = tt;
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> +    write_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
> +    write_vec_element_i64(t1, get_field(s, v1), 1, ES_64);
> +
> +    tcg_temp_free(t0);
> +    tcg_temp_free(t1);
> +    return DISAS_NEXT;
> +}
> +
>  static DisasJumpType op_vle(DisasContext *s, DisasOps *o)
>  {
>      const uint8_t es = s->insn->data;
> @@ -998,6 +1048,57 @@ static DisasJumpType op_vst(DisasContext *s, DisasOps *o)
>      return DISAS_NEXT;
>  }
>  
> +static DisasJumpType op_vstbr(DisasContext *s, DisasOps *o)
> +{
> +    const uint8_t es = get_field(s, m3);
> +    TCGv_i64 t0, t1, tt;
> +
> +    if (es < ES_16 || es > ES_128) {
> +        gen_program_exception(s, PGM_SPECIFICATION);
> +        return DISAS_NORETURN;
> +    }
> +
> +    /* Probe write access before actually modifying memory */
> +    gen_helper_probe_write_access(cpu_env, o->addr1, tcg_constant_i64(16));
> +
> +    t0 = tcg_temp_new_i64();
> +    t1 = tcg_temp_new_i64();
> +    read_vec_element_i64(t0, get_field(s, v1), 0, ES_64);
> +    read_vec_element_i64(t1, get_field(s, v1), 1, ES_64);


Dito, eventually just special case on MO_128 directly.

> +
> +    /*
> +     * For 16 and 32-bit elements, the doubleword bswap below will
> +     * reverse the order of the elements.  Perform a larger order
> +     * swap to put them back into place.  For the 128-bit "element",
> +     * finish the bswap by swapping the doublewords.
> +     */
> +    switch (es) {
> +    case MO_16:
> +        tcg_gen_hswap_i64(t0, t0);
> +        tcg_gen_hswap_i64(t1, t1);
> +        break;
> +    case MO_32:
> +        tcg_gen_wswap_i64(t0, t0);
> +        tcg_gen_wswap_i64(t1, t1);
> +        break;
> +    case MO_64:
> +        break;
> +    case MO_128:
> +        tt = t0, t0 = t1, t1 = tt;
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> +    tcg_gen_qemu_st_i64(t0, o->addr1, get_mem_index(s), MO_LEUQ);
> +    gen_addi_and_wrap_i64(s, o->addr1, o->addr1, 8);
> +    tcg_gen_qemu_st_i64(t1, o->addr1, get_mem_index(s), MO_LEUQ);
> +
> +    tcg_temp_free(t0);
> +    tcg_temp_free(t1);
> +    return DISAS_NEXT;
> +}
> +
>  static DisasJumpType op_vste(DisasContext *s, DisasOps *o)
>  {
>      const uint8_t es = s->insn->data;
> diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
> index b524541a7d..ee6e1dc9e5 100644
> --- a/target/s390x/tcg/insn-data.def
> +++ b/target/s390x/tcg/insn-data.def
> @@ -1027,6 +1027,8 @@
>      F(0xe756, VLR,     VRR_a, V,   0, 0, 0, 0, vlr, 0, IF_VEC)
>  /* VECTOR LOAD AND REPLICATE */
>      F(0xe705, VLREP,   VRX,   V,   la2, 0, 0, 0, vlrep, 0, IF_VEC)
> +/* VECTOR LOAD BYTE REVERSED ELEMENTS */
> +    F(0xe606, VLBR,    VRX,   VE2, la2, 0, 0, 0, vlbr, 0, IF_VEC)
>  /* VECTOR LOAD ELEMENT */
>      E(0xe700, VLEB,    VRX,   V,   la2, 0, 0, 0, vle, 0, ES_8, IF_VEC)
>      E(0xe701, VLEH,    VRX,   V,   la2, 0, 0, 0, vle, 0, ES_16, IF_VEC)
> @@ -1079,6 +1081,8 @@
>      F(0xe75f, VSEG,    VRR_a, V,   0, 0, 0, 0, vseg, 0, IF_VEC)
>  /* VECTOR STORE */
>      F(0xe70e, VST,     VRX,   V,   la2, 0, 0, 0, vst, 0, IF_VEC)
> +/* VECTOR STORE BYTE REVERSED ELEMENTS */
> +    F(0xe60e, VSTBR,    VRX,   VE2, la2, 0, 0, 0, vstbr, 0, IF_VEC)
>  /* VECTOR STORE ELEMENT */
>      E(0xe708, VSTEB,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_8, IF_VEC)
>      E(0xe709, VSTEH,   VRX,   V,   la2, 0, 0, 0, vste, 0, ES_16, IF_VEC)


-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 08/11] target/s390x: vxeh2: vector {load, store} byte reversed element
  2022-03-08  1:53 ` [PATCH v3 08/11] target/s390x: vxeh2: vector {load, store} byte reversed element Richard Henderson
@ 2022-03-21 12:33   ` David Hildenbrand
  0 siblings, 0 replies; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21 12:33 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, dmiller423

On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>
> 
> This includes VLEBR* and VSTEBR* (single element);
> VLBRREP (load single element and replicate); and
> VLLEBRZ (load single element and zero).

"load byte reversed element and ..."

> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Message-Id: <20220307020327.3003-6-dmiller423@gmail.com>
> [rth: Split out elements (plural) from element (scalar),
>       Use tcg little-endian memory operations.]
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

[...]

> diff --git a/target/s390x/tcg/insn-data.def b/target/s390x/tcg/insn-data.def
> index ee6e1dc9e5..b80f989002 100644
> --- a/target/s390x/tcg/insn-data.def
> +++ b/target/s390x/tcg/insn-data.def
> @@ -1027,6 +1027,14 @@
>      F(0xe756, VLR,     VRR_a, V,   0, 0, 0, 0, vlr, 0, IF_VEC)
>  /* VECTOR LOAD AND REPLICATE */
>      F(0xe705, VLREP,   VRX,   V,   la2, 0, 0, 0, vlrep, 0, IF_VEC)
> +/* VECTOR LOAD BYTE REVERSED ELEMENT */
> +    E(0xe601, VLEBRH,  VRX,   VE2, la2, 0, 0, 0, vlebr, 0, ES_16, IF_VEC)
> +    E(0xe603, VLEBRF,  VRX,   VE2, la2, 0, 0, 0, vlebr, 0, ES_32, IF_VEC)
> +    E(0xe602, VLEBRG,  VRX,   VE2, la2, 0, 0, 0, vlebr, 0, ES_64, IF_VEC)
> +/* VECTOR LOAD BYTE REVERSED ELEMENT AND REPLOCATE */

s/REPLOCATE/REPLICATE/

> +    F(0xe605, VLBRREP, VRX,   VE2, la2, 0, 0, 0, vlbrrep, 0, IF_VEC)
> +/* VECTOR LOAD BYTE REVERSED ELEMENT AND ZERO */
> +    F(0xe604, VLLEBRZ, VRX,   VE2, la2, 0, 0, 0, vllebrz, 0, IF_VEC)
>  /* VECTOR LOAD BYTE REVERSED ELEMENTS */
>      F(0xe606, VLBR,    VRX,   VE2, la2, 0, 0, 0, vlbr, 0, IF_VEC)
>  /* VECTOR LOAD ELEMENT */
> @@ -1081,6 +1089,10 @@
>      F(0xe75f, VSEG,    VRR_a, V,   0, 0, 0, 0, vseg, 0, IF_VEC)
>  /* VECTOR STORE */
>      F(0xe70e, VST,     VRX,   V,   la2, 0, 0, 0, vst, 0, IF_VEC)
> +/* VECTOR STORE BYTE REVERSED ELEMENT */
> +    E(0xe609, VSTEBRH,  VRX,   VE2, la2, 0, 0, 0, vstebr, 0, ES_16, IF_VEC)
> +    E(0xe60b, VSTEBRF,  VRX,   VE2, la2, 0, 0, 0, vstebr, 0, ES_32, IF_VEC)
> +    E(0xe60a, VSTEBRG,  VRX,   VE2, la2, 0, 0, 0, vstebr, 0, ES_64, IF_VEC)
>  /* VECTOR STORE BYTE REVERSED ELEMENTS */
>      F(0xe60e, VSTBR,    VRX,   VE2, la2, 0, 0, 0, vstbr, 0, IF_VEC)
>  /* VECTOR STORE ELEMENT */

Reviewed-by: David Hildenbrand <david@redhat.com>

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 06/11] target/s390x: vxeh2: vector {load, store} elements reversed
  2022-03-21 11:35   ` David Hildenbrand
@ 2022-03-21 15:35     ` Richard Henderson
  2022-03-21 15:40       ` David Hildenbrand
  0 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-21 15:35 UTC (permalink / raw)
  To: David Hildenbrand, qemu-devel; +Cc: qemu-s390x, dmiller423

On 3/21/22 04:35, David Hildenbrand wrote:
>> +    /* Probe write access before actually modifying memory */
>> +    gen_helper_probe_write_access(cpu_env, o->addr1, tcg_constant_i64(16));
> 
> We have to free the tcg_constant_i64() IIRC.

We do not.


r~


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 06/11] target/s390x: vxeh2: vector {load, store} elements reversed
  2022-03-21 15:35     ` Richard Henderson
@ 2022-03-21 15:40       ` David Hildenbrand
  0 siblings, 0 replies; 29+ messages in thread
From: David Hildenbrand @ 2022-03-21 15:40 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-s390x, dmiller423

On 21.03.22 16:35, Richard Henderson wrote:
> On 3/21/22 04:35, David Hildenbrand wrote:
>>> +    /* Probe write access before actually modifying memory */
>>> +    gen_helper_probe_write_access(cpu_env, o->addr1, tcg_constant_i64(16));
>>
>> We have to free the tcg_constant_i64() IIRC.
> 
> We do not.

Ah, then my memory is playing tricks on me :)

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 03/11] target/s390x: vxeh2: vector string search
  2022-03-21 10:31   ` David Hildenbrand
@ 2022-03-22 14:42     ` Richard Henderson
  2022-03-22 15:06       ` David Miller
  0 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2022-03-22 14:42 UTC (permalink / raw)
  To: David Hildenbrand, qemu-devel; +Cc: qemu-s390x, dmiller423

On 3/21/22 03:31, David Hildenbrand wrote:
>> +        for (i = 0; i < nelem; i++) {
>> +            if (s390_vec_read_element(v3, i, es) == 0) {
>> +                break;
>> +            }
>> +        }
>> +        if (i < substr_elen) {
>> +            substr_elen = i;
>> +        }
> 
> Maybe combine both, I guess there is no need to search beyond substr_elen.
> 
> substr_elen = MIN(substr_elen, nelem);
> for (i = 0; i < substr_elen; i++) {
>      if (s390_vec_read_element(v3, i, es) == 0) {
>          substr_elen = i;
>          break;
>      }
> }

Yep.

> We should do the MIN(substr_elen, nelem) maybe right when reading it
> from v4.

No, v4 does not get bounded until zs is set.

>> +    /* If ZS, look for eos in the searched string. */
>> +    if (zs) {
>> +        for (k = 0; k < nelem; k++) {
>> +            if (s390_vec_read_element(v2, k, es) == 0) {
>> +                eos = true;
>> +                break;
>> +            }
>> +        }
> 
> I guess we could move that into the main search loop and avoid parsing
> the string twice. Not sure what's better.

I'd leave it here, so that we only do the strlen once.  There's no obvious place within 
the the search loop that wouldn't wind up doing the strlen more than once.


r~


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v3 03/11] target/s390x: vxeh2: vector string search
  2022-03-22 14:42     ` Richard Henderson
@ 2022-03-22 15:06       ` David Miller
  0 siblings, 0 replies; 29+ messages in thread
From: David Miller @ 2022-03-22 15:06 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-s390x, qemu-devel, David Hildenbrand

I came to much the same conclusion

On Tue, Mar 22, 2022 at 10:42 AM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 3/21/22 03:31, David Hildenbrand wrote:
> >> +        for (i = 0; i < nelem; i++) {
> >> +            if (s390_vec_read_element(v3, i, es) == 0) {
> >> +                break;
> >> +            }
> >> +        }
> >> +        if (i < substr_elen) {
> >> +            substr_elen = i;
> >> +        }
> >
> > Maybe combine both, I guess there is no need to search beyond substr_elen.
> >
> > substr_elen = MIN(substr_elen, nelem);
> > for (i = 0; i < substr_elen; i++) {
> >      if (s390_vec_read_element(v3, i, es) == 0) {
> >          substr_elen = i;
> >          break;
> >      }
> > }
>
> Yep.
>
> > We should do the MIN(substr_elen, nelem) maybe right when reading it
> > from v4.
>
> No, v4 does not get bounded until zs is set.
>
> >> +    /* If ZS, look for eos in the searched string. */
> >> +    if (zs) {
> >> +        for (k = 0; k < nelem; k++) {
> >> +            if (s390_vec_read_element(v2, k, es) == 0) {
> >> +                eos = true;
> >> +                break;
> >> +            }
> >> +        }
> >
> > I guess we could move that into the main search loop and avoid parsing
> > the string twice. Not sure what's better.
>
> I'd leave it here, so that we only do the strlen once.  There's no obvious place within
> the the search loop that wouldn't wind up doing the strlen more than once.
>
>
> r~


^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2022-03-22 15:14 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-08  1:53 [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 Richard Henderson
2022-03-08  1:53 ` [PATCH v3 01/11] tcg: Implement tcg_gen_{h,w}swap_{i32,i64} Richard Henderson
2022-03-21  9:32   ` David Hildenbrand
2022-03-08  1:53 ` [PATCH v3 02/11] target/s390x: vxeh2: vector convert short/32b Richard Henderson
2022-03-21  9:33   ` David Hildenbrand
2022-03-08  1:53 ` [PATCH v3 03/11] target/s390x: vxeh2: vector string search Richard Henderson
2022-03-21 10:31   ` David Hildenbrand
2022-03-22 14:42     ` Richard Henderson
2022-03-22 15:06       ` David Miller
2022-03-08  1:53 ` [PATCH v3 04/11] target/s390x: vxeh2: Update for changes to vector shifts Richard Henderson
2022-03-21 11:15   ` David Hildenbrand
2022-03-08  1:53 ` [PATCH v3 05/11] target/s390x: vxeh2: vector shift double by bit Richard Henderson
2022-03-21 11:23   ` David Hildenbrand
2022-03-08  1:53 ` [PATCH v3 06/11] target/s390x: vxeh2: vector {load, store} elements reversed Richard Henderson
2022-03-21 11:35   ` David Hildenbrand
2022-03-21 15:35     ` Richard Henderson
2022-03-21 15:40       ` David Hildenbrand
2022-03-08  1:53 ` [PATCH v3 07/11] target/s390x: vxeh2: vector {load, store} byte reversed elements Richard Henderson
2022-03-21 11:45   ` David Hildenbrand
2022-03-08  1:53 ` [PATCH v3 08/11] target/s390x: vxeh2: vector {load, store} byte reversed element Richard Henderson
2022-03-21 12:33   ` David Hildenbrand
2022-03-08  1:53 ` [PATCH v3 09/11] target/s390x: add S390_FEAT_VECTOR_ENH2 to cpu max Richard Henderson
2022-03-21 11:28   ` David Hildenbrand
2022-03-08  1:53 ` [PATCH v3 10/11] tests/tcg/s390x: Tests for Vector Enhancements Facility 2 Richard Henderson
2022-03-08  1:53 ` [PATCH v3 11/11] target/s390x: Fix writeback to v1 in helper_vstl Richard Henderson
2022-03-21 11:26   ` David Hildenbrand
2022-03-08  4:09 ` [PATCH v3 00/11] s390x/tcg: Implement Vector-Enhancements Facility 2 David Miller
2022-03-20  1:14   ` David Miller
2022-03-21  9:19     ` David Hildenbrand

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.