Re: [PATCH v3 04/11] target/s390x: vxeh2: Update for changes to vector shifts

From: David Hildenbrand <david@redhat.com>
To: Richard Henderson <richard.henderson@linaro.org>, qemu-devel@nongnu.org
Cc: qemu-s390x@nongnu.org, dmiller423@gmail.com
Subject: Re: [PATCH v3 04/11] target/s390x: vxeh2: Update for changes to vector shifts
Date: Mon, 21 Mar 2022 12:15:53 +0100	[thread overview]
Message-ID: <93302366-470f-2684-c788-d5eb9e0f861e@redhat.com> (raw)
In-Reply-To: <20220308015358.188499-5-richard.henderson@linaro.org>

On 08.03.22 02:53, Richard Henderson wrote:
> From: David Miller <dmiller423@gmail.com>
> 
> Prior to vector enhancements 2, the shift count was supposed to be equal
> for each byte lest the result be unpredictable, which allowed us to assume
> that the shift count was the same, and optimize accordingly.
> 
> With vector enhancements 2, the shift count is allowed to be different
> for each byte, and we must cope with that.
> 
> Signed-off-by: David Miller <dmiller423@gmail.com>
> Message-Id: <20220307020327.3003-4-dmiller423@gmail.com>
> [rth: Split out of larger patch; simplify shift/merge code.]
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/s390x/helper.h               |  3 ++
>  target/s390x/tcg/vec_int_helper.c   | 58 ++++++++++++++++++++++
>  target/s390x/tcg/translate_vx.c.inc | 77 ++++++++++++-----------------
>  target/s390x/tcg/insn-data.def      | 12 ++---
>  4 files changed, 99 insertions(+), 51 deletions(-)
> 
> diff --git a/target/s390x/helper.h b/target/s390x/helper.h
> index 7412130883..bf33d86f74 100644
> --- a/target/s390x/helper.h
> +++ b/target/s390x/helper.h
> @@ -203,8 +203,11 @@ DEF_HELPER_FLAGS_3(gvec_vpopct16, TCG_CALL_NO_RWG, void, ptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_verim8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_verim16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vsl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_vsl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vsra, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_vsra_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vsrl, TCG_CALL_NO_RWG, void, ptr, cptr, i64, i32)
> +DEF_HELPER_FLAGS_4(gvec_vsrl_ve2, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vscbi8, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_FLAGS_4(gvec_vscbi16, TCG_CALL_NO_RWG, void, ptr, cptr, cptr, i32)
>  DEF_HELPER_4(gvec_vtm, void, ptr, cptr, env, i32)
> diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c
> index 5561b3ed90..a881d5d267 100644
> --- a/target/s390x/tcg/vec_int_helper.c
> +++ b/target/s390x/tcg/vec_int_helper.c
> @@ -540,18 +540,76 @@ void HELPER(gvec_vsl)(void *v1, const void *v2, uint64_t count,
>      s390_vec_shl(v1, v2, count);
>  }
>  
> +void HELPER(gvec_vsl_ve2)(void *v1, const void *v2, const void *v3,
> +                          uint32_t desc)
> +{
> +    S390Vector tmp;
> +    uint32_t sh, e0, e1 = 0;

int i;

> +
> +    for (int i = 15; i >= 0; --i, e1 = e0 << 24) {

I'd only do "e1 = e0" here and do the shift for the rol32 ...

> +        e0 = s390_vec_read_element8(v2, i);
> +        sh = s390_vec_read_element8(v3, i) & 7;
> +
> +        s390_vec_write_element8(&tmp, i, rol32(e0 | e1, sh));

... here

s390_vec_write_element8(&tmp, i, rol32(e0 | e1 << 24, sh));

> +    }
> +
> +    *(S390Vector *)v1 = tmp;
> +}
> +
>  void HELPER(gvec_vsra)(void *v1, const void *v2, uint64_t count,
>                         uint32_t desc)
>  {
>      s390_vec_sar(v1, v2, count);
>  }
>  
> +void HELPER(gvec_vsra_ve2)(void *v1, const void *v2, const void *v3,
> +                           uint32_t desc)
> +{
> +    S390Vector tmp;
> +    uint32_t sh, e0, e1;
> +    int i = 0;
> +
> +    e0 = s390_vec_read_element8(v2, 0);
> +    e1 = -(e0 >> 7) << 8;
> +
> +    for (;;) {
> +        sh = s390_vec_read_element8(v3, i) & 7;
> +
> +        s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh);
> +
> +        if (++i >= 16) {
> +            break;
> +        }
> +
> +        e1 = e0 << 8;
> +        e0 = s390_vec_read_element8(v2, i);
> +    }

Can't we use the following that resembles the other helpers or am I
missing something?

S390Vector tmp;
uint32_t sh, e0, e1 = 0;

/* Byte 0 is special only. */
e0 = (int32_t)(int8_t)s390_vec_read_element8(v2, i);
sh = s390_vec_read_element8(v3, i) & 7;
s390_vec_write_element8(&tmp, i, e0 >> sh);

e1 = e0;
for (int i = 1; i < 16; ++i, e1 = e0) {
	e0 = s390_vec_read_element8(v2, i);
	sh = s390_vec_read_element8(v3, i) & 7;
	s390_vec_write_element8(&tmp, i, (e0 | e1 << 8) >> sh);
}

*(S390Vector *)v1 = tmp;

> +
> +    *(S390Vector *)v1 = tmp;
> +}
> +
>  void HELPER(gvec_vsrl)(void *v1, const void *v2, uint64_t count,
>                         uint32_t desc)
>  {
>      s390_vec_shr(v1, v2, count);
>  }
>  
> +void HELPER(gvec_vsrl_ve2)(void *v1, const void *v2, const void *v3,
> +                           uint32_t desc)
> +{
> +    S390Vector tmp;
> +    uint32_t sh, e0, e1 = 0;
> +
> +    for (int i = 0; i < 16; ++i, e1 = e0 << 8) {

Dito, I'd do the shift below ...

> +        e0 = s390_vec_read_element8(v2, i);
> +        sh = s390_vec_read_element8(v3, i) & 7;
> +
> +        s390_vec_write_element8(&tmp, i, (e0 | e1) >> sh);

s390_vec_write_element8(&tmp, i, (e0 | e1 << 8) >> sh);

> +    }
> +
> +    *(S390Vector *)v1 = tmp;
> +}
> +
>  #define DEF_VSCBI(BITS)                                                        \
>  void HELPER(gvec_vscbi##BITS)(void *v1, const void *v2, const void *v3,        \
>                                uint32_t desc)                                   \
> diff --git a/target/s390x/tcg/translate_vx.c.inc b/target/s390x/tcg/translate_vx.c.inc
> index d514e8b218..967f6213d8 100644
> --- a/target/s390x/tcg/translate_vx.c.inc
> +++ b/target/s390x/tcg/translate_vx.c.inc
> @@ -2018,21 +2018,42 @@ static DisasJumpType op_ves(DisasContext *s, DisasOps *o)
>      return DISAS_NEXT;
>  }
>  
> +static DisasJumpType gen_vsh_bit_byte(DisasContext *s, DisasOps *o,
> +                                      gen_helper_gvec_2i *gen,
> +                                      gen_helper_gvec_3 *gen_ve2)
> +{
> +    bool byte = s->insn->data;

Nit: I'd have called this "by_byte".

-- 
Thanks,

David / dhildenb