Re: [PATCH v2 04/10] target/hexagon: introduce new helper functions

From: Richard Henderson <richard.henderson@linaro.org>
To: Alessandro Di Federico <ale.qemu@rev.ng>, qemu-devel@nongnu.org
Cc: Alessandro Di Federico <ale@rev.ng>,
	bcain@quicinc.com, babush@rev.ng, tsimpson@quicinc.com,
	nizzo@rev.ng, philmd@redhat.com
Subject: Re: [PATCH v2 04/10] target/hexagon: introduce new helper functions
Date: Thu, 25 Feb 2021 12:45:02 -0800	[thread overview]
Message-ID: <9b040b9f-cba6-5a2e-a1af-ea4d9445b453@linaro.org> (raw)
In-Reply-To: <20210225151856.3284701-5-ale.qemu@rev.ng>

On 2/25/21 7:18 AM, Alessandro Di Federico wrote:
> From: Niccolò Izzo <nizzo@rev.ng>
> 
> These helpers will be employed by the idef-parser generated code.
> 
> Signed-off-by: Alessandro Di Federico <ale@rev.ng>
> Signed-off-by: Niccolò Izzo <nizzo@rev.ng>
> ---
>  target/hexagon/genptr.c | 227 +++++++++++++++++++++++++++++++++++++++-
>  target/hexagon/genptr.h |  19 ++++
>  target/hexagon/macros.h |   2 +-
>  3 files changed, 245 insertions(+), 3 deletions(-)
> 
> diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
> index 97de669f38..78cda032db 100644
> --- a/target/hexagon/genptr.c
> +++ b/target/hexagon/genptr.c
> @@ -40,7 +40,8 @@ TCGv gen_read_preg(TCGv pred, uint8_t num)
>      return pred;
>  }
>  
> -static inline void gen_log_predicated_reg_write(int rnum, TCGv val, int slot)
> +static inline void gen_log_predicated_reg_write(int rnum, TCGv val,
> +                                                unsigned slot)

This change is unrelated to adding new helper functions.
It requires a separate patch and justification.

> +void gen_fbrev(TCGv result, TCGv src)
> +{
> +    TCGv lo = tcg_temp_new();
> +    TCGv tmp1 = tcg_temp_new();
> +    TCGv tmp2 = tcg_temp_new();
> +
> +    /* Bit reversal of low 16 bits */
> +    tcg_gen_extract_tl(lo, src, 0, 16);
> +    tcg_gen_andi_tl(tmp1, lo, 0xaaaa);
> +    tcg_gen_shri_tl(tmp1, tmp1, 1);
> +    tcg_gen_andi_tl(tmp2, lo, 0x5555);
> +    tcg_gen_shli_tl(tmp2, tmp2, 1);
> +    tcg_gen_or_tl(lo, tmp1, tmp2);
> +    tcg_gen_andi_tl(tmp1, lo, 0xcccc);
> +    tcg_gen_shri_tl(tmp1, tmp1, 2);
> +    tcg_gen_andi_tl(tmp2, lo, 0x3333);
> +    tcg_gen_shli_tl(tmp2, tmp2, 2);
> +    tcg_gen_or_tl(lo, tmp1, tmp2);
> +    tcg_gen_andi_tl(tmp1, lo, 0xf0f0);
> +    tcg_gen_shri_tl(tmp1, tmp1, 4);
> +    tcg_gen_andi_tl(tmp2, lo, 0x0f0f);
> +    tcg_gen_shli_tl(tmp2, tmp2, 4);
> +    tcg_gen_or_tl(lo, tmp1, tmp2);
> +    tcg_gen_bswap16_tl(lo, lo);

So far we've kept operations like this as external helper functions, where you
can then use revbit16().  General rule of thumb for a cutoff is about 10-15
ops, and this is right on the edge.

Any particular reason you wanted this inlined?

> +    /* Final tweaks */
> +    tcg_gen_extract_tl(result, src, 16, 16);
> +    tcg_gen_or_tl(result, result, lo);

This is wrong.  You've clobbered your carefully reversed results with the high
16-bits of src.

I'm certain you wanted

   tcg_gen_deposit_tl(result, src, lo, 0, 16);

to replace the low 16 bits of the input with your computation.

> +TCGv gen_set_bit(tcg_target_long i, TCGv result, TCGv src)
> +{
> +    TCGv mask = tcg_const_tl(~(1 << i));
> +    TCGv bit = tcg_temp_new();
> +    tcg_gen_shli_tl(bit, src, i);
> +    tcg_gen_and_tl(result, result, mask);
> +    tcg_gen_or_tl(result, result, bit);
> +    tcg_temp_free(mask);
> +    tcg_temp_free(bit);

  tcg_gen_deposit_tl(result, result, src, i, 1);

> +void gen_cancel(tcg_target_ulong slot)
> +{
> +    TCGv one = tcg_const_tl(1);
> +    tcg_gen_deposit_tl(hex_slot_cancelled, hex_slot_cancelled, one, slot, 1);
> +    tcg_temp_free(one);

  tcg_gen_ori_tl(hex_slot_cancelled, hex_slot_cancelled,
                 1 << slot);

> +void gen_sat_i32(TCGv dest, TCGv source, int width, bool set_overflow)
> +{
> +    TCGv max_val = tcg_const_i32((1 << (width - 1)) - 1);
> +    TCGv min_val = tcg_const_i32(-(1 << (width - 1)));
> +    tcg_gen_movcond_i32(TCG_COND_GT, dest, source, max_val, max_val, source);
> +    tcg_gen_movcond_i32(TCG_COND_LT, dest, source, min_val, min_val, dest);

  tcg_gen_smin_tl(dest, source, max_val);
  tcg_gen_smax_tl(dest, dest, min_val);

> +    /* Set Overflow Bit */
> +    if (set_overflow) {
> +        TCGv ovf = tcg_temp_new();
> +        TCGv one = tcg_const_i32(1);
> +        GET_USR_FIELD(USR_OVF, ovf);
> +        tcg_gen_movcond_i32(TCG_COND_GT, ovf, source, max_val, one, ovf);
> +        tcg_gen_movcond_i32(TCG_COND_LT, ovf, source, min_val, one, ovf);
> +        SET_USR_FIELD(USR_OVF, ovf);

This seems like a complicated way to set overflow.
How about

  tcg_gen_setcond_tl(TCG_COND_NE, ovf, source, dest);
  tcg_gen_shli_tl(ovf, ovf, USR_OVF_SHIFT);
  tcg_gen_or_tl(hex_reg[usr], hex_reg[usr], ovf);

> +        tcg_temp_free_i32(ovf);
> +        tcg_temp_free_i32(one);
> +    }
> +    tcg_temp_free_i32(max_val);
> +    tcg_temp_free_i32(min_val);
> +}
> +
> +void gen_satu_i32(TCGv dest, TCGv source, int width, bool set_overflow)
> +{
> +    TCGv max_val = tcg_const_i32((1 << width) - 1);
> +    tcg_gen_movcond_i32(TCG_COND_GTU, dest, source, max_val, max_val, source);
> +    TCGv_i32 zero = tcg_const_i32(0);
> +    tcg_gen_movcond_i32(TCG_COND_LT, dest, source, zero, zero, dest);

Is this a signed input being saturated to unsigned bounds or not?  Because one
of these two comparisons is wrong.

If it's an unsigned input being saturated to unsigned bounds, then you don't
need the second test at all, and should be using

  tcg_gen_umin_i32(dest, src, max_val);

> +void gen_sat_i64(TCGv_i64 dest, TCGv_i64 source, int width, bool set_overflow)
> +void gen_satu_i64(TCGv_i64 dest, TCGv_i64 source, int width, bool set_overflow)

Similarly.

> diff --git a/target/hexagon/macros.h b/target/hexagon/macros.h
> index 78c4efb5cb..7b6556b07b 100644
> --- a/target/hexagon/macros.h
> +++ b/target/hexagon/macros.h
> @@ -154,7 +154,7 @@
>  #define LOAD_CANCEL(EA) do { CANCEL; } while (0)
>  
>  #ifdef QEMU_GENERATE
> -static inline void gen_pred_cancel(TCGv pred, int slot_num)
> +static inline void gen_pred_cancel(TCGv pred, tcg_target_ulong slot_num)

Why in the world would a slot number need to be 64-bits?

r~