All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] tcg: Improve vector tail clearing
@ 2020-04-18 15:56 Richard Henderson
  2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell

Something I noticed while looking at AdvSIMD dumps, while
testing changes common with SVE2.

If we're going to load a zero into a vector register for
clearing the high bits of the SVE register, we might as
well use that zero to store the 8 bytes at the top of the
AdvSIMD register as well.

Output assembly goes from e.g.

  00:   48 c7 85 08 10 00 00 00   movq   $0x0,0x1008(%rbp)
        00 00 00
  0b:   c5 f9 ef c0               vpxor  %xmm0,%xmm0,%xmm0
  0f:   c5 fe 7f 85 10 10 00 00   vmovdqu %ymm0,0x1010(%rbp)
  17:   c5 fa 7f 85 30 10 00 00   vmovdqu %xmm0,0x1030(%rbp)

to

  00:   c5 f9 ef c0               vpxor  %xmm0,%xmm0,%xmm0
  04:   c5 f9 d6 85 08 10 00 00   vmovq  %xmm0,0x1008(%rbp)
  0c:   c5 fe 7f 85 10 10 00 00   vmovdqu %ymm0,0x1010(%rbp)
  14:   c5 fa 7f 85 30 10 00 00   vmovdqu %xmm0,0x1030(%rbp)

Saves a few bytes now, and more when we can do better with
loading constants into registers, where we can share the
vpxor between instructions.

The target/arm patches are not aided by the tcg patch, but
are not dependent on it.


r~


Richard Henderson (3):
  tcg: Improve vector tail clearing
  target/arm: Use tcg_gen_gvec_mov for clear_vec_high
  target/arm: Use clear_vec_high more effectively

 target/arm/translate-a64.c | 69 ++++++++++++++++++--------------
 tcg/tcg-op-gvec.c          | 82 +++++++++++++++++++++++++++++---------
 2 files changed, 101 insertions(+), 50 deletions(-)

-- 
2.20.1



^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/3] tcg: Improve vector tail clearing
  2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson
@ 2020-04-18 15:56 ` Richard Henderson
  2020-04-20 15:25   ` Alex Bennée
  2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson
  2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson
  2 siblings, 1 reply; 7+ messages in thread
From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell

Better handling of non-power-of-2 tails as seen with Arm 8-byte
vector operations.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 5a6cc19812..43cac1a0bf 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -326,11 +326,34 @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
    in units of LNSZ.  This limits the expansion of inline code.  */
 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 {
-    if (oprsz % lnsz == 0) {
-        uint32_t lnct = oprsz / lnsz;
-        return lnct >= 1 && lnct <= MAX_UNROLL;
+    uint32_t q, r;
+
+    if (oprsz < lnsz) {
+        return false;
     }
-    return false;
+
+    q = oprsz / lnsz;
+    r = oprsz % lnsz;
+    tcg_debug_assert((r & 7) == 0);
+
+    if (lnsz < 16) {
+        /* For sizes below 16, accept no remainder. */
+        if (r != 0) {
+            return false;
+        }
+    } else {
+        /*
+         * Recall that ARM SVE allows vector sizes that are not a
+         * power of 2, but always a multiple of 16.  The intent is
+         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+         * In addition, expand_clr needs to handle a multiple of 8.
+         * Thus we can handle the tail with one more operation per
+         * diminishing power of 2.
+         */
+        q += ctpop32(r);
+    }
+
+    return q <= MAX_UNROLL;
 }
 
 static void expand_clr(uint32_t dofs, uint32_t maxsz);
@@ -402,22 +425,31 @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
                                   uint32_t size, bool prefer_i64)
 {
-    if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
-        /*
-         * Recall that ARM SVE allows vector sizes that are not a
-         * power of 2, but always a multiple of 16.  The intent is
-         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
-         * It is hard to imagine a case in which v256 is supported
-         * but v128 is not, but check anyway.
-         */
-        if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
-            && (size % 32 == 0
-                || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
-            return TCG_TYPE_V256;
-        }
+    /*
+     * Recall that ARM SVE allows vector sizes that are not a
+     * power of 2, but always a multiple of 16.  The intent is
+     * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+     * It is hard to imagine a case in which v256 is supported
+     * but v128 is not, but check anyway.
+     * In addition, expand_clr needs to handle a multiple of 8.
+     */
+    if (TCG_TARGET_HAS_v256 &&
+        check_size_impl(size, 32) &&
+        tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
+        (!(size & 16) ||
+         (TCG_TARGET_HAS_v128 &&
+          tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
+        (!(size & 8) ||
+         (TCG_TARGET_HAS_v64 &&
+          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
+        return TCG_TYPE_V256;
     }
-    if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
-        && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
+    if (TCG_TARGET_HAS_v128 &&
+        check_size_impl(size, 16) &&
+        tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
+        (!(size & 8) ||
+         (TCG_TARGET_HAS_v64 &&
+          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
         return TCG_TYPE_V128;
     }
     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
@@ -432,6 +464,18 @@ static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 {
     uint32_t i = 0;
 
+    tcg_debug_assert(oprsz >= 8);
+
+    /*
+     * This may be expand_clr for the tail of an operation, e.g.
+     * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
+     * are misaligned wrt the maximum vector size, so do that first.
+     */
+    if (dofs & 8) {
+        tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
+        i += 8;
+    }
+
     switch (type) {
     case TCG_TYPE_V256:
         /*
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high
  2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson
  2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson
@ 2020-04-18 15:56 ` Richard Henderson
  2020-04-20 15:29   ` Alex Bennée
  2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson
  2 siblings, 1 reply; 7+ messages in thread
From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell

The 8-byte store for the end a !is_q operation can be
merged with the other stores.  Use a no-op vector move
to trigger the expand_clr portion of tcg_gen_gvec_mov.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/translate-a64.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 095638e09a..d57aa54d6a 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -513,14 +513,8 @@ static void clear_vec_high(DisasContext *s, bool is_q, int rd)
     unsigned ofs = fp_reg_offset(s, rd, MO_64);
     unsigned vsz = vec_full_reg_size(s);
 
-    if (!is_q) {
-        TCGv_i64 tcg_zero = tcg_const_i64(0);
-        tcg_gen_st_i64(tcg_zero, cpu_env, ofs + 8);
-        tcg_temp_free_i64(tcg_zero);
-    }
-    if (vsz > 16) {
-        tcg_gen_gvec_dup_imm(MO_64, ofs + 16, vsz - 16, vsz - 16, 0);
-    }
+    /* Nop move, with side effect of clearing the tail. */
+    tcg_gen_gvec_mov(MO_64, ofs, ofs, is_q ? 16 : 8, vsz);
 }
 
 void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v)
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/3] target/arm: Use clear_vec_high more effectively
  2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson
  2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson
  2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson
@ 2020-04-18 15:56 ` Richard Henderson
  2020-04-20 15:32   ` Alex Bennée
  2 siblings, 1 reply; 7+ messages in thread
From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell

Do not explicitly store zero to the NEON high part
when we can pass !is_q to clear_vec_high.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/translate-a64.c | 59 +++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index d57aa54d6a..bf82a2e115 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -948,11 +948,10 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
 {
     /* This always zero-extends and writes to a full 128 bit wide vector */
     TCGv_i64 tmplo = tcg_temp_new_i64();
-    TCGv_i64 tmphi;
+    TCGv_i64 tmphi = NULL;
 
     if (size < 4) {
         MemOp memop = s->be_data + size;
-        tmphi = tcg_const_i64(0);
         tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), memop);
     } else {
         bool be = s->be_data == MO_BE;
@@ -970,12 +969,13 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
     }
 
     tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64));
-    tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
-
     tcg_temp_free_i64(tmplo);
-    tcg_temp_free_i64(tmphi);
 
-    clear_vec_high(s, true, destidx);
+    if (tmphi) {
+        tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
+        tcg_temp_free_i64(tmphi);
+    }
+    clear_vec_high(s, tmphi != NULL, destidx);
 }
 
 /*
@@ -6969,8 +6969,8 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
         return;
     }
 
-    tcg_resh = tcg_temp_new_i64();
     tcg_resl = tcg_temp_new_i64();
+    tcg_resh = NULL;
 
     /* Vd gets bits starting at pos bits into Vm:Vn. This is
      * either extracting 128 bits from a 128:128 concatenation, or
@@ -6982,7 +6982,6 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
             read_vec_element(s, tcg_resh, rm, 0, MO_64);
             do_ext64(s, tcg_resh, tcg_resl, pos);
         }
-        tcg_gen_movi_i64(tcg_resh, 0);
     } else {
         TCGv_i64 tcg_hh;
         typedef struct {
@@ -6997,6 +6996,7 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
             pos -= 64;
         }
 
+        tcg_resh = tcg_temp_new_i64();
         read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64);
         elt++;
         read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64);
@@ -7012,9 +7012,12 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
 
     write_vec_element(s, tcg_resl, rd, 0, MO_64);
     tcg_temp_free_i64(tcg_resl);
-    write_vec_element(s, tcg_resh, rd, 1, MO_64);
-    tcg_temp_free_i64(tcg_resh);
-    clear_vec_high(s, true, rd);
+
+    if (is_q) {
+        write_vec_element(s, tcg_resh, rd, 1, MO_64);
+        tcg_temp_free_i64(tcg_resh);
+    }
+    clear_vec_high(s, is_q, rd);
 }
 
 /* TBL/TBX
@@ -7051,17 +7054,21 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn)
      * the input.
      */
     tcg_resl = tcg_temp_new_i64();
-    tcg_resh = tcg_temp_new_i64();
+    tcg_resh = NULL;
 
     if (is_tblx) {
         read_vec_element(s, tcg_resl, rd, 0, MO_64);
     } else {
         tcg_gen_movi_i64(tcg_resl, 0);
     }
-    if (is_tblx && is_q) {
-        read_vec_element(s, tcg_resh, rd, 1, MO_64);
-    } else {
-        tcg_gen_movi_i64(tcg_resh, 0);
+
+    if (is_q) {
+        tcg_resh = tcg_temp_new_i64();
+        if (is_tblx) {
+            read_vec_element(s, tcg_resh, rd, 1, MO_64);
+        } else {
+            tcg_gen_movi_i64(tcg_resh, 0);
+        }
     }
 
     tcg_idx = tcg_temp_new_i64();
@@ -7081,9 +7088,12 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn)
 
     write_vec_element(s, tcg_resl, rd, 0, MO_64);
     tcg_temp_free_i64(tcg_resl);
-    write_vec_element(s, tcg_resh, rd, 1, MO_64);
-    tcg_temp_free_i64(tcg_resh);
-    clear_vec_high(s, true, rd);
+
+    if (is_q) {
+        write_vec_element(s, tcg_resh, rd, 1, MO_64);
+        tcg_temp_free_i64(tcg_resh);
+    }
+    clear_vec_high(s, is_q, rd);
 }
 
 /* ZIP/UZP/TRN
@@ -7120,7 +7130,7 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
     }
 
     tcg_resl = tcg_const_i64(0);
-    tcg_resh = tcg_const_i64(0);
+    tcg_resh = is_q ? tcg_const_i64(0) : NULL;
     tcg_res = tcg_temp_new_i64();
 
     for (i = 0; i < elements; i++) {
@@ -7171,9 +7181,12 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
 
     write_vec_element(s, tcg_resl, rd, 0, MO_64);
     tcg_temp_free_i64(tcg_resl);
-    write_vec_element(s, tcg_resh, rd, 1, MO_64);
-    tcg_temp_free_i64(tcg_resh);
-    clear_vec_high(s, true, rd);
+
+    if (is_q) {
+        write_vec_element(s, tcg_resh, rd, 1, MO_64);
+        tcg_temp_free_i64(tcg_resh);
+    }
+    clear_vec_high(s, is_q, rd);
 }
 
 /*
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/3] tcg: Improve vector tail clearing
  2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson
@ 2020-04-20 15:25   ` Alex Bennée
  0 siblings, 0 replies; 7+ messages in thread
From: Alex Bennée @ 2020-04-20 15:25 UTC (permalink / raw)
  To: Richard Henderson; +Cc: peter.maydell, qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> Better handling of non-power-of-2 tails as seen with Arm 8-byte
> vector operations.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

> ---
>  tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 63 insertions(+), 19 deletions(-)
>
> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
> index 5a6cc19812..43cac1a0bf 100644
> --- a/tcg/tcg-op-gvec.c
> +++ b/tcg/tcg-op-gvec.c
> @@ -326,11 +326,34 @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
>     in units of LNSZ.  This limits the expansion of inline code.  */
>  static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
>  {
> -    if (oprsz % lnsz == 0) {
> -        uint32_t lnct = oprsz / lnsz;
> -        return lnct >= 1 && lnct <= MAX_UNROLL;
> +    uint32_t q, r;
> +
> +    if (oprsz < lnsz) {
> +        return false;
>      }
> -    return false;
> +
> +    q = oprsz / lnsz;
> +    r = oprsz % lnsz;
> +    tcg_debug_assert((r & 7) == 0);
> +
> +    if (lnsz < 16) {
> +        /* For sizes below 16, accept no remainder. */
> +        if (r != 0) {
> +            return false;
> +        }
> +    } else {
> +        /*
> +         * Recall that ARM SVE allows vector sizes that are not a
> +         * power of 2, but always a multiple of 16.  The intent is
> +         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
> +         * In addition, expand_clr needs to handle a multiple of 8.
> +         * Thus we can handle the tail with one more operation per
> +         * diminishing power of 2.
> +         */
> +        q += ctpop32(r);
> +    }
> +
> +    return q <= MAX_UNROLL;
>  }
>  
>  static void expand_clr(uint32_t dofs, uint32_t maxsz);
> @@ -402,22 +425,31 @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
>  static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
>                                    uint32_t size, bool prefer_i64)
>  {
> -    if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
> -        /*
> -         * Recall that ARM SVE allows vector sizes that are not a
> -         * power of 2, but always a multiple of 16.  The intent is
> -         * that e.g. size == 80 would be expanded with 2x32 + 1x16.
> -         * It is hard to imagine a case in which v256 is supported
> -         * but v128 is not, but check anyway.
> -         */
> -        if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
> -            && (size % 32 == 0
> -                || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
> -            return TCG_TYPE_V256;
> -        }
> +    /*
> +     * Recall that ARM SVE allows vector sizes that are not a
> +     * power of 2, but always a multiple of 16.  The intent is
> +     * that e.g. size == 80 would be expanded with 2x32 + 1x16.
> +     * It is hard to imagine a case in which v256 is supported
> +     * but v128 is not, but check anyway.
> +     * In addition, expand_clr needs to handle a multiple of 8.
> +     */
> +    if (TCG_TARGET_HAS_v256 &&
> +        check_size_impl(size, 32) &&
> +        tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
> +        (!(size & 16) ||
> +         (TCG_TARGET_HAS_v128 &&
> +          tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
> +        (!(size & 8) ||
> +         (TCG_TARGET_HAS_v64 &&
> +          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
> +        return TCG_TYPE_V256;
>      }
> -    if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
> -        && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
> +    if (TCG_TARGET_HAS_v128 &&
> +        check_size_impl(size, 16) &&
> +        tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
> +        (!(size & 8) ||
> +         (TCG_TARGET_HAS_v64 &&
> +          tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
>          return TCG_TYPE_V128;
>      }
>      if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
> @@ -432,6 +464,18 @@ static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
>  {
>      uint32_t i = 0;
>  
> +    tcg_debug_assert(oprsz >= 8);
> +
> +    /*
> +     * This may be expand_clr for the tail of an operation, e.g.
> +     * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
> +     * are misaligned wrt the maximum vector size, so do that first.
> +     */
> +    if (dofs & 8) {
> +        tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
> +        i += 8;
> +    }
> +
>      switch (type) {
>      case TCG_TYPE_V256:
>          /*


-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high
  2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson
@ 2020-04-20 15:29   ` Alex Bennée
  0 siblings, 0 replies; 7+ messages in thread
From: Alex Bennée @ 2020-04-20 15:29 UTC (permalink / raw)
  To: Richard Henderson; +Cc: peter.maydell, qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> The 8-byte store for the end a !is_q operation can be
> merged with the other stores.  Use a no-op vector move
> to trigger the expand_clr portion of tcg_gen_gvec_mov.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

> ---
>  target/arm/translate-a64.c | 10 ++--------
>  1 file changed, 2 insertions(+), 8 deletions(-)
>
> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
> index 095638e09a..d57aa54d6a 100644
> --- a/target/arm/translate-a64.c
> +++ b/target/arm/translate-a64.c
> @@ -513,14 +513,8 @@ static void clear_vec_high(DisasContext *s, bool is_q, int rd)
>      unsigned ofs = fp_reg_offset(s, rd, MO_64);
>      unsigned vsz = vec_full_reg_size(s);
>  
> -    if (!is_q) {
> -        TCGv_i64 tcg_zero = tcg_const_i64(0);
> -        tcg_gen_st_i64(tcg_zero, cpu_env, ofs + 8);
> -        tcg_temp_free_i64(tcg_zero);
> -    }
> -    if (vsz > 16) {
> -        tcg_gen_gvec_dup_imm(MO_64, ofs + 16, vsz - 16, vsz - 16, 0);
> -    }
> +    /* Nop move, with side effect of clearing the tail. */
> +    tcg_gen_gvec_mov(MO_64, ofs, ofs, is_q ? 16 : 8, vsz);
>  }
>  
>  void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v)


-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 3/3] target/arm: Use clear_vec_high more effectively
  2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson
@ 2020-04-20 15:32   ` Alex Bennée
  0 siblings, 0 replies; 7+ messages in thread
From: Alex Bennée @ 2020-04-20 15:32 UTC (permalink / raw)
  To: Richard Henderson; +Cc: peter.maydell, qemu-devel


Richard Henderson <richard.henderson@linaro.org> writes:

> Do not explicitly store zero to the NEON high part
> when we can pass !is_q to clear_vec_high.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

> ---
>  target/arm/translate-a64.c | 59 +++++++++++++++++++++++---------------
>  1 file changed, 36 insertions(+), 23 deletions(-)
>
> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
> index d57aa54d6a..bf82a2e115 100644
> --- a/target/arm/translate-a64.c
> +++ b/target/arm/translate-a64.c
> @@ -948,11 +948,10 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
>  {
>      /* This always zero-extends and writes to a full 128 bit wide vector */
>      TCGv_i64 tmplo = tcg_temp_new_i64();
> -    TCGv_i64 tmphi;
> +    TCGv_i64 tmphi = NULL;
>  
>      if (size < 4) {
>          MemOp memop = s->be_data + size;
> -        tmphi = tcg_const_i64(0);
>          tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), memop);
>      } else {
>          bool be = s->be_data == MO_BE;
> @@ -970,12 +969,13 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
>      }
>  
>      tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64));
> -    tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
> -
>      tcg_temp_free_i64(tmplo);
> -    tcg_temp_free_i64(tmphi);
>  
> -    clear_vec_high(s, true, destidx);
> +    if (tmphi) {
> +        tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
> +        tcg_temp_free_i64(tmphi);
> +    }
> +    clear_vec_high(s, tmphi != NULL, destidx);
>  }
>  
>  /*
> @@ -6969,8 +6969,8 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
>          return;
>      }
>  
> -    tcg_resh = tcg_temp_new_i64();
>      tcg_resl = tcg_temp_new_i64();
> +    tcg_resh = NULL;
>  
>      /* Vd gets bits starting at pos bits into Vm:Vn. This is
>       * either extracting 128 bits from a 128:128 concatenation, or
> @@ -6982,7 +6982,6 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
>              read_vec_element(s, tcg_resh, rm, 0, MO_64);
>              do_ext64(s, tcg_resh, tcg_resl, pos);
>          }
> -        tcg_gen_movi_i64(tcg_resh, 0);
>      } else {
>          TCGv_i64 tcg_hh;
>          typedef struct {
> @@ -6997,6 +6996,7 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
>              pos -= 64;
>          }
>  
> +        tcg_resh = tcg_temp_new_i64();
>          read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64);
>          elt++;
>          read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64);
> @@ -7012,9 +7012,12 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
>  
>      write_vec_element(s, tcg_resl, rd, 0, MO_64);
>      tcg_temp_free_i64(tcg_resl);
> -    write_vec_element(s, tcg_resh, rd, 1, MO_64);
> -    tcg_temp_free_i64(tcg_resh);
> -    clear_vec_high(s, true, rd);
> +
> +    if (is_q) {
> +        write_vec_element(s, tcg_resh, rd, 1, MO_64);
> +        tcg_temp_free_i64(tcg_resh);
> +    }
> +    clear_vec_high(s, is_q, rd);
>  }
>  
>  /* TBL/TBX
> @@ -7051,17 +7054,21 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn)
>       * the input.
>       */
>      tcg_resl = tcg_temp_new_i64();
> -    tcg_resh = tcg_temp_new_i64();
> +    tcg_resh = NULL;
>  
>      if (is_tblx) {
>          read_vec_element(s, tcg_resl, rd, 0, MO_64);
>      } else {
>          tcg_gen_movi_i64(tcg_resl, 0);
>      }
> -    if (is_tblx && is_q) {
> -        read_vec_element(s, tcg_resh, rd, 1, MO_64);
> -    } else {
> -        tcg_gen_movi_i64(tcg_resh, 0);
> +
> +    if (is_q) {
> +        tcg_resh = tcg_temp_new_i64();
> +        if (is_tblx) {
> +            read_vec_element(s, tcg_resh, rd, 1, MO_64);
> +        } else {
> +            tcg_gen_movi_i64(tcg_resh, 0);
> +        }
>      }
>  
>      tcg_idx = tcg_temp_new_i64();
> @@ -7081,9 +7088,12 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn)
>  
>      write_vec_element(s, tcg_resl, rd, 0, MO_64);
>      tcg_temp_free_i64(tcg_resl);
> -    write_vec_element(s, tcg_resh, rd, 1, MO_64);
> -    tcg_temp_free_i64(tcg_resh);
> -    clear_vec_high(s, true, rd);
> +
> +    if (is_q) {
> +        write_vec_element(s, tcg_resh, rd, 1, MO_64);
> +        tcg_temp_free_i64(tcg_resh);
> +    }
> +    clear_vec_high(s, is_q, rd);
>  }
>  
>  /* ZIP/UZP/TRN
> @@ -7120,7 +7130,7 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
>      }
>  
>      tcg_resl = tcg_const_i64(0);
> -    tcg_resh = tcg_const_i64(0);
> +    tcg_resh = is_q ? tcg_const_i64(0) : NULL;
>      tcg_res = tcg_temp_new_i64();
>  
>      for (i = 0; i < elements; i++) {
> @@ -7171,9 +7181,12 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
>  
>      write_vec_element(s, tcg_resl, rd, 0, MO_64);
>      tcg_temp_free_i64(tcg_resl);
> -    write_vec_element(s, tcg_resh, rd, 1, MO_64);
> -    tcg_temp_free_i64(tcg_resh);
> -    clear_vec_high(s, true, rd);
> +
> +    if (is_q) {
> +        write_vec_element(s, tcg_resh, rd, 1, MO_64);
> +        tcg_temp_free_i64(tcg_resh);
> +    }
> +    clear_vec_high(s, is_q, rd);
>  }
>  
>  /*


-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-04-20 15:34 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson
2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson
2020-04-20 15:25   ` Alex Bennée
2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson
2020-04-20 15:29   ` Alex Bennée
2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson
2020-04-20 15:32   ` Alex Bennée

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.