* [PATCH 0/3] tcg: Improve vector tail clearing @ 2020-04-18 15:56 Richard Henderson 2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson ` (2 more replies) 0 siblings, 3 replies; 7+ messages in thread From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw) To: qemu-devel; +Cc: peter.maydell Something I noticed while looking at AdvSIMD dumps, while testing changes common with SVE2. If we're going to load a zero into a vector register for clearing the high bits of the SVE register, we might as well use that zero to store the 8 bytes at the top of the AdvSIMD register as well. Output assembly goes from e.g. 00: 48 c7 85 08 10 00 00 00 movq $0x0,0x1008(%rbp) 00 00 00 0b: c5 f9 ef c0 vpxor %xmm0,%xmm0,%xmm0 0f: c5 fe 7f 85 10 10 00 00 vmovdqu %ymm0,0x1010(%rbp) 17: c5 fa 7f 85 30 10 00 00 vmovdqu %xmm0,0x1030(%rbp) to 00: c5 f9 ef c0 vpxor %xmm0,%xmm0,%xmm0 04: c5 f9 d6 85 08 10 00 00 vmovq %xmm0,0x1008(%rbp) 0c: c5 fe 7f 85 10 10 00 00 vmovdqu %ymm0,0x1010(%rbp) 14: c5 fa 7f 85 30 10 00 00 vmovdqu %xmm0,0x1030(%rbp) Saves a few bytes now, and more when we can do better with loading constants into registers, where we can share the vpxor between instructions. The target/arm patches are not aided by the tcg patch, but are not dependent on it. r~ Richard Henderson (3): tcg: Improve vector tail clearing target/arm: Use tcg_gen_gvec_mov for clear_vec_high target/arm: Use clear_vec_high more effectively target/arm/translate-a64.c | 69 ++++++++++++++++++-------------- tcg/tcg-op-gvec.c | 82 +++++++++++++++++++++++++++++--------- 2 files changed, 101 insertions(+), 50 deletions(-) -- 2.20.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 1/3] tcg: Improve vector tail clearing 2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson @ 2020-04-18 15:56 ` Richard Henderson 2020-04-20 15:25 ` Alex Bennée 2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson 2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson 2 siblings, 1 reply; 7+ messages in thread From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw) To: qemu-devel; +Cc: peter.maydell Better handling of non-power-of-2 tails as seen with Arm 8-byte vector operations. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 19 deletions(-) diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c index 5a6cc19812..43cac1a0bf 100644 --- a/tcg/tcg-op-gvec.c +++ b/tcg/tcg-op-gvec.c @@ -326,11 +326,34 @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, in units of LNSZ. This limits the expansion of inline code. */ static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) { - if (oprsz % lnsz == 0) { - uint32_t lnct = oprsz / lnsz; - return lnct >= 1 && lnct <= MAX_UNROLL; + uint32_t q, r; + + if (oprsz < lnsz) { + return false; } - return false; + + q = oprsz / lnsz; + r = oprsz % lnsz; + tcg_debug_assert((r & 7) == 0); + + if (lnsz < 16) { + /* For sizes below 16, accept no remainder. */ + if (r != 0) { + return false; + } + } else { + /* + * Recall that ARM SVE allows vector sizes that are not a + * power of 2, but always a multiple of 16. The intent is + * that e.g. size == 80 would be expanded with 2x32 + 1x16. + * In addition, expand_clr needs to handle a multiple of 8. + * Thus we can handle the tail with one more operation per + * diminishing power of 2. + */ + q += ctpop32(r); + } + + return q <= MAX_UNROLL; } static void expand_clr(uint32_t dofs, uint32_t maxsz); @@ -402,22 +425,31 @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, uint32_t size, bool prefer_i64) { - if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { - /* - * Recall that ARM SVE allows vector sizes that are not a - * power of 2, but always a multiple of 16. The intent is - * that e.g. size == 80 would be expanded with 2x32 + 1x16. - * It is hard to imagine a case in which v256 is supported - * but v128 is not, but check anyway. - */ - if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) - && (size % 32 == 0 - || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) { - return TCG_TYPE_V256; - } + /* + * Recall that ARM SVE allows vector sizes that are not a + * power of 2, but always a multiple of 16. The intent is + * that e.g. size == 80 would be expanded with 2x32 + 1x16. + * It is hard to imagine a case in which v256 is supported + * but v128 is not, but check anyway. + * In addition, expand_clr needs to handle a multiple of 8. + */ + if (TCG_TARGET_HAS_v256 && + check_size_impl(size, 32) && + tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && + (!(size & 16) || + (TCG_TARGET_HAS_v128 && + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && + (!(size & 8) || + (TCG_TARGET_HAS_v64 && + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { + return TCG_TYPE_V256; } - if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) - && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) { + if (TCG_TARGET_HAS_v128 && + check_size_impl(size, 16) && + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && + (!(size & 8) || + (TCG_TARGET_HAS_v64 && + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { return TCG_TYPE_V128; } if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) @@ -432,6 +464,18 @@ static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, { uint32_t i = 0; + tcg_debug_assert(oprsz >= 8); + + /* + * This may be expand_clr for the tail of an operation, e.g. + * oprsz == 8 && maxsz == 64. The first 8 bytes of this store + * are misaligned wrt the maximum vector size, so do that first. + */ + if (dofs & 8) { + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); + i += 8; + } + switch (type) { case TCG_TYPE_V256: /* -- 2.20.1 ^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 1/3] tcg: Improve vector tail clearing 2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson @ 2020-04-20 15:25 ` Alex Bennée 0 siblings, 0 replies; 7+ messages in thread From: Alex Bennée @ 2020-04-20 15:25 UTC (permalink / raw) To: Richard Henderson; +Cc: peter.maydell, qemu-devel Richard Henderson <richard.henderson@linaro.org> writes: > Better handling of non-power-of-2 tails as seen with Arm 8-byte > vector operations. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> > --- > tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++++++----------- > 1 file changed, 63 insertions(+), 19 deletions(-) > > diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c > index 5a6cc19812..43cac1a0bf 100644 > --- a/tcg/tcg-op-gvec.c > +++ b/tcg/tcg-op-gvec.c > @@ -326,11 +326,34 @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, > in units of LNSZ. This limits the expansion of inline code. */ > static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) > { > - if (oprsz % lnsz == 0) { > - uint32_t lnct = oprsz / lnsz; > - return lnct >= 1 && lnct <= MAX_UNROLL; > + uint32_t q, r; > + > + if (oprsz < lnsz) { > + return false; > } > - return false; > + > + q = oprsz / lnsz; > + r = oprsz % lnsz; > + tcg_debug_assert((r & 7) == 0); > + > + if (lnsz < 16) { > + /* For sizes below 16, accept no remainder. */ > + if (r != 0) { > + return false; > + } > + } else { > + /* > + * Recall that ARM SVE allows vector sizes that are not a > + * power of 2, but always a multiple of 16. The intent is > + * that e.g. size == 80 would be expanded with 2x32 + 1x16. > + * In addition, expand_clr needs to handle a multiple of 8. > + * Thus we can handle the tail with one more operation per > + * diminishing power of 2. > + */ > + q += ctpop32(r); > + } > + > + return q <= MAX_UNROLL; > } > > static void expand_clr(uint32_t dofs, uint32_t maxsz); > @@ -402,22 +425,31 @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) > static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece, > uint32_t size, bool prefer_i64) > { > - if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) { > - /* > - * Recall that ARM SVE allows vector sizes that are not a > - * power of 2, but always a multiple of 16. The intent is > - * that e.g. size == 80 would be expanded with 2x32 + 1x16. > - * It is hard to imagine a case in which v256 is supported > - * but v128 is not, but check anyway. > - */ > - if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) > - && (size % 32 == 0 > - || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) { > - return TCG_TYPE_V256; > - } > + /* > + * Recall that ARM SVE allows vector sizes that are not a > + * power of 2, but always a multiple of 16. The intent is > + * that e.g. size == 80 would be expanded with 2x32 + 1x16. > + * It is hard to imagine a case in which v256 is supported > + * but v128 is not, but check anyway. > + * In addition, expand_clr needs to handle a multiple of 8. > + */ > + if (TCG_TARGET_HAS_v256 && > + check_size_impl(size, 32) && > + tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) && > + (!(size & 16) || > + (TCG_TARGET_HAS_v128 && > + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) && > + (!(size & 8) || > + (TCG_TARGET_HAS_v64 && > + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { > + return TCG_TYPE_V256; > } > - if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16) > - && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) { > + if (TCG_TARGET_HAS_v128 && > + check_size_impl(size, 16) && > + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) && > + (!(size & 8) || > + (TCG_TARGET_HAS_v64 && > + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) { > return TCG_TYPE_V128; > } > if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8) > @@ -432,6 +464,18 @@ static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz, > { > uint32_t i = 0; > > + tcg_debug_assert(oprsz >= 8); > + > + /* > + * This may be expand_clr for the tail of an operation, e.g. > + * oprsz == 8 && maxsz == 64. The first 8 bytes of this store > + * are misaligned wrt the maximum vector size, so do that first. > + */ > + if (dofs & 8) { > + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); > + i += 8; > + } > + > switch (type) { > case TCG_TYPE_V256: > /* -- Alex Bennée ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high 2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson 2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson @ 2020-04-18 15:56 ` Richard Henderson 2020-04-20 15:29 ` Alex Bennée 2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson 2 siblings, 1 reply; 7+ messages in thread From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw) To: qemu-devel; +Cc: peter.maydell The 8-byte store for the end a !is_q operation can be merged with the other stores. Use a no-op vector move to trigger the expand_clr portion of tcg_gen_gvec_mov. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/translate-a64.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index 095638e09a..d57aa54d6a 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -513,14 +513,8 @@ static void clear_vec_high(DisasContext *s, bool is_q, int rd) unsigned ofs = fp_reg_offset(s, rd, MO_64); unsigned vsz = vec_full_reg_size(s); - if (!is_q) { - TCGv_i64 tcg_zero = tcg_const_i64(0); - tcg_gen_st_i64(tcg_zero, cpu_env, ofs + 8); - tcg_temp_free_i64(tcg_zero); - } - if (vsz > 16) { - tcg_gen_gvec_dup_imm(MO_64, ofs + 16, vsz - 16, vsz - 16, 0); - } + /* Nop move, with side effect of clearing the tail. */ + tcg_gen_gvec_mov(MO_64, ofs, ofs, is_q ? 16 : 8, vsz); } void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v) -- 2.20.1 ^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high 2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson @ 2020-04-20 15:29 ` Alex Bennée 0 siblings, 0 replies; 7+ messages in thread From: Alex Bennée @ 2020-04-20 15:29 UTC (permalink / raw) To: Richard Henderson; +Cc: peter.maydell, qemu-devel Richard Henderson <richard.henderson@linaro.org> writes: > The 8-byte store for the end a !is_q operation can be > merged with the other stores. Use a no-op vector move > to trigger the expand_clr portion of tcg_gen_gvec_mov. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> > --- > target/arm/translate-a64.c | 10 ++-------- > 1 file changed, 2 insertions(+), 8 deletions(-) > > diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c > index 095638e09a..d57aa54d6a 100644 > --- a/target/arm/translate-a64.c > +++ b/target/arm/translate-a64.c > @@ -513,14 +513,8 @@ static void clear_vec_high(DisasContext *s, bool is_q, int rd) > unsigned ofs = fp_reg_offset(s, rd, MO_64); > unsigned vsz = vec_full_reg_size(s); > > - if (!is_q) { > - TCGv_i64 tcg_zero = tcg_const_i64(0); > - tcg_gen_st_i64(tcg_zero, cpu_env, ofs + 8); > - tcg_temp_free_i64(tcg_zero); > - } > - if (vsz > 16) { > - tcg_gen_gvec_dup_imm(MO_64, ofs + 16, vsz - 16, vsz - 16, 0); > - } > + /* Nop move, with side effect of clearing the tail. */ > + tcg_gen_gvec_mov(MO_64, ofs, ofs, is_q ? 16 : 8, vsz); > } > > void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v) -- Alex Bennée ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 3/3] target/arm: Use clear_vec_high more effectively 2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson 2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson 2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson @ 2020-04-18 15:56 ` Richard Henderson 2020-04-20 15:32 ` Alex Bennée 2 siblings, 1 reply; 7+ messages in thread From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw) To: qemu-devel; +Cc: peter.maydell Do not explicitly store zero to the NEON high part when we can pass !is_q to clear_vec_high. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/translate-a64.c | 59 +++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index d57aa54d6a..bf82a2e115 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -948,11 +948,10 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size) { /* This always zero-extends and writes to a full 128 bit wide vector */ TCGv_i64 tmplo = tcg_temp_new_i64(); - TCGv_i64 tmphi; + TCGv_i64 tmphi = NULL; if (size < 4) { MemOp memop = s->be_data + size; - tmphi = tcg_const_i64(0); tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), memop); } else { bool be = s->be_data == MO_BE; @@ -970,12 +969,13 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size) } tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64)); - tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx)); - tcg_temp_free_i64(tmplo); - tcg_temp_free_i64(tmphi); - clear_vec_high(s, true, destidx); + if (tmphi) { + tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx)); + tcg_temp_free_i64(tmphi); + } + clear_vec_high(s, tmphi != NULL, destidx); } /* @@ -6969,8 +6969,8 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn) return; } - tcg_resh = tcg_temp_new_i64(); tcg_resl = tcg_temp_new_i64(); + tcg_resh = NULL; /* Vd gets bits starting at pos bits into Vm:Vn. This is * either extracting 128 bits from a 128:128 concatenation, or @@ -6982,7 +6982,6 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn) read_vec_element(s, tcg_resh, rm, 0, MO_64); do_ext64(s, tcg_resh, tcg_resl, pos); } - tcg_gen_movi_i64(tcg_resh, 0); } else { TCGv_i64 tcg_hh; typedef struct { @@ -6997,6 +6996,7 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn) pos -= 64; } + tcg_resh = tcg_temp_new_i64(); read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64); elt++; read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64); @@ -7012,9 +7012,12 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn) write_vec_element(s, tcg_resl, rd, 0, MO_64); tcg_temp_free_i64(tcg_resl); - write_vec_element(s, tcg_resh, rd, 1, MO_64); - tcg_temp_free_i64(tcg_resh); - clear_vec_high(s, true, rd); + + if (is_q) { + write_vec_element(s, tcg_resh, rd, 1, MO_64); + tcg_temp_free_i64(tcg_resh); + } + clear_vec_high(s, is_q, rd); } /* TBL/TBX @@ -7051,17 +7054,21 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn) * the input. */ tcg_resl = tcg_temp_new_i64(); - tcg_resh = tcg_temp_new_i64(); + tcg_resh = NULL; if (is_tblx) { read_vec_element(s, tcg_resl, rd, 0, MO_64); } else { tcg_gen_movi_i64(tcg_resl, 0); } - if (is_tblx && is_q) { - read_vec_element(s, tcg_resh, rd, 1, MO_64); - } else { - tcg_gen_movi_i64(tcg_resh, 0); + + if (is_q) { + tcg_resh = tcg_temp_new_i64(); + if (is_tblx) { + read_vec_element(s, tcg_resh, rd, 1, MO_64); + } else { + tcg_gen_movi_i64(tcg_resh, 0); + } } tcg_idx = tcg_temp_new_i64(); @@ -7081,9 +7088,12 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn) write_vec_element(s, tcg_resl, rd, 0, MO_64); tcg_temp_free_i64(tcg_resl); - write_vec_element(s, tcg_resh, rd, 1, MO_64); - tcg_temp_free_i64(tcg_resh); - clear_vec_high(s, true, rd); + + if (is_q) { + write_vec_element(s, tcg_resh, rd, 1, MO_64); + tcg_temp_free_i64(tcg_resh); + } + clear_vec_high(s, is_q, rd); } /* ZIP/UZP/TRN @@ -7120,7 +7130,7 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn) } tcg_resl = tcg_const_i64(0); - tcg_resh = tcg_const_i64(0); + tcg_resh = is_q ? tcg_const_i64(0) : NULL; tcg_res = tcg_temp_new_i64(); for (i = 0; i < elements; i++) { @@ -7171,9 +7181,12 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn) write_vec_element(s, tcg_resl, rd, 0, MO_64); tcg_temp_free_i64(tcg_resl); - write_vec_element(s, tcg_resh, rd, 1, MO_64); - tcg_temp_free_i64(tcg_resh); - clear_vec_high(s, true, rd); + + if (is_q) { + write_vec_element(s, tcg_resh, rd, 1, MO_64); + tcg_temp_free_i64(tcg_resh); + } + clear_vec_high(s, is_q, rd); } /* -- 2.20.1 ^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 3/3] target/arm: Use clear_vec_high more effectively 2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson @ 2020-04-20 15:32 ` Alex Bennée 0 siblings, 0 replies; 7+ messages in thread From: Alex Bennée @ 2020-04-20 15:32 UTC (permalink / raw) To: Richard Henderson; +Cc: peter.maydell, qemu-devel Richard Henderson <richard.henderson@linaro.org> writes: > Do not explicitly store zero to the NEON high part > when we can pass !is_q to clear_vec_high. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> > --- > target/arm/translate-a64.c | 59 +++++++++++++++++++++++--------------- > 1 file changed, 36 insertions(+), 23 deletions(-) > > diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c > index d57aa54d6a..bf82a2e115 100644 > --- a/target/arm/translate-a64.c > +++ b/target/arm/translate-a64.c > @@ -948,11 +948,10 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size) > { > /* This always zero-extends and writes to a full 128 bit wide vector */ > TCGv_i64 tmplo = tcg_temp_new_i64(); > - TCGv_i64 tmphi; > + TCGv_i64 tmphi = NULL; > > if (size < 4) { > MemOp memop = s->be_data + size; > - tmphi = tcg_const_i64(0); > tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), memop); > } else { > bool be = s->be_data == MO_BE; > @@ -970,12 +969,13 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size) > } > > tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64)); > - tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx)); > - > tcg_temp_free_i64(tmplo); > - tcg_temp_free_i64(tmphi); > > - clear_vec_high(s, true, destidx); > + if (tmphi) { > + tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx)); > + tcg_temp_free_i64(tmphi); > + } > + clear_vec_high(s, tmphi != NULL, destidx); > } > > /* > @@ -6969,8 +6969,8 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn) > return; > } > > - tcg_resh = tcg_temp_new_i64(); > tcg_resl = tcg_temp_new_i64(); > + tcg_resh = NULL; > > /* Vd gets bits starting at pos bits into Vm:Vn. This is > * either extracting 128 bits from a 128:128 concatenation, or > @@ -6982,7 +6982,6 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn) > read_vec_element(s, tcg_resh, rm, 0, MO_64); > do_ext64(s, tcg_resh, tcg_resl, pos); > } > - tcg_gen_movi_i64(tcg_resh, 0); > } else { > TCGv_i64 tcg_hh; > typedef struct { > @@ -6997,6 +6996,7 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn) > pos -= 64; > } > > + tcg_resh = tcg_temp_new_i64(); > read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64); > elt++; > read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64); > @@ -7012,9 +7012,12 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn) > > write_vec_element(s, tcg_resl, rd, 0, MO_64); > tcg_temp_free_i64(tcg_resl); > - write_vec_element(s, tcg_resh, rd, 1, MO_64); > - tcg_temp_free_i64(tcg_resh); > - clear_vec_high(s, true, rd); > + > + if (is_q) { > + write_vec_element(s, tcg_resh, rd, 1, MO_64); > + tcg_temp_free_i64(tcg_resh); > + } > + clear_vec_high(s, is_q, rd); > } > > /* TBL/TBX > @@ -7051,17 +7054,21 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn) > * the input. > */ > tcg_resl = tcg_temp_new_i64(); > - tcg_resh = tcg_temp_new_i64(); > + tcg_resh = NULL; > > if (is_tblx) { > read_vec_element(s, tcg_resl, rd, 0, MO_64); > } else { > tcg_gen_movi_i64(tcg_resl, 0); > } > - if (is_tblx && is_q) { > - read_vec_element(s, tcg_resh, rd, 1, MO_64); > - } else { > - tcg_gen_movi_i64(tcg_resh, 0); > + > + if (is_q) { > + tcg_resh = tcg_temp_new_i64(); > + if (is_tblx) { > + read_vec_element(s, tcg_resh, rd, 1, MO_64); > + } else { > + tcg_gen_movi_i64(tcg_resh, 0); > + } > } > > tcg_idx = tcg_temp_new_i64(); > @@ -7081,9 +7088,12 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn) > > write_vec_element(s, tcg_resl, rd, 0, MO_64); > tcg_temp_free_i64(tcg_resl); > - write_vec_element(s, tcg_resh, rd, 1, MO_64); > - tcg_temp_free_i64(tcg_resh); > - clear_vec_high(s, true, rd); > + > + if (is_q) { > + write_vec_element(s, tcg_resh, rd, 1, MO_64); > + tcg_temp_free_i64(tcg_resh); > + } > + clear_vec_high(s, is_q, rd); > } > > /* ZIP/UZP/TRN > @@ -7120,7 +7130,7 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn) > } > > tcg_resl = tcg_const_i64(0); > - tcg_resh = tcg_const_i64(0); > + tcg_resh = is_q ? tcg_const_i64(0) : NULL; > tcg_res = tcg_temp_new_i64(); > > for (i = 0; i < elements; i++) { > @@ -7171,9 +7181,12 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn) > > write_vec_element(s, tcg_resl, rd, 0, MO_64); > tcg_temp_free_i64(tcg_resl); > - write_vec_element(s, tcg_resh, rd, 1, MO_64); > - tcg_temp_free_i64(tcg_resh); > - clear_vec_high(s, true, rd); > + > + if (is_q) { > + write_vec_element(s, tcg_resh, rd, 1, MO_64); > + tcg_temp_free_i64(tcg_resh); > + } > + clear_vec_high(s, is_q, rd); > } > > /* -- Alex Bennée ^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2020-04-20 15:34 UTC | newest] Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson 2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson 2020-04-20 15:25 ` Alex Bennée 2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson 2020-04-20 15:29 ` Alex Bennée 2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson 2020-04-20 15:32 ` Alex Bennée
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.