* [PATCH 0/3] tcg: Improve vector tail clearing
@ 2020-04-18 15:56 Richard Henderson
2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson
` (2 more replies)
0 siblings, 3 replies; 7+ messages in thread
From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw)
To: qemu-devel; +Cc: peter.maydell
Something I noticed while looking at AdvSIMD dumps, while
testing changes common with SVE2.
If we're going to load a zero into a vector register for
clearing the high bits of the SVE register, we might as
well use that zero to store the 8 bytes at the top of the
AdvSIMD register as well.
Output assembly goes from e.g.
00: 48 c7 85 08 10 00 00 00 movq $0x0,0x1008(%rbp)
00 00 00
0b: c5 f9 ef c0 vpxor %xmm0,%xmm0,%xmm0
0f: c5 fe 7f 85 10 10 00 00 vmovdqu %ymm0,0x1010(%rbp)
17: c5 fa 7f 85 30 10 00 00 vmovdqu %xmm0,0x1030(%rbp)
to
00: c5 f9 ef c0 vpxor %xmm0,%xmm0,%xmm0
04: c5 f9 d6 85 08 10 00 00 vmovq %xmm0,0x1008(%rbp)
0c: c5 fe 7f 85 10 10 00 00 vmovdqu %ymm0,0x1010(%rbp)
14: c5 fa 7f 85 30 10 00 00 vmovdqu %xmm0,0x1030(%rbp)
Saves a few bytes now, and more when we can do better with
loading constants into registers, where we can share the
vpxor between instructions.
The target/arm patches are not aided by the tcg patch, but
are not dependent on it.
r~
Richard Henderson (3):
tcg: Improve vector tail clearing
target/arm: Use tcg_gen_gvec_mov for clear_vec_high
target/arm: Use clear_vec_high more effectively
target/arm/translate-a64.c | 69 ++++++++++++++++++--------------
tcg/tcg-op-gvec.c | 82 +++++++++++++++++++++++++++++---------
2 files changed, 101 insertions(+), 50 deletions(-)
--
2.20.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH 1/3] tcg: Improve vector tail clearing
2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson
@ 2020-04-18 15:56 ` Richard Henderson
2020-04-20 15:25 ` Alex Bennée
2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson
2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson
2 siblings, 1 reply; 7+ messages in thread
From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw)
To: qemu-devel; +Cc: peter.maydell
Better handling of non-power-of-2 tails as seen with Arm 8-byte
vector operations.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++++++-----------
1 file changed, 63 insertions(+), 19 deletions(-)
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 5a6cc19812..43cac1a0bf 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -326,11 +326,34 @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
in units of LNSZ. This limits the expansion of inline code. */
static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
{
- if (oprsz % lnsz == 0) {
- uint32_t lnct = oprsz / lnsz;
- return lnct >= 1 && lnct <= MAX_UNROLL;
+ uint32_t q, r;
+
+ if (oprsz < lnsz) {
+ return false;
}
- return false;
+
+ q = oprsz / lnsz;
+ r = oprsz % lnsz;
+ tcg_debug_assert((r & 7) == 0);
+
+ if (lnsz < 16) {
+ /* For sizes below 16, accept no remainder. */
+ if (r != 0) {
+ return false;
+ }
+ } else {
+ /*
+ * Recall that ARM SVE allows vector sizes that are not a
+ * power of 2, but always a multiple of 16. The intent is
+ * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+ * In addition, expand_clr needs to handle a multiple of 8.
+ * Thus we can handle the tail with one more operation per
+ * diminishing power of 2.
+ */
+ q += ctpop32(r);
+ }
+
+ return q <= MAX_UNROLL;
}
static void expand_clr(uint32_t dofs, uint32_t maxsz);
@@ -402,22 +425,31 @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
uint32_t size, bool prefer_i64)
{
- if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
- /*
- * Recall that ARM SVE allows vector sizes that are not a
- * power of 2, but always a multiple of 16. The intent is
- * that e.g. size == 80 would be expanded with 2x32 + 1x16.
- * It is hard to imagine a case in which v256 is supported
- * but v128 is not, but check anyway.
- */
- if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
- && (size % 32 == 0
- || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
- return TCG_TYPE_V256;
- }
+ /*
+ * Recall that ARM SVE allows vector sizes that are not a
+ * power of 2, but always a multiple of 16. The intent is
+ * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+ * It is hard to imagine a case in which v256 is supported
+ * but v128 is not, but check anyway.
+ * In addition, expand_clr needs to handle a multiple of 8.
+ */
+ if (TCG_TARGET_HAS_v256 &&
+ check_size_impl(size, 32) &&
+ tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
+ (!(size & 16) ||
+ (TCG_TARGET_HAS_v128 &&
+ tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
+ (!(size & 8) ||
+ (TCG_TARGET_HAS_v64 &&
+ tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
+ return TCG_TYPE_V256;
}
- if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
- && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
+ if (TCG_TARGET_HAS_v128 &&
+ check_size_impl(size, 16) &&
+ tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
+ (!(size & 8) ||
+ (TCG_TARGET_HAS_v64 &&
+ tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
return TCG_TYPE_V128;
}
if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
@@ -432,6 +464,18 @@ static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
{
uint32_t i = 0;
+ tcg_debug_assert(oprsz >= 8);
+
+ /*
+ * This may be expand_clr for the tail of an operation, e.g.
+ * oprsz == 8 && maxsz == 64. The first 8 bytes of this store
+ * are misaligned wrt the maximum vector size, so do that first.
+ */
+ if (dofs & 8) {
+ tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
+ i += 8;
+ }
+
switch (type) {
case TCG_TYPE_V256:
/*
--
2.20.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high
2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson
2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson
@ 2020-04-18 15:56 ` Richard Henderson
2020-04-20 15:29 ` Alex Bennée
2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson
2 siblings, 1 reply; 7+ messages in thread
From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw)
To: qemu-devel; +Cc: peter.maydell
The 8-byte store for the end a !is_q operation can be
merged with the other stores. Use a no-op vector move
to trigger the expand_clr portion of tcg_gen_gvec_mov.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/translate-a64.c | 10 ++--------
1 file changed, 2 insertions(+), 8 deletions(-)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 095638e09a..d57aa54d6a 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -513,14 +513,8 @@ static void clear_vec_high(DisasContext *s, bool is_q, int rd)
unsigned ofs = fp_reg_offset(s, rd, MO_64);
unsigned vsz = vec_full_reg_size(s);
- if (!is_q) {
- TCGv_i64 tcg_zero = tcg_const_i64(0);
- tcg_gen_st_i64(tcg_zero, cpu_env, ofs + 8);
- tcg_temp_free_i64(tcg_zero);
- }
- if (vsz > 16) {
- tcg_gen_gvec_dup_imm(MO_64, ofs + 16, vsz - 16, vsz - 16, 0);
- }
+ /* Nop move, with side effect of clearing the tail. */
+ tcg_gen_gvec_mov(MO_64, ofs, ofs, is_q ? 16 : 8, vsz);
}
void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v)
--
2.20.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH 3/3] target/arm: Use clear_vec_high more effectively
2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson
2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson
2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson
@ 2020-04-18 15:56 ` Richard Henderson
2020-04-20 15:32 ` Alex Bennée
2 siblings, 1 reply; 7+ messages in thread
From: Richard Henderson @ 2020-04-18 15:56 UTC (permalink / raw)
To: qemu-devel; +Cc: peter.maydell
Do not explicitly store zero to the NEON high part
when we can pass !is_q to clear_vec_high.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/translate-a64.c | 59 +++++++++++++++++++++++---------------
1 file changed, 36 insertions(+), 23 deletions(-)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index d57aa54d6a..bf82a2e115 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -948,11 +948,10 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
{
/* This always zero-extends and writes to a full 128 bit wide vector */
TCGv_i64 tmplo = tcg_temp_new_i64();
- TCGv_i64 tmphi;
+ TCGv_i64 tmphi = NULL;
if (size < 4) {
MemOp memop = s->be_data + size;
- tmphi = tcg_const_i64(0);
tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), memop);
} else {
bool be = s->be_data == MO_BE;
@@ -970,12 +969,13 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
}
tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64));
- tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
-
tcg_temp_free_i64(tmplo);
- tcg_temp_free_i64(tmphi);
- clear_vec_high(s, true, destidx);
+ if (tmphi) {
+ tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
+ tcg_temp_free_i64(tmphi);
+ }
+ clear_vec_high(s, tmphi != NULL, destidx);
}
/*
@@ -6969,8 +6969,8 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
return;
}
- tcg_resh = tcg_temp_new_i64();
tcg_resl = tcg_temp_new_i64();
+ tcg_resh = NULL;
/* Vd gets bits starting at pos bits into Vm:Vn. This is
* either extracting 128 bits from a 128:128 concatenation, or
@@ -6982,7 +6982,6 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
read_vec_element(s, tcg_resh, rm, 0, MO_64);
do_ext64(s, tcg_resh, tcg_resl, pos);
}
- tcg_gen_movi_i64(tcg_resh, 0);
} else {
TCGv_i64 tcg_hh;
typedef struct {
@@ -6997,6 +6996,7 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
pos -= 64;
}
+ tcg_resh = tcg_temp_new_i64();
read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64);
elt++;
read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64);
@@ -7012,9 +7012,12 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
write_vec_element(s, tcg_resl, rd, 0, MO_64);
tcg_temp_free_i64(tcg_resl);
- write_vec_element(s, tcg_resh, rd, 1, MO_64);
- tcg_temp_free_i64(tcg_resh);
- clear_vec_high(s, true, rd);
+
+ if (is_q) {
+ write_vec_element(s, tcg_resh, rd, 1, MO_64);
+ tcg_temp_free_i64(tcg_resh);
+ }
+ clear_vec_high(s, is_q, rd);
}
/* TBL/TBX
@@ -7051,17 +7054,21 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn)
* the input.
*/
tcg_resl = tcg_temp_new_i64();
- tcg_resh = tcg_temp_new_i64();
+ tcg_resh = NULL;
if (is_tblx) {
read_vec_element(s, tcg_resl, rd, 0, MO_64);
} else {
tcg_gen_movi_i64(tcg_resl, 0);
}
- if (is_tblx && is_q) {
- read_vec_element(s, tcg_resh, rd, 1, MO_64);
- } else {
- tcg_gen_movi_i64(tcg_resh, 0);
+
+ if (is_q) {
+ tcg_resh = tcg_temp_new_i64();
+ if (is_tblx) {
+ read_vec_element(s, tcg_resh, rd, 1, MO_64);
+ } else {
+ tcg_gen_movi_i64(tcg_resh, 0);
+ }
}
tcg_idx = tcg_temp_new_i64();
@@ -7081,9 +7088,12 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn)
write_vec_element(s, tcg_resl, rd, 0, MO_64);
tcg_temp_free_i64(tcg_resl);
- write_vec_element(s, tcg_resh, rd, 1, MO_64);
- tcg_temp_free_i64(tcg_resh);
- clear_vec_high(s, true, rd);
+
+ if (is_q) {
+ write_vec_element(s, tcg_resh, rd, 1, MO_64);
+ tcg_temp_free_i64(tcg_resh);
+ }
+ clear_vec_high(s, is_q, rd);
}
/* ZIP/UZP/TRN
@@ -7120,7 +7130,7 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
}
tcg_resl = tcg_const_i64(0);
- tcg_resh = tcg_const_i64(0);
+ tcg_resh = is_q ? tcg_const_i64(0) : NULL;
tcg_res = tcg_temp_new_i64();
for (i = 0; i < elements; i++) {
@@ -7171,9 +7181,12 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
write_vec_element(s, tcg_resl, rd, 0, MO_64);
tcg_temp_free_i64(tcg_resl);
- write_vec_element(s, tcg_resh, rd, 1, MO_64);
- tcg_temp_free_i64(tcg_resh);
- clear_vec_high(s, true, rd);
+
+ if (is_q) {
+ write_vec_element(s, tcg_resh, rd, 1, MO_64);
+ tcg_temp_free_i64(tcg_resh);
+ }
+ clear_vec_high(s, is_q, rd);
}
/*
--
2.20.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH 1/3] tcg: Improve vector tail clearing
2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson
@ 2020-04-20 15:25 ` Alex Bennée
0 siblings, 0 replies; 7+ messages in thread
From: Alex Bennée @ 2020-04-20 15:25 UTC (permalink / raw)
To: Richard Henderson; +Cc: peter.maydell, qemu-devel
Richard Henderson <richard.henderson@linaro.org> writes:
> Better handling of non-power-of-2 tails as seen with Arm 8-byte
> vector operations.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> ---
> tcg/tcg-op-gvec.c | 82 ++++++++++++++++++++++++++++++++++++-----------
> 1 file changed, 63 insertions(+), 19 deletions(-)
>
> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
> index 5a6cc19812..43cac1a0bf 100644
> --- a/tcg/tcg-op-gvec.c
> +++ b/tcg/tcg-op-gvec.c
> @@ -326,11 +326,34 @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> in units of LNSZ. This limits the expansion of inline code. */
> static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
> {
> - if (oprsz % lnsz == 0) {
> - uint32_t lnct = oprsz / lnsz;
> - return lnct >= 1 && lnct <= MAX_UNROLL;
> + uint32_t q, r;
> +
> + if (oprsz < lnsz) {
> + return false;
> }
> - return false;
> +
> + q = oprsz / lnsz;
> + r = oprsz % lnsz;
> + tcg_debug_assert((r & 7) == 0);
> +
> + if (lnsz < 16) {
> + /* For sizes below 16, accept no remainder. */
> + if (r != 0) {
> + return false;
> + }
> + } else {
> + /*
> + * Recall that ARM SVE allows vector sizes that are not a
> + * power of 2, but always a multiple of 16. The intent is
> + * that e.g. size == 80 would be expanded with 2x32 + 1x16.
> + * In addition, expand_clr needs to handle a multiple of 8.
> + * Thus we can handle the tail with one more operation per
> + * diminishing power of 2.
> + */
> + q += ctpop32(r);
> + }
> +
> + return q <= MAX_UNROLL;
> }
>
> static void expand_clr(uint32_t dofs, uint32_t maxsz);
> @@ -402,22 +425,31 @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
> static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
> uint32_t size, bool prefer_i64)
> {
> - if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
> - /*
> - * Recall that ARM SVE allows vector sizes that are not a
> - * power of 2, but always a multiple of 16. The intent is
> - * that e.g. size == 80 would be expanded with 2x32 + 1x16.
> - * It is hard to imagine a case in which v256 is supported
> - * but v128 is not, but check anyway.
> - */
> - if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
> - && (size % 32 == 0
> - || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
> - return TCG_TYPE_V256;
> - }
> + /*
> + * Recall that ARM SVE allows vector sizes that are not a
> + * power of 2, but always a multiple of 16. The intent is
> + * that e.g. size == 80 would be expanded with 2x32 + 1x16.
> + * It is hard to imagine a case in which v256 is supported
> + * but v128 is not, but check anyway.
> + * In addition, expand_clr needs to handle a multiple of 8.
> + */
> + if (TCG_TARGET_HAS_v256 &&
> + check_size_impl(size, 32) &&
> + tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
> + (!(size & 16) ||
> + (TCG_TARGET_HAS_v128 &&
> + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
> + (!(size & 8) ||
> + (TCG_TARGET_HAS_v64 &&
> + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
> + return TCG_TYPE_V256;
> }
> - if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
> - && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
> + if (TCG_TARGET_HAS_v128 &&
> + check_size_impl(size, 16) &&
> + tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
> + (!(size & 8) ||
> + (TCG_TARGET_HAS_v64 &&
> + tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
> return TCG_TYPE_V128;
> }
> if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
> @@ -432,6 +464,18 @@ static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
> {
> uint32_t i = 0;
>
> + tcg_debug_assert(oprsz >= 8);
> +
> + /*
> + * This may be expand_clr for the tail of an operation, e.g.
> + * oprsz == 8 && maxsz == 64. The first 8 bytes of this store
> + * are misaligned wrt the maximum vector size, so do that first.
> + */
> + if (dofs & 8) {
> + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
> + i += 8;
> + }
> +
> switch (type) {
> case TCG_TYPE_V256:
> /*
--
Alex Bennée
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high
2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson
@ 2020-04-20 15:29 ` Alex Bennée
0 siblings, 0 replies; 7+ messages in thread
From: Alex Bennée @ 2020-04-20 15:29 UTC (permalink / raw)
To: Richard Henderson; +Cc: peter.maydell, qemu-devel
Richard Henderson <richard.henderson@linaro.org> writes:
> The 8-byte store for the end a !is_q operation can be
> merged with the other stores. Use a no-op vector move
> to trigger the expand_clr portion of tcg_gen_gvec_mov.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> ---
> target/arm/translate-a64.c | 10 ++--------
> 1 file changed, 2 insertions(+), 8 deletions(-)
>
> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
> index 095638e09a..d57aa54d6a 100644
> --- a/target/arm/translate-a64.c
> +++ b/target/arm/translate-a64.c
> @@ -513,14 +513,8 @@ static void clear_vec_high(DisasContext *s, bool is_q, int rd)
> unsigned ofs = fp_reg_offset(s, rd, MO_64);
> unsigned vsz = vec_full_reg_size(s);
>
> - if (!is_q) {
> - TCGv_i64 tcg_zero = tcg_const_i64(0);
> - tcg_gen_st_i64(tcg_zero, cpu_env, ofs + 8);
> - tcg_temp_free_i64(tcg_zero);
> - }
> - if (vsz > 16) {
> - tcg_gen_gvec_dup_imm(MO_64, ofs + 16, vsz - 16, vsz - 16, 0);
> - }
> + /* Nop move, with side effect of clearing the tail. */
> + tcg_gen_gvec_mov(MO_64, ofs, ofs, is_q ? 16 : 8, vsz);
> }
>
> void write_fp_dreg(DisasContext *s, int reg, TCGv_i64 v)
--
Alex Bennée
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH 3/3] target/arm: Use clear_vec_high more effectively
2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson
@ 2020-04-20 15:32 ` Alex Bennée
0 siblings, 0 replies; 7+ messages in thread
From: Alex Bennée @ 2020-04-20 15:32 UTC (permalink / raw)
To: Richard Henderson; +Cc: peter.maydell, qemu-devel
Richard Henderson <richard.henderson@linaro.org> writes:
> Do not explicitly store zero to the NEON high part
> when we can pass !is_q to clear_vec_high.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> ---
> target/arm/translate-a64.c | 59 +++++++++++++++++++++++---------------
> 1 file changed, 36 insertions(+), 23 deletions(-)
>
> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
> index d57aa54d6a..bf82a2e115 100644
> --- a/target/arm/translate-a64.c
> +++ b/target/arm/translate-a64.c
> @@ -948,11 +948,10 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
> {
> /* This always zero-extends and writes to a full 128 bit wide vector */
> TCGv_i64 tmplo = tcg_temp_new_i64();
> - TCGv_i64 tmphi;
> + TCGv_i64 tmphi = NULL;
>
> if (size < 4) {
> MemOp memop = s->be_data + size;
> - tmphi = tcg_const_i64(0);
> tcg_gen_qemu_ld_i64(tmplo, tcg_addr, get_mem_index(s), memop);
> } else {
> bool be = s->be_data == MO_BE;
> @@ -970,12 +969,13 @@ static void do_fp_ld(DisasContext *s, int destidx, TCGv_i64 tcg_addr, int size)
> }
>
> tcg_gen_st_i64(tmplo, cpu_env, fp_reg_offset(s, destidx, MO_64));
> - tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
> -
> tcg_temp_free_i64(tmplo);
> - tcg_temp_free_i64(tmphi);
>
> - clear_vec_high(s, true, destidx);
> + if (tmphi) {
> + tcg_gen_st_i64(tmphi, cpu_env, fp_reg_hi_offset(s, destidx));
> + tcg_temp_free_i64(tmphi);
> + }
> + clear_vec_high(s, tmphi != NULL, destidx);
> }
>
> /*
> @@ -6969,8 +6969,8 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
> return;
> }
>
> - tcg_resh = tcg_temp_new_i64();
> tcg_resl = tcg_temp_new_i64();
> + tcg_resh = NULL;
>
> /* Vd gets bits starting at pos bits into Vm:Vn. This is
> * either extracting 128 bits from a 128:128 concatenation, or
> @@ -6982,7 +6982,6 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
> read_vec_element(s, tcg_resh, rm, 0, MO_64);
> do_ext64(s, tcg_resh, tcg_resl, pos);
> }
> - tcg_gen_movi_i64(tcg_resh, 0);
> } else {
> TCGv_i64 tcg_hh;
> typedef struct {
> @@ -6997,6 +6996,7 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
> pos -= 64;
> }
>
> + tcg_resh = tcg_temp_new_i64();
> read_vec_element(s, tcg_resl, elt->reg, elt->elt, MO_64);
> elt++;
> read_vec_element(s, tcg_resh, elt->reg, elt->elt, MO_64);
> @@ -7012,9 +7012,12 @@ static void disas_simd_ext(DisasContext *s, uint32_t insn)
>
> write_vec_element(s, tcg_resl, rd, 0, MO_64);
> tcg_temp_free_i64(tcg_resl);
> - write_vec_element(s, tcg_resh, rd, 1, MO_64);
> - tcg_temp_free_i64(tcg_resh);
> - clear_vec_high(s, true, rd);
> +
> + if (is_q) {
> + write_vec_element(s, tcg_resh, rd, 1, MO_64);
> + tcg_temp_free_i64(tcg_resh);
> + }
> + clear_vec_high(s, is_q, rd);
> }
>
> /* TBL/TBX
> @@ -7051,17 +7054,21 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn)
> * the input.
> */
> tcg_resl = tcg_temp_new_i64();
> - tcg_resh = tcg_temp_new_i64();
> + tcg_resh = NULL;
>
> if (is_tblx) {
> read_vec_element(s, tcg_resl, rd, 0, MO_64);
> } else {
> tcg_gen_movi_i64(tcg_resl, 0);
> }
> - if (is_tblx && is_q) {
> - read_vec_element(s, tcg_resh, rd, 1, MO_64);
> - } else {
> - tcg_gen_movi_i64(tcg_resh, 0);
> +
> + if (is_q) {
> + tcg_resh = tcg_temp_new_i64();
> + if (is_tblx) {
> + read_vec_element(s, tcg_resh, rd, 1, MO_64);
> + } else {
> + tcg_gen_movi_i64(tcg_resh, 0);
> + }
> }
>
> tcg_idx = tcg_temp_new_i64();
> @@ -7081,9 +7088,12 @@ static void disas_simd_tb(DisasContext *s, uint32_t insn)
>
> write_vec_element(s, tcg_resl, rd, 0, MO_64);
> tcg_temp_free_i64(tcg_resl);
> - write_vec_element(s, tcg_resh, rd, 1, MO_64);
> - tcg_temp_free_i64(tcg_resh);
> - clear_vec_high(s, true, rd);
> +
> + if (is_q) {
> + write_vec_element(s, tcg_resh, rd, 1, MO_64);
> + tcg_temp_free_i64(tcg_resh);
> + }
> + clear_vec_high(s, is_q, rd);
> }
>
> /* ZIP/UZP/TRN
> @@ -7120,7 +7130,7 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
> }
>
> tcg_resl = tcg_const_i64(0);
> - tcg_resh = tcg_const_i64(0);
> + tcg_resh = is_q ? tcg_const_i64(0) : NULL;
> tcg_res = tcg_temp_new_i64();
>
> for (i = 0; i < elements; i++) {
> @@ -7171,9 +7181,12 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
>
> write_vec_element(s, tcg_resl, rd, 0, MO_64);
> tcg_temp_free_i64(tcg_resl);
> - write_vec_element(s, tcg_resh, rd, 1, MO_64);
> - tcg_temp_free_i64(tcg_resh);
> - clear_vec_high(s, true, rd);
> +
> + if (is_q) {
> + write_vec_element(s, tcg_resh, rd, 1, MO_64);
> + tcg_temp_free_i64(tcg_resh);
> + }
> + clear_vec_high(s, is_q, rd);
> }
>
> /*
--
Alex Bennée
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2020-04-20 15:34 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-18 15:56 [PATCH 0/3] tcg: Improve vector tail clearing Richard Henderson
2020-04-18 15:56 ` [PATCH 1/3] " Richard Henderson
2020-04-20 15:25 ` Alex Bennée
2020-04-18 15:56 ` [PATCH 2/3] target/arm: Use tcg_gen_gvec_mov for clear_vec_high Richard Henderson
2020-04-20 15:29 ` Alex Bennée
2020-04-18 15:56 ` [PATCH 3/3] target/arm: Use clear_vec_high more effectively Richard Henderson
2020-04-20 15:32 ` Alex Bennée
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.