* [Qemu-devel] [PATCH v2 01/10] tcg: Add logical simplifications during gvec expand
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
@ 2019-01-04 22:31 ` Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 02/10] tcg: Add gvec expanders for nand, nor, eqv Richard Henderson
` (9 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2019-01-04 22:31 UTC (permalink / raw)
To: qemu-devel
We handle many of these during integer expansion, and the
rest of them during integer optimization.
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/tcg-op-gvec.c | 35 ++++++++++++++++++++++++++++++-----
1 file changed, 30 insertions(+), 5 deletions(-)
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 61c25f5784..ec231b78fb 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -1840,7 +1840,12 @@ void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
.opc = INDEX_op_and_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
};
- tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+
+ if (aofs == bofs) {
+ tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
+ } else {
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+ }
}
void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -1853,7 +1858,12 @@ void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
.opc = INDEX_op_or_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
};
- tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+
+ if (aofs == bofs) {
+ tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
+ } else {
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+ }
}
void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -1866,7 +1876,12 @@ void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
.opc = INDEX_op_xor_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
};
- tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+
+ if (aofs == bofs) {
+ tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
+ } else {
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+ }
}
void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -1879,7 +1894,12 @@ void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
.opc = INDEX_op_andc_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
};
- tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+
+ if (aofs == bofs) {
+ tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
+ } else {
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+ }
}
void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
@@ -1892,7 +1912,12 @@ void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
.opc = INDEX_op_orc_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
};
- tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+
+ if (aofs == bofs) {
+ tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
+ } else {
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+ }
}
static const GVecGen2s gop_ands = {
--
2.17.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Qemu-devel] [PATCH v2 02/10] tcg: Add gvec expanders for nand, nor, eqv
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 01/10] tcg: Add logical simplifications during gvec expand Richard Henderson
@ 2019-01-04 22:31 ` Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 03/10] tcg: Add write_aofs to GVecGen4 Richard Henderson
` (8 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2019-01-04 22:31 UTC (permalink / raw)
To: qemu-devel
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
accel/tcg/tcg-runtime.h | 3 +++
tcg/tcg-op-gvec.h | 6 +++++
tcg/tcg-op.h | 3 +++
accel/tcg/tcg-runtime-gvec.c | 33 +++++++++++++++++++++++
tcg/tcg-op-gvec.c | 51 ++++++++++++++++++++++++++++++++++++
tcg/tcg-op-vec.c | 21 +++++++++++++++
6 files changed, 117 insertions(+)
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index 1bd39d136d..835ddfebb2 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -211,6 +211,9 @@ DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_nand, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_nor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_eqv, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
index ff43a29a0b..d65b9d9d4c 100644
--- a/tcg/tcg-op-gvec.h
+++ b/tcg/tcg-op-gvec.h
@@ -242,6 +242,12 @@ void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
int64_t c, uint32_t oprsz, uint32_t maxsz);
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 7007ec0d4d..f6ef1cd690 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -962,6 +962,9 @@ void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index 90340e56e0..d1802467d5 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -512,6 +512,39 @@ void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
clear_high(d, oprsz, desc);
}
+void HELPER(gvec_nand)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) & *(vec64 *)(b + i));
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_nor)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) | *(vec64 *)(b + i));
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) ^ *(vec64 *)(b + i));
+ }
+ clear_high(d, oprsz, desc);
+}
+
void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index ec231b78fb..81689d02f7 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -1920,6 +1920,57 @@ void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
}
}
+void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_nand_i64,
+ .fniv = tcg_gen_nand_vec,
+ .fno = gen_helper_gvec_nand,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+
+ if (aofs == bofs) {
+ tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
+ } else {
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+ }
+}
+
+void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_nor_i64,
+ .fniv = tcg_gen_nor_vec,
+ .fno = gen_helper_gvec_nor,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+
+ if (aofs == bofs) {
+ tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
+ } else {
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+ }
+}
+
+void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_eqv_i64,
+ .fniv = tcg_gen_eqv_vec,
+ .fno = gen_helper_gvec_eqv,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+
+ if (aofs == bofs) {
+ tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
+ } else {
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+ }
+}
+
static const GVecGen2s gop_ands = {
.fni8 = tcg_gen_and_i64,
.fniv = tcg_gen_and_vec,
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index cefba3d185..d77fdf7c1d 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -275,6 +275,27 @@ void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
}
}
+void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ /* TODO: Add TCG_TARGET_HAS_nand_vec when adding a backend supports it. */
+ tcg_gen_and_vec(0, r, a, b);
+ tcg_gen_not_vec(0, r, r);
+}
+
+void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ /* TODO: Add TCG_TARGET_HAS_nor_vec when adding a backend supports it. */
+ tcg_gen_or_vec(0, r, a, b);
+ tcg_gen_not_vec(0, r, r);
+}
+
+void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ /* TODO: Add TCG_TARGET_HAS_eqv_vec when adding a backend supports it. */
+ tcg_gen_xor_vec(0, r, a, b);
+ tcg_gen_not_vec(0, r, r);
+}
+
void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
{
if (TCG_TARGET_HAS_not_vec) {
--
2.17.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Qemu-devel] [PATCH v2 03/10] tcg: Add write_aofs to GVecGen4
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 01/10] tcg: Add logical simplifications during gvec expand Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 02/10] tcg: Add gvec expanders for nand, nor, eqv Richard Henderson
@ 2019-01-04 22:31 ` Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 04/10] tcg: Add opcodes for vector saturated arithmetic Richard Henderson
` (7 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2019-01-04 22:31 UTC (permalink / raw)
To: qemu-devel
This allows writing 2 output, 3 input operations.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/tcg-op-gvec.h | 2 ++
tcg/tcg-op-gvec.c | 27 +++++++++++++++++++--------
2 files changed, 21 insertions(+), 8 deletions(-)
diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
index d65b9d9d4c..2cb447112e 100644
--- a/tcg/tcg-op-gvec.h
+++ b/tcg/tcg-op-gvec.h
@@ -181,6 +181,8 @@ typedef struct {
uint8_t vece;
/* Prefer i64 to v64. */
bool prefer_i64;
+ /* Write aofs as a 2nd dest operand. */
+ bool write_aofs;
} GVecGen4;
void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 81689d02f7..c10d3d7b26 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -665,7 +665,7 @@ static void expand_3_i32(uint32_t dofs, uint32_t aofs,
/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
- uint32_t cofs, uint32_t oprsz,
+ uint32_t cofs, uint32_t oprsz, bool write_aofs,
void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
{
TCGv_i32 t0 = tcg_temp_new_i32();
@@ -680,6 +680,9 @@ static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
tcg_gen_ld_i32(t3, cpu_env, cofs + i);
fni(t0, t1, t2, t3);
tcg_gen_st_i32(t0, cpu_env, dofs + i);
+ if (write_aofs) {
+ tcg_gen_st_i32(t1, cpu_env, aofs + i);
+ }
}
tcg_temp_free_i32(t3);
tcg_temp_free_i32(t2);
@@ -769,7 +772,7 @@ static void expand_3_i64(uint32_t dofs, uint32_t aofs,
/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
- uint32_t cofs, uint32_t oprsz,
+ uint32_t cofs, uint32_t oprsz, bool write_aofs,
void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
{
TCGv_i64 t0 = tcg_temp_new_i64();
@@ -784,6 +787,9 @@ static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
tcg_gen_ld_i64(t3, cpu_env, cofs + i);
fni(t0, t1, t2, t3);
tcg_gen_st_i64(t0, cpu_env, dofs + i);
+ if (write_aofs) {
+ tcg_gen_st_i64(t1, cpu_env, aofs + i);
+ }
}
tcg_temp_free_i64(t3);
tcg_temp_free_i64(t2);
@@ -880,7 +886,7 @@ static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
/* Expand OPSZ bytes worth of four-operand operations using host vectors. */
static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t cofs, uint32_t oprsz,
- uint32_t tysz, TCGType type,
+ uint32_t tysz, TCGType type, bool write_aofs,
void (*fni)(unsigned, TCGv_vec, TCGv_vec,
TCGv_vec, TCGv_vec))
{
@@ -896,6 +902,9 @@ static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
tcg_gen_ld_vec(t3, cpu_env, cofs + i);
fni(vece, t0, t1, t2, t3);
tcg_gen_st_vec(t0, cpu_env, dofs + i);
+ if (write_aofs) {
+ tcg_gen_st_vec(t1, cpu_env, aofs + i);
+ }
}
tcg_temp_free_vec(t3);
tcg_temp_free_vec(t2);
@@ -1187,7 +1196,7 @@ void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
*/
some = QEMU_ALIGN_DOWN(oprsz, 32);
expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
- 32, TCG_TYPE_V256, g->fniv);
+ 32, TCG_TYPE_V256, g->write_aofs, g->fniv);
if (some == oprsz) {
break;
}
@@ -1200,18 +1209,20 @@ void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
/* fallthru */
case TCG_TYPE_V128:
expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
- 16, TCG_TYPE_V128, g->fniv);
+ 16, TCG_TYPE_V128, g->write_aofs, g->fniv);
break;
case TCG_TYPE_V64:
expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
- 8, TCG_TYPE_V64, g->fniv);
+ 8, TCG_TYPE_V64, g->write_aofs, g->fniv);
break;
case 0:
if (g->fni8 && check_size_impl(oprsz, 8)) {
- expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8);
+ expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
+ g->write_aofs, g->fni8);
} else if (g->fni4 && check_size_impl(oprsz, 4)) {
- expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4);
+ expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
+ g->write_aofs, g->fni4);
} else {
assert(g->fno != NULL);
tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
--
2.17.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Qemu-devel] [PATCH v2 04/10] tcg: Add opcodes for vector saturated arithmetic
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
` (2 preceding siblings ...)
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 03/10] tcg: Add write_aofs to GVecGen4 Richard Henderson
@ 2019-01-04 22:31 ` Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 05/10] tcg: Add opcodes for vector minmax arithmetic Richard Henderson
` (6 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2019-01-04 22:31 UTC (permalink / raw)
To: qemu-devel
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/aarch64/tcg-target.h | 1 +
tcg/i386/tcg-target.h | 1 +
tcg/tcg-op.h | 4 ++
tcg/tcg-opc.h | 4 ++
tcg/tcg.h | 1 +
tcg/tcg-op-gvec.c | 84 ++++++++++++++++++++++++++++++----------
tcg/tcg-op-vec.c | 34 ++++++++++++++--
tcg/tcg.c | 5 +++
8 files changed, 110 insertions(+), 24 deletions(-)
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index f966a4fcb3..98556bcf22 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -135,6 +135,7 @@ typedef enum {
#define TCG_TARGET_HAS_shv_vec 0
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
+#define TCG_TARGET_HAS_sat_vec 0
#define TCG_TARGET_DEFAULT_MO (0)
#define TCG_TARGET_HAS_MEMORY_BSWAP 1
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index f378d29568..44381062e6 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -185,6 +185,7 @@ extern bool have_avx2;
#define TCG_TARGET_HAS_shv_vec 0
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
+#define TCG_TARGET_HAS_sat_vec 0
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
(((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index f6ef1cd690..4a93d730e8 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -967,6 +967,10 @@ void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+void tcg_gen_ssadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_usadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sssub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_ussub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 7a8a3edb5b..94b2ed80af 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -222,6 +222,10 @@ DEF(add_vec, 1, 2, 0, IMPLVEC)
DEF(sub_vec, 1, 2, 0, IMPLVEC)
DEF(mul_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_mul_vec))
DEF(neg_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+DEF(ssadd_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
+DEF(usadd_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
+DEF(sssub_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
+DEF(ussub_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
DEF(and_vec, 1, 2, 0, IMPLVEC)
DEF(or_vec, 1, 2, 0, IMPLVEC)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 3a629991ca..df24afa425 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -183,6 +183,7 @@ typedef uint64_t TCGRegSet;
#define TCG_TARGET_HAS_shs_vec 0
#define TCG_TARGET_HAS_shv_vec 0
#define TCG_TARGET_HAS_mul_vec 0
+#define TCG_TARGET_HAS_sat_vec 0
#else
#define TCG_TARGET_MAYBE_vec 1
#endif
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index c10d3d7b26..0a33f51065 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -1678,10 +1678,22 @@ void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
static const GVecGen3 g[4] = {
- { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 },
- { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 },
- { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 },
- { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 }
+ { .fniv = tcg_gen_ssadd_vec,
+ .fno = gen_helper_gvec_ssadd8,
+ .opc = INDEX_op_ssadd_vec,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_ssadd_vec,
+ .fno = gen_helper_gvec_ssadd16,
+ .opc = INDEX_op_ssadd_vec,
+ .vece = MO_16 },
+ { .fniv = tcg_gen_ssadd_vec,
+ .fno = gen_helper_gvec_ssadd32,
+ .opc = INDEX_op_ssadd_vec,
+ .vece = MO_32 },
+ { .fniv = tcg_gen_ssadd_vec,
+ .fno = gen_helper_gvec_ssadd64,
+ .opc = INDEX_op_ssadd_vec,
+ .vece = MO_64 },
};
tcg_debug_assert(vece <= MO_64);
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
@@ -1691,16 +1703,28 @@ void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
static const GVecGen3 g[4] = {
- { .fno = gen_helper_gvec_sssub8, .vece = MO_8 },
- { .fno = gen_helper_gvec_sssub16, .vece = MO_16 },
- { .fno = gen_helper_gvec_sssub32, .vece = MO_32 },
- { .fno = gen_helper_gvec_sssub64, .vece = MO_64 }
+ { .fniv = tcg_gen_sssub_vec,
+ .fno = gen_helper_gvec_sssub8,
+ .opc = INDEX_op_sssub_vec,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_sssub_vec,
+ .fno = gen_helper_gvec_sssub16,
+ .opc = INDEX_op_sssub_vec,
+ .vece = MO_16 },
+ { .fniv = tcg_gen_sssub_vec,
+ .fno = gen_helper_gvec_sssub32,
+ .opc = INDEX_op_sssub_vec,
+ .vece = MO_32 },
+ { .fniv = tcg_gen_sssub_vec,
+ .fno = gen_helper_gvec_sssub64,
+ .opc = INDEX_op_sssub_vec,
+ .vece = MO_64 },
};
tcg_debug_assert(vece <= MO_64);
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
}
-static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
{
TCGv_i32 max = tcg_const_i32(-1);
tcg_gen_add_i32(d, a, b);
@@ -1708,7 +1732,7 @@ static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
tcg_temp_free_i32(max);
}
-static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
{
TCGv_i64 max = tcg_const_i64(-1);
tcg_gen_add_i64(d, a, b);
@@ -1720,20 +1744,30 @@ void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
static const GVecGen3 g[4] = {
- { .fno = gen_helper_gvec_usadd8, .vece = MO_8 },
- { .fno = gen_helper_gvec_usadd16, .vece = MO_16 },
- { .fni4 = tcg_gen_vec_usadd32_i32,
+ { .fniv = tcg_gen_usadd_vec,
+ .fno = gen_helper_gvec_usadd8,
+ .opc = INDEX_op_usadd_vec,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_usadd_vec,
+ .fno = gen_helper_gvec_usadd16,
+ .opc = INDEX_op_usadd_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_usadd_i32,
+ .fniv = tcg_gen_usadd_vec,
.fno = gen_helper_gvec_usadd32,
+ .opc = INDEX_op_usadd_vec,
.vece = MO_32 },
- { .fni8 = tcg_gen_vec_usadd32_i64,
+ { .fni8 = tcg_gen_usadd_i64,
+ .fniv = tcg_gen_usadd_vec,
.fno = gen_helper_gvec_usadd64,
+ .opc = INDEX_op_usadd_vec,
.vece = MO_64 }
};
tcg_debug_assert(vece <= MO_64);
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
}
-static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
{
TCGv_i32 min = tcg_const_i32(0);
tcg_gen_sub_i32(d, a, b);
@@ -1741,7 +1775,7 @@ static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
tcg_temp_free_i32(min);
}
-static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
{
TCGv_i64 min = tcg_const_i64(0);
tcg_gen_sub_i64(d, a, b);
@@ -1753,13 +1787,23 @@ void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
static const GVecGen3 g[4] = {
- { .fno = gen_helper_gvec_ussub8, .vece = MO_8 },
- { .fno = gen_helper_gvec_ussub16, .vece = MO_16 },
- { .fni4 = tcg_gen_vec_ussub32_i32,
+ { .fniv = tcg_gen_ussub_vec,
+ .fno = gen_helper_gvec_ussub8,
+ .opc = INDEX_op_ussub_vec,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_ussub_vec,
+ .fno = gen_helper_gvec_ussub16,
+ .opc = INDEX_op_ussub_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_ussub_i32,
+ .fniv = tcg_gen_ussub_vec,
.fno = gen_helper_gvec_ussub32,
+ .opc = INDEX_op_ussub_vec,
.vece = MO_32 },
- { .fni8 = tcg_gen_vec_ussub32_i64,
+ { .fni8 = tcg_gen_ussub_i64,
+ .fniv = tcg_gen_ussub_vec,
.fno = gen_helper_gvec_ussub64,
+ .opc = INDEX_op_ussub_vec,
.vece = MO_64 }
};
tcg_debug_assert(vece <= MO_64);
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index d77fdf7c1d..675aa09258 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -386,7 +386,8 @@ void tcg_gen_cmp_vec(TCGCond cond, unsigned vece,
}
}
-void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+static void do_op3(unsigned vece, TCGv_vec r, TCGv_vec a,
+ TCGv_vec b, TCGOpcode opc)
{
TCGTemp *rt = tcgv_vec_temp(r);
TCGTemp *at = tcgv_vec_temp(a);
@@ -399,11 +400,36 @@ void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
tcg_debug_assert(at->base_type >= type);
tcg_debug_assert(bt->base_type >= type);
- can = tcg_can_emit_vec_op(INDEX_op_mul_vec, type, vece);
+ can = tcg_can_emit_vec_op(opc, type, vece);
if (can > 0) {
- vec_gen_3(INDEX_op_mul_vec, type, vece, ri, ai, bi);
+ vec_gen_3(opc, type, vece, ri, ai, bi);
} else {
tcg_debug_assert(can < 0);
- tcg_expand_vec_op(INDEX_op_mul_vec, type, vece, ri, ai, bi);
+ tcg_expand_vec_op(opc, type, vece, ri, ai, bi);
}
}
+
+void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_mul_vec);
+}
+
+void tcg_gen_ssadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_ssadd_vec);
+}
+
+void tcg_gen_usadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_usadd_vec);
+}
+
+void tcg_gen_sssub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_sssub_vec);
+}
+
+void tcg_gen_ussub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_ussub_vec);
+}
diff --git a/tcg/tcg.c b/tcg/tcg.c
index c54b119020..15ed5af007 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1607,6 +1607,11 @@ bool tcg_op_supported(TCGOpcode op)
case INDEX_op_shrv_vec:
case INDEX_op_sarv_vec:
return have_vec && TCG_TARGET_HAS_shv_vec;
+ case INDEX_op_ssadd_vec:
+ case INDEX_op_usadd_vec:
+ case INDEX_op_sssub_vec:
+ case INDEX_op_ussub_vec:
+ return have_vec && TCG_TARGET_HAS_sat_vec;
default:
tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS);
--
2.17.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Qemu-devel] [PATCH v2 05/10] tcg: Add opcodes for vector minmax arithmetic
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
` (3 preceding siblings ...)
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 04/10] tcg: Add opcodes for vector saturated arithmetic Richard Henderson
@ 2019-01-04 22:31 ` Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 06/10] tcg/i386: Split subroutines out of tcg_expand_vec_op Richard Henderson
` (5 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2019-01-04 22:31 UTC (permalink / raw)
To: qemu-devel
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
accel/tcg/tcg-runtime.h | 20 ++++
tcg/aarch64/tcg-target.h | 1 +
tcg/i386/tcg-target.h | 1 +
tcg/tcg-op-gvec.h | 10 ++
tcg/tcg-op.h | 4 +
tcg/tcg-opc.h | 4 +
tcg/tcg.h | 1 +
accel/tcg/tcg-runtime-gvec.c | 224 +++++++++++++++++++++++++++++++++++
tcg/tcg-op-gvec.c | 108 +++++++++++++++++
tcg/tcg-op-vec.c | 20 ++++
tcg/tcg.c | 5 +
11 files changed, 398 insertions(+)
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index 835ddfebb2..dfe325625c 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -200,6 +200,26 @@ DEF_HELPER_FLAGS_4(gvec_ussub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_ussub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_ussub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_smin8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_smin16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_smin32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_smin64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_smax8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_smax16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_smax32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_smax64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_umin8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_umin16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_umin32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_umin64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_umax8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_umax16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_umax32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_umax64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 98556bcf22..545a6eec75 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -136,6 +136,7 @@ typedef enum {
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
#define TCG_TARGET_HAS_sat_vec 0
+#define TCG_TARGET_HAS_minmax_vec 0
#define TCG_TARGET_DEFAULT_MO (0)
#define TCG_TARGET_HAS_MEMORY_BSWAP 1
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 44381062e6..7bd7eae672 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -186,6 +186,7 @@ extern bool have_avx2;
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
#define TCG_TARGET_HAS_sat_vec 0
+#define TCG_TARGET_HAS_minmax_vec 0
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
(((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
index 2cb447112e..4734eef7de 100644
--- a/tcg/tcg-op-gvec.h
+++ b/tcg/tcg-op-gvec.h
@@ -234,6 +234,16 @@ void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+/* Min/max. */
+void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 4a93d730e8..2d98868d8f 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -971,6 +971,10 @@ void tcg_gen_ssadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_usadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_sssub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_ussub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_smin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_umin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_smax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_umax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 94b2ed80af..4e0238ad1a 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -226,6 +226,10 @@ DEF(ssadd_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
DEF(usadd_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
DEF(sssub_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
DEF(ussub_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
+DEF(smin_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_minmax_vec))
+DEF(umin_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_minmax_vec))
+DEF(smax_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_minmax_vec))
+DEF(umax_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_minmax_vec))
DEF(and_vec, 1, 2, 0, IMPLVEC)
DEF(or_vec, 1, 2, 0, IMPLVEC)
diff --git a/tcg/tcg.h b/tcg/tcg.h
index df24afa425..1c3579077d 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -184,6 +184,7 @@ typedef uint64_t TCGRegSet;
#define TCG_TARGET_HAS_shv_vec 0
#define TCG_TARGET_HAS_mul_vec 0
#define TCG_TARGET_HAS_sat_vec 0
+#define TCG_TARGET_HAS_minmax_vec 0
#else
#define TCG_TARGET_MAYBE_vec 1
#endif
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index d1802467d5..9358749741 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -1028,3 +1028,227 @@ void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc)
}
clear_high(d, oprsz, desc);
}
+
+void HELPER(gvec_smin8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int8_t)) {
+ int8_t aa = *(int8_t *)(a + i);
+ int8_t bb = *(int8_t *)(b + i);
+ int8_t dd = aa < bb ? aa : bb;
+ *(int8_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_smin16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+ int16_t aa = *(int16_t *)(a + i);
+ int16_t bb = *(int16_t *)(b + i);
+ int16_t dd = aa < bb ? aa : bb;
+ *(int16_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_smin32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int32_t)) {
+ int32_t aa = *(int32_t *)(a + i);
+ int32_t bb = *(int32_t *)(b + i);
+ int32_t dd = aa < bb ? aa : bb;
+ *(int32_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_smin64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int64_t)) {
+ int64_t aa = *(int64_t *)(a + i);
+ int64_t bb = *(int64_t *)(b + i);
+ int64_t dd = aa < bb ? aa : bb;
+ *(int64_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_smax8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int8_t)) {
+ int8_t aa = *(int8_t *)(a + i);
+ int8_t bb = *(int8_t *)(b + i);
+ int8_t dd = aa > bb ? aa : bb;
+ *(int8_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_smax16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+ int16_t aa = *(int16_t *)(a + i);
+ int16_t bb = *(int16_t *)(b + i);
+ int16_t dd = aa > bb ? aa : bb;
+ *(int16_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_smax32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int32_t)) {
+ int32_t aa = *(int32_t *)(a + i);
+ int32_t bb = *(int32_t *)(b + i);
+ int32_t dd = aa > bb ? aa : bb;
+ *(int32_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_smax64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int64_t)) {
+ int64_t aa = *(int64_t *)(a + i);
+ int64_t bb = *(int64_t *)(b + i);
+ int64_t dd = aa > bb ? aa : bb;
+ *(int64_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_umin8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+ uint8_t aa = *(uint8_t *)(a + i);
+ uint8_t bb = *(uint8_t *)(b + i);
+ uint8_t dd = aa < bb ? aa : bb;
+ *(uint8_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_umin16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+ uint16_t aa = *(uint16_t *)(a + i);
+ uint16_t bb = *(uint16_t *)(b + i);
+ uint16_t dd = aa < bb ? aa : bb;
+ *(uint16_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_umin32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+ uint32_t aa = *(uint32_t *)(a + i);
+ uint32_t bb = *(uint32_t *)(b + i);
+ uint32_t dd = aa < bb ? aa : bb;
+ *(uint32_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_umin64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+ uint64_t aa = *(uint64_t *)(a + i);
+ uint64_t bb = *(uint64_t *)(b + i);
+ uint64_t dd = aa < bb ? aa : bb;
+ *(uint64_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_umax8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+ uint8_t aa = *(uint8_t *)(a + i);
+ uint8_t bb = *(uint8_t *)(b + i);
+ uint8_t dd = aa > bb ? aa : bb;
+ *(uint8_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_umax16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+ uint16_t aa = *(uint16_t *)(a + i);
+ uint16_t bb = *(uint16_t *)(b + i);
+ uint16_t dd = aa > bb ? aa : bb;
+ *(uint16_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_umax32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+ uint32_t aa = *(uint32_t *)(a + i);
+ uint32_t bb = *(uint32_t *)(b + i);
+ uint32_t dd = aa > bb ? aa : bb;
+ *(uint32_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_umax64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+ uint64_t aa = *(uint64_t *)(a + i);
+ uint64_t bb = *(uint64_t *)(b + i);
+ uint64_t dd = aa > bb ? aa : bb;
+ *(uint64_t *)(d + i) = dd;
+ }
+ clear_high(d, oprsz, desc);
+}
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 0a33f51065..3ee44fcb75 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -1810,6 +1810,114 @@ void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
}
+void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fniv = tcg_gen_smin_vec,
+ .fno = gen_helper_gvec_smin8,
+ .opc = INDEX_op_smin_vec,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_smin_vec,
+ .fno = gen_helper_gvec_smin16,
+ .opc = INDEX_op_smin_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_smin_i32,
+ .fniv = tcg_gen_smin_vec,
+ .fno = gen_helper_gvec_smin32,
+ .opc = INDEX_op_smin_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_smin_i64,
+ .fniv = tcg_gen_smin_vec,
+ .fno = gen_helper_gvec_smin64,
+ .opc = INDEX_op_smin_vec,
+ .vece = MO_64 }
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fniv = tcg_gen_umin_vec,
+ .fno = gen_helper_gvec_umin8,
+ .opc = INDEX_op_umin_vec,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_umin_vec,
+ .fno = gen_helper_gvec_umin16,
+ .opc = INDEX_op_umin_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_umin_i32,
+ .fniv = tcg_gen_umin_vec,
+ .fno = gen_helper_gvec_umin32,
+ .opc = INDEX_op_umin_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_umin_i64,
+ .fniv = tcg_gen_umin_vec,
+ .fno = gen_helper_gvec_umin64,
+ .opc = INDEX_op_umin_vec,
+ .vece = MO_64 }
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fniv = tcg_gen_smax_vec,
+ .fno = gen_helper_gvec_smax8,
+ .opc = INDEX_op_smax_vec,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_smax_vec,
+ .fno = gen_helper_gvec_smax16,
+ .opc = INDEX_op_smax_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_smax_i32,
+ .fniv = tcg_gen_smax_vec,
+ .fno = gen_helper_gvec_smax32,
+ .opc = INDEX_op_smax_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_smax_i64,
+ .fniv = tcg_gen_smax_vec,
+ .fno = gen_helper_gvec_smax64,
+ .opc = INDEX_op_smax_vec,
+ .vece = MO_64 }
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fniv = tcg_gen_umax_vec,
+ .fno = gen_helper_gvec_umax8,
+ .opc = INDEX_op_umax_vec,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_umax_vec,
+ .fno = gen_helper_gvec_umax16,
+ .opc = INDEX_op_umax_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_umax_i32,
+ .fniv = tcg_gen_umax_vec,
+ .fno = gen_helper_gvec_umax32,
+ .opc = INDEX_op_umax_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_umax_i64,
+ .fniv = tcg_gen_umax_vec,
+ .fno = gen_helper_gvec_umax64,
+ .opc = INDEX_op_umax_vec,
+ .vece = MO_64 }
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
/* Perform a vector negation using normal negation and a mask.
Compare gen_subv_mask above. */
static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index 675aa09258..36f35022ac 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -433,3 +433,23 @@ void tcg_gen_ussub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
{
do_op3(vece, r, a, b, INDEX_op_ussub_vec);
}
+
+void tcg_gen_smin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_smin_vec);
+}
+
+void tcg_gen_umin_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_umin_vec);
+}
+
+void tcg_gen_smax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_smax_vec);
+}
+
+void tcg_gen_umax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_umax_vec);
+}
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 15ed5af007..1ae1e788f6 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1612,6 +1612,11 @@ bool tcg_op_supported(TCGOpcode op)
case INDEX_op_sssub_vec:
case INDEX_op_ussub_vec:
return have_vec && TCG_TARGET_HAS_sat_vec;
+ case INDEX_op_smin_vec:
+ case INDEX_op_umin_vec:
+ case INDEX_op_smax_vec:
+ case INDEX_op_umax_vec:
+ return have_vec && TCG_TARGET_HAS_minmax_vec;
default:
tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS);
--
2.17.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Qemu-devel] [PATCH v2 06/10] tcg/i386: Split subroutines out of tcg_expand_vec_op
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
` (4 preceding siblings ...)
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 05/10] tcg: Add opcodes for vector minmax arithmetic Richard Henderson
@ 2019-01-04 22:31 ` Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 07/10] tcg/i386: Implement vector saturating arithmetic Richard Henderson
` (4 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2019-01-04 22:31 UTC (permalink / raw)
To: qemu-devel
This routine was becoming too large.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/i386/tcg-target.inc.c | 459 +++++++++++++++++++-------------------
1 file changed, 232 insertions(+), 227 deletions(-)
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index c21c3272f2..ad97386d06 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -3079,253 +3079,258 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
}
}
+static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
+ TCGv_vec v0, TCGv_vec v1, TCGArg imm)
+{
+ TCGv_vec t1, t2;
+
+ tcg_debug_assert(vece == MO_8);
+
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+
+ /* Unpack to W, shift, and repack. Tricky bits:
+ (1) Use punpck*bw x,x to produce DDCCBBAA,
+ i.e. duplicate in other half of the 16-bit lane.
+ (2) For right-shift, add 8 so that the high half of
+ the lane becomes zero. For left-shift, we must
+ shift up and down again.
+ (3) Step 2 leaves high half zero such that PACKUSWB
+ (pack with unsigned saturation) does not modify
+ the quantity. */
+ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+ tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
+ vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
+
+ if (shr) {
+ tcg_gen_shri_vec(MO_16, t1, t1, imm + 8);
+ tcg_gen_shri_vec(MO_16, t2, t2, imm + 8);
+ } else {
+ tcg_gen_shli_vec(MO_16, t1, t1, imm + 8);
+ tcg_gen_shli_vec(MO_16, t2, t2, imm + 8);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ tcg_gen_shri_vec(MO_16, t2, t2, 8);
+ }
+
+ vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
+ tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+}
+
+static void expand_vec_sari(TCGType type, unsigned vece,
+ TCGv_vec v0, TCGv_vec v1, TCGArg imm)
+{
+ TCGv_vec t1, t2;
+
+ switch (vece) {
+ case MO_8:
+ /* Unpack to W, shift, and repack, as in expand_vec_shi. */
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+ tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
+ vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
+ tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
+ tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
+ vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
+ tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+
+ case MO_64:
+ if (imm <= 32) {
+ /* We can emulate a small sign extend by performing an arithmetic
+ * 32-bit shift and overwriting the high half of a 64-bit logical
+ * shift (note that the ISA says shift of 32 is valid).
+ */
+ t1 = tcg_temp_new_vec(type);
+ tcg_gen_sari_vec(MO_32, t1, v1, imm);
+ tcg_gen_shri_vec(MO_64, v0, v1, imm);
+ vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
+ tcgv_vec_arg(v0), tcgv_vec_arg(v0),
+ tcgv_vec_arg(t1), 0xaa);
+ tcg_temp_free_vec(t1);
+ } else {
+ /* Otherwise we will need to use a compare vs 0 to produce
+ * the sign-extend, shift and merge.
+ */
+ t1 = tcg_const_zeros_vec(type);
+ tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
+ tcg_gen_shri_vec(MO_64, v0, v1, imm);
+ tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
+ tcg_gen_or_vec(MO_64, v0, v0, t1);
+ tcg_temp_free_vec(t1);
+ }
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void expand_vec_mul(TCGType type, unsigned vece,
+ TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
+{
+ TCGv_vec t1, t2, t3, t4;
+
+ tcg_debug_assert(vece == MO_8);
+
+ /*
+ * Unpack v1 bytes to words, 0 | x.
+ * Unpack v2 bytes to words, y | 0.
+ * This leaves the 8-bit result, x * y, with 8 bits of right padding.
+ * Shift logical right by 8 bits to clear the high 8 bytes before
+ * using an unsigned saturated pack.
+ *
+ * The difference between the V64, V128 and V256 cases is merely how
+ * we distribute the expansion between temporaries.
+ */
+ switch (type) {
+ case TCG_TYPE_V64:
+ t1 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t2 = tcg_temp_new_vec(TCG_TYPE_V128);
+ tcg_gen_dup16i_vec(t2, 0);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+
+ case TCG_TYPE_V128:
+ case TCG_TYPE_V256:
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ t3 = tcg_temp_new_vec(type);
+ t4 = tcg_temp_new_vec(type);
+ tcg_gen_dup16i_vec(t4, 0);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+ tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
+ vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+ tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+ tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_mul_vec(MO_16, t3, t3, t4);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ tcg_gen_shri_vec(MO_16, t3, t3, 8);
+ vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
+ tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ tcg_temp_free_vec(t3);
+ tcg_temp_free_vec(t4);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
+ TCGv_vec v1, TCGv_vec v2, TCGCond cond)
+{
+ enum {
+ NEED_SWAP = 1,
+ NEED_INV = 2,
+ NEED_BIAS = 4
+ };
+ static const uint8_t fixups[16] = {
+ [0 ... 15] = -1,
+ [TCG_COND_EQ] = 0,
+ [TCG_COND_NE] = NEED_INV,
+ [TCG_COND_GT] = 0,
+ [TCG_COND_LT] = NEED_SWAP,
+ [TCG_COND_LE] = NEED_INV,
+ [TCG_COND_GE] = NEED_SWAP | NEED_INV,
+ [TCG_COND_GTU] = NEED_BIAS,
+ [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
+ [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
+ [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
+ };
+ TCGv_vec t1, t2;
+ uint8_t fixup;
+
+ fixup = fixups[cond & 15];
+ tcg_debug_assert(fixup != 0xff);
+
+ if (fixup & NEED_INV) {
+ cond = tcg_invert_cond(cond);
+ }
+ if (fixup & NEED_SWAP) {
+ t1 = v1, v1 = v2, v2 = t1;
+ cond = tcg_swap_cond(cond);
+ }
+
+ t1 = t2 = NULL;
+ if (fixup & NEED_BIAS) {
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
+ tcg_gen_sub_vec(vece, t1, v1, t2);
+ tcg_gen_sub_vec(vece, t2, v2, t2);
+ v1 = t1;
+ v2 = t2;
+ cond = tcg_signed_cond(cond);
+ }
+
+ tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
+ /* Expand directly; do not recurse. */
+ vec_gen_4(INDEX_op_cmp_vec, type, vece,
+ tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
+
+ if (t1) {
+ tcg_temp_free_vec(t1);
+ if (t2) {
+ tcg_temp_free_vec(t2);
+ }
+ }
+ if (fixup & NEED_INV) {
+ tcg_gen_not_vec(vece, v0, v0);
+ }
+}
+
void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
TCGArg a0, ...)
{
va_list va;
- TCGArg a1, a2;
- TCGv_vec v0, t1, t2, t3, t4;
+ TCGArg a2;
+ TCGv_vec v0, v1, v2;
va_start(va, a0);
v0 = temp_tcgv_vec(arg_temp(a0));
+ v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+ a2 = va_arg(va, TCGArg);
switch (opc) {
case INDEX_op_shli_vec:
case INDEX_op_shri_vec:
- tcg_debug_assert(vece == MO_8);
- a1 = va_arg(va, TCGArg);
- a2 = va_arg(va, TCGArg);
- /* Unpack to W, shift, and repack. Tricky bits:
- (1) Use punpck*bw x,x to produce DDCCBBAA,
- i.e. duplicate in other half of the 16-bit lane.
- (2) For right-shift, add 8 so that the high half of
- the lane becomes zero. For left-shift, we must
- shift up and down again.
- (3) Step 2 leaves high half zero such that PACKUSWB
- (pack with unsigned saturation) does not modify
- the quantity. */
- t1 = tcg_temp_new_vec(type);
- t2 = tcg_temp_new_vec(type);
- vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
- tcgv_vec_arg(t1), a1, a1);
- vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
- tcgv_vec_arg(t2), a1, a1);
- if (opc == INDEX_op_shri_vec) {
- vec_gen_3(INDEX_op_shri_vec, type, MO_16,
- tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
- vec_gen_3(INDEX_op_shri_vec, type, MO_16,
- tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
- } else {
- vec_gen_3(INDEX_op_shli_vec, type, MO_16,
- tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
- vec_gen_3(INDEX_op_shli_vec, type, MO_16,
- tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
- vec_gen_3(INDEX_op_shri_vec, type, MO_16,
- tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
- vec_gen_3(INDEX_op_shri_vec, type, MO_16,
- tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
- }
- vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
- a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
- tcg_temp_free_vec(t1);
- tcg_temp_free_vec(t2);
+ expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
break;
case INDEX_op_sari_vec:
- a1 = va_arg(va, TCGArg);
- a2 = va_arg(va, TCGArg);
- if (vece == MO_8) {
- /* Unpack to W, shift, and repack, as above. */
- t1 = tcg_temp_new_vec(type);
- t2 = tcg_temp_new_vec(type);
- vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
- tcgv_vec_arg(t1), a1, a1);
- vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
- tcgv_vec_arg(t2), a1, a1);
- vec_gen_3(INDEX_op_sari_vec, type, MO_16,
- tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
- vec_gen_3(INDEX_op_sari_vec, type, MO_16,
- tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
- vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
- a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
- tcg_temp_free_vec(t1);
- tcg_temp_free_vec(t2);
- break;
- }
- tcg_debug_assert(vece == MO_64);
- /* MO_64: If the shift is <= 32, we can emulate the sign extend by
- performing an arithmetic 32-bit shift and overwriting the high
- half of the result (note that the ISA says shift of 32 is valid). */
- if (a2 <= 32) {
- t1 = tcg_temp_new_vec(type);
- vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
- vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
- vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
- a0, a0, tcgv_vec_arg(t1), 0xaa);
- tcg_temp_free_vec(t1);
- break;
- }
- /* Otherwise we will need to use a compare vs 0 to produce the
- sign-extend, shift and merge. */
- t1 = tcg_temp_new_vec(type);
- t2 = tcg_const_zeros_vec(type);
- vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
- tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
- tcg_temp_free_vec(t2);
- vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
- vec_gen_3(INDEX_op_shli_vec, type, MO_64,
- tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
- vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
- tcg_temp_free_vec(t1);
+ expand_vec_sari(type, vece, v0, v1, a2);
break;
case INDEX_op_mul_vec:
- tcg_debug_assert(vece == MO_8);
- a1 = va_arg(va, TCGArg);
- a2 = va_arg(va, TCGArg);
- switch (type) {
- case TCG_TYPE_V64:
- t1 = tcg_temp_new_vec(TCG_TYPE_V128);
- t2 = tcg_temp_new_vec(TCG_TYPE_V128);
- tcg_gen_dup16i_vec(t2, 0);
- vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
- tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
- vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
- tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
- tcg_gen_mul_vec(MO_16, t1, t1, t2);
- tcg_gen_shri_vec(MO_16, t1, t1, 8);
- vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
- a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
- tcg_temp_free_vec(t1);
- tcg_temp_free_vec(t2);
- break;
-
- case TCG_TYPE_V128:
- t1 = tcg_temp_new_vec(TCG_TYPE_V128);
- t2 = tcg_temp_new_vec(TCG_TYPE_V128);
- t3 = tcg_temp_new_vec(TCG_TYPE_V128);
- t4 = tcg_temp_new_vec(TCG_TYPE_V128);
- tcg_gen_dup16i_vec(t4, 0);
- vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
- tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
- vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
- tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
- vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
- tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
- vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
- tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
- tcg_gen_mul_vec(MO_16, t1, t1, t2);
- tcg_gen_mul_vec(MO_16, t3, t3, t4);
- tcg_gen_shri_vec(MO_16, t1, t1, 8);
- tcg_gen_shri_vec(MO_16, t3, t3, 8);
- vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
- a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
- tcg_temp_free_vec(t1);
- tcg_temp_free_vec(t2);
- tcg_temp_free_vec(t3);
- tcg_temp_free_vec(t4);
- break;
-
- case TCG_TYPE_V256:
- t1 = tcg_temp_new_vec(TCG_TYPE_V256);
- t2 = tcg_temp_new_vec(TCG_TYPE_V256);
- t3 = tcg_temp_new_vec(TCG_TYPE_V256);
- t4 = tcg_temp_new_vec(TCG_TYPE_V256);
- tcg_gen_dup16i_vec(t4, 0);
- /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
- t1: extends of B[0-7], D[0-7]
- t2: extends of X[0-7], Z[0-7]
- t3: extends of A[0-7], C[0-7]
- t4: extends of W[0-7], Y[0-7]. */
- vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
- tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
- vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
- tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
- vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
- tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
- vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
- tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
- /* t1: BX DZ; t2: AW CY. */
- tcg_gen_mul_vec(MO_16, t1, t1, t2);
- tcg_gen_mul_vec(MO_16, t3, t3, t4);
- tcg_gen_shri_vec(MO_16, t1, t1, 8);
- tcg_gen_shri_vec(MO_16, t3, t3, 8);
- /* a0: AW BX CY DZ. */
- vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
- a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
- tcg_temp_free_vec(t1);
- tcg_temp_free_vec(t2);
- tcg_temp_free_vec(t3);
- tcg_temp_free_vec(t4);
- break;
-
- default:
- g_assert_not_reached();
- }
+ v2 = temp_tcgv_vec(arg_temp(a2));
+ expand_vec_mul(type, vece, v0, v1, v2);
break;
case INDEX_op_cmp_vec:
- {
- enum {
- NEED_SWAP = 1,
- NEED_INV = 2,
- NEED_BIAS = 4
- };
- static const uint8_t fixups[16] = {
- [0 ... 15] = -1,
- [TCG_COND_EQ] = 0,
- [TCG_COND_NE] = NEED_INV,
- [TCG_COND_GT] = 0,
- [TCG_COND_LT] = NEED_SWAP,
- [TCG_COND_LE] = NEED_INV,
- [TCG_COND_GE] = NEED_SWAP | NEED_INV,
- [TCG_COND_GTU] = NEED_BIAS,
- [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
- [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
- [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
- };
-
- TCGCond cond;
- uint8_t fixup;
-
- a1 = va_arg(va, TCGArg);
- a2 = va_arg(va, TCGArg);
- cond = va_arg(va, TCGArg);
- fixup = fixups[cond & 15];
- tcg_debug_assert(fixup != 0xff);
-
- if (fixup & NEED_INV) {
- cond = tcg_invert_cond(cond);
- }
- if (fixup & NEED_SWAP) {
- TCGArg t;
- t = a1, a1 = a2, a2 = t;
- cond = tcg_swap_cond(cond);
- }
-
- t1 = t2 = NULL;
- if (fixup & NEED_BIAS) {
- t1 = tcg_temp_new_vec(type);
- t2 = tcg_temp_new_vec(type);
- tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
- tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
- tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
- a1 = tcgv_vec_arg(t1);
- a2 = tcgv_vec_arg(t2);
- cond = tcg_signed_cond(cond);
- }
-
- tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
- vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
-
- if (fixup & NEED_BIAS) {
- tcg_temp_free_vec(t1);
- tcg_temp_free_vec(t2);
- }
- if (fixup & NEED_INV) {
- tcg_gen_not_vec(vece, v0, v0);
- }
- }
+ v2 = temp_tcgv_vec(arg_temp(a2));
+ expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
break;
default:
--
2.17.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Qemu-devel] [PATCH v2 07/10] tcg/i386: Implement vector saturating arithmetic
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
` (5 preceding siblings ...)
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 06/10] tcg/i386: Split subroutines out of tcg_expand_vec_op Richard Henderson
@ 2019-01-04 22:31 ` Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 08/10] tcg/i386: Implement vector minmax arithmetic Richard Henderson
` (3 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2019-01-04 22:31 UTC (permalink / raw)
To: qemu-devel
Only MO_8 and MO_16 are implemented, since that's all the
instruction set provides.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/i386/tcg-target.h | 2 +-
tcg/i386/tcg-target.inc.c | 42 +++++++++++++++++++++++++++++++++++++++
2 files changed, 43 insertions(+), 1 deletion(-)
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 7bd7eae672..efbd5a6fc9 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -185,7 +185,7 @@ extern bool have_avx2;
#define TCG_TARGET_HAS_shv_vec 0
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
-#define TCG_TARGET_HAS_sat_vec 0
+#define TCG_TARGET_HAS_sat_vec 1
#define TCG_TARGET_HAS_minmax_vec 0
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index ad97386d06..feec40a412 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -377,6 +377,10 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_PADDW (0xfd | P_EXT | P_DATA16)
#define OPC_PADDD (0xfe | P_EXT | P_DATA16)
#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
+#define OPC_PADDSB (0xec | P_EXT | P_DATA16)
+#define OPC_PADDSW (0xed | P_EXT | P_DATA16)
+#define OPC_PADDUB (0xdc | P_EXT | P_DATA16)
+#define OPC_PADDUW (0xdd | P_EXT | P_DATA16)
#define OPC_PAND (0xdb | P_EXT | P_DATA16)
#define OPC_PANDN (0xdf | P_EXT | P_DATA16)
#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
@@ -408,6 +412,10 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
#define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
+#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16)
+#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16)
+#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16)
+#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16)
#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
@@ -2591,9 +2599,21 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
static int const add_insn[4] = {
OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
};
+ static int const ssadd_insn[4] = {
+ OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
+ };
+ static int const usadd_insn[4] = {
+ OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
+ };
static int const sub_insn[4] = {
OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
};
+ static int const sssub_insn[4] = {
+ OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
+ };
+ static int const ussub_insn[4] = {
+ OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
+ };
static int const mul_insn[4] = {
OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
};
@@ -2631,9 +2651,21 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_add_vec:
insn = add_insn[vece];
goto gen_simd;
+ case INDEX_op_ssadd_vec:
+ insn = ssadd_insn[vece];
+ goto gen_simd;
+ case INDEX_op_usadd_vec:
+ insn = usadd_insn[vece];
+ goto gen_simd;
case INDEX_op_sub_vec:
insn = sub_insn[vece];
goto gen_simd;
+ case INDEX_op_sssub_vec:
+ insn = sssub_insn[vece];
+ goto gen_simd;
+ case INDEX_op_ussub_vec:
+ insn = ussub_insn[vece];
+ goto gen_simd;
case INDEX_op_mul_vec:
insn = mul_insn[vece];
goto gen_simd;
@@ -3007,6 +3039,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_or_vec:
case INDEX_op_xor_vec:
case INDEX_op_andc_vec:
+ case INDEX_op_ssadd_vec:
+ case INDEX_op_usadd_vec:
+ case INDEX_op_sssub_vec:
+ case INDEX_op_ussub_vec:
case INDEX_op_cmp_vec:
case INDEX_op_x86_shufps_vec:
case INDEX_op_x86_blend_vec:
@@ -3074,6 +3110,12 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
}
return 1;
+ case INDEX_op_ssadd_vec:
+ case INDEX_op_usadd_vec:
+ case INDEX_op_sssub_vec:
+ case INDEX_op_ussub_vec:
+ return vece <= MO_16;
+
default:
return 0;
}
--
2.17.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Qemu-devel] [PATCH v2 08/10] tcg/i386: Implement vector minmax arithmetic
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
` (6 preceding siblings ...)
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 07/10] tcg/i386: Implement vector saturating arithmetic Richard Henderson
@ 2019-01-04 22:31 ` Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 09/10] tcg/aarch64: Implement vector saturating arithmetic Richard Henderson
` (2 subsequent siblings)
10 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2019-01-04 22:31 UTC (permalink / raw)
To: qemu-devel
The avx instruction set does not directly provide MO_64.
We can still implement 64-bit with comparison and vpblendvb.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/i386/tcg-target.h | 2 +-
tcg/i386/tcg-target.inc.c | 81 +++++++++++++++++++++++++++++++++++++++
2 files changed, 82 insertions(+), 1 deletion(-)
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index efbd5a6fc9..7995fe3eab 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -186,7 +186,7 @@ extern bool have_avx2;
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
#define TCG_TARGET_HAS_sat_vec 1
-#define TCG_TARGET_HAS_minmax_vec 0
+#define TCG_TARGET_HAS_minmax_vec 1
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
(((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index feec40a412..94007c7aa5 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -392,6 +392,18 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
+#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
+#define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
+#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
+#define OPC_PMAXUB (0xde | P_EXT | P_DATA16)
+#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16)
+#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16)
+#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16)
+#define OPC_PMINSW (0xea | P_EXT | P_DATA16)
+#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16)
+#define OPC_PMINUB (0xda | P_EXT | P_DATA16)
+#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16)
+#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16)
#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
@@ -2638,6 +2650,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
static int const packus_insn[4] = {
OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
};
+ static int const smin_insn[4] = {
+ OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
+ };
+ static int const smax_insn[4] = {
+ OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
+ };
+ static int const umin_insn[4] = {
+ OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
+ };
+ static int const umax_insn[4] = {
+ OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
+ };
TCGType type = vecl + TCG_TYPE_V64;
int insn, sub;
@@ -2678,6 +2702,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_xor_vec:
insn = OPC_PXOR;
goto gen_simd;
+ case INDEX_op_smin_vec:
+ insn = smin_insn[vece];
+ goto gen_simd;
+ case INDEX_op_umin_vec:
+ insn = umin_insn[vece];
+ goto gen_simd;
+ case INDEX_op_smax_vec:
+ insn = smax_insn[vece];
+ goto gen_simd;
+ case INDEX_op_umax_vec:
+ insn = umax_insn[vece];
+ goto gen_simd;
case INDEX_op_x86_punpckl_vec:
insn = punpckl_insn[vece];
goto gen_simd;
@@ -3043,6 +3079,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_usadd_vec:
case INDEX_op_sssub_vec:
case INDEX_op_ussub_vec:
+ case INDEX_op_smin_vec:
+ case INDEX_op_umin_vec:
+ case INDEX_op_smax_vec:
+ case INDEX_op_umax_vec:
case INDEX_op_cmp_vec:
case INDEX_op_x86_shufps_vec:
case INDEX_op_x86_blend_vec:
@@ -3115,6 +3155,11 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
case INDEX_op_sssub_vec:
case INDEX_op_ussub_vec:
return vece <= MO_16;
+ case INDEX_op_smin_vec:
+ case INDEX_op_smax_vec:
+ case INDEX_op_umin_vec:
+ case INDEX_op_umax_vec:
+ return vece <= MO_32 ? 1 : -1;
default:
return 0;
@@ -3343,6 +3388,25 @@ static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
}
}
+static void expand_vec_minmax(TCGType type, unsigned vece,
+ TCGCond cond, bool min,
+ TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
+{
+ TCGv_vec t1 = tcg_temp_new_vec(type);
+
+ tcg_debug_assert(vece == MO_64);
+
+ tcg_gen_cmp_vec(cond, vece, t1, v1, v2);
+ if (min) {
+ TCGv_vec t2;
+ t2 = v1, v1 = v2, v2 = t2;
+ }
+ vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
+ tcgv_vec_arg(v0), tcgv_vec_arg(v1),
+ tcgv_vec_arg(v2), tcgv_vec_arg(t1));
+ tcg_temp_free_vec(t1);
+}
+
void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
TCGArg a0, ...)
{
@@ -3375,6 +3439,23 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
break;
+ case INDEX_op_smin_vec:
+ v2 = temp_tcgv_vec(arg_temp(a2));
+ expand_vec_minmax(type, vece, TCG_COND_GT, true, v0, v1, v2);
+ break;
+ case INDEX_op_smax_vec:
+ v2 = temp_tcgv_vec(arg_temp(a2));
+ expand_vec_minmax(type, vece, TCG_COND_GT, false, v0, v1, v2);
+ break;
+ case INDEX_op_umin_vec:
+ v2 = temp_tcgv_vec(arg_temp(a2));
+ expand_vec_minmax(type, vece, TCG_COND_GTU, true, v0, v1, v2);
+ break;
+ case INDEX_op_umax_vec:
+ v2 = temp_tcgv_vec(arg_temp(a2));
+ expand_vec_minmax(type, vece, TCG_COND_GTU, false, v0, v1, v2);
+ break;
+
default:
break;
}
--
2.17.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Qemu-devel] [PATCH v2 09/10] tcg/aarch64: Implement vector saturating arithmetic
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
` (7 preceding siblings ...)
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 08/10] tcg/i386: Implement vector minmax arithmetic Richard Henderson
@ 2019-01-04 22:31 ` Richard Henderson
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 10/10] tcg/aarch64: Implement vector minmax arithmetic Richard Henderson
2019-01-07 13:11 ` [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Mark Cave-Ayland
10 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2019-01-04 22:31 UTC (permalink / raw)
To: qemu-devel
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/aarch64/tcg-target.h | 2 +-
tcg/aarch64/tcg-target.inc.c | 24 ++++++++++++++++++++++++
2 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 545a6eec75..a1884543d0 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -135,7 +135,7 @@ typedef enum {
#define TCG_TARGET_HAS_shv_vec 0
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
-#define TCG_TARGET_HAS_sat_vec 0
+#define TCG_TARGET_HAS_sat_vec 1
#define TCG_TARGET_HAS_minmax_vec 0
#define TCG_TARGET_DEFAULT_MO (0)
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 0562e0aa40..b2b011f130 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -528,6 +528,10 @@ typedef enum {
I3616_CMHI = 0x2e203400,
I3616_CMHS = 0x2e203c00,
I3616_CMEQ = 0x2e208c00,
+ I3616_SQADD = 0x0e200c00,
+ I3616_SQSUB = 0x0e202c00,
+ I3616_UQADD = 0x2e200c00,
+ I3616_UQSUB = 0x2e202c00,
/* AdvSIMD two-reg misc. */
I3617_CMGT0 = 0x0e208800,
@@ -2137,6 +2141,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_orc_vec:
tcg_out_insn(s, 3616, ORN, is_q, 0, a0, a1, a2);
break;
+ case INDEX_op_ssadd_vec:
+ tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
+ break;
+ case INDEX_op_sssub_vec:
+ tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
+ break;
+ case INDEX_op_usadd_vec:
+ tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
+ break;
+ case INDEX_op_ussub_vec:
+ tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
+ break;
case INDEX_op_not_vec:
tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
break;
@@ -2207,6 +2223,10 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
case INDEX_op_shli_vec:
case INDEX_op_shri_vec:
case INDEX_op_sari_vec:
+ case INDEX_op_ssadd_vec:
+ case INDEX_op_sssub_vec:
+ case INDEX_op_usadd_vec:
+ case INDEX_op_ussub_vec:
return 1;
case INDEX_op_mul_vec:
return vece < MO_64;
@@ -2386,6 +2406,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_xor_vec:
case INDEX_op_andc_vec:
case INDEX_op_orc_vec:
+ case INDEX_op_ssadd_vec:
+ case INDEX_op_sssub_vec:
+ case INDEX_op_usadd_vec:
+ case INDEX_op_ussub_vec:
return &w_w_w;
case INDEX_op_not_vec:
case INDEX_op_neg_vec:
--
2.17.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [Qemu-devel] [PATCH v2 10/10] tcg/aarch64: Implement vector minmax arithmetic
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
` (8 preceding siblings ...)
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 09/10] tcg/aarch64: Implement vector saturating arithmetic Richard Henderson
@ 2019-01-04 22:31 ` Richard Henderson
2019-01-07 13:11 ` [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Mark Cave-Ayland
10 siblings, 0 replies; 18+ messages in thread
From: Richard Henderson @ 2019-01-04 22:31 UTC (permalink / raw)
To: qemu-devel
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/aarch64/tcg-target.h | 2 +-
tcg/aarch64/tcg-target.inc.c | 24 ++++++++++++++++++++++++
2 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index a1884543d0..2d93cf404e 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -136,7 +136,7 @@ typedef enum {
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
#define TCG_TARGET_HAS_sat_vec 1
-#define TCG_TARGET_HAS_minmax_vec 0
+#define TCG_TARGET_HAS_minmax_vec 1
#define TCG_TARGET_DEFAULT_MO (0)
#define TCG_TARGET_HAS_MEMORY_BSWAP 1
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index b2b011f130..ee0d5819af 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -528,8 +528,12 @@ typedef enum {
I3616_CMHI = 0x2e203400,
I3616_CMHS = 0x2e203c00,
I3616_CMEQ = 0x2e208c00,
+ I3616_SMAX = 0x0e206400,
+ I3616_SMIN = 0x0e206c00,
I3616_SQADD = 0x0e200c00,
I3616_SQSUB = 0x0e202c00,
+ I3616_UMAX = 0x2e206400,
+ I3616_UMIN = 0x2e206c00,
I3616_UQADD = 0x2e200c00,
I3616_UQSUB = 0x2e202c00,
@@ -2153,6 +2157,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_ussub_vec:
tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
break;
+ case INDEX_op_smax_vec:
+ tcg_out_insn(s, 3616, SMAX, is_q, vece, a0, a1, a2);
+ break;
+ case INDEX_op_smin_vec:
+ tcg_out_insn(s, 3616, SMIN, is_q, vece, a0, a1, a2);
+ break;
+ case INDEX_op_umax_vec:
+ tcg_out_insn(s, 3616, UMAX, is_q, vece, a0, a1, a2);
+ break;
+ case INDEX_op_umin_vec:
+ tcg_out_insn(s, 3616, UMIN, is_q, vece, a0, a1, a2);
+ break;
case INDEX_op_not_vec:
tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
break;
@@ -2227,6 +2243,10 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
case INDEX_op_sssub_vec:
case INDEX_op_usadd_vec:
case INDEX_op_ussub_vec:
+ case INDEX_op_smax_vec:
+ case INDEX_op_smin_vec:
+ case INDEX_op_umax_vec:
+ case INDEX_op_umin_vec:
return 1;
case INDEX_op_mul_vec:
return vece < MO_64;
@@ -2410,6 +2430,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_sssub_vec:
case INDEX_op_usadd_vec:
case INDEX_op_ussub_vec:
+ case INDEX_op_smax_vec:
+ case INDEX_op_smin_vec:
+ case INDEX_op_umax_vec:
+ case INDEX_op_umin_vec:
return &w_w_w;
case INDEX_op_not_vec:
case INDEX_op_neg_vec:
--
2.17.2
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [Qemu-devel] [PATCH v2 00/10] tcg vector improvements
2019-01-04 22:31 [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Richard Henderson
` (9 preceding siblings ...)
2019-01-04 22:31 ` [Qemu-devel] [PATCH v2 10/10] tcg/aarch64: Implement vector minmax arithmetic Richard Henderson
@ 2019-01-07 13:11 ` Mark Cave-Ayland
2019-01-23 5:09 ` Richard Henderson
10 siblings, 1 reply; 18+ messages in thread
From: Mark Cave-Ayland @ 2019-01-07 13:11 UTC (permalink / raw)
To: Richard Henderson, qemu-devel; +Cc: Howard Spoelstra
On 04/01/2019 22:31, Richard Henderson wrote:
> I've split this out from the target/ppc patch set in which
> it was developed.
>
>
> r~
>
>
> Richard Henderson (10):
> tcg: Add logical simplifications during gvec expand
> tcg: Add gvec expanders for nand, nor, eqv
> tcg: Add write_aofs to GVecGen4
> tcg: Add opcodes for vector saturated arithmetic
> tcg: Add opcodes for vector minmax arithmetic
> tcg/i386: Split subroutines out of tcg_expand_vec_op
> tcg/i386: Implement vector saturating arithmetic
> tcg/i386: Implement vector minmax arithmetic
> tcg/aarch64: Implement vector saturating arithmetic
> tcg/aarch64: Implement vector minmax arithmetic
>
> accel/tcg/tcg-runtime.h | 23 ++
> tcg/aarch64/tcg-target.h | 2 +
> tcg/i386/tcg-target.h | 2 +
> tcg/tcg-op-gvec.h | 18 ++
> tcg/tcg-op.h | 11 +
> tcg/tcg-opc.h | 8 +
> tcg/tcg.h | 2 +
> accel/tcg/tcg-runtime-gvec.c | 257 ++++++++++++++++
> tcg/aarch64/tcg-target.inc.c | 48 +++
> tcg/i386/tcg-target.inc.c | 580 +++++++++++++++++++++--------------
> tcg/tcg-op-gvec.c | 305 ++++++++++++++++--
> tcg/tcg-op-vec.c | 75 ++++-
> tcg/tcg.c | 10 +
> 13 files changed, 1078 insertions(+), 263 deletions(-)
Not sure that I'm particularly qualified to give this an R-B, however should there be
a corresponding update to tcg/README for the new instructions?
One other thing is that Howard sent me off-list a backtrace from trying my combined
branch at https://github.com/mcayland/qemu/tree/ppc-altivec-v5.5-rth booting MacOS
10.5 in the guest and ended up hitting an assert in an --enable-debug build:
Thread 5 (Thread 0x7fffe3fff700 (LWP 10627)):
#0 0x00007ffff698d53f in raise () at /lib64/libc.so.6
#1 0x00007ffff6977895 in abort () at /lib64/libc.so.6
#2 0x00007ffff6977769 in _nl_load_domain.cold.0 () at /lib64/libc.so.6
#3 0x00007ffff69859f6 in .annobin_assert.c_end () at /lib64/libc.so.6
#4 0x000055555584bb67 in do_op3 (vece=2, r=0x1848, a=0x18b8,
b=0x18f0, opc=INDEX_op_ssadd_vec) at
/home/hsp/src/qemu-altivec-55/tcg/tcg-op-vec.c:407
rt = 0x7fffdc002368
at = 0x7fffdc0023d8
bt = 0x7fffdc002410
ri = 140736884384616
ai = 140736884384728
bi = 140736884384784
type = TCG_TYPE_V128
can = 0
__PRETTY_FUNCTION__ = "do_op3"
#5 0x000055555584bbfa in tcg_gen_ssadd_vec (vece=2, r=0x1848,
a=0x18b8, b=0x18f0) at
/home/hsp/src/qemu-altivec-55/tcg/tcg-op-vec.c:419
#6 0x0000555555991887 in gen_vaddsws_vec (vece=2, t=0x1848,
sat=0x1880, a=0x18b8, b=0x18f0) at
/home/hsp/src/qemu-altivec-55/target/ppc/translate/vmx-impl.inc.c:597
x = 0x1928
#7 0x0000555555852e53 in expand_4_vec (vece=2, dofs=197872,
aofs=198288, bofs=197776, cofs=197792, oprsz=16, tysz=16,
type=TCG_TYPE_V128, write_aofs=true, fni=0x55555599182a
<gen_vaddsws_vec>) at
/home/hsp/src/qemu-altivec-55/tcg/tcg-op-gvec.c:903
t0 = 0x1848
t1 = 0x1880
t2 = 0x18b8
t3 = 0x18f0
i = 0
#8 0x0000555555853cc4 in tcg_gen_gvec_4 (dofs=197872, aofs=198288,
bofs=197776, cofs=197792, oprsz=16, maxsz=16, g=0x5555562d33c0 <g>) at
/home/hsp/src/qemu-altivec-55/tcg/tcg-op-gvec.c:1211
type = TCG_TYPE_V128
some = 21845
__PRETTY_FUNCTION__ = "tcg_gen_gvec_4"
__func__ = "tcg_gen_gvec_4"
#9 0x0000555555991987 in gen_vaddsws (ctx=0x7fffe3ffe5f0) at
/home/hsp/src/qemu-altivec-55/target/ppc/translate/vmx-impl.inc.c:597
g = {fni8 = 0x0, fni4 = 0x0, fniv = 0x55555599182a
<gen_vaddsws_vec>, fno = 0x5555559601a1 <gen_helper_vaddsws>, opc =
INDEX_op_add_vec, data = 0, vece = 2 '\002', prefer_i64 = false,
write_aofs = true}
Certainly according to patch 7 of the series only 8-bit and 16-bit accesses are
supported on i386 hosts, but shouldn't we be falling back to the previous
implementations rather than hitting an assert()?
ATB,
Mark.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [Qemu-devel] [PATCH v2 00/10] tcg vector improvements
2019-01-07 13:11 ` [Qemu-devel] [PATCH v2 00/10] tcg vector improvements Mark Cave-Ayland
@ 2019-01-23 5:09 ` Richard Henderson
2019-02-05 21:29 ` Mark Cave-Ayland
0 siblings, 1 reply; 18+ messages in thread
From: Richard Henderson @ 2019-01-23 5:09 UTC (permalink / raw)
To: Mark Cave-Ayland, qemu-devel; +Cc: Howard Spoelstra
On 1/7/19 5:11 AM, Mark Cave-Ayland wrote:
> #7 0x0000555555852e53 in expand_4_vec (vece=2, dofs=197872,
> aofs=198288, bofs=197776, cofs=197792, oprsz=16, tysz=16,
> type=TCG_TYPE_V128, write_aofs=true, fni=0x55555599182a
> <gen_vaddsws_vec>) at
> /home/hsp/src/qemu-altivec-55/tcg/tcg-op-gvec.c:903
> t0 = 0x1848
> t1 = 0x1880
> t2 = 0x18b8
> t3 = 0x18f0
> i = 0
> #8 0x0000555555853cc4 in tcg_gen_gvec_4 (dofs=197872, aofs=198288,
> bofs=197776, cofs=197792, oprsz=16, maxsz=16, g=0x5555562d33c0 <g>) at
> /home/hsp/src/qemu-altivec-55/tcg/tcg-op-gvec.c:1211
> type = TCG_TYPE_V128
> some = 21845
> __PRETTY_FUNCTION__ = "tcg_gen_gvec_4"
> __func__ = "tcg_gen_gvec_4"
> #9 0x0000555555991987 in gen_vaddsws (ctx=0x7fffe3ffe5f0) at
> /home/hsp/src/qemu-altivec-55/target/ppc/translate/vmx-impl.inc.c:597
> g = {fni8 = 0x0, fni4 = 0x0, fniv = 0x55555599182a
> <gen_vaddsws_vec>, fno = 0x5555559601a1 <gen_helper_vaddsws>, opc =
> INDEX_op_add_vec, data = 0, vece = 2 '\002', prefer_i64 = false,
> write_aofs = true}
>
>
> Certainly according to patch 7 of the series only 8-bit and 16-bit accesses are
> supported on i386 hosts, but shouldn't we be falling back to the previous
> implementations rather than hitting an assert()?
In here:
#define GEN_VXFORM_SAT(NAME, VECE, NORM, SAT, OPC2, OPC3) \
static void glue(glue(gen_, NAME), _vec)(unsigned vece, TCGv_vec t, \
TCGv_vec sat, TCGv_vec a, \
TCGv_vec b) \
{ \
TCGv_vec x = tcg_temp_new_vec_matching(t); \
glue(glue(tcg_gen_, NORM), _vec)(VECE, x, a, b); \
glue(glue(tcg_gen_, SAT), _vec)(VECE, t, a, b); \
tcg_gen_cmp_vec(TCG_COND_NE, VECE, x, x, t); \
tcg_gen_or_vec(VECE, sat, sat, x); \
tcg_temp_free_vec(x); \
} \
static void glue(gen_, NAME)(DisasContext *ctx) \
{ \
static const GVecGen4 g = { \
.fniv = glue(glue(gen_, NAME), _vec), \
.fno = glue(gen_helper_, NAME), \
.opc = glue(glue(INDEX_op_, NORM), _vec), \
s/NORM/SAT/, so that we query whether the saturated opcode is supported. The
normal arithmetic, cmp, and or opcodes are mandatory; we don't need to do
anything with those.
r~
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [Qemu-devel] [PATCH v2 00/10] tcg vector improvements
2019-01-23 5:09 ` Richard Henderson
@ 2019-02-05 21:29 ` Mark Cave-Ayland
2019-02-06 3:37 ` Richard Henderson
2019-02-07 22:31 ` Mark Cave-Ayland
0 siblings, 2 replies; 18+ messages in thread
From: Mark Cave-Ayland @ 2019-02-05 21:29 UTC (permalink / raw)
To: Richard Henderson, qemu-devel; +Cc: Howard Spoelstra
On 23/01/2019 05:09, Richard Henderson wrote:
> On 1/7/19 5:11 AM, Mark Cave-Ayland wrote:
>> #7 0x0000555555852e53 in expand_4_vec (vece=2, dofs=197872,
>> aofs=198288, bofs=197776, cofs=197792, oprsz=16, tysz=16,
>> type=TCG_TYPE_V128, write_aofs=true, fni=0x55555599182a
>> <gen_vaddsws_vec>) at
>> /home/hsp/src/qemu-altivec-55/tcg/tcg-op-gvec.c:903
>> t0 = 0x1848
>> t1 = 0x1880
>> t2 = 0x18b8
>> t3 = 0x18f0
>> i = 0
>> #8 0x0000555555853cc4 in tcg_gen_gvec_4 (dofs=197872, aofs=198288,
>> bofs=197776, cofs=197792, oprsz=16, maxsz=16, g=0x5555562d33c0 <g>) at
>> /home/hsp/src/qemu-altivec-55/tcg/tcg-op-gvec.c:1211
>> type = TCG_TYPE_V128
>> some = 21845
>> __PRETTY_FUNCTION__ = "tcg_gen_gvec_4"
>> __func__ = "tcg_gen_gvec_4"
>> #9 0x0000555555991987 in gen_vaddsws (ctx=0x7fffe3ffe5f0) at
>> /home/hsp/src/qemu-altivec-55/target/ppc/translate/vmx-impl.inc.c:597
>> g = {fni8 = 0x0, fni4 = 0x0, fniv = 0x55555599182a
>> <gen_vaddsws_vec>, fno = 0x5555559601a1 <gen_helper_vaddsws>, opc =
>> INDEX_op_add_vec, data = 0, vece = 2 '\002', prefer_i64 = false,
>> write_aofs = true}
>>
>>
>> Certainly according to patch 7 of the series only 8-bit and 16-bit accesses are
>> supported on i386 hosts, but shouldn't we be falling back to the previous
>> implementations rather than hitting an assert()?
>
> In here:
>
> #define GEN_VXFORM_SAT(NAME, VECE, NORM, SAT, OPC2, OPC3) \
> static void glue(glue(gen_, NAME), _vec)(unsigned vece, TCGv_vec t, \
> TCGv_vec sat, TCGv_vec a, \
> TCGv_vec b) \
> { \
> TCGv_vec x = tcg_temp_new_vec_matching(t); \
> glue(glue(tcg_gen_, NORM), _vec)(VECE, x, a, b); \
> glue(glue(tcg_gen_, SAT), _vec)(VECE, t, a, b); \
> tcg_gen_cmp_vec(TCG_COND_NE, VECE, x, x, t); \
> tcg_gen_or_vec(VECE, sat, sat, x); \
> tcg_temp_free_vec(x); \
> } \
> static void glue(gen_, NAME)(DisasContext *ctx) \
> { \
> static const GVecGen4 g = { \
> .fniv = glue(glue(gen_, NAME), _vec), \
> .fno = glue(gen_helper_, NAME), \
> .opc = glue(glue(INDEX_op_, NORM), _vec), \
>
> s/NORM/SAT/, so that we query whether the saturated opcode is supported. The
> normal arithmetic, cmp, and or opcodes are mandatory; we don't need to do
> anything with those.
Now that this and the other pre-requisite patches have been merged into master, I've
rebased the outstanding PPC parts of your "tcg, target/ppc vector improvements" on
master including the above fix and pushed the result to
https://github.com/mcayland/qemu/commits/ppc-altivec-v6.
The good news is that the graphics corruption I originally noticed caused by the
patch introducing the saturating add/sub vector ops has now gone, and with my
little-endian vsplt fix included then both OS X and MacOS 9 appear to run without any
obvious issues on an x86 host, and certainly feel smoother compared to before.
The only minor question I had with the patchset in its current form is whether to use
the new VsrD() macro for vscr_sat, or whether we don't really care enough?
ATB,
Mark.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [Qemu-devel] [PATCH v2 00/10] tcg vector improvements
2019-02-05 21:29 ` Mark Cave-Ayland
@ 2019-02-06 3:37 ` Richard Henderson
2019-02-06 7:15 ` Mark Cave-Ayland
2019-02-07 22:31 ` Mark Cave-Ayland
1 sibling, 1 reply; 18+ messages in thread
From: Richard Henderson @ 2019-02-06 3:37 UTC (permalink / raw)
To: Mark Cave-Ayland, qemu-devel; +Cc: Howard Spoelstra
On 2/5/19 9:29 PM, Mark Cave-Ayland wrote:
> The only minor question I had with the patchset in its current form is whether to use
> the new VsrD() macro for vscr_sat, or whether we don't really care enough?
Given the comment
/* Which bit we set is completely arbitrary, but clear the rest. */
I don't think VsrD is helpful.
In "target/ppc: Split out VSCR_SAT to a vector field":
ppc_vsr_t vsr[64] QEMU_ALIGNED(16);
+ /* Non-zero if and only if VSCR_SAT should be set. */
+ ppc_vsr_t vscr_sat;
Better to add the QEMU_ALIGNED(16) to vscr_sat as well. Yes, it will already
happen to be aligned by placement, but this is also a bit documentation.
In "target/ppc: convert vadd*s and vsub*s to vector operations":
if (sat) { \
- set_vscr_sat(env); \
+ vscr_sat->u32[0] = 1; \
} \
Changed in error?
r~
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [Qemu-devel] [PATCH v2 00/10] tcg vector improvements
2019-02-06 3:37 ` Richard Henderson
@ 2019-02-06 7:15 ` Mark Cave-Ayland
2019-02-06 15:44 ` BALATON Zoltan
0 siblings, 1 reply; 18+ messages in thread
From: Mark Cave-Ayland @ 2019-02-06 7:15 UTC (permalink / raw)
To: Richard Henderson, qemu-devel; +Cc: Howard Spoelstra
On 06/02/2019 03:37, Richard Henderson wrote:
> On 2/5/19 9:29 PM, Mark Cave-Ayland wrote:
>> The only minor question I had with the patchset in its current form is whether to use
>> the new VsrD() macro for vscr_sat, or whether we don't really care enough?
>
> Given the comment
>
> /* Which bit we set is completely arbitrary, but clear the rest. */
>
> I don't think VsrD is helpful.
Okay, I can leave that for now.
> In "target/ppc: Split out VSCR_SAT to a vector field":
>
> ppc_vsr_t vsr[64] QEMU_ALIGNED(16);
> + /* Non-zero if and only if VSCR_SAT should be set. */
> + ppc_vsr_t vscr_sat;
>
> Better to add the QEMU_ALIGNED(16) to vscr_sat as well. Yes, it will already
> happen to be aligned by placement, but this is also a bit documentation.
I've added this to my latest branch.
> In "target/ppc: convert vadd*s and vsub*s to vector operations":
>
> if (sat) { \
> - set_vscr_sat(env); \
> + vscr_sat->u32[0] = 1; \
> } \
>
> Changed in error?
It looks like this was in the original patch, presumably because GEN_VXFORM_SAT
doesn't include the env parameter which was present in GEN_VXFORM_ENV. Should the env
parameter be added to GEN_VXFORM_SAT?
Howard has also pointed out that he's still spotted some corruption in his tests, so
I will do a bit more investigation and report back.
ATB,
Mark.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [Qemu-devel] [PATCH v2 00/10] tcg vector improvements
2019-02-05 21:29 ` Mark Cave-Ayland
2019-02-06 3:37 ` Richard Henderson
@ 2019-02-07 22:31 ` Mark Cave-Ayland
1 sibling, 0 replies; 18+ messages in thread
From: Mark Cave-Ayland @ 2019-02-07 22:31 UTC (permalink / raw)
To: Richard Henderson, qemu-devel; +Cc: Howard Spoelstra
On 05/02/2019 21:29, Mark Cave-Ayland wrote:
> Now that this and the other pre-requisite patches have been merged into master, I've
> rebased the outstanding PPC parts of your "tcg, target/ppc vector improvements" on
> master including the above fix and pushed the result to
> https://github.com/mcayland/qemu/commits/ppc-altivec-v6.
>
> The good news is that the graphics corruption I originally noticed caused by the
> patch introducing the saturating add/sub vector ops has now gone, and with my
> little-endian vsplt fix included then both OS X and MacOS 9 appear to run without any
> obvious issues on an x86 host, and certainly feel smoother compared to before.
I started to follow up Howard's report that he could still see graphical corruption
with these patches, and I was surprised to notice that it had reappeared locally once
again :/
After working out that this was because the relevant instructions weren't being
generated for all Mac machines, I was able to figure it out after a few hours of
digging. Patch to follow shortly.
ATB,
Mark.
^ permalink raw reply [flat|nested] 18+ messages in thread