* [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion
@ 2017-09-16 2:34 Richard Henderson
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
` (7 more replies)
0 siblings, 8 replies; 14+ messages in thread
From: Richard Henderson @ 2017-09-16 2:34 UTC (permalink / raw)
To: qemu-devel; +Cc: alex.bennee, f4bug
Now addressing the complex vector op issue. I now expose TCGv_vec
to target front-ends, but opaque wrt the vector size. One can thus
compose vector operations, as demonstrated in target/arm/.
The actual host vector length now becomes an argument to the *_vec
opcodes. It's a little awkward, but does prevent an explosion of
opcode values.
All R-b dropped because all patches rewritten or heavily modified.
Whacha think?
r~
Richard Henderson (6):
tcg: Add types and operations for host vectors
tcg: Add vector expanders
target/arm: Align vector registers
target/arm: Use vector infrastructure for aa64 add/sub/logic
tcg/i386: Add vector operations
tcg/aarch64: Add vector operations
Makefile.target | 2 +-
accel/tcg/tcg-runtime.h | 24 ++
target/arm/cpu.h | 2 +-
tcg/aarch64/tcg-target.h | 20 +-
tcg/i386/tcg-target.h | 36 +-
tcg/tcg-gvec-desc.h | 49 +++
tcg/tcg-op-gvec.h | 143 ++++++++
tcg/tcg-op.h | 26 ++
tcg/tcg-opc.h | 37 ++
tcg/tcg.h | 34 ++
accel/tcg/tcg-runtime-gvec.c | 255 +++++++++++++
target/arm/translate-a64.c | 216 +++++++----
tcg/aarch64/tcg-target.inc.c | 340 ++++++++++++++---
tcg/i386/tcg-target.inc.c | 423 ++++++++++++++++++---
tcg/tcg-op-gvec.c | 853 +++++++++++++++++++++++++++++++++++++++++++
tcg/tcg-op.c | 234 ++++++++++++
tcg/tcg.c | 77 +++-
accel/tcg/Makefile.objs | 2 +-
tcg/README | 46 +++
19 files changed, 2651 insertions(+), 168 deletions(-)
create mode 100644 tcg/tcg-gvec-desc.h
create mode 100644 tcg/tcg-op-gvec.h
create mode 100644 accel/tcg/tcg-runtime-gvec.c
create mode 100644 tcg/tcg-op-gvec.c
--
2.13.5
^ permalink raw reply [flat|nested] 14+ messages in thread
* [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors
2017-09-16 2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
@ 2017-09-16 2:34 ` Richard Henderson
2017-09-26 19:28 ` Alex Bennée
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders Richard Henderson
` (6 subsequent siblings)
7 siblings, 1 reply; 14+ messages in thread
From: Richard Henderson @ 2017-09-16 2:34 UTC (permalink / raw)
To: qemu-devel; +Cc: alex.bennee, f4bug
Nothing uses or enables them yet.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/tcg-op.h | 26 +++++++
tcg/tcg-opc.h | 37 ++++++++++
tcg/tcg.h | 34 +++++++++
tcg/tcg-op.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
tcg/tcg.c | 77 ++++++++++++++++++-
tcg/README | 46 ++++++++++++
6 files changed, 453 insertions(+), 1 deletion(-)
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 5d3278f243..b9b0b9f46f 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -915,6 +915,32 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
+void tcg_gen_movi_vec(TCGv_vec, tcg_target_long);
+void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a);
+
+void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
+void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
+void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz);
+void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz);
+
#if TARGET_LONG_BITS == 64
#define tcg_gen_movi_tl tcg_gen_movi_i64
#define tcg_gen_mov_tl tcg_gen_mov_i64
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 956fb1e9f3..8200184fa9 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -204,8 +204,45 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
+/* Host vector support. */
+
+#define IMPLVEC \
+ IMPL(TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256)
+
+DEF(mov_vec, 1, 1, 1, TCG_OPF_NOT_PRESENT)
+
+/* ??? Simple, but perhaps dupiN would be more descriptive. */
+DEF(movi_vec, 1, 0, 2, TCG_OPF_NOT_PRESENT)
+
+DEF(ld_vec, 1, 1, 2, IMPLVEC)
+DEF(ldz_vec, 1, 1, 3, IMPLVEC)
+DEF(st_vec, 0, 2, 2, IMPLVEC)
+
+DEF(add8_vec, 1, 2, 1, IMPLVEC)
+DEF(add16_vec, 1, 2, 1, IMPLVEC)
+DEF(add32_vec, 1, 2, 1, IMPLVEC)
+DEF(add64_vec, 1, 2, 1, IMPLVEC)
+
+DEF(sub8_vec, 1, 2, 1, IMPLVEC)
+DEF(sub16_vec, 1, 2, 1, IMPLVEC)
+DEF(sub32_vec, 1, 2, 1, IMPLVEC)
+DEF(sub64_vec, 1, 2, 1, IMPLVEC)
+
+DEF(neg8_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+DEF(neg16_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+DEF(neg32_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+DEF(neg64_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+
+DEF(and_vec, 1, 2, 1, IMPLVEC)
+DEF(or_vec, 1, 2, 1, IMPLVEC)
+DEF(xor_vec, 1, 2, 1, IMPLVEC)
+DEF(andc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
+DEF(orc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
+DEF(not_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
+
#undef TLADDR_ARGS
#undef DATA64_ARGS
#undef IMPL
#undef IMPL64
+#undef IMPLVEC
#undef DEF
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 25662c36d4..7cd356e87f 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -173,6 +173,16 @@ typedef uint64_t TCGRegSet;
# error "Missing unsigned widening multiply"
#endif
+#ifndef TCG_TARGET_HAS_v64
+#define TCG_TARGET_HAS_v64 0
+#define TCG_TARGET_HAS_v128 0
+#define TCG_TARGET_HAS_v256 0
+#define TCG_TARGET_HAS_neg_vec 0
+#define TCG_TARGET_HAS_not_vec 0
+#define TCG_TARGET_HAS_andc_vec 0
+#define TCG_TARGET_HAS_orc_vec 0
+#endif
+
#ifndef TARGET_INSN_START_EXTRA_WORDS
# define TARGET_INSN_START_WORDS 1
#else
@@ -249,6 +259,11 @@ typedef struct TCGPool {
typedef enum TCGType {
TCG_TYPE_I32,
TCG_TYPE_I64,
+
+ TCG_TYPE_V64,
+ TCG_TYPE_V128,
+ TCG_TYPE_V256,
+
TCG_TYPE_COUNT, /* number of different types */
/* An alias for the size of the host register. */
@@ -399,6 +414,8 @@ typedef tcg_target_ulong TCGArg;
* TCGv_i32 : 32 bit integer type
* TCGv_i64 : 64 bit integer type
* TCGv_ptr : a host pointer type
+ * TCGv_vec : a host vector type; the exact size is not exposed
+ to the CPU front-end code.
* TCGv : an integer type the same size as target_ulong
(an alias for either TCGv_i32 or TCGv_i64)
The compiler's type checking will complain if you mix them
@@ -424,6 +441,7 @@ typedef tcg_target_ulong TCGArg;
typedef struct TCGv_i32_d *TCGv_i32;
typedef struct TCGv_i64_d *TCGv_i64;
typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_vec_d *TCGv_vec;
typedef TCGv_ptr TCGv_env;
#if TARGET_LONG_BITS == 32
#define TCGv TCGv_i32
@@ -448,6 +466,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL MAKE_TCGV_PTR(intptr_t i)
return (TCGv_ptr)i;
}
+static inline TCGv_vec QEMU_ARTIFICIAL MAKE_TCGV_VEC(intptr_t i)
+{
+ return (TCGv_vec)i;
+}
+
static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
{
return (intptr_t)t;
@@ -463,6 +486,11 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
return (intptr_t)t;
}
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_VEC(TCGv_vec t)
+{
+ return (intptr_t)t;
+}
+
#if TCG_TARGET_REG_BITS == 32
#define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
#define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
@@ -471,15 +499,18 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
#define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
#define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
#define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
+#define TCGV_EQUAL_VEC(a, b) (GET_TCGV_VEC(a) == GET_TCGV_VEC(b))
/* Dummy definition to avoid compiler warnings. */
#define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
#define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
#define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
+#define TCGV_UNUSED_VEC(x) x = MAKE_TCGV_VEC(-1)
#define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
#define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
#define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
+#define TCGV_IS_UNUSED_VEC(x) (GET_TCGV_VEC(x) == -1)
/* call flags */
/* Helper does not read globals (either directly or through an exception). It
@@ -790,9 +821,12 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char *name);
TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_vec tcg_temp_new_vec(TCGType type);
+TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match);
void tcg_temp_free_i32(TCGv_i32 arg);
void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_vec(TCGv_vec arg);
static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
const char *name)
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 688d91755b..50b3177e5f 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -3072,3 +3072,237 @@ static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b)
GEN_ATOMIC_HELPER(xchg, mov2, 0)
#undef GEN_ATOMIC_HELPER
+
+static void tcg_gen_op2_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a)
+{
+ TCGArg ri = GET_TCGV_VEC(r);
+ TCGArg ai = GET_TCGV_VEC(a);
+ TCGTemp *rt = &tcg_ctx.temps[ri];
+ TCGTemp *at = &tcg_ctx.temps[ai];
+ TCGType type = rt->base_type;
+
+ tcg_debug_assert(at->base_type == type);
+ tcg_gen_op3(&tcg_ctx, opc, ri, ai, type - TCG_TYPE_V64);
+}
+
+static void tcg_gen_op3_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ TCGArg ri = GET_TCGV_VEC(r);
+ TCGArg ai = GET_TCGV_VEC(a);
+ TCGArg bi = GET_TCGV_VEC(b);
+ TCGTemp *rt = &tcg_ctx.temps[ri];
+ TCGTemp *at = &tcg_ctx.temps[ai];
+ TCGTemp *bt = &tcg_ctx.temps[bi];
+ TCGType type = rt->base_type;
+
+ tcg_debug_assert(at->base_type == type);
+ tcg_debug_assert(bt->base_type == type);
+ tcg_gen_op4(&tcg_ctx, opc, ri, ai, bi, type - TCG_TYPE_V64);
+}
+
+void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
+{
+ if (!TCGV_EQUAL_VEC(r, a)) {
+ tcg_gen_op2_vec(INDEX_op_mov_vec, r, a);
+ }
+}
+
+void tcg_gen_movi_vec(TCGv_vec r, tcg_target_long a)
+{
+ TCGArg ri = GET_TCGV_VEC(r);
+ TCGTemp *rt = &tcg_ctx.temps[ri];
+ TCGType type = rt->base_type;
+
+ tcg_debug_assert(a == 0 || a == -1);
+ tcg_gen_op3(&tcg_ctx, INDEX_op_movi_vec, ri, a, type - TCG_TYPE_V64);
+}
+
+void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
+{
+ TCGArg ri = GET_TCGV_VEC(r);
+ TCGArg bi = GET_TCGV_PTR(b);
+ TCGTemp *rt = &tcg_ctx.temps[ri];
+ TCGType type = rt->base_type;
+
+ tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64);
+}
+
+void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
+{
+ TCGArg ri = GET_TCGV_VEC(r);
+ TCGArg bi = GET_TCGV_PTR(b);
+ TCGTemp *rt = &tcg_ctx.temps[ri];
+ TCGType type = rt->base_type;
+
+ tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64);
+}
+
+/* Load data into a vector R from B+O using TYPE. If R is wider than TYPE,
+ fill the high bits with zeros. */
+void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type)
+{
+ TCGArg ri = GET_TCGV_VEC(r);
+ TCGArg bi = GET_TCGV_PTR(b);
+ TCGTemp *rt = &tcg_ctx.temps[ri];
+ TCGType btype = rt->base_type;
+
+ if (type < btype) {
+ tcg_gen_op5(&tcg_ctx, INDEX_op_ldz_vec, ri, bi, o,
+ type - TCG_TYPE_V64, btype - TCG_TYPE_V64);
+ } else {
+ tcg_debug_assert(type == btype);
+ tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64);
+ }
+}
+
+/* Store data from vector R into B+O using TYPE. If R is wider than TYPE,
+ store only the low bits. */
+void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type)
+{
+ TCGArg ri = GET_TCGV_VEC(r);
+ TCGArg bi = GET_TCGV_PTR(b);
+ TCGTemp *rt = &tcg_ctx.temps[ri];
+ TCGType btype = rt->base_type;
+
+ tcg_debug_assert(type <= btype);
+ tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64);
+}
+
+void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_add8_vec, r, a, b);
+}
+
+void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_add16_vec, r, a, b);
+}
+
+void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_add32_vec, r, a, b);
+}
+
+void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_add64_vec, r, a, b);
+}
+
+void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_sub8_vec, r, a, b);
+}
+
+void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_sub16_vec, r, a, b);
+}
+
+void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_sub32_vec, r, a, b);
+}
+
+void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_sub64_vec, r, a, b);
+}
+
+void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_and_vec, r, a, b);
+}
+
+void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_or_vec, r, a, b);
+}
+
+void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_op3_vec(INDEX_op_xor_vec, r, a, b);
+}
+
+void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ if (TCG_TARGET_HAS_andc_vec) {
+ tcg_gen_op3_vec(INDEX_op_andc_vec, r, a, b);
+ } else {
+ TCGv_vec t = tcg_temp_new_vec_matching(r);
+ tcg_gen_not_vec(t, b);
+ tcg_gen_and_vec(r, a, t);
+ tcg_temp_free_vec(t);
+ }
+}
+
+void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ if (TCG_TARGET_HAS_orc_vec) {
+ tcg_gen_op3_vec(INDEX_op_orc_vec, r, a, b);
+ } else {
+ TCGv_vec t = tcg_temp_new_vec_matching(r);
+ tcg_gen_not_vec(t, b);
+ tcg_gen_or_vec(r, a, t);
+ tcg_temp_free_vec(t);
+ }
+}
+
+void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a)
+{
+ if (TCG_TARGET_HAS_not_vec) {
+ tcg_gen_op2_vec(INDEX_op_orc_vec, r, a);
+ } else {
+ TCGv_vec t = tcg_temp_new_vec_matching(r);
+ tcg_gen_movi_vec(t, -1);
+ tcg_gen_xor_vec(r, a, t);
+ tcg_temp_free_vec(t);
+ }
+}
+
+void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a)
+{
+ if (TCG_TARGET_HAS_neg_vec) {
+ tcg_gen_op2_vec(INDEX_op_neg8_vec, r, a);
+ } else {
+ TCGv_vec t = tcg_temp_new_vec_matching(r);
+ tcg_gen_movi_vec(t, 0);
+ tcg_gen_sub8_vec(r, t, a);
+ tcg_temp_free_vec(t);
+ }
+}
+
+void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a)
+{
+ if (TCG_TARGET_HAS_neg_vec) {
+ tcg_gen_op2_vec(INDEX_op_neg16_vec, r, a);
+ } else {
+ TCGv_vec t = tcg_temp_new_vec_matching(r);
+ tcg_gen_movi_vec(t, 0);
+ tcg_gen_sub16_vec(r, t, a);
+ tcg_temp_free_vec(t);
+ }
+}
+
+void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a)
+{
+ if (TCG_TARGET_HAS_neg_vec) {
+ tcg_gen_op2_vec(INDEX_op_neg32_vec, r, a);
+ } else {
+ TCGv_vec t = tcg_temp_new_vec_matching(r);
+ tcg_gen_movi_vec(t, 0);
+ tcg_gen_sub32_vec(r, t, a);
+ tcg_temp_free_vec(t);
+ }
+}
+
+void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a)
+{
+ if (TCG_TARGET_HAS_neg_vec) {
+ tcg_gen_op2_vec(INDEX_op_neg64_vec, r, a);
+ } else {
+ TCGv_vec t = tcg_temp_new_vec_matching(r);
+ tcg_gen_movi_vec(t, 0);
+ tcg_gen_sub64_vec(r, t, a);
+ tcg_temp_free_vec(t);
+ }
+}
diff --git a/tcg/tcg.c b/tcg/tcg.c
index dff9999bc6..a4d55efdf0 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -116,7 +116,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
static bool tcg_out_ldst_finalize(TCGContext *s);
#endif
-static TCGRegSet tcg_target_available_regs[2];
+static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
static TCGRegSet tcg_target_call_clobber_regs;
#if TCG_TARGET_INSN_UNIT_SIZE == 1
@@ -664,6 +664,44 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
return MAKE_TCGV_I64(idx);
}
+TCGv_vec tcg_temp_new_vec(TCGType type)
+{
+ int idx;
+
+#ifdef CONFIG_DEBUG_TCG
+ switch (type) {
+ case TCG_TYPE_V64:
+ assert(TCG_TARGET_HAS_v64);
+ break;
+ case TCG_TYPE_V128:
+ assert(TCG_TARGET_HAS_v128);
+ break;
+ case TCG_TYPE_V256:
+ assert(TCG_TARGET_HAS_v256);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+#endif
+
+ idx = tcg_temp_new_internal(type, 0);
+ return MAKE_TCGV_VEC(idx);
+}
+
+TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match)
+{
+ TCGContext *s = &tcg_ctx;
+ int idx = GET_TCGV_VEC(match);
+ TCGTemp *ts;
+
+ tcg_debug_assert(idx >= s->nb_globals && idx < s->nb_temps);
+ ts = &s->temps[idx];
+ tcg_debug_assert(ts->temp_allocated != 0);
+
+ idx = tcg_temp_new_internal(ts->base_type, 0);
+ return MAKE_TCGV_VEC(idx);
+}
+
static void tcg_temp_free_internal(int idx)
{
TCGContext *s = &tcg_ctx;
@@ -696,6 +734,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
tcg_temp_free_internal(GET_TCGV_I64(arg));
}
+void tcg_temp_free_vec(TCGv_vec arg)
+{
+ tcg_temp_free_internal(GET_TCGV_VEC(arg));
+}
+
TCGv_i32 tcg_const_i32(int32_t val)
{
TCGv_i32 t0;
@@ -753,6 +796,9 @@ int tcg_check_temp_count(void)
Test the runtime variable that controls each opcode. */
bool tcg_op_supported(TCGOpcode op)
{
+ const bool have_vec
+ = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256;
+
switch (op) {
case INDEX_op_discard:
case INDEX_op_set_label:
@@ -966,6 +1012,35 @@ bool tcg_op_supported(TCGOpcode op)
case INDEX_op_mulsh_i64:
return TCG_TARGET_HAS_mulsh_i64;
+ case INDEX_op_mov_vec:
+ case INDEX_op_movi_vec:
+ case INDEX_op_ld_vec:
+ case INDEX_op_ldz_vec:
+ case INDEX_op_st_vec:
+ case INDEX_op_add8_vec:
+ case INDEX_op_add16_vec:
+ case INDEX_op_add32_vec:
+ case INDEX_op_add64_vec:
+ case INDEX_op_sub8_vec:
+ case INDEX_op_sub16_vec:
+ case INDEX_op_sub32_vec:
+ case INDEX_op_sub64_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ return have_vec;
+ case INDEX_op_not_vec:
+ return have_vec && TCG_TARGET_HAS_not_vec;
+ case INDEX_op_neg8_vec:
+ case INDEX_op_neg16_vec:
+ case INDEX_op_neg32_vec:
+ case INDEX_op_neg64_vec:
+ return have_vec && TCG_TARGET_HAS_neg_vec;
+ case INDEX_op_andc_vec:
+ return have_vec && TCG_TARGET_HAS_andc_vec;
+ case INDEX_op_orc_vec:
+ return have_vec && TCG_TARGET_HAS_orc_vec;
+
case NB_OPS:
break;
}
diff --git a/tcg/README b/tcg/README
index 03bfb6acd4..3bf3af67db 100644
--- a/tcg/README
+++ b/tcg/README
@@ -503,6 +503,52 @@ of the memory access.
For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
64-bit memory access specified in flags.
+********* Host vector operations
+
+All of the vector ops have a final constant argument that specifies the
+length of the vector operation LEN as 64 << LEN bits.
+
+* mov_vec v0, v1, len
+* ld_vec v0, t1, len
+* st_vec v0, t1, len
+
+ Move, load and store.
+
+* movi_vec v0, c, len
+
+ Copy C across the entire vector.
+ At present the only supported values for C are 0 and -1.
+
+* add8_vec v0, v1, v2, len
+* add16_vec v0, v1, v2, len
+* add32_vec v0, v1, v2, len
+* add64_vec v0, v1, v2, len
+
+ v0 = v1 + v2, in elements of 8/16/32/64 bits, across len.
+
+* sub8_vec v0, v1, v2, len
+* sub16_vec v0, v1, v2, len
+* sub32_vec v0, v1, v2, len
+* sub64_vec v0, v1, v2, len
+
+ Similarly, v0 = v1 - v2.
+
+* neg8_vec v0, v1, len
+* neg16_vec v0, v1, len
+* neg32_vec v0, v1, len
+* neg64_vec v0, v1, len
+
+ Similarly, v0 = -v1.
+
+* and_vec v0, v1, v2, len
+* or_vec v0, v1, v2, len
+* xor_vec v0, v1, v2, len
+* andc_vec v0, v1, v2, len
+* orc_vec v0, v1, v2, len
+* not_vec v0, v1, len
+
+ Similarly, logical operations.
+
*********
Note 1: Some shortcuts are defined when the last operand is known to be
--
2.13.5
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders
2017-09-16 2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
@ 2017-09-16 2:34 ` Richard Henderson
2017-09-26 22:31 ` Alex Bennée
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers Richard Henderson
` (5 subsequent siblings)
7 siblings, 1 reply; 14+ messages in thread
From: Richard Henderson @ 2017-09-16 2:34 UTC (permalink / raw)
To: qemu-devel; +Cc: alex.bennee, f4bug
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
Makefile.target | 2 +-
accel/tcg/tcg-runtime.h | 24 ++
tcg/tcg-gvec-desc.h | 49 +++
tcg/tcg-op-gvec.h | 143 ++++++++
accel/tcg/tcg-runtime-gvec.c | 255 +++++++++++++
tcg/tcg-op-gvec.c | 853 +++++++++++++++++++++++++++++++++++++++++++
accel/tcg/Makefile.objs | 2 +-
7 files changed, 1326 insertions(+), 2 deletions(-)
create mode 100644 tcg/tcg-gvec-desc.h
create mode 100644 tcg/tcg-op-gvec.h
create mode 100644 accel/tcg/tcg-runtime-gvec.c
create mode 100644 tcg/tcg-op-gvec.c
diff --git a/Makefile.target b/Makefile.target
index 6361f957fb..f9967feef5 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -94,7 +94,7 @@ all: $(PROGS) stap
obj-y += exec.o
obj-y += accel/
obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
-obj-$(CONFIG_TCG) += tcg/tcg-common.o
+obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/tcg-op-gvec.o
obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
obj-y += fpu/softfloat.o
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index c41d38a557..61c0ce39d3 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -134,3 +134,27 @@ GEN_ATOMIC_HELPERS(xor_fetch)
GEN_ATOMIC_HELPERS(xchg)
#undef GEN_ATOMIC_HELPERS
+
+DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/tcg/tcg-gvec-desc.h b/tcg/tcg-gvec-desc.h
new file mode 100644
index 0000000000..8ba9a8168d
--- /dev/null
+++ b/tcg/tcg-gvec-desc.h
@@ -0,0 +1,49 @@
+/*
+ * Generic vector operation descriptor
+ *
+ * Copyright (c) 2017 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
+#define SIMD_OPRSZ_SHIFT 0
+#define SIMD_OPRSZ_BITS 5
+
+#define SIMD_MAXSZ_SHIFT (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
+#define SIMD_MAXSZ_BITS 5
+
+#define SIMD_DATA_SHIFT (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
+#define SIMD_DATA_BITS (32 - SIMD_DATA_SHIFT)
+
+/* Create a descriptor from components. */
+uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
+
+/* Extract the operation size from a descriptor. */
+static inline intptr_t simd_oprsz(uint32_t desc)
+{
+ return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
+}
+
+/* Extract the max vector size from a descriptor. */
+static inline intptr_t simd_maxsz(uint32_t desc)
+{
+ return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
+}
+
+/* Extract the operation-specific data from a descriptor. */
+static inline int32_t simd_data(uint32_t desc)
+{
+ return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS);
+}
diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
new file mode 100644
index 0000000000..28bd77f1dc
--- /dev/null
+++ b/tcg/tcg-op-gvec.h
@@ -0,0 +1,143 @@
+/*
+ * Generic vector operation expansion
+ *
+ * Copyright (c) 2017 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * "Generic" vectors. All operands are given as offsets from ENV,
+ * and therefore cannot also be allocated via tcg_global_mem_new_*.
+ * OPRSZ is the byte size of the vector upon which the operation is performed.
+ * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
+ *
+ * All sizes must be 8 or any multiple of 16.
+ * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
+ * Operands may completely, but not partially, overlap.
+ */
+
+/* Expand a call to a gvec-style helper, with pointers to two vector
+ operands, and a descriptor (see tcg-gvec-desc.h). */
+typedef void (gen_helper_gvec_2)(TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz, int32_t data,
+ gen_helper_gvec_2 *fn);
+
+/* Similarly, passing an extra pointer (e.g. env or float_status). */
+typedef void (gen_helper_gvec_2_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
+ TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+ int32_t data, gen_helper_gvec_2_ptr *fn);
+
+/* Similarly, with three vector operands. */
+typedef void (gen_helper_gvec_3)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz, int32_t data,
+ gen_helper_gvec_3 *fn);
+
+typedef void (gen_helper_gvec_3_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+ int32_t data, gen_helper_gvec_3_ptr *fn);
+
+/* Expand a gvec operation. Either inline or out-of-line depending on
+ the actual vector size and the operations supported by the host. */
+typedef struct {
+ /* Expand inline as a 64-bit or 32-bit integer.
+ Only one of these will be non-NULL. */
+ void (*fni8)(TCGv_i64, TCGv_i64);
+ void (*fni4)(TCGv_i32, TCGv_i32);
+ /* Expand inline with a host vector type. */
+ void (*fniv)(TCGv_vec, TCGv_vec);
+ /* Expand out-of-line helper w/descriptor. */
+ gen_helper_gvec_2 *fno;
+ /* Prefer i64 to v64. */
+ bool prefer_i64;
+} GVecGen2;
+
+typedef struct {
+ /* Expand inline as a 64-bit or 32-bit integer.
+ Only one of these will be non-NULL. */
+ void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
+ void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
+ /* Expand inline with a host vector type. */
+ void (*fniv)(TCGv_vec, TCGv_vec, TCGv_vec);
+ /* Expand out-of-line helper w/descriptor. */
+ gen_helper_gvec_3 *fno;
+ /* Prefer i64 to v64. */
+ bool prefer_i64;
+ /* Load dest as a 3rd source operand. */
+ bool load_dest;
+} GVecGen3;
+
+void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
+ uint32_t opsz, uint32_t clsz, const GVecGen2 *);
+void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz, const GVecGen3 *);
+
+/* Expand a specific vector operation. */
+
+#define DEF(X) \
+ void tcg_gen_gvec_##X(uint32_t dofs, uint32_t aofs, \
+ uint32_t opsz, uint32_t clsz)
+
+DEF(mov);
+DEF(not);
+DEF(neg8);
+DEF(neg16);
+DEF(neg32);
+DEF(neg64);
+
+#undef DEF
+#define DEF(X) \
+ void tcg_gen_gvec_##X(uint32_t dofs, uint32_t aofs, uint32_t bofs, \
+ uint32_t opsz, uint32_t clsz)
+
+DEF(add8);
+DEF(add16);
+DEF(add32);
+DEF(add64);
+
+DEF(sub8);
+DEF(sub16);
+DEF(sub32);
+DEF(sub64);
+
+DEF(and);
+DEF(or);
+DEF(xor);
+DEF(andc);
+DEF(orc);
+
+#undef DEF
+
+/*
+ * 64-bit vector operations. Use these when the register has been allocated
+ * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
+ * OPRSZ = MAXSZ = 8.
+ */
+
+void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
+void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
+void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
+
+void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+
+void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
new file mode 100644
index 0000000000..c75e76367c
--- /dev/null
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -0,0 +1,255 @@
+/*
+ * Generic vectorized operation runtime
+ *
+ * Copyright (c) 2017 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "tcg-gvec-desc.h"
+
+
+/* Virtually all hosts support 16-byte vectors. Those that don't can emulate
+ them via GCC's generic vector extension. This turns out to be simpler and
+ more reliable than getting the compiler to autovectorize.
+
+ In tcg-op-gvec.c, we asserted that both the size and alignment
+ of the data are multiples of 16. */
+
+typedef uint8_t vec8 __attribute__((vector_size(16)));
+typedef uint16_t vec16 __attribute__((vector_size(16)));
+typedef uint32_t vec32 __attribute__((vector_size(16)));
+typedef uint64_t vec64 __attribute__((vector_size(16)));
+
+static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
+{
+ intptr_t maxsz = simd_maxsz(desc);
+ intptr_t i;
+
+ if (unlikely(maxsz > oprsz)) {
+ for (i = oprsz; i < maxsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = (vec64){ 0 };
+ }
+ }
+}
+
+void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = -*(vec8 *)(a + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = -*(vec16 *)(a + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = -*(vec32 *)(a + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = -*(vec64 *)(a + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+
+ memcpy(d, a, oprsz);
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
new file mode 100644
index 0000000000..7464321eba
--- /dev/null
+++ b/tcg/tcg-op-gvec.c
@@ -0,0 +1,853 @@
+/*
+ * Generic vector operation expansion
+ *
+ * Copyright (c) 2017 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "tcg.h"
+#include "tcg-op.h"
+#include "tcg-op-gvec.h"
+#include "tcg-gvec-desc.h"
+
+#define REP8(x) ((x) * 0x0101010101010101ull)
+#define REP16(x) ((x) * 0x0001000100010001ull)
+
+#define MAX_UNROLL 4
+
+/* Verify vector size and alignment rules. OFS should be the OR of all
+ of the operand offsets so that we can check them all at once. */
+static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
+{
+ uint32_t align = maxsz > 16 || oprsz >= 16 ? 15 : 7;
+ tcg_debug_assert(oprsz > 0);
+ tcg_debug_assert(oprsz <= maxsz);
+ tcg_debug_assert((oprsz & align) == 0);
+ tcg_debug_assert((maxsz & align) == 0);
+ tcg_debug_assert((ofs & align) == 0);
+}
+
+/* Verify vector overlap rules for two operands. */
+static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
+{
+ tcg_debug_assert(d == a || d + s <= a || a + s <= d);
+}
+
+/* Verify vector overlap rules for three operands. */
+static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
+{
+ check_overlap_2(d, a, s);
+ check_overlap_2(d, b, s);
+ check_overlap_2(a, b, s);
+}
+
+/* Create a descriptor from components. */
+uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
+{
+ uint32_t desc = 0;
+
+ assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
+ assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
+ assert(data == sextract32(data, 0, SIMD_DATA_BITS));
+
+ oprsz = (oprsz / 8) - 1;
+ maxsz = (maxsz / 8) - 1;
+ desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
+ desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
+ desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
+
+ return desc;
+}
+
+/* Generate a call to a gvec-style helper with two vector operands. */
+void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz, int32_t data,
+ gen_helper_gvec_2 *fn)
+{
+ TCGv_ptr a0, a1;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
+ tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
+
+ fn(a0, a1, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with three vector operands. */
+void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz, int32_t data,
+ gen_helper_gvec_3 *fn)
+{
+ TCGv_ptr a0, a1, a2;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+ a2 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
+ tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
+ tcg_gen_addi_ptr(a2, tcg_ctx.tcg_env, bofs);
+
+ fn(a0, a1, a2, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_ptr(a2);
+ tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with three vector operands
+ and an extra pointer operand. */
+void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
+ TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+ int32_t data, gen_helper_gvec_2_ptr *fn)
+{
+ TCGv_ptr a0, a1;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
+ tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
+
+ fn(a0, a1, ptr, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with three vector operands
+ and an extra pointer operand. */
+void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+ int32_t data, gen_helper_gvec_3_ptr *fn)
+{
+ TCGv_ptr a0, a1, a2;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+ a2 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
+ tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
+ tcg_gen_addi_ptr(a2, tcg_ctx.tcg_env, bofs);
+
+ fn(a0, a1, a2, ptr, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_ptr(a2);
+ tcg_temp_free_i32(desc);
+}
+
+/* Return true if we want to implement something of OPRSZ bytes
+ in units of LNSZ. This limits the expansion of inline code. */
+static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
+{
+ uint32_t lnct = oprsz / lnsz;
+ return lnct >= 1 && lnct <= MAX_UNROLL;
+}
+
+/* Clear MAXSZ bytes at DOFS. */
+static void expand_clr(uint32_t dofs, uint32_t maxsz)
+{
+ if (maxsz >= 16 && TCG_TARGET_HAS_v128) {
+ TCGv_vec zero;
+
+ if (maxsz >= 32 && TCG_TARGET_HAS_v256) {
+ zero = tcg_temp_new_vec(TCG_TYPE_V256);
+ tcg_gen_movi_vec(zero, 0);
+
+ for (; maxsz >= 32; dofs += 32, maxsz -= 32) {
+ tcg_gen_stl_vec(zero, tcg_ctx.tcg_env, dofs, TCG_TYPE_V256);
+ }
+ } else {
+ zero = tcg_temp_new_vec(TCG_TYPE_V128);
+ tcg_gen_movi_vec(zero, 0);
+ }
+ for (; maxsz >= 16; dofs += 16, maxsz -= 16) {
+ tcg_gen_stl_vec(zero, tcg_ctx.tcg_env, dofs, TCG_TYPE_V128);
+ }
+
+ tcg_temp_free_vec(zero);
+ } if (TCG_TARGET_REG_BITS == 64) {
+ TCGv_i64 zero = tcg_const_i64(0);
+
+ for (; maxsz >= 8; dofs += 8, maxsz -= 8) {
+ tcg_gen_st_i64(zero, tcg_ctx.tcg_env, dofs);
+ }
+
+ tcg_temp_free_i64(zero);
+ } else if (TCG_TARGET_HAS_v64) {
+ TCGv_vec zero = tcg_temp_new_vec(TCG_TYPE_V64);
+
+ tcg_gen_movi_vec(zero, 0);
+ for (; maxsz >= 8; dofs += 8, maxsz -= 8) {
+ tcg_gen_st_vec(zero, tcg_ctx.tcg_env, dofs);
+ }
+
+ tcg_temp_free_vec(zero);
+ } else {
+ TCGv_i32 zero = tcg_const_i32(0);
+
+ for (; maxsz >= 4; dofs += 4, maxsz -= 4) {
+ tcg_gen_st_i32(zero, tcg_ctx.tcg_env, dofs);
+ }
+
+ tcg_temp_free_i32(zero);
+ }
+}
+
+/* Expand OPSZ bytes worth of two-operand operations using i32 elements. */
+static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t opsz,
+ void (*fni)(TCGv_i32, TCGv_i32))
+{
+ TCGv_i32 t0 = tcg_temp_new_i32();
+ uint32_t i;
+
+ for (i = 0; i < opsz; i += 4) {
+ tcg_gen_ld_i32(t0, tcg_ctx.tcg_env, aofs + i);
+ fni(t0, t0);
+ tcg_gen_st_i32(t0, tcg_ctx.tcg_env, dofs + i);
+ }
+ tcg_temp_free_i32(t0);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
+static void expand_3_i32(uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t opsz, bool load_dest,
+ void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+ TCGv_i32 t0 = tcg_temp_new_i32();
+ TCGv_i32 t1 = tcg_temp_new_i32();
+ TCGv_i32 t2 = tcg_temp_new_i32();
+ uint32_t i;
+
+ for (i = 0; i < opsz; i += 4) {
+ tcg_gen_ld_i32(t0, tcg_ctx.tcg_env, aofs + i);
+ tcg_gen_ld_i32(t1, tcg_ctx.tcg_env, bofs + i);
+ if (load_dest) {
+ tcg_gen_ld_i32(t2, tcg_ctx.tcg_env, dofs + i);
+ }
+ fni(t2, t0, t1);
+ tcg_gen_st_i32(t2, tcg_ctx.tcg_env, dofs + i);
+ }
+ tcg_temp_free_i32(t2);
+ tcg_temp_free_i32(t1);
+ tcg_temp_free_i32(t0);
+}
+
+/* Expand OPSZ bytes worth of two-operand operations using i64 elements. */
+static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t opsz,
+ void (*fni)(TCGv_i64, TCGv_i64))
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ uint32_t i;
+
+ for (i = 0; i < opsz; i += 8) {
+ tcg_gen_ld_i64(t0, tcg_ctx.tcg_env, aofs + i);
+ fni(t0, t0);
+ tcg_gen_st_i64(t0, tcg_ctx.tcg_env, dofs + i);
+ }
+ tcg_temp_free_i64(t0);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
+static void expand_3_i64(uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t opsz, bool load_dest,
+ void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ uint32_t i;
+
+ for (i = 0; i < opsz; i += 8) {
+ tcg_gen_ld_i64(t0, tcg_ctx.tcg_env, aofs + i);
+ tcg_gen_ld_i64(t1, tcg_ctx.tcg_env, bofs + i);
+ if (load_dest) {
+ tcg_gen_ld_i64(t2, tcg_ctx.tcg_env, dofs + i);
+ }
+ fni(t2, t0, t1);
+ tcg_gen_st_i64(t2, tcg_ctx.tcg_env, dofs + i);
+ }
+ tcg_temp_free_i64(t2);
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t0);
+}
+
+/* Expand OPSZ bytes worth of two-operand operations using host vectors. */
+static void expand_2_vec(uint32_t dofs, uint32_t aofs,
+ uint32_t opsz, uint32_t tysz, TCGType type,
+ void (*fni)(TCGv_vec, TCGv_vec))
+{
+ TCGv_vec t0 = tcg_temp_new_vec(type);
+ uint32_t i;
+
+ for (i = 0; i < opsz; i += tysz) {
+ tcg_gen_ld_vec(t0, tcg_ctx.tcg_env, aofs + i);
+ fni(t0, t0);
+ tcg_gen_st_vec(t0, tcg_ctx.tcg_env, dofs + i);
+ }
+ tcg_temp_free_vec(t0);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using host vectors. */
+static void expand_3_vec(uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t opsz,
+ uint32_t tysz, TCGType type, bool load_dest,
+ void (*fni)(TCGv_vec, TCGv_vec, TCGv_vec))
+{
+ TCGv_vec t0 = tcg_temp_new_vec(type);
+ TCGv_vec t1 = tcg_temp_new_vec(type);
+ TCGv_vec t2 = tcg_temp_new_vec(type);
+ uint32_t i;
+
+ for (i = 0; i < opsz; i += tysz) {
+ tcg_gen_ld_vec(t0, tcg_ctx.tcg_env, aofs + i);
+ tcg_gen_ld_vec(t1, tcg_ctx.tcg_env, bofs + i);
+ if (load_dest) {
+ tcg_gen_ld_vec(t2, tcg_ctx.tcg_env, dofs + i);
+ }
+ fni(t2, t0, t1);
+ tcg_gen_st_vec(t2, tcg_ctx.tcg_env, dofs + i);
+ }
+ tcg_temp_free_vec(t2);
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t0);
+}
+
+/* Expand a vector two-operand operation. */
+void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
+{
+ check_size_align(oprsz, maxsz, dofs | aofs);
+ check_overlap_2(dofs, aofs, maxsz);
+
+ /* Quick check for sizes we won't support inline. */
+ if (oprsz > MAX_UNROLL * 32 || maxsz > MAX_UNROLL * 32) {
+ goto do_ool;
+ }
+
+ /* Recall that ARM SVE allows vector sizes that are not a power of 2.
+ Expand with successively smaller host vector sizes. The intent is
+ that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
+ /* ??? For maxsz > oprsz, the host may be able to use an op-sized
+ operation, zeroing the balance of the register. We can then
+ use a cl-sized store to implement the clearing without an extra
+ store operation. This is true for aarch64 and x86_64 hosts. */
+
+ if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
+ uint32_t done = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_2_vec(dofs, aofs, done, 32, TCG_TYPE_V256, g->fniv);
+ dofs += done;
+ aofs += done;
+ oprsz -= done;
+ maxsz -= done;
+ }
+
+ if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
+ uint32_t done = QEMU_ALIGN_DOWN(oprsz, 16);
+ expand_2_vec(dofs, aofs, done, 16, TCG_TYPE_V128, g->fniv);
+ dofs += done;
+ aofs += done;
+ oprsz -= done;
+ maxsz -= done;
+ }
+
+ if (check_size_impl(oprsz, 8)) {
+ uint32_t done = QEMU_ALIGN_DOWN(oprsz, 8);
+ if (TCG_TARGET_HAS_v64 && !g->prefer_i64) {
+ expand_2_vec(dofs, aofs, done, 8, TCG_TYPE_V64, g->fniv);
+ } else if (g->fni8) {
+ expand_2_i64(dofs, aofs, done, g->fni8);
+ } else {
+ done = 0;
+ }
+ dofs += done;
+ aofs += done;
+ oprsz -= done;
+ maxsz -= done;
+ }
+
+ if (check_size_impl(oprsz, 4)) {
+ uint32_t done = QEMU_ALIGN_DOWN(oprsz, 4);
+ expand_2_i32(dofs, aofs, done, g->fni4);
+ dofs += done;
+ aofs += done;
+ oprsz -= done;
+ maxsz -= done;
+ }
+
+ if (oprsz == 0) {
+ if (maxsz != 0) {
+ expand_clr(dofs, maxsz);
+ }
+ return;
+ }
+
+ do_ool:
+ tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, 0, g->fno);
+}
+
+/* Expand a vector three-operand operation. */
+void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
+{
+ check_size_align(oprsz, maxsz, dofs | aofs | bofs);
+ check_overlap_3(dofs, aofs, bofs, maxsz);
+
+ /* Quick check for sizes we won't support inline. */
+ if (oprsz > MAX_UNROLL * 32 || maxsz > MAX_UNROLL * 32) {
+ goto do_ool;
+ }
+
+ /* Recall that ARM SVE allows vector sizes that are not a power of 2.
+ Expand with successively smaller host vector sizes. The intent is
+ that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
+ /* ??? For maxsz > oprsz, the host may be able to use an op-sized
+ operation, zeroing the balance of the register. We can then
+ use a cl-sized store to implement the clearing without an extra
+ store operation. This is true for aarch64 and x86_64 hosts. */
+
+ if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
+ uint32_t done = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_3_vec(dofs, aofs, bofs, done, 32, TCG_TYPE_V256,
+ g->load_dest, g->fniv);
+ dofs += done;
+ aofs += done;
+ bofs += done;
+ oprsz -= done;
+ maxsz -= done;
+ }
+
+ if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
+ uint32_t done = QEMU_ALIGN_DOWN(oprsz, 16);
+ expand_3_vec(dofs, aofs, bofs, done, 16, TCG_TYPE_V128,
+ g->load_dest, g->fniv);
+ dofs += done;
+ aofs += done;
+ bofs += done;
+ oprsz -= done;
+ maxsz -= done;
+ }
+
+ if (check_size_impl(oprsz, 8)) {
+ uint32_t done = QEMU_ALIGN_DOWN(oprsz, 8);
+ if (TCG_TARGET_HAS_v64 && !g->prefer_i64) {
+ expand_3_vec(dofs, aofs, bofs, done, 8, TCG_TYPE_V64,
+ g->load_dest, g->fniv);
+ } else if (g->fni8) {
+ expand_3_i64(dofs, aofs, bofs, done, g->load_dest, g->fni8);
+ } else {
+ done = 0;
+ }
+ dofs += done;
+ aofs += done;
+ bofs += done;
+ oprsz -= done;
+ maxsz -= done;
+ }
+
+ if (check_size_impl(oprsz, 4)) {
+ uint32_t done = QEMU_ALIGN_DOWN(oprsz, 4);
+ expand_3_i32(dofs, aofs, bofs, done, g->load_dest, g->fni4);
+ dofs += done;
+ aofs += done;
+ bofs += done;
+ oprsz -= done;
+ maxsz -= done;
+ }
+
+ if (oprsz == 0) {
+ if (maxsz != 0) {
+ expand_clr(dofs, maxsz);
+ }
+ return;
+ }
+
+ do_ool:
+ tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, g->fno);
+}
+
+/*
+ * Expand specific vector operations.
+ */
+
+void tcg_gen_gvec_mov(uint32_t dofs, uint32_t aofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen2 g = {
+ .fni8 = tcg_gen_mov_i64,
+ .fniv = tcg_gen_mov_vec,
+ .fno = gen_helper_gvec_mov,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_not(uint32_t dofs, uint32_t aofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen2 g = {
+ .fni8 = tcg_gen_not_i64,
+ .fniv = tcg_gen_not_vec,
+ .fno = gen_helper_gvec_not,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ TCGv_i64 t3 = tcg_temp_new_i64();
+
+ tcg_gen_andc_i64(t1, a, m);
+ tcg_gen_andc_i64(t2, b, m);
+ tcg_gen_xor_i64(t3, a, b);
+ tcg_gen_add_i64(d, t1, t2);
+ tcg_gen_and_i64(t3, t3, m);
+ tcg_gen_xor_i64(d, d, t3);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+ tcg_temp_free_i64(t3);
+}
+
+void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(REP8(0x80));
+ gen_addv_mask(d, a, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(REP16(0x8000));
+ gen_addv_mask(d, a, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+
+ tcg_gen_andi_i64(t1, a, ~0xffffffffull);
+ tcg_gen_add_i64(t2, a, b);
+ tcg_gen_add_i64(t1, t1, b);
+ tcg_gen_deposit_i64(d, t1, t2, 0, 32);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+}
+
+void tcg_gen_gvec_add8(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_vec_add8_i64,
+ .fniv = tcg_gen_add8_vec,
+ .fno = gen_helper_gvec_add8,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_add16(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_vec_add16_i64,
+ .fniv = tcg_gen_add16_vec,
+ .fno = gen_helper_gvec_add16,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_add32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni4 = tcg_gen_add_i32,
+ .fniv = tcg_gen_add32_vec,
+ .fno = gen_helper_gvec_add32,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_add64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_add_i64,
+ .fniv = tcg_gen_add64_vec,
+ .fno = gen_helper_gvec_add64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ TCGv_i64 t3 = tcg_temp_new_i64();
+
+ tcg_gen_or_i64(t1, a, m);
+ tcg_gen_andc_i64(t2, b, m);
+ tcg_gen_eqv_i64(t3, a, b);
+ tcg_gen_sub_i64(d, t1, t2);
+ tcg_gen_and_i64(t3, t3, m);
+ tcg_gen_xor_i64(d, d, t3);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+ tcg_temp_free_i64(t3);
+}
+
+void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(REP8(0x80));
+ gen_subv_mask(d, a, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(REP16(0x8000));
+ gen_subv_mask(d, a, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+
+ tcg_gen_andi_i64(t1, b, ~0xffffffffull);
+ tcg_gen_sub_i64(t2, a, b);
+ tcg_gen_sub_i64(t1, a, t1);
+ tcg_gen_deposit_i64(d, t1, t2, 0, 32);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+}
+
+void tcg_gen_gvec_sub8(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_vec_sub8_i64,
+ .fniv = tcg_gen_sub8_vec,
+ .fno = gen_helper_gvec_sub8,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_sub16(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_vec_sub16_i64,
+ .fniv = tcg_gen_sub16_vec,
+ .fno = gen_helper_gvec_sub16,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_sub32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni4 = tcg_gen_sub_i32,
+ .fniv = tcg_gen_sub32_vec,
+ .fno = gen_helper_gvec_sub32,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_sub64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_sub_i64,
+ .fniv = tcg_gen_sub64_vec,
+ .fno = gen_helper_gvec_sub64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
+{
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ TCGv_i64 t3 = tcg_temp_new_i64();
+
+ tcg_gen_andc_i64(t3, m, b);
+ tcg_gen_andc_i64(t2, b, m);
+ tcg_gen_sub_i64(d, m, t2);
+ tcg_gen_xor_i64(d, d, t3);
+
+ tcg_temp_free_i64(t2);
+ tcg_temp_free_i64(t3);
+}
+
+void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(REP8(0x80));
+ gen_negv_mask(d, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(REP16(0x8000));
+ gen_negv_mask(d, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+
+ tcg_gen_andi_i64(t1, b, ~0xffffffffull);
+ tcg_gen_neg_i64(t2, b);
+ tcg_gen_neg_i64(t1, t1);
+ tcg_gen_deposit_i64(d, t1, t2, 0, 32);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+}
+
+void tcg_gen_gvec_neg8(uint32_t dofs, uint32_t aofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen2 g = {
+ .fni8 = tcg_gen_vec_neg8_i64,
+ .fniv = tcg_gen_neg8_vec,
+ .fno = gen_helper_gvec_neg8,
+ };
+ tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_neg16(uint32_t dofs, uint32_t aofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen2 g = {
+ .fni8 = tcg_gen_vec_neg16_i64,
+ .fniv = tcg_gen_neg16_vec,
+ .fno = gen_helper_gvec_neg16,
+ };
+ tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_neg32(uint32_t dofs, uint32_t aofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen2 g = {
+ .fni4 = tcg_gen_neg_i32,
+ .fniv = tcg_gen_neg32_vec,
+ .fno = gen_helper_gvec_neg32,
+ };
+ tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_neg64(uint32_t dofs, uint32_t aofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen2 g = {
+ .fni8 = tcg_gen_neg_i64,
+ .fniv = tcg_gen_neg64_vec,
+ .fno = gen_helper_gvec_neg64,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_and(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_and_i64,
+ .fniv = tcg_gen_and_vec,
+ .fno = gen_helper_gvec_and,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_or(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_or_i64,
+ .fniv = tcg_gen_or_vec,
+ .fno = gen_helper_gvec_or,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_xor(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_xor_i64,
+ .fniv = tcg_gen_xor_vec,
+ .fno = gen_helper_gvec_xor,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_andc(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_andc_i64,
+ .fniv = tcg_gen_andc_vec,
+ .fno = gen_helper_gvec_andc,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_orc(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t opsz, uint32_t clsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_orc_i64,
+ .fniv = tcg_gen_orc_vec,
+ .fno = gen_helper_gvec_orc,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
index 228cd84fa4..d381a02f34 100644
--- a/accel/tcg/Makefile.objs
+++ b/accel/tcg/Makefile.objs
@@ -1,6 +1,6 @@
obj-$(CONFIG_SOFTMMU) += tcg-all.o
obj-$(CONFIG_SOFTMMU) += cputlb.o
-obj-y += tcg-runtime.o
+obj-y += tcg-runtime.o tcg-runtime-gvec.o
obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
obj-y += translator.o
--
2.13.5
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers
2017-09-16 2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders Richard Henderson
@ 2017-09-16 2:34 ` Richard Henderson
2017-09-26 22:33 ` Alex Bennée
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic Richard Henderson
` (4 subsequent siblings)
7 siblings, 1 reply; 14+ messages in thread
From: Richard Henderson @ 2017-09-16 2:34 UTC (permalink / raw)
To: qemu-devel; +Cc: alex.bennee, f4bug
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/cpu.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 98b9b26fd3..c346bd148f 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -486,7 +486,7 @@ typedef struct CPUARMState {
* the two execution states, and means we do not need to explicitly
* map these registers when changing states.
*/
- float64 regs[64];
+ float64 regs[64] QEMU_ALIGNED(16);
uint32_t xregs[16];
/* We store these fpcsr fields separately for convenience. */
--
2.13.5
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic
2017-09-16 2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
` (2 preceding siblings ...)
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers Richard Henderson
@ 2017-09-16 2:34 ` Richard Henderson
2017-09-26 23:12 ` Alex Bennée
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 5/6] tcg/i386: Add vector operations Richard Henderson
` (3 subsequent siblings)
7 siblings, 1 reply; 14+ messages in thread
From: Richard Henderson @ 2017-09-16 2:34 UTC (permalink / raw)
To: qemu-devel; +Cc: alex.bennee, f4bug
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
target/arm/translate-a64.c | 216 ++++++++++++++++++++++++++++++---------------
1 file changed, 143 insertions(+), 73 deletions(-)
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index a3984c9a0d..4759cc9829 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -21,6 +21,7 @@
#include "cpu.h"
#include "exec/exec-all.h"
#include "tcg-op.h"
+#include "tcg-op-gvec.h"
#include "qemu/log.h"
#include "arm_ldst.h"
#include "translate.h"
@@ -82,6 +83,7 @@ typedef void NeonGenTwoDoubleOPFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
typedef void NeonGenOneOpFn(TCGv_i64, TCGv_i64);
typedef void CryptoTwoOpEnvFn(TCGv_ptr, TCGv_i32, TCGv_i32);
typedef void CryptoThreeOpEnvFn(TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
+typedef void GVecGenTwoFn(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t);
/* initialize TCG globals. */
void a64_translate_init(void)
@@ -537,6 +539,21 @@ static inline int vec_reg_offset(DisasContext *s, int regno,
return offs;
}
+/* Return the offset info CPUARMState of the "whole" vector register Qn. */
+static inline int vec_full_reg_offset(DisasContext *s, int regno)
+{
+ assert_fp_access_checked(s);
+ return offsetof(CPUARMState, vfp.regs[regno * 2]);
+}
+
+/* Return the byte size of the "whole" vector register, VL / 8. */
+static inline int vec_full_reg_size(DisasContext *s)
+{
+ /* FIXME SVE: We should put the composite ZCR_EL* value into tb->flags.
+ In the meantime this is just the AdvSIMD length of 128. */
+ return 128 / 8;
+}
+
/* Return the offset into CPUARMState of a slice (from
* the least significant end) of FP register Qn (ie
* Dn, Sn, Hn or Bn).
@@ -9036,85 +9053,125 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
}
}
+static void gen_bsl_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+ tcg_gen_xor_i64(rn, rn, rm);
+ tcg_gen_and_i64(rn, rn, rd);
+ tcg_gen_xor_i64(rd, rm, rn);
+}
+
+static void gen_bit_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+ tcg_gen_xor_i64(rn, rn, rd);
+ tcg_gen_and_i64(rn, rn, rm);
+ tcg_gen_xor_i64(rd, rd, rn);
+}
+
+static void gen_bif_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+ tcg_gen_xor_i64(rn, rn, rd);
+ tcg_gen_andc_i64(rn, rn, rm);
+ tcg_gen_xor_i64(rd, rd, rn);
+}
+
+static void gen_bsl_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
+{
+ tcg_gen_xor_vec(rn, rn, rm);
+ tcg_gen_and_vec(rn, rn, rd);
+ tcg_gen_xor_vec(rd, rm, rn);
+}
+
+static void gen_bit_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
+{
+ tcg_gen_xor_vec(rn, rn, rd);
+ tcg_gen_and_vec(rn, rn, rm);
+ tcg_gen_xor_vec(rd, rd, rn);
+}
+
+static void gen_bif_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
+{
+ tcg_gen_xor_vec(rn, rn, rd);
+ tcg_gen_andc_vec(rn, rn, rm);
+ tcg_gen_xor_vec(rd, rd, rn);
+}
+
/* Logic op (opcode == 3) subgroup of C3.6.16. */
static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
{
+ static const GVecGen3 bsl_op = {
+ .fni8 = gen_bsl_i64,
+ .fniv = gen_bsl_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true
+ };
+ static const GVecGen3 bit_op = {
+ .fni8 = gen_bit_i64,
+ .fniv = gen_bit_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true
+ };
+ static const GVecGen3 bif_op = {
+ .fni8 = gen_bif_i64,
+ .fniv = gen_bif_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true
+ };
+
int rd = extract32(insn, 0, 5);
int rn = extract32(insn, 5, 5);
int rm = extract32(insn, 16, 5);
int size = extract32(insn, 22, 2);
bool is_u = extract32(insn, 29, 1);
bool is_q = extract32(insn, 30, 1);
- TCGv_i64 tcg_op1, tcg_op2, tcg_res[2];
- int pass;
+ GVecGenTwoFn *gvec_fn;
+ const GVecGen3 *gvec_op;
if (!fp_access_check(s)) {
return;
}
- tcg_op1 = tcg_temp_new_i64();
- tcg_op2 = tcg_temp_new_i64();
- tcg_res[0] = tcg_temp_new_i64();
- tcg_res[1] = tcg_temp_new_i64();
-
- for (pass = 0; pass < (is_q ? 2 : 1); pass++) {
- read_vec_element(s, tcg_op1, rn, pass, MO_64);
- read_vec_element(s, tcg_op2, rm, pass, MO_64);
-
- if (!is_u) {
- switch (size) {
- case 0: /* AND */
- tcg_gen_and_i64(tcg_res[pass], tcg_op1, tcg_op2);
- break;
- case 1: /* BIC */
- tcg_gen_andc_i64(tcg_res[pass], tcg_op1, tcg_op2);
- break;
- case 2: /* ORR */
- tcg_gen_or_i64(tcg_res[pass], tcg_op1, tcg_op2);
- break;
- case 3: /* ORN */
- tcg_gen_orc_i64(tcg_res[pass], tcg_op1, tcg_op2);
- break;
- }
- } else {
- if (size != 0) {
- /* B* ops need res loaded to operate on */
- read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
- }
-
- switch (size) {
- case 0: /* EOR */
- tcg_gen_xor_i64(tcg_res[pass], tcg_op1, tcg_op2);
- break;
- case 1: /* BSL bitwise select */
- tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_op2);
- tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_res[pass]);
- tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op1);
- break;
- case 2: /* BIT, bitwise insert if true */
- tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
- tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_op2);
- tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
- break;
- case 3: /* BIF, bitwise insert if false */
- tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
- tcg_gen_andc_i64(tcg_op1, tcg_op1, tcg_op2);
- tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
- break;
- }
- }
- }
+ switch (size + 4 * is_u) {
+ case 0: /* AND */
+ gvec_fn = tcg_gen_gvec_and;
+ goto do_fn;
+ case 1: /* BIC */
+ gvec_fn = tcg_gen_gvec_andc;
+ goto do_fn;
+ case 2: /* ORR */
+ gvec_fn = tcg_gen_gvec_or;
+ goto do_fn;
+ case 3: /* ORN */
+ gvec_fn = tcg_gen_gvec_orc;
+ goto do_fn;
+ case 4: /* EOR */
+ gvec_fn = tcg_gen_gvec_xor;
+ goto do_fn;
+ do_fn:
+ gvec_fn(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ is_q ? 16 : 8, vec_full_reg_size(s));
+ return;
+
+ case 5: /* BSL bitwise select */
+ gvec_op = &bsl_op;
+ goto do_op;
+ case 6: /* BIT, bitwise insert if true */
+ gvec_op = &bit_op;
+ goto do_op;
+ case 7: /* BIF, bitwise insert if false */
+ gvec_op = &bif_op;
+ goto do_op;
+ do_op:
+ tcg_gen_gvec_3(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ is_q ? 16 : 8, vec_full_reg_size(s), gvec_op);
+ return;
- write_vec_element(s, tcg_res[0], rd, 0, MO_64);
- if (!is_q) {
- tcg_gen_movi_i64(tcg_res[1], 0);
+ default:
+ g_assert_not_reached();
}
- write_vec_element(s, tcg_res[1], rd, 1, MO_64);
-
- tcg_temp_free_i64(tcg_op1);
- tcg_temp_free_i64(tcg_op2);
- tcg_temp_free_i64(tcg_res[0]);
- tcg_temp_free_i64(tcg_res[1]);
}
/* Helper functions for 32 bit comparisons */
@@ -9375,6 +9432,7 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
int rn = extract32(insn, 5, 5);
int rd = extract32(insn, 0, 5);
int pass;
+ GVecGenTwoFn *gvec_op;
switch (opcode) {
case 0x13: /* MUL, PMUL */
@@ -9414,6 +9472,28 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
return;
}
+ switch (opcode) {
+ case 0x10: /* ADD, SUB */
+ {
+ static GVecGenTwoFn * const fns[4][2] = {
+ { tcg_gen_gvec_add8, tcg_gen_gvec_sub8 },
+ { tcg_gen_gvec_add16, tcg_gen_gvec_sub16 },
+ { tcg_gen_gvec_add32, tcg_gen_gvec_sub32 },
+ { tcg_gen_gvec_add64, tcg_gen_gvec_sub64 },
+ };
+ gvec_op = fns[size][u];
+ goto do_gvec;
+ }
+ break;
+
+ do_gvec:
+ gvec_op(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ is_q ? 16 : 8, vec_full_reg_size(s));
+ return;
+ }
+
if (size == 3) {
assert(is_q);
for (pass = 0; pass < 2; pass++) {
@@ -9586,16 +9666,6 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
genfn = fns[size][u];
break;
}
- case 0x10: /* ADD, SUB */
- {
- static NeonGenTwoOpFn * const fns[3][2] = {
- { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 },
- { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
- { tcg_gen_add_i32, tcg_gen_sub_i32 },
- };
- genfn = fns[size][u];
- break;
- }
case 0x11: /* CMTST, CMEQ */
{
static NeonGenTwoOpFn * const fns[3][2] = {
--
2.13.5
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [Qemu-devel] [PATCH v3 5/6] tcg/i386: Add vector operations
2017-09-16 2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
` (3 preceding siblings ...)
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic Richard Henderson
@ 2017-09-16 2:34 ` Richard Henderson
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 6/6] tcg/aarch64: " Richard Henderson
` (2 subsequent siblings)
7 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2017-09-16 2:34 UTC (permalink / raw)
To: qemu-devel; +Cc: alex.bennee, f4bug
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/i386/tcg-target.h | 36 +++-
tcg/i386/tcg-target.inc.c | 423 +++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 413 insertions(+), 46 deletions(-)
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index b89dababf4..df69f8db91 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -30,11 +30,10 @@
#ifdef __x86_64__
# define TCG_TARGET_REG_BITS 64
-# define TCG_TARGET_NB_REGS 16
#else
# define TCG_TARGET_REG_BITS 32
-# define TCG_TARGET_NB_REGS 8
#endif
+# define TCG_TARGET_NB_REGS 24
typedef enum {
TCG_REG_EAX = 0,
@@ -56,6 +55,19 @@ typedef enum {
TCG_REG_R13,
TCG_REG_R14,
TCG_REG_R15,
+
+ /* SSE registers; 64-bit has access to 8 more, but we won't
+ need more than a few and using only the first 8 minimizes
+ the need for a rex prefix on the sse instructions. */
+ TCG_REG_XMM0,
+ TCG_REG_XMM1,
+ TCG_REG_XMM2,
+ TCG_REG_XMM3,
+ TCG_REG_XMM4,
+ TCG_REG_XMM5,
+ TCG_REG_XMM6,
+ TCG_REG_XMM7,
+
TCG_REG_RAX = TCG_REG_EAX,
TCG_REG_RCX = TCG_REG_ECX,
TCG_REG_RDX = TCG_REG_EDX,
@@ -78,6 +90,17 @@ typedef enum {
extern bool have_bmi1;
extern bool have_popcnt;
+#ifdef __SSE2__
+#define have_sse2 true
+#else
+extern bool have_sse2;
+#endif
+#ifdef __AVX2__
+#define have_avx2 true
+#else
+extern bool have_avx2;
+#endif
+
/* optional instructions */
#define TCG_TARGET_HAS_div2_i32 1
#define TCG_TARGET_HAS_rot_i32 1
@@ -146,6 +169,15 @@ extern bool have_popcnt;
#define TCG_TARGET_HAS_mulsh_i64 0
#endif
+#define TCG_TARGET_HAS_v64 have_sse2
+#define TCG_TARGET_HAS_v128 have_sse2
+#define TCG_TARGET_HAS_v256 have_avx2
+
+#define TCG_TARGET_HAS_andc_vec 1
+#define TCG_TARGET_HAS_orc_vec 0
+#define TCG_TARGET_HAS_not_vec 0
+#define TCG_TARGET_HAS_neg_vec 0
+
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
(((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
((ofs) == 0 && (len) == 16))
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 69e49c9f58..df3be932d5 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -28,10 +28,11 @@
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
#if TCG_TARGET_REG_BITS == 64
"%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
- "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
#else
"%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
#endif
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
};
#endif
@@ -61,6 +62,14 @@ static const int tcg_target_reg_alloc_order[] = {
TCG_REG_EDX,
TCG_REG_EAX,
#endif
+ TCG_REG_XMM0,
+ TCG_REG_XMM1,
+ TCG_REG_XMM2,
+ TCG_REG_XMM3,
+ TCG_REG_XMM4,
+ TCG_REG_XMM5,
+ TCG_REG_XMM6,
+ TCG_REG_XMM7,
};
static const int tcg_target_call_iarg_regs[] = {
@@ -94,7 +103,7 @@ static const int tcg_target_call_oarg_regs[] = {
#define TCG_CT_CONST_I32 0x400
#define TCG_CT_CONST_WSZ 0x800
-/* Registers used with L constraint, which are the first argument
+/* Registers used with L constraint, which are the first argument
registers on x86_64, and two random call clobbered registers on
i386. */
#if TCG_TARGET_REG_BITS == 64
@@ -126,6 +135,16 @@ static bool have_cmov;
bool have_bmi1;
bool have_popcnt;
+#ifndef have_sse2
+bool have_sse2;
+#endif
+#ifdef have_avx2
+#define have_avx1 have_avx2
+#else
+static bool have_avx1;
+bool have_avx2;
+#endif
+
#ifdef CONFIG_CPUID_H
static bool have_movbe;
static bool have_bmi2;
@@ -192,14 +211,17 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
break;
case 'q':
+ /* A register that can be used as a byte operand. */
ct->ct |= TCG_CT_REG;
ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
break;
case 'Q':
+ /* A register with an addressable second byte (e.g. %ah). */
ct->ct |= TCG_CT_REG;
ct->u.regs = 0xf;
break;
case 'r':
+ /* A general register. */
ct->ct |= TCG_CT_REG;
ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
break;
@@ -207,6 +229,11 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
/* With TZCNT/LZCNT, we can have operand-size as an input. */
ct->ct |= TCG_CT_CONST_WSZ;
break;
+ case 'x':
+ /* A vector register. */
+ ct->ct |= TCG_CT_REG;
+ ct->u.regs = 0xff0000;
+ break;
/* qemu_ld/st address constraint */
case 'L':
@@ -277,8 +304,9 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
# define P_REXB_RM 0
# define P_GS 0
#endif
-#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */
-#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */
+#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
+#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
+#define P_VEXL 0x80000 /* Set VEX.L = 1 */
#define OPC_ARITH_EvIz (0x81)
#define OPC_ARITH_EvIb (0x83)
@@ -310,11 +338,30 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_MOVL_Iv (0xb8)
#define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
#define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
+#define OPC_MOVDQA_GyMy (0x6f | P_EXT | P_DATA16)
+#define OPC_MOVDQA_MyGy (0x7f | P_EXT | P_DATA16)
+#define OPC_MOVDQU_GyMy (0x6f | P_EXT | P_SIMDF3)
+#define OPC_MOVDQU_MyGy (0x7f | P_EXT | P_SIMDF3)
+#define OPC_MOVQ_GyMy (0x7e | P_EXT | P_SIMDF3)
+#define OPC_MOVQ_MyGy (0xd6 | P_EXT | P_DATA16)
#define OPC_MOVSBL (0xbe | P_EXT)
#define OPC_MOVSWL (0xbf | P_EXT)
#define OPC_MOVSLQ (0x63 | P_REXW)
#define OPC_MOVZBL (0xb6 | P_EXT)
#define OPC_MOVZWL (0xb7 | P_EXT)
+#define OPC_PADDB (0xfc | P_EXT | P_DATA16)
+#define OPC_PADDW (0xfd | P_EXT | P_DATA16)
+#define OPC_PADDD (0xfe | P_EXT | P_DATA16)
+#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
+#define OPC_PAND (0xdb | P_EXT | P_DATA16)
+#define OPC_PANDN (0xdf | P_EXT | P_DATA16)
+#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
+#define OPC_POR (0xeb | P_EXT | P_DATA16)
+#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
+#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
+#define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
+#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
+#define OPC_PXOR (0xef | P_EXT | P_DATA16)
#define OPC_POP_r32 (0x58)
#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
#define OPC_PUSH_r32 (0x50)
@@ -330,6 +377,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
#define OPC_TESTL (0x85)
#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
+#define OPC_VZEROUPPER (0x77 | P_EXT)
#define OPC_XCHG_ax_r32 (0x90)
#define OPC_GRP3_Ev (0xf7)
@@ -479,11 +527,20 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}
-static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
+ int rm, int index)
{
int tmp;
- if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
+ /* Use the two byte form if possible, which cannot encode
+ VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
+ if ((opc & (P_EXT | P_EXT38 | P_REXW)) == P_EXT
+ && ((rm | index) & 8) == 0) {
+ /* Two byte VEX prefix. */
+ tcg_out8(s, 0xc5);
+
+ tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
+ } else {
/* Three byte VEX prefix. */
tcg_out8(s, 0xc4);
@@ -493,20 +550,17 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
} else if (opc & P_EXT) {
tmp = 1;
} else {
- tcg_abort();
+ g_assert_not_reached();
}
- tmp |= 0x40; /* VEX.X */
- tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
- tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
+ tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
+ tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
+ tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
tcg_out8(s, tmp);
- tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
- } else {
- /* Two byte VEX prefix. */
- tcg_out8(s, 0xc5);
-
- tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
+ tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
}
+
+ tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
/* VEX.pp */
if (opc & P_DATA16) {
tmp |= 1; /* 0x66 */
@@ -518,6 +572,11 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
tmp |= (~v & 15) << 3; /* VEX.vvvv */
tcg_out8(s, tmp);
tcg_out8(s, opc);
+}
+
+static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+{
+ tcg_out_vex_opc(s, opc, r, v, rm, 0);
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}
@@ -526,8 +585,8 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
mode for absolute addresses, ~RM is the size of the immediate operand
that will follow the instruction. */
-static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
- int index, int shift, intptr_t offset)
+static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
+ int shift, intptr_t offset)
{
int mod, len;
@@ -538,7 +597,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
intptr_t disp = offset - pc;
if (disp == (int32_t)disp) {
- tcg_out_opc(s, opc, r, 0, 0);
tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
tcg_out32(s, disp);
return;
@@ -548,7 +606,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
use of the MODRM+SIB encoding and is therefore larger than
rip-relative addressing. */
if (offset == (int32_t)offset) {
- tcg_out_opc(s, opc, r, 0, 0);
tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
tcg_out8(s, (4 << 3) | 5);
tcg_out32(s, offset);
@@ -556,10 +613,9 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
}
/* ??? The memory isn't directly addressable. */
- tcg_abort();
+ g_assert_not_reached();
} else {
/* Absolute address. */
- tcg_out_opc(s, opc, r, 0, 0);
tcg_out8(s, (r << 3) | 5);
tcg_out32(s, offset);
return;
@@ -582,7 +638,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
that would be used for %esp is the escape to the two byte form. */
if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
/* Single byte MODRM format. */
- tcg_out_opc(s, opc, r, rm, 0);
tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
} else {
/* Two byte MODRM+SIB format. */
@@ -596,7 +651,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
tcg_debug_assert(index != TCG_REG_ESP);
}
- tcg_out_opc(s, opc, r, rm, index);
tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
}
@@ -608,6 +662,21 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
}
}
+static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
+ int index, int shift, intptr_t offset)
+{
+ tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
+ tcg_out_sib_offset(s, r, rm, index, shift, offset);
+}
+
+static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
+ int rm, int index, int shift,
+ intptr_t offset)
+{
+ tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
+ tcg_out_sib_offset(s, r, rm, index, shift, offset);
+}
+
/* A simplification of the above with no index or shift. */
static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
int rm, intptr_t offset)
@@ -615,6 +684,31 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
}
+static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
+ int v, int rm, intptr_t offset)
+{
+ tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
+}
+
+static void tcg_out_maybe_vex_modrm(TCGContext *s, int opc, int r, int rm)
+{
+ if (have_avx1) {
+ tcg_out_vex_modrm(s, opc, r, 0, rm);
+ } else {
+ tcg_out_modrm(s, opc, r, rm);
+ }
+}
+
+static void tcg_out_maybe_vex_modrm_offset(TCGContext *s, int opc, int r,
+ int rm, intptr_t offset)
+{
+ if (have_avx1) {
+ tcg_out_vex_modrm_offset(s, opc, r, 0, rm, offset);
+ } else {
+ tcg_out_modrm_offset(s, opc, r, rm, offset);
+ }
+}
+
/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
{
@@ -625,12 +719,34 @@ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
}
-static inline void tcg_out_mov(TCGContext *s, TCGType type,
- TCGReg ret, TCGReg arg)
+static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
{
- if (arg != ret) {
- int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
- tcg_out_modrm(s, opc, ret, arg);
+ if (arg == ret) {
+ return;
+ }
+ switch (type) {
+ case TCG_TYPE_I32:
+ tcg_debug_assert(ret < 16 && arg < 16);
+ tcg_out_modrm(s, OPC_MOVL_GvEv, ret, arg);
+ break;
+ case TCG_TYPE_I64:
+ tcg_debug_assert(ret < 16 && arg < 16);
+ tcg_out_modrm(s, OPC_MOVL_GvEv | P_REXW, ret, arg);
+ break;
+ case TCG_TYPE_V64:
+ tcg_debug_assert(ret >= 16 && arg >= 16);
+ tcg_out_maybe_vex_modrm(s, OPC_MOVQ_GyMy, ret, arg);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(ret >= 16 && arg >= 16);
+ tcg_out_maybe_vex_modrm(s, OPC_MOVDQA_GyMy, ret, arg);
+ break;
+ case TCG_TYPE_V256:
+ tcg_debug_assert(ret >= 16 && arg >= 16);
+ tcg_out_vex_modrm(s, OPC_MOVDQA_GyMy | P_VEXL, ret, 0, arg);
+ break;
+ default:
+ g_assert_not_reached();
}
}
@@ -638,6 +754,36 @@ static void tcg_out_movi(TCGContext *s, TCGType type,
TCGReg ret, tcg_target_long arg)
{
tcg_target_long diff;
+ int opc;
+
+ switch (type) {
+ case TCG_TYPE_I32:
+ case TCG_TYPE_I64:
+ tcg_debug_assert(ret < 16);
+ break;
+
+ case TCG_TYPE_V64:
+ case TCG_TYPE_V128:
+ case TCG_TYPE_V256:
+ tcg_debug_assert(ret >= 16);
+ /* ??? Revisit this as the implementation progresses. */
+ if (arg == 0) {
+ opc = OPC_PXOR;
+ } else if (arg == -1) {
+ opc = OPC_PCMPEQB;
+ } else {
+ g_assert_not_reached();
+ }
+ if (have_avx1) {
+ tcg_out_vex_modrm(s, opc, ret, ret, ret);
+ } else {
+ tcg_out_modrm(s, opc, ret, ret);
+ }
+ return;
+
+ default:
+ g_assert_not_reached();
+ }
if (arg == 0) {
tgen_arithr(s, ARITH_XOR, ret, ret);
@@ -702,18 +848,64 @@ static inline void tcg_out_pop(TCGContext *s, int reg)
tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
}
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
- TCGReg arg1, intptr_t arg2)
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
+ TCGReg arg1, intptr_t arg2)
{
- int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
- tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
+ switch (type) {
+ case TCG_TYPE_I64:
+ tcg_debug_assert(ret < 16);
+ tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
+ break;
+ case TCG_TYPE_I32:
+ tcg_debug_assert(ret < 16);
+ tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
+ break;
+ case TCG_TYPE_V64:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_GyMy, ret, arg1, arg2);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_GyMy, ret, arg1, arg2);
+ break;
+ case TCG_TYPE_V256:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVDQU_GyMy | P_VEXL,
+ ret, 0, arg1, arg2);
+ break;
+ default:
+ g_assert_not_reached();
+ }
}
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
- TCGReg arg1, intptr_t arg2)
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+ TCGReg arg1, intptr_t arg2)
{
- int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
- tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
+ switch (type) {
+ case TCG_TYPE_I64:
+ tcg_debug_assert(arg < 16);
+ tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
+ break;
+ case TCG_TYPE_I32:
+ tcg_debug_assert(arg < 16);
+ tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
+ break;
+ case TCG_TYPE_V64:
+ tcg_debug_assert(arg >= 16);
+ tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_MyGy, arg, arg1, arg2);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(arg >= 16);
+ tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_MyGy, arg, arg1, arg2);
+ break;
+ case TCG_TYPE_V256:
+ tcg_debug_assert(arg >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVDQU_MyGy | P_VEXL,
+ arg, 0, arg1, arg2);
+ break;
+ default:
+ g_assert_not_reached();
+ }
}
static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
@@ -725,6 +917,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
return false;
}
rexw = P_REXW;
+ } else if (type != TCG_TYPE_I32) {
+ return false;
}
tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
tcg_out32(s, val);
@@ -2254,19 +2448,110 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
}
break;
+ case INDEX_op_add8_vec:
+ c = OPC_PADDB;
+ goto gen_simd;
+ case INDEX_op_add16_vec:
+ c = OPC_PADDW;
+ goto gen_simd;
+ case INDEX_op_add32_vec:
+ c = OPC_PADDD;
+ goto gen_simd;
+ case INDEX_op_add64_vec:
+ c = OPC_PADDQ;
+ goto gen_simd;
+ case INDEX_op_sub8_vec:
+ c = OPC_PSUBB;
+ goto gen_simd;
+ case INDEX_op_sub16_vec:
+ c = OPC_PSUBW;
+ goto gen_simd;
+ case INDEX_op_sub32_vec:
+ c = OPC_PSUBD;
+ goto gen_simd;
+ case INDEX_op_sub64_vec:
+ c = OPC_PSUBQ;
+ goto gen_simd;
+ case INDEX_op_and_vec:
+ c = OPC_PAND;
+ goto gen_simd;
+ case INDEX_op_or_vec:
+ c = OPC_POR;
+ goto gen_simd;
+ case INDEX_op_xor_vec:
+ c = OPC_PXOR;
+ gen_simd:
+ if (args[3] == 2) {
+ c |= P_VEXL;
+ }
+ if (have_avx1) {
+ tcg_out_vex_modrm(s, c, a0, a1, a2);
+ } else {
+ tcg_out_modrm(s, c, a0, a2);
+ }
+ break;
+ case INDEX_op_andc_vec:
+ c = OPC_PANDN;
+ if (args[3] == 2) {
+ c |= P_VEXL;
+ }
+ if (have_avx1) {
+ tcg_out_vex_modrm(s, c, a0, a2, a1);
+ } else {
+ tcg_out_modrm(s, c, a0, a1);
+ }
+ break;
+
+ case INDEX_op_ld_vec:
+ case INDEX_op_ldz_vec:
+ switch (args[3]) {
+ case 0:
+ tcg_out_ld(s, TCG_TYPE_V64, a0, a1, a2);
+ break;
+ case 1:
+ tcg_out_ld(s, TCG_TYPE_V128, a0, a1, a2);
+ break;
+ case 2:
+ tcg_out_ld(s, TCG_TYPE_V256, a0, a1, a2);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ break;
+
+ case INDEX_op_st_vec:
+ switch (args[3]) {
+ case 0:
+ tcg_out_st(s, TCG_TYPE_V64, a0, a1, a2);
+ break;
+ case 1:
+ tcg_out_st(s, TCG_TYPE_V128, a0, a1, a2);
+ break;
+ case 2:
+ tcg_out_st(s, TCG_TYPE_V256, a0, a1, a2);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ break;
+
case INDEX_op_mb:
tcg_out_mb(s, a0);
break;
case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
case INDEX_op_mov_i64:
+ case INDEX_op_mov_vec:
case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
case INDEX_op_movi_i64:
+ case INDEX_op_movi_vec:
case INDEX_op_call: /* Always emitted via tcg_out_call. */
default:
tcg_abort();
}
#undef OP_32_64
+#undef OP_128_256
+#undef OP_64_128_256
}
static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
@@ -2292,6 +2577,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
= { .args_ct_str = { "r", "r", "L", "L" } };
static const TCGTargetOpDef L_L_L_L
= { .args_ct_str = { "L", "L", "L", "L" } };
+ static const TCGTargetOpDef x_0_x = { .args_ct_str = { "x", "0", "x" } };
+ static const TCGTargetOpDef x_x_0 = { .args_ct_str = { "x", "x", "0" } };
+ static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
+ static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
switch (op) {
case INDEX_op_goto_ptr:
@@ -2493,6 +2782,26 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
return &s2;
}
+ case INDEX_op_ld_vec:
+ case INDEX_op_ldz_vec:
+ case INDEX_op_st_vec:
+ return &x_r;
+
+ case INDEX_op_add8_vec:
+ case INDEX_op_add16_vec:
+ case INDEX_op_add32_vec:
+ case INDEX_op_add64_vec:
+ case INDEX_op_sub8_vec:
+ case INDEX_op_sub16_vec:
+ case INDEX_op_sub32_vec:
+ case INDEX_op_sub64_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ return have_avx1 ? &x_x_x : &x_0_x;
+ case INDEX_op_andc_vec:
+ return have_avx1 ? &x_x_x : &x_x_0;
+
default:
break;
}
@@ -2577,6 +2886,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
+ if (have_avx2) {
+ tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
+ }
for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
tcg_out_pop(s, tcg_target_callee_save_regs[i]);
}
@@ -2598,9 +2910,16 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
static void tcg_target_init(TCGContext *s)
{
#ifdef CONFIG_CPUID_H
- unsigned a, b, c, d;
+ unsigned a, b, c, d, b7 = 0;
int max = __get_cpuid_max(0, 0);
+ if (max >= 7) {
+ /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
+ __cpuid_count(7, 0, a, b7, c, d);
+ have_bmi1 = (b7 & bit_BMI) != 0;
+ have_bmi2 = (b7 & bit_BMI2) != 0;
+ }
+
if (max >= 1) {
__cpuid(1, a, b, c, d);
#ifndef have_cmov
@@ -2609,17 +2928,26 @@ static void tcg_target_init(TCGContext *s)
available, we'll use a small forward branch. */
have_cmov = (d & bit_CMOV) != 0;
#endif
+#ifndef have_sse2
+ have_sse2 = (d & bit_SSE2) != 0;
+#endif
/* MOVBE is only available on Intel Atom and Haswell CPUs, so we
need to probe for it. */
have_movbe = (c & bit_MOVBE) != 0;
have_popcnt = (c & bit_POPCNT) != 0;
- }
- if (max >= 7) {
- /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
- __cpuid_count(7, 0, a, b, c, d);
- have_bmi1 = (b & bit_BMI) != 0;
- have_bmi2 = (b & bit_BMI2) != 0;
+#ifndef have_avx2
+ /* There are a number of things we must check before we can be
+ sure of not hitting invalid opcode. */
+ if (c & bit_OSXSAVE) {
+ unsigned xcrl, xcrh;
+ asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
+ if ((xcrl & 6) == 6) {
+ have_avx1 = (c & bit_AVX) != 0;
+ have_avx2 = (b7 & bit_AVX2) != 0;
+ }
+ }
+#endif
}
max = __get_cpuid_max(0x8000000, 0);
@@ -2636,6 +2964,13 @@ static void tcg_target_init(TCGContext *s)
} else {
tcg_target_available_regs[TCG_TYPE_I32] = 0xff;
}
+ if (have_sse2) {
+ tcg_target_available_regs[TCG_TYPE_V64] = 0xff0000;
+ tcg_target_available_regs[TCG_TYPE_V128] = 0xff0000;
+ }
+ if (have_avx2) {
+ tcg_target_available_regs[TCG_TYPE_V256] = 0xff0000;
+ }
tcg_target_call_clobber_regs = 0;
tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
--
2.13.5
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [Qemu-devel] [PATCH v3 6/6] tcg/aarch64: Add vector operations
2017-09-16 2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
` (4 preceding siblings ...)
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 5/6] tcg/i386: Add vector operations Richard Henderson
@ 2017-09-16 2:34 ` Richard Henderson
2017-09-16 2:35 ` [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
2017-09-26 22:58 ` no-reply
7 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2017-09-16 2:34 UTC (permalink / raw)
To: qemu-devel; +Cc: alex.bennee, f4bug
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
tcg/aarch64/tcg-target.h | 20 ++-
tcg/aarch64/tcg-target.inc.c | 340 +++++++++++++++++++++++++++++++++++++------
2 files changed, 315 insertions(+), 45 deletions(-)
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index c2525066ab..c3e8c4480f 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -31,13 +31,22 @@ typedef enum {
TCG_REG_SP = 31,
TCG_REG_XZR = 31,
+ TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+ TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+ TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11,
+ TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+ TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+ TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+ TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+ TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
+
/* Aliases. */
TCG_REG_FP = TCG_REG_X29,
TCG_REG_LR = TCG_REG_X30,
TCG_AREG0 = TCG_REG_X19,
} TCGReg;
-#define TCG_TARGET_NB_REGS 32
+#define TCG_TARGET_NB_REGS 64
/* used for function call generation */
#define TCG_REG_CALL_STACK TCG_REG_SP
@@ -113,6 +122,15 @@ typedef enum {
#define TCG_TARGET_HAS_mulsh_i64 1
#define TCG_TARGET_HAS_direct_jump 1
+#define TCG_TARGET_HAS_v64 1
+#define TCG_TARGET_HAS_v128 1
+#define TCG_TARGET_HAS_v256 0
+
+#define TCG_TARGET_HAS_andc_vec 1
+#define TCG_TARGET_HAS_orc_vec 1
+#define TCG_TARGET_HAS_not_vec 1
+#define TCG_TARGET_HAS_neg_vec 1
+
#define TCG_TARGET_DEFAULT_MO (0)
static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 150530f30e..4b401cfe6c 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -20,10 +20,15 @@ QEMU_BUILD_BUG_ON(TCG_TYPE_I32 != 0 || TCG_TYPE_I64 != 1);
#ifdef CONFIG_DEBUG_TCG
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
- "%x0", "%x1", "%x2", "%x3", "%x4", "%x5", "%x6", "%x7",
- "%x8", "%x9", "%x10", "%x11", "%x12", "%x13", "%x14", "%x15",
- "%x16", "%x17", "%x18", "%x19", "%x20", "%x21", "%x22", "%x23",
- "%x24", "%x25", "%x26", "%x27", "%x28", "%fp", "%x30", "%sp",
+ "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+ "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+ "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
+ "x24", "x25", "x26", "x27", "x28", "fp", "x30", "sp",
+
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+ "v24", "v25", "v26", "v27", "v28", "fp", "v30", "v31",
};
#endif /* CONFIG_DEBUG_TCG */
@@ -43,6 +48,14 @@ static const int tcg_target_reg_alloc_order[] = {
/* X19 reserved for AREG0 */
/* X29 reserved as fp */
/* X30 reserved as temporary */
+
+ TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+ TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+ /* V8 - V15 are call-saved, and skipped. */
+ TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+ TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+ TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+ TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
};
static const int tcg_target_call_iarg_regs[8] = {
@@ -119,10 +132,14 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
const char *ct_str, TCGType type)
{
switch (*ct_str++) {
- case 'r':
+ case 'r': /* general registers */
ct->ct |= TCG_CT_REG;
ct->u.regs = 0xffffffffu;
break;
+ case 'w': /* advsimd registers */
+ ct->ct |= TCG_CT_REG;
+ ct->u.regs = 0xffffffff00000000ull;
+ break;
case 'l': /* qemu_ld / qemu_st address, data_reg */
ct->ct |= TCG_CT_REG;
ct->u.regs = 0xffffffffu;
@@ -290,6 +307,12 @@ typedef enum {
I3312_LDRSHX = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30,
I3312_LDRSWX = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30,
+ I3312_LDRVD = 0x3c000000 | LDST_LD << 22 | MO_64 << 30,
+ I3312_STRVD = 0x3c000000 | LDST_ST << 22 | MO_64 << 30,
+
+ I3312_LDRVQ = 0x3c000000 | 3 << 22 | 0 << 30,
+ I3312_STRVQ = 0x3c000000 | 2 << 22 | 0 << 30,
+
I3312_TO_I3310 = 0x00200800,
I3312_TO_I3313 = 0x01000000,
@@ -374,8 +397,33 @@ typedef enum {
I3510_EON = 0x4a200000,
I3510_ANDS = 0x6a000000,
- NOP = 0xd503201f,
+ /* AdvSIMD modified immediate */
+ I3606_MOVI = 0x0f000400,
+
+ /* AdvSIMD three same. */
+ I3616_ADD_B = 0x0e208400,
+ I3616_ADD_H = 0x0e608400,
+ I3616_ADD_S = 0x0ea08400,
+ I3616_ADD_D = 0x4ee08400,
+ I3616_AND = 0x0e201c00,
+ I3616_BIC = 0x0e601c00,
+ I3616_EOR = 0x2e201c00,
+ I3616_ORR = 0x0ea01c00,
+ I3616_ORN = 0x0ee01c00,
+ I3616_SUB_B = 0x2e208400,
+ I3616_SUB_H = 0x2e608400,
+ I3616_SUB_S = 0x2ea08400,
+ I3616_SUB_D = 0x6ee08400,
+
+ /* AdvSIMD two-reg misc. */
+ I3617_NOT = 0x2e205800,
+ I3617_NEG_B = 0x2e20b800,
+ I3617_NEG_H = 0x2e60b800,
+ I3617_NEG_S = 0x2ea0b800,
+ I3617_NEG_D = 0x6ee0b800,
+
/* System instructions. */
+ NOP = 0xd503201f,
DMB_ISH = 0xd50338bf,
DMB_LD = 0x00000100,
DMB_ST = 0x00000200,
@@ -520,26 +568,47 @@ static void tcg_out_insn_3509(TCGContext *s, AArch64Insn insn, TCGType ext,
tcg_out32(s, insn | ext << 31 | rm << 16 | ra << 10 | rn << 5 | rd);
}
+static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
+ TCGReg rd, bool op, int cmode, uint8_t imm8)
+{
+ tcg_out32(s, insn | q << 30 | op << 29 | cmode << 12 | (rd & 0x1f)
+ | (imm8 & 0xe0) << 16 | (imm8 & 0x1f) << 5);
+}
+
+static void tcg_out_insn_3616(TCGContext *s, AArch64Insn insn, bool q,
+ TCGReg rd, TCGReg rn, TCGReg rm)
+{
+ tcg_out32(s, insn | q << 30 | (rm & 0x1f) << 16
+ | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
+static void tcg_out_insn_3617(TCGContext *s, AArch64Insn insn, bool q,
+ TCGReg rd, TCGReg rn)
+{
+ tcg_out32(s, insn | q << 30 | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn,
TCGReg rd, TCGReg base, TCGType ext,
TCGReg regoff)
{
/* Note the AArch64Insn constants above are for C3.3.12. Adjust. */
tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 |
- 0x4000 | ext << 13 | base << 5 | rd);
+ 0x4000 | ext << 13 | base << 5 | (rd & 0x1f));
}
static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn,
TCGReg rd, TCGReg rn, intptr_t offset)
{
- tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | rd);
+ tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | (rd & 0x1f));
}
static void tcg_out_insn_3313(TCGContext *s, AArch64Insn insn,
TCGReg rd, TCGReg rn, uintptr_t scaled_uimm)
{
/* Note the AArch64Insn constants above are for C3.3.12. Adjust. */
- tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10 | rn << 5 | rd);
+ tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10
+ | rn << 5 | (rd & 0x1f));
}
/* Register to register move using ORR (shifted register with no shift). */
@@ -594,6 +663,24 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
int s0, s1;
AArch64Insn opc;
+ switch (type) {
+ case TCG_TYPE_I32:
+ case TCG_TYPE_I64:
+ tcg_debug_assert(rd < 32);
+ break;
+
+ case TCG_TYPE_V64:
+ case TCG_TYPE_V128:
+ tcg_debug_assert(rd >= 32);
+ /* ??? Revisit this as the implementation progresses. */
+ tcg_debug_assert(value == 0);
+ tcg_out_insn(s, 3606, MOVI, 0, rd, 0, 0, 0);
+ return;
+
+ default:
+ g_assert_not_reached();
+ }
+
/* For 32-bit values, discard potential garbage in value. For 64-bit
values within [2**31, 2**32-1], we can create smaller sequences by
interpreting this as a negative 32-bit number, while ensuring that
@@ -669,15 +756,13 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
/* Define something more legible for general use. */
#define tcg_out_ldst_r tcg_out_insn_3310
-static void tcg_out_ldst(TCGContext *s, AArch64Insn insn,
- TCGReg rd, TCGReg rn, intptr_t offset)
+static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
+ TCGReg rn, intptr_t offset, int lgsize)
{
- TCGMemOp size = (uint32_t)insn >> 30;
-
/* If the offset is naturally aligned and in range, then we can
use the scaled uimm12 encoding */
- if (offset >= 0 && !(offset & ((1 << size) - 1))) {
- uintptr_t scaled_uimm = offset >> size;
+ if (offset >= 0 && !(offset & ((1 << lgsize) - 1))) {
+ uintptr_t scaled_uimm = offset >> lgsize;
if (scaled_uimm <= 0xfff) {
tcg_out_insn_3313(s, insn, rd, rn, scaled_uimm);
return;
@@ -695,32 +780,94 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn,
tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
}
-static inline void tcg_out_mov(TCGContext *s,
- TCGType type, TCGReg ret, TCGReg arg)
+static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
{
- if (ret != arg) {
+ if (ret == arg) {
+ return;
+ }
+ switch (type) {
+ case TCG_TYPE_I32:
+ case TCG_TYPE_I64:
+ tcg_debug_assert(ret < 32 && arg < 32);
tcg_out_movr(s, type, ret, arg);
+ break;
+
+ case TCG_TYPE_V64:
+ tcg_debug_assert(ret >= 32 && arg >= 32);
+ tcg_out_insn(s, 3616, ORR, 0, ret, arg, arg);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(ret >= 32 && arg >= 32);
+ tcg_out_insn(s, 3616, ORR, 1, ret, arg, arg);
+ break;
+
+ default:
+ g_assert_not_reached();
}
}
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
- TCGReg arg1, intptr_t arg2)
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
+ TCGReg arg1, intptr_t arg2)
{
- tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_LDRW : I3312_LDRX,
- arg, arg1, arg2);
+ AArch64Insn insn;
+ int lgsz;
+
+ switch (type) {
+ case TCG_TYPE_I32:
+ insn = I3312_LDRW;
+ lgsz = 2;
+ break;
+ case TCG_TYPE_I64:
+ insn = I3312_LDRX;
+ lgsz = 3;
+ break;
+ case TCG_TYPE_V64:
+ insn = I3312_LDRVD;
+ lgsz = 3;
+ break;
+ case TCG_TYPE_V128:
+ insn = I3312_LDRVQ;
+ lgsz = 4;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_out_ldst(s, insn, arg, arg1, arg2, lgsz);
}
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
- TCGReg arg1, intptr_t arg2)
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+ TCGReg arg1, intptr_t arg2)
{
- tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_STRW : I3312_STRX,
- arg, arg1, arg2);
+ AArch64Insn insn;
+ int lgsz;
+
+ switch (type) {
+ case TCG_TYPE_I32:
+ insn = I3312_STRW;
+ lgsz = 2;
+ break;
+ case TCG_TYPE_I64:
+ insn = I3312_STRX;
+ lgsz = 3;
+ break;
+ case TCG_TYPE_V64:
+ insn = I3312_STRVD;
+ lgsz = 3;
+ break;
+ case TCG_TYPE_V128:
+ insn = I3312_STRVQ;
+ lgsz = 4;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_out_ldst(s, insn, arg, arg1, arg2, lgsz);
}
static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
TCGReg base, intptr_t ofs)
{
- if (val == 0) {
+ if (type <= TCG_TYPE_I64 && val == 0) {
tcg_out_st(s, type, TCG_REG_XZR, base, ofs);
return true;
}
@@ -1210,14 +1357,15 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
/* Merge "low bits" from tlb offset, load the tlb comparator into X0.
X0 = load [X2 + (tlb_offset & 0x000fff)] */
tcg_out_ldst(s, TARGET_LONG_BITS == 32 ? I3312_LDRW : I3312_LDRX,
- TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff);
+ TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff,
+ TARGET_LONG_BITS == 32 ? 2 : 3);
/* Load the tlb addend. Do that early to avoid stalling.
X1 = load [X2 + (tlb_offset & 0xfff) + offsetof(addend)] */
tcg_out_ldst(s, I3312_LDRX, TCG_REG_X1, TCG_REG_X2,
(tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend)) -
(is_read ? offsetof(CPUTLBEntry, addr_read)
- : offsetof(CPUTLBEntry, addr_write)));
+ : offsetof(CPUTLBEntry, addr_write)), 3);
/* Perform the address comparison. */
tcg_out_cmp(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, TCG_REG_X3, 0);
@@ -1435,49 +1583,49 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_ld8u_i32:
case INDEX_op_ld8u_i64:
- tcg_out_ldst(s, I3312_LDRB, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRB, a0, a1, a2, 0);
break;
case INDEX_op_ld8s_i32:
- tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2, 0);
break;
case INDEX_op_ld8s_i64:
- tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2, 0);
break;
case INDEX_op_ld16u_i32:
case INDEX_op_ld16u_i64:
- tcg_out_ldst(s, I3312_LDRH, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRH, a0, a1, a2, 1);
break;
case INDEX_op_ld16s_i32:
- tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2, 1);
break;
case INDEX_op_ld16s_i64:
- tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2, 1);
break;
case INDEX_op_ld_i32:
case INDEX_op_ld32u_i64:
- tcg_out_ldst(s, I3312_LDRW, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRW, a0, a1, a2, 2);
break;
case INDEX_op_ld32s_i64:
- tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2, 2);
break;
case INDEX_op_ld_i64:
- tcg_out_ldst(s, I3312_LDRX, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRX, a0, a1, a2, 3);
break;
case INDEX_op_st8_i32:
case INDEX_op_st8_i64:
- tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2);
+ tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2, 0);
break;
case INDEX_op_st16_i32:
case INDEX_op_st16_i64:
- tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2);
+ tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2, 1);
break;
case INDEX_op_st_i32:
case INDEX_op_st32_i64:
- tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2);
+ tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2, 2);
break;
case INDEX_op_st_i64:
- tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2);
+ tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2, 3);
break;
case INDEX_op_add_i32:
@@ -1774,13 +1922,77 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
tcg_out_mb(s, a0);
break;
+ case INDEX_op_ld_vec:
+ case INDEX_op_ldz_vec:
+ tcg_out_ld(s, TCG_TYPE_V64 + args[3], a0, a1, a2);
+ break;
+ case INDEX_op_st_vec:
+ tcg_out_st(s, TCG_TYPE_V64 + args[3], a0, a1, a2);
+ break;
+ case INDEX_op_add8_vec:
+ tcg_out_insn(s, 3616, ADD_B, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_add16_vec:
+ tcg_out_insn(s, 3616, ADD_H, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_add32_vec:
+ tcg_out_insn(s, 3616, ADD_S, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_add64_vec:
+ tcg_out_insn(s, 3616, ADD_D, 1, a0, a1, a2);
+ break;
+ case INDEX_op_sub8_vec:
+ tcg_out_insn(s, 3616, SUB_B, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_sub16_vec:
+ tcg_out_insn(s, 3616, SUB_H, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_sub32_vec:
+ tcg_out_insn(s, 3616, SUB_S, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_sub64_vec:
+ tcg_out_insn(s, 3616, SUB_D, 1, a0, a1, a2);
+ break;
+ case INDEX_op_neg8_vec:
+ tcg_out_insn(s, 3617, NEG_B, a2, a0, a1);
+ break;
+ case INDEX_op_neg16_vec:
+ tcg_out_insn(s, 3617, NEG_H, a2, a0, a1);
+ break;
+ case INDEX_op_neg32_vec:
+ tcg_out_insn(s, 3617, NEG_S, a2, a0, a1);
+ break;
+ case INDEX_op_neg64_vec:
+ tcg_out_insn(s, 3617, NEG_D, 1, a0, a1);
+ break;
+ case INDEX_op_and_vec:
+ tcg_out_insn(s, 3616, AND, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_or_vec:
+ tcg_out_insn(s, 3616, ORR, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_xor_vec:
+ tcg_out_insn(s, 3616, EOR, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_andc_vec:
+ tcg_out_insn(s, 3616, BIC, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_orc_vec:
+ tcg_out_insn(s, 3616, ORN, args[3], a0, a1, a2);
+ break;
+ case INDEX_op_not_vec:
+ tcg_out_insn(s, 3617, NOT, a2, a0, a1);
+ break;
+
case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
case INDEX_op_mov_i64:
+ case INDEX_op_mov_vec:
case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
case INDEX_op_movi_i64:
+ case INDEX_op_movi_vec:
case INDEX_op_call: /* Always emitted via tcg_out_call. */
default:
- tcg_abort();
+ g_assert_not_reached();
}
#undef REG0
@@ -1790,11 +2002,14 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
{
static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
+ static const TCGTargetOpDef w_w = { .args_ct_str = { "w", "w" } };
+ static const TCGTargetOpDef w_r = { .args_ct_str = { "w", "r" } };
static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } };
static const TCGTargetOpDef r_rA = { .args_ct_str = { "r", "rA" } };
static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } };
static const TCGTargetOpDef lZ_l = { .args_ct_str = { "lZ", "l" } };
static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
+ static const TCGTargetOpDef w_w_w = { .args_ct_str = { "w", "w", "w" } };
static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
static const TCGTargetOpDef r_r_rA = { .args_ct_str = { "r", "r", "rA" } };
static const TCGTargetOpDef r_r_rL = { .args_ct_str = { "r", "r", "rL" } };
@@ -1938,6 +2153,33 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_sub2_i64:
return &add2;
+ case INDEX_op_add8_vec:
+ case INDEX_op_add16_vec:
+ case INDEX_op_add32_vec:
+ case INDEX_op_add64_vec:
+ case INDEX_op_sub8_vec:
+ case INDEX_op_sub16_vec:
+ case INDEX_op_sub32_vec:
+ case INDEX_op_sub64_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ case INDEX_op_andc_vec:
+ case INDEX_op_orc_vec:
+ return &w_w_w;
+
+ case INDEX_op_not_vec:
+ case INDEX_op_neg8_vec:
+ case INDEX_op_neg16_vec:
+ case INDEX_op_neg32_vec:
+ case INDEX_op_neg64_vec:
+ return &w_w;
+
+ case INDEX_op_ld_vec:
+ case INDEX_op_ldz_vec:
+ case INDEX_op_st_vec:
+ return &w_r;
+
default:
return NULL;
}
@@ -1947,8 +2189,10 @@ static void tcg_target_init(TCGContext *s)
{
tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
+ tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
+ tcg_target_available_regs[TCG_TYPE_V128] = 0xffffffff00000000ull;
- tcg_target_call_clobber_regs = 0xfffffffu;
+ tcg_target_call_clobber_regs = -1ull;
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X19);
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X20);
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X21);
@@ -1960,6 +2204,14 @@ static void tcg_target_init(TCGContext *s)
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X27);
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X28);
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X29);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V8);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V9);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V10);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V11);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V12);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V13);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V14);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V15);
s->reserved_regs = 0;
tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
--
2.13.5
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion
2017-09-16 2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
` (5 preceding siblings ...)
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 6/6] tcg/aarch64: " Richard Henderson
@ 2017-09-16 2:35 ` Richard Henderson
2017-09-26 22:58 ` no-reply
7 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2017-09-16 2:35 UTC (permalink / raw)
To: qemu-devel; +Cc: alex.bennee, f4bug
On 09/15/2017 07:34 PM, Richard Henderson wrote:
> Now addressing the complex vector op issue. I now expose TCGv_vec
> to target front-ends, but opaque wrt the vector size. One can thus
> compose vector operations, as demonstrated in target/arm/.
>
> The actual host vector length now becomes an argument to the *_vec
> opcodes. It's a little awkward, but does prevent an explosion of
> opcode values.
>
> All R-b dropped because all patches rewritten or heavily modified.
Bah. Forgot to mention that this depends on tcg-next. Full tree at
git://github.com/rth7680/qemu.git native-vector-registers-3
r~
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
@ 2017-09-26 19:28 ` Alex Bennée
2017-09-27 16:18 ` Richard Henderson
0 siblings, 1 reply; 14+ messages in thread
From: Alex Bennée @ 2017-09-26 19:28 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-devel, f4bug
Richard Henderson <richard.henderson@linaro.org> writes:
> Nothing uses or enables them yet.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> tcg/tcg-op.h | 26 +++++++
> tcg/tcg-opc.h | 37 ++++++++++
> tcg/tcg.h | 34 +++++++++
> tcg/tcg-op.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> tcg/tcg.c | 77 ++++++++++++++++++-
> tcg/README | 46 ++++++++++++
> 6 files changed, 453 insertions(+), 1 deletion(-)
>
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index 5d3278f243..b9b0b9f46f 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -915,6 +915,32 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
> void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
> void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
>
> +void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
> +void tcg_gen_movi_vec(TCGv_vec, tcg_target_long);
> +void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a);
> +
> +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
> +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
> +void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz);
> +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz);
> +
> #if TARGET_LONG_BITS == 64
> #define tcg_gen_movi_tl tcg_gen_movi_i64
> #define tcg_gen_mov_tl tcg_gen_mov_i64
> diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
> index 956fb1e9f3..8200184fa9 100644
> --- a/tcg/tcg-opc.h
> +++ b/tcg/tcg-opc.h
> @@ -204,8 +204,45 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
> DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
> TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
>
> +/* Host vector support. */
> +
> +#define IMPLVEC \
> + IMPL(TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256)
> +
> +DEF(mov_vec, 1, 1, 1, TCG_OPF_NOT_PRESENT)
> +
> +/* ??? Simple, but perhaps dupiN would be more descriptive. */
> +DEF(movi_vec, 1, 0, 2, TCG_OPF_NOT_PRESENT)
> +
> +DEF(ld_vec, 1, 1, 2, IMPLVEC)
> +DEF(ldz_vec, 1, 1, 3, IMPLVEC)
> +DEF(st_vec, 0, 2, 2, IMPLVEC)
> +
> +DEF(add8_vec, 1, 2, 1, IMPLVEC)
> +DEF(add16_vec, 1, 2, 1, IMPLVEC)
> +DEF(add32_vec, 1, 2, 1, IMPLVEC)
> +DEF(add64_vec, 1, 2, 1, IMPLVEC)
> +
> +DEF(sub8_vec, 1, 2, 1, IMPLVEC)
> +DEF(sub16_vec, 1, 2, 1, IMPLVEC)
> +DEF(sub32_vec, 1, 2, 1, IMPLVEC)
> +DEF(sub64_vec, 1, 2, 1, IMPLVEC)
> +
> +DEF(neg8_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +DEF(neg16_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +DEF(neg32_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +DEF(neg64_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +
> +DEF(and_vec, 1, 2, 1, IMPLVEC)
> +DEF(or_vec, 1, 2, 1, IMPLVEC)
> +DEF(xor_vec, 1, 2, 1, IMPLVEC)
> +DEF(andc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
> +DEF(orc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
> +DEF(not_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
> +
> #undef TLADDR_ARGS
> #undef DATA64_ARGS
> #undef IMPL
> #undef IMPL64
> +#undef IMPLVEC
> #undef DEF
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 25662c36d4..7cd356e87f 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -173,6 +173,16 @@ typedef uint64_t TCGRegSet;
> # error "Missing unsigned widening multiply"
> #endif
>
> +#ifndef TCG_TARGET_HAS_v64
> +#define TCG_TARGET_HAS_v64 0
> +#define TCG_TARGET_HAS_v128 0
> +#define TCG_TARGET_HAS_v256 0
> +#define TCG_TARGET_HAS_neg_vec 0
> +#define TCG_TARGET_HAS_not_vec 0
> +#define TCG_TARGET_HAS_andc_vec 0
> +#define TCG_TARGET_HAS_orc_vec 0
> +#endif
> +
> #ifndef TARGET_INSN_START_EXTRA_WORDS
> # define TARGET_INSN_START_WORDS 1
> #else
> @@ -249,6 +259,11 @@ typedef struct TCGPool {
> typedef enum TCGType {
> TCG_TYPE_I32,
> TCG_TYPE_I64,
> +
> + TCG_TYPE_V64,
> + TCG_TYPE_V128,
> + TCG_TYPE_V256,
> +
> TCG_TYPE_COUNT, /* number of different types */
>
> /* An alias for the size of the host register. */
> @@ -399,6 +414,8 @@ typedef tcg_target_ulong TCGArg;
> * TCGv_i32 : 32 bit integer type
> * TCGv_i64 : 64 bit integer type
> * TCGv_ptr : a host pointer type
> + * TCGv_vec : a host vector type; the exact size is not exposed
> + to the CPU front-end code.
Isn't this a guest vector type (which is pointed to by a host pointer)?
> * TCGv : an integer type the same size as target_ulong
> (an alias for either TCGv_i32 or TCGv_i64)
> The compiler's type checking will complain if you mix them
> @@ -424,6 +441,7 @@ typedef tcg_target_ulong TCGArg;
> typedef struct TCGv_i32_d *TCGv_i32;
> typedef struct TCGv_i64_d *TCGv_i64;
> typedef struct TCGv_ptr_d *TCGv_ptr;
> +typedef struct TCGv_vec_d *TCGv_vec;
> typedef TCGv_ptr TCGv_env;
> #if TARGET_LONG_BITS == 32
> #define TCGv TCGv_i32
> @@ -448,6 +466,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL MAKE_TCGV_PTR(intptr_t i)
> return (TCGv_ptr)i;
> }
>
> +static inline TCGv_vec QEMU_ARTIFICIAL MAKE_TCGV_VEC(intptr_t i)
> +{
> + return (TCGv_vec)i;
> +}
> +
> static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
> {
> return (intptr_t)t;
> @@ -463,6 +486,11 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
> return (intptr_t)t;
> }
>
> +static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_VEC(TCGv_vec t)
> +{
> + return (intptr_t)t;
> +}
> +
> #if TCG_TARGET_REG_BITS == 32
> #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
> #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
> @@ -471,15 +499,18 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
> #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
> #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
> #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
> +#define TCGV_EQUAL_VEC(a, b) (GET_TCGV_VEC(a) == GET_TCGV_VEC(b))
>
> /* Dummy definition to avoid compiler warnings. */
> #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
> #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
> #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
> +#define TCGV_UNUSED_VEC(x) x = MAKE_TCGV_VEC(-1)
>
> #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
> #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
> #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
> +#define TCGV_IS_UNUSED_VEC(x) (GET_TCGV_VEC(x) == -1)
>
> /* call flags */
> /* Helper does not read globals (either directly or through an exception). It
> @@ -790,9 +821,12 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char *name);
>
> TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
> TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
> +TCGv_vec tcg_temp_new_vec(TCGType type);
> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match);
>
> void tcg_temp_free_i32(TCGv_i32 arg);
> void tcg_temp_free_i64(TCGv_i64 arg);
> +void tcg_temp_free_vec(TCGv_vec arg);
>
> static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
> const char *name)
> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
> index 688d91755b..50b3177e5f 100644
> --- a/tcg/tcg-op.c
> +++ b/tcg/tcg-op.c
> @@ -3072,3 +3072,237 @@ static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b)
> GEN_ATOMIC_HELPER(xchg, mov2, 0)
>
> #undef GEN_ATOMIC_HELPER
> +
> +static void tcg_gen_op2_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a)
> +{
> + TCGArg ri = GET_TCGV_VEC(r);
> + TCGArg ai = GET_TCGV_VEC(a);
> + TCGTemp *rt = &tcg_ctx.temps[ri];
> + TCGTemp *at = &tcg_ctx.temps[ai];
> + TCGType type = rt->base_type;
> +
> + tcg_debug_assert(at->base_type == type);
> + tcg_gen_op3(&tcg_ctx, opc, ri, ai, type - TCG_TYPE_V64);
> +}
> +
> +static void tcg_gen_op3_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + TCGArg ri = GET_TCGV_VEC(r);
> + TCGArg ai = GET_TCGV_VEC(a);
> + TCGArg bi = GET_TCGV_VEC(b);
> + TCGTemp *rt = &tcg_ctx.temps[ri];
> + TCGTemp *at = &tcg_ctx.temps[ai];
> + TCGTemp *bt = &tcg_ctx.temps[bi];
> + TCGType type = rt->base_type;
> +
> + tcg_debug_assert(at->base_type == type);
> + tcg_debug_assert(bt->base_type == type);
> + tcg_gen_op4(&tcg_ctx, opc, ri, ai, bi, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
> +{
> + if (!TCGV_EQUAL_VEC(r, a)) {
> + tcg_gen_op2_vec(INDEX_op_mov_vec, r, a);
> + }
> +}
> +
> +void tcg_gen_movi_vec(TCGv_vec r, tcg_target_long a)
> +{
> + TCGArg ri = GET_TCGV_VEC(r);
> + TCGTemp *rt = &tcg_ctx.temps[ri];
> + TCGType type = rt->base_type;
> +
> + tcg_debug_assert(a == 0 || a == -1);
> + tcg_gen_op3(&tcg_ctx, INDEX_op_movi_vec, ri, a, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
> +{
> + TCGArg ri = GET_TCGV_VEC(r);
> + TCGArg bi = GET_TCGV_PTR(b);
> + TCGTemp *rt = &tcg_ctx.temps[ri];
> + TCGType type = rt->base_type;
> +
> + tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
> +{
> + TCGArg ri = GET_TCGV_VEC(r);
> + TCGArg bi = GET_TCGV_PTR(b);
> + TCGTemp *rt = &tcg_ctx.temps[ri];
> + TCGType type = rt->base_type;
> +
> + tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64);
> +}
> +
> +/* Load data into a vector R from B+O using TYPE. If R is wider than TYPE,
> + fill the high bits with zeros. */
> +void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type)
> +{
> + TCGArg ri = GET_TCGV_VEC(r);
> + TCGArg bi = GET_TCGV_PTR(b);
> + TCGTemp *rt = &tcg_ctx.temps[ri];
> + TCGType btype = rt->base_type;
> +
> + if (type < btype) {
> + tcg_gen_op5(&tcg_ctx, INDEX_op_ldz_vec, ri, bi, o,
> + type - TCG_TYPE_V64, btype - TCG_TYPE_V64);
> + } else {
> + tcg_debug_assert(type == btype);
> + tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64);
> + }
> +}
> +
> +/* Store data from vector R into B+O using TYPE. If R is wider than TYPE,
> + store only the low bits. */
> +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type)
> +{
> + TCGArg ri = GET_TCGV_VEC(r);
> + TCGArg bi = GET_TCGV_PTR(b);
> + TCGTemp *rt = &tcg_ctx.temps[ri];
> + TCGType btype = rt->base_type;
> +
> + tcg_debug_assert(type <= btype);
> + tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_add8_vec, r, a, b);
> +}
> +
> +void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_add16_vec, r, a, b);
> +}
> +
> +void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_add32_vec, r, a, b);
> +}
> +
> +void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_add64_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_sub8_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_sub16_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_sub32_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_sub64_vec, r, a, b);
> +}
> +
> +void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_and_vec, r, a, b);
> +}
> +
> +void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_or_vec, r, a, b);
> +}
> +
> +void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + tcg_gen_op3_vec(INDEX_op_xor_vec, r, a, b);
> +}
> +
> +void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + if (TCG_TARGET_HAS_andc_vec) {
> + tcg_gen_op3_vec(INDEX_op_andc_vec, r, a, b);
> + } else {
> + TCGv_vec t = tcg_temp_new_vec_matching(r);
> + tcg_gen_not_vec(t, b);
> + tcg_gen_and_vec(r, a, t);
> + tcg_temp_free_vec(t);
> + }
> +}
> +
> +void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> + if (TCG_TARGET_HAS_orc_vec) {
> + tcg_gen_op3_vec(INDEX_op_orc_vec, r, a, b);
> + } else {
> + TCGv_vec t = tcg_temp_new_vec_matching(r);
> + tcg_gen_not_vec(t, b);
> + tcg_gen_or_vec(r, a, t);
> + tcg_temp_free_vec(t);
> + }
> +}
> +
> +void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a)
> +{
> + if (TCG_TARGET_HAS_not_vec) {
> + tcg_gen_op2_vec(INDEX_op_orc_vec, r, a);
> + } else {
> + TCGv_vec t = tcg_temp_new_vec_matching(r);
> + tcg_gen_movi_vec(t, -1);
> + tcg_gen_xor_vec(r, a, t);
> + tcg_temp_free_vec(t);
> + }
> +}
> +
> +void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a)
> +{
> + if (TCG_TARGET_HAS_neg_vec) {
> + tcg_gen_op2_vec(INDEX_op_neg8_vec, r, a);
> + } else {
> + TCGv_vec t = tcg_temp_new_vec_matching(r);
> + tcg_gen_movi_vec(t, 0);
> + tcg_gen_sub8_vec(r, t, a);
> + tcg_temp_free_vec(t);
> + }
> +}
> +
> +void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a)
> +{
> + if (TCG_TARGET_HAS_neg_vec) {
> + tcg_gen_op2_vec(INDEX_op_neg16_vec, r, a);
> + } else {
> + TCGv_vec t = tcg_temp_new_vec_matching(r);
> + tcg_gen_movi_vec(t, 0);
> + tcg_gen_sub16_vec(r, t, a);
> + tcg_temp_free_vec(t);
> + }
> +}
> +
> +void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a)
> +{
> + if (TCG_TARGET_HAS_neg_vec) {
> + tcg_gen_op2_vec(INDEX_op_neg32_vec, r, a);
> + } else {
> + TCGv_vec t = tcg_temp_new_vec_matching(r);
> + tcg_gen_movi_vec(t, 0);
> + tcg_gen_sub32_vec(r, t, a);
> + tcg_temp_free_vec(t);
> + }
> +}
> +
> +void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a)
> +{
> + if (TCG_TARGET_HAS_neg_vec) {
> + tcg_gen_op2_vec(INDEX_op_neg64_vec, r, a);
> + } else {
> + TCGv_vec t = tcg_temp_new_vec_matching(r);
> + tcg_gen_movi_vec(t, 0);
> + tcg_gen_sub64_vec(r, t, a);
> + tcg_temp_free_vec(t);
> + }
> +}
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index dff9999bc6..a4d55efdf0 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -116,7 +116,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
> static bool tcg_out_ldst_finalize(TCGContext *s);
> #endif
>
> -static TCGRegSet tcg_target_available_regs[2];
> +static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
> static TCGRegSet tcg_target_call_clobber_regs;
>
> #if TCG_TARGET_INSN_UNIT_SIZE == 1
> @@ -664,6 +664,44 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
> return MAKE_TCGV_I64(idx);
> }
>
> +TCGv_vec tcg_temp_new_vec(TCGType type)
> +{
> + int idx;
> +
> +#ifdef CONFIG_DEBUG_TCG
> + switch (type) {
> + case TCG_TYPE_V64:
> + assert(TCG_TARGET_HAS_v64);
> + break;
> + case TCG_TYPE_V128:
> + assert(TCG_TARGET_HAS_v128);
> + break;
> + case TCG_TYPE_V256:
> + assert(TCG_TARGET_HAS_v256);
> + break;
> + default:
> + g_assert_not_reached();
> + }
> +#endif
> +
> + idx = tcg_temp_new_internal(type, 0);
> + return MAKE_TCGV_VEC(idx);
> +}
> +
A one line comment wouldn't go amiss here. This looks like we are
allocating a new temp of the same type as an existing temp?
> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match)
> +{
> + TCGContext *s = &tcg_ctx;
> + int idx = GET_TCGV_VEC(match);
> + TCGTemp *ts;
> +
> + tcg_debug_assert(idx >= s->nb_globals && idx < s->nb_temps);
> + ts = &s->temps[idx];
> + tcg_debug_assert(ts->temp_allocated != 0);
> +
> + idx = tcg_temp_new_internal(ts->base_type, 0);
> + return MAKE_TCGV_VEC(idx);
> +}
> +
> static void tcg_temp_free_internal(int idx)
> {
> TCGContext *s = &tcg_ctx;
> @@ -696,6 +734,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
> tcg_temp_free_internal(GET_TCGV_I64(arg));
> }
>
> +void tcg_temp_free_vec(TCGv_vec arg)
> +{
> + tcg_temp_free_internal(GET_TCGV_VEC(arg));
> +}
> +
> TCGv_i32 tcg_const_i32(int32_t val)
> {
> TCGv_i32 t0;
> @@ -753,6 +796,9 @@ int tcg_check_temp_count(void)
> Test the runtime variable that controls each opcode. */
> bool tcg_op_supported(TCGOpcode op)
> {
> + const bool have_vec
> + = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256;
> +
> switch (op) {
> case INDEX_op_discard:
> case INDEX_op_set_label:
> @@ -966,6 +1012,35 @@ bool tcg_op_supported(TCGOpcode op)
> case INDEX_op_mulsh_i64:
> return TCG_TARGET_HAS_mulsh_i64;
>
> + case INDEX_op_mov_vec:
> + case INDEX_op_movi_vec:
> + case INDEX_op_ld_vec:
> + case INDEX_op_ldz_vec:
> + case INDEX_op_st_vec:
> + case INDEX_op_add8_vec:
> + case INDEX_op_add16_vec:
> + case INDEX_op_add32_vec:
> + case INDEX_op_add64_vec:
> + case INDEX_op_sub8_vec:
> + case INDEX_op_sub16_vec:
> + case INDEX_op_sub32_vec:
> + case INDEX_op_sub64_vec:
> + case INDEX_op_and_vec:
> + case INDEX_op_or_vec:
> + case INDEX_op_xor_vec:
> + return have_vec;
> + case INDEX_op_not_vec:
> + return have_vec && TCG_TARGET_HAS_not_vec;
> + case INDEX_op_neg8_vec:
> + case INDEX_op_neg16_vec:
> + case INDEX_op_neg32_vec:
> + case INDEX_op_neg64_vec:
> + return have_vec && TCG_TARGET_HAS_neg_vec;
> + case INDEX_op_andc_vec:
> + return have_vec && TCG_TARGET_HAS_andc_vec;
> + case INDEX_op_orc_vec:
> + return have_vec && TCG_TARGET_HAS_orc_vec;
> +
> case NB_OPS:
> break;
> }
> diff --git a/tcg/README b/tcg/README
> index 03bfb6acd4..3bf3af67db 100644
> --- a/tcg/README
> +++ b/tcg/README
> @@ -503,6 +503,52 @@ of the memory access.
> For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
> 64-bit memory access specified in flags.
>
> +********* Host vector operations
> +
> +All of the vector ops have a final constant argument that specifies the
> +length of the vector operation LEN as 64 << LEN bits.
That doesn't scan well. So would a 4 lane operation be encoded as 64 <<
4? Is this because we are using the bottom bits for something?
> +
> +* mov_vec v0, v1, len
> +* ld_vec v0, t1, len
> +* st_vec v0, t1, len
> +
> + Move, load and store.
> +
> +* movi_vec v0, c, len
> +
> + Copy C across the entire vector.
> + At present the only supported values for C are 0 and -1.
I guess this is why the size in unimportant? This is for clearing or
setting the whole of the vector? What does len mean in this case?
> +
> +* add8_vec v0, v1, v2, len
> +* add16_vec v0, v1, v2, len
> +* add32_vec v0, v1, v2, len
> +* add64_vec v0, v1, v2, len
> +
> + v0 = v1 + v2, in elements of 8/16/32/64 bits, across len.
> +
> +* sub8_vec v0, v1, v2, len
> +* sub16_vec v0, v1, v2, len
> +* sub32_vec v0, v1, v2, len
> +* sub64_vec v0, v1, v2, len
> +
> + Similarly, v0 = v1 - v2.
> +
> +* neg8_vec v0, v1, len
> +* neg16_vec v0, v1, len
> +* neg32_vec v0, v1, len
> +* neg64_vec v0, v1, len
> +
> + Similarly, v0 = -v1.
> +
> +* and_vec v0, v1, v2, len
> +* or_vec v0, v1, v2, len
> +* xor_vec v0, v1, v2, len
> +* andc_vec v0, v1, v2, len
> +* orc_vec v0, v1, v2, len
> +* not_vec v0, v1, len
> +
> + Similarly, logical operations.
Similarly, logical operations with and without compliment?
> +
> *********
>
> Note 1: Some shortcuts are defined when the last operand is known to be
--
Alex Bennée
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders Richard Henderson
@ 2017-09-26 22:31 ` Alex Bennée
0 siblings, 0 replies; 14+ messages in thread
From: Alex Bennée @ 2017-09-26 22:31 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-devel, f4bug
Richard Henderson <richard.henderson@linaro.org> writes:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Other than live comments:
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> ---
> Makefile.target | 2 +-
> accel/tcg/tcg-runtime.h | 24 ++
> tcg/tcg-gvec-desc.h | 49 +++
> tcg/tcg-op-gvec.h | 143 ++++++++
> accel/tcg/tcg-runtime-gvec.c | 255 +++++++++++++
> tcg/tcg-op-gvec.c | 853 +++++++++++++++++++++++++++++++++++++++++++
> accel/tcg/Makefile.objs | 2 +-
> 7 files changed, 1326 insertions(+), 2 deletions(-)
> create mode 100644 tcg/tcg-gvec-desc.h
> create mode 100644 tcg/tcg-op-gvec.h
> create mode 100644 accel/tcg/tcg-runtime-gvec.c
> create mode 100644 tcg/tcg-op-gvec.c
>
> diff --git a/Makefile.target b/Makefile.target
> index 6361f957fb..f9967feef5 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -94,7 +94,7 @@ all: $(PROGS) stap
> obj-y += exec.o
> obj-y += accel/
> obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
> -obj-$(CONFIG_TCG) += tcg/tcg-common.o
> +obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/tcg-op-gvec.o
> obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
> obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
> obj-y += fpu/softfloat.o
> diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
> index c41d38a557..61c0ce39d3 100644
> --- a/accel/tcg/tcg-runtime.h
> +++ b/accel/tcg/tcg-runtime.h
> @@ -134,3 +134,27 @@ GEN_ATOMIC_HELPERS(xor_fetch)
> GEN_ATOMIC_HELPERS(xchg)
>
> #undef GEN_ATOMIC_HELPERS
> +
> +DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> diff --git a/tcg/tcg-gvec-desc.h b/tcg/tcg-gvec-desc.h
> new file mode 100644
> index 0000000000..8ba9a8168d
> --- /dev/null
> +++ b/tcg/tcg-gvec-desc.h
> @@ -0,0 +1,49 @@
> +/*
> + * Generic vector operation descriptor
> + *
> + * Copyright (c) 2017 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
> +#define SIMD_OPRSZ_SHIFT 0
> +#define SIMD_OPRSZ_BITS 5
> +
> +#define SIMD_MAXSZ_SHIFT (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
> +#define SIMD_MAXSZ_BITS 5
> +
> +#define SIMD_DATA_SHIFT (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
> +#define SIMD_DATA_BITS (32 - SIMD_DATA_SHIFT)
> +
> +/* Create a descriptor from components. */
> +uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
> +
> +/* Extract the operation size from a descriptor. */
> +static inline intptr_t simd_oprsz(uint32_t desc)
> +{
> + return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
> +}
> +
> +/* Extract the max vector size from a descriptor. */
> +static inline intptr_t simd_maxsz(uint32_t desc)
> +{
> + return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
> +}
> +
> +/* Extract the operation-specific data from a descriptor. */
> +static inline int32_t simd_data(uint32_t desc)
> +{
> + return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS);
> +}
> diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
> new file mode 100644
> index 0000000000..28bd77f1dc
> --- /dev/null
> +++ b/tcg/tcg-op-gvec.h
> @@ -0,0 +1,143 @@
> +/*
> + * Generic vector operation expansion
> + *
> + * Copyright (c) 2017 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +/*
> + * "Generic" vectors. All operands are given as offsets from ENV,
> + * and therefore cannot also be allocated via tcg_global_mem_new_*.
> + * OPRSZ is the byte size of the vector upon which the operation is performed.
> + * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
> + *
> + * All sizes must be 8 or any multiple of 16.
> + * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
> + * Operands may completely, but not partially, overlap.
> + */
> +
> +/* Expand a call to a gvec-style helper, with pointers to two vector
> + operands, and a descriptor (see tcg-gvec-desc.h). */
> +typedef void (gen_helper_gvec_2)(TCGv_ptr, TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
> + uint32_t oprsz, uint32_t maxsz, int32_t data,
> + gen_helper_gvec_2 *fn);
> +
> +/* Similarly, passing an extra pointer (e.g. env or float_status). */
> +typedef void (gen_helper_gvec_2_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
> + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> + int32_t data, gen_helper_gvec_2_ptr *fn);
> +
> +/* Similarly, with three vector operands. */
> +typedef void (gen_helper_gvec_3)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t oprsz, uint32_t maxsz, int32_t data,
> + gen_helper_gvec_3 *fn);
> +
> +typedef void (gen_helper_gvec_3_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr,
> + TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> + int32_t data, gen_helper_gvec_3_ptr *fn);
> +
> +/* Expand a gvec operation. Either inline or out-of-line depending on
> + the actual vector size and the operations supported by the host. */
> +typedef struct {
> + /* Expand inline as a 64-bit or 32-bit integer.
> + Only one of these will be non-NULL. */
> + void (*fni8)(TCGv_i64, TCGv_i64);
> + void (*fni4)(TCGv_i32, TCGv_i32);
> + /* Expand inline with a host vector type. */
> + void (*fniv)(TCGv_vec, TCGv_vec);
> + /* Expand out-of-line helper w/descriptor. */
> + gen_helper_gvec_2 *fno;
> + /* Prefer i64 to v64. */
> + bool prefer_i64;
> +} GVecGen2;
> +
> +typedef struct {
> + /* Expand inline as a 64-bit or 32-bit integer.
> + Only one of these will be non-NULL. */
> + void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
> + void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
> + /* Expand inline with a host vector type. */
> + void (*fniv)(TCGv_vec, TCGv_vec, TCGv_vec);
> + /* Expand out-of-line helper w/descriptor. */
> + gen_helper_gvec_3 *fno;
> + /* Prefer i64 to v64. */
> + bool prefer_i64;
> + /* Load dest as a 3rd source operand. */
> + bool load_dest;
> +} GVecGen3;
> +
> +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
> + uint32_t opsz, uint32_t clsz, const GVecGen2 *);
> +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz, const GVecGen3 *);
> +
> +/* Expand a specific vector operation. */
> +
> +#define DEF(X) \
> + void tcg_gen_gvec_##X(uint32_t dofs, uint32_t aofs, \
> + uint32_t opsz, uint32_t clsz)
> +
> +DEF(mov);
> +DEF(not);
> +DEF(neg8);
> +DEF(neg16);
> +DEF(neg32);
> +DEF(neg64);
> +
> +#undef DEF
> +#define DEF(X) \
> + void tcg_gen_gvec_##X(uint32_t dofs, uint32_t aofs, uint32_t bofs, \
> + uint32_t opsz, uint32_t clsz)
> +
> +DEF(add8);
> +DEF(add16);
> +DEF(add32);
> +DEF(add64);
> +
> +DEF(sub8);
> +DEF(sub16);
> +DEF(sub32);
> +DEF(sub64);
> +
> +DEF(and);
> +DEF(or);
> +DEF(xor);
> +DEF(andc);
> +DEF(orc);
> +
> +#undef DEF
> +
> +/*
> + * 64-bit vector operations. Use these when the register has been allocated
> + * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
> + * OPRSZ = MAXSZ = 8.
> + */
> +
> +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
> +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
> +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
> +
> +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +
> +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
> new file mode 100644
> index 0000000000..c75e76367c
> --- /dev/null
> +++ b/accel/tcg/tcg-runtime-gvec.c
> @@ -0,0 +1,255 @@
> +/*
> + * Generic vectorized operation runtime
> + *
> + * Copyright (c) 2017 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/host-utils.h"
> +#include "cpu.h"
> +#include "exec/helper-proto.h"
> +#include "tcg-gvec-desc.h"
> +
> +
> +/* Virtually all hosts support 16-byte vectors. Those that don't can emulate
> + them via GCC's generic vector extension. This turns out to be simpler and
> + more reliable than getting the compiler to autovectorize.
> +
> + In tcg-op-gvec.c, we asserted that both the size and alignment
> + of the data are multiples of 16. */
> +
> +typedef uint8_t vec8 __attribute__((vector_size(16)));
> +typedef uint16_t vec16 __attribute__((vector_size(16)));
> +typedef uint32_t vec32 __attribute__((vector_size(16)));
> +typedef uint64_t vec64 __attribute__((vector_size(16)));
> +
> +static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
> +{
> + intptr_t maxsz = simd_maxsz(desc);
> + intptr_t i;
> +
> + if (unlikely(maxsz > oprsz)) {
> + for (i = oprsz; i < maxsz; i += sizeof(vec64)) {
> + *(vec64 *)(d + i) = (vec64){ 0 };
> + }
> + }
> +}
> +
> +void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec8)) {
> + *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec16)) {
> + *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec32)) {
> + *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec64)) {
> + *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec8)) {
> + *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec16)) {
> + *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec32)) {
> + *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec64)) {
> + *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec8)) {
> + *(vec8 *)(d + i) = -*(vec8 *)(a + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec16)) {
> + *(vec16 *)(d + i) = -*(vec16 *)(a + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec32)) {
> + *(vec32 *)(d + i) = -*(vec32 *)(a + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec64)) {
> + *(vec64 *)(d + i) = -*(vec64 *)(a + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> +
> + memcpy(d, a, oprsz);
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec64)) {
> + *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec64)) {
> + *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec64)) {
> + *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec64)) {
> + *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec64)) {
> + *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
> +{
> + intptr_t oprsz = simd_oprsz(desc);
> + intptr_t i;
> +
> + for (i = 0; i < oprsz; i += sizeof(vec64)) {
> + *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
> + }
> + clear_high(d, oprsz, desc);
> +}
> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
> new file mode 100644
> index 0000000000..7464321eba
> --- /dev/null
> +++ b/tcg/tcg-op-gvec.c
> @@ -0,0 +1,853 @@
> +/*
> + * Generic vector operation expansion
> + *
> + * Copyright (c) 2017 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include "tcg.h"
> +#include "tcg-op.h"
> +#include "tcg-op-gvec.h"
> +#include "tcg-gvec-desc.h"
> +
> +#define REP8(x) ((x) * 0x0101010101010101ull)
> +#define REP16(x) ((x) * 0x0001000100010001ull)
> +
> +#define MAX_UNROLL 4
> +
> +/* Verify vector size and alignment rules. OFS should be the OR of all
> + of the operand offsets so that we can check them all at once. */
> +static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
> +{
> + uint32_t align = maxsz > 16 || oprsz >= 16 ? 15 : 7;
> + tcg_debug_assert(oprsz > 0);
> + tcg_debug_assert(oprsz <= maxsz);
> + tcg_debug_assert((oprsz & align) == 0);
> + tcg_debug_assert((maxsz & align) == 0);
> + tcg_debug_assert((ofs & align) == 0);
> +}
> +
> +/* Verify vector overlap rules for two operands. */
> +static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
> +{
> + tcg_debug_assert(d == a || d + s <= a || a + s <= d);
> +}
> +
> +/* Verify vector overlap rules for three operands. */
> +static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
> +{
> + check_overlap_2(d, a, s);
> + check_overlap_2(d, b, s);
> + check_overlap_2(a, b, s);
> +}
> +
> +/* Create a descriptor from components. */
> +uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
> +{
> + uint32_t desc = 0;
> +
> + assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
> + assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
> + assert(data == sextract32(data, 0, SIMD_DATA_BITS));
> +
> + oprsz = (oprsz / 8) - 1;
> + maxsz = (maxsz / 8) - 1;
> + desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
> + desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
> + desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
> +
> + return desc;
> +}
> +
> +/* Generate a call to a gvec-style helper with two vector operands. */
> +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
> + uint32_t oprsz, uint32_t maxsz, int32_t data,
> + gen_helper_gvec_2 *fn)
> +{
> + TCGv_ptr a0, a1;
> + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> + a0 = tcg_temp_new_ptr();
> + a1 = tcg_temp_new_ptr();
> +
> + tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
> + tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
> +
> + fn(a0, a1, desc);
> +
> + tcg_temp_free_ptr(a0);
> + tcg_temp_free_ptr(a1);
> + tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with three vector operands. */
> +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t oprsz, uint32_t maxsz, int32_t data,
> + gen_helper_gvec_3 *fn)
> +{
> + TCGv_ptr a0, a1, a2;
> + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> + a0 = tcg_temp_new_ptr();
> + a1 = tcg_temp_new_ptr();
> + a2 = tcg_temp_new_ptr();
> +
> + tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
> + tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
> + tcg_gen_addi_ptr(a2, tcg_ctx.tcg_env, bofs);
> +
> + fn(a0, a1, a2, desc);
> +
> + tcg_temp_free_ptr(a0);
> + tcg_temp_free_ptr(a1);
> + tcg_temp_free_ptr(a2);
> + tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with three vector operands
> + and an extra pointer operand. */
> +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
> + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> + int32_t data, gen_helper_gvec_2_ptr *fn)
> +{
> + TCGv_ptr a0, a1;
> + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> + a0 = tcg_temp_new_ptr();
> + a1 = tcg_temp_new_ptr();
> +
> + tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
> + tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
> +
> + fn(a0, a1, ptr, desc);
> +
> + tcg_temp_free_ptr(a0);
> + tcg_temp_free_ptr(a1);
> + tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with three vector operands
> + and an extra pointer operand. */
> +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> + int32_t data, gen_helper_gvec_3_ptr *fn)
> +{
> + TCGv_ptr a0, a1, a2;
> + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> + a0 = tcg_temp_new_ptr();
> + a1 = tcg_temp_new_ptr();
> + a2 = tcg_temp_new_ptr();
> +
> + tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
> + tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
> + tcg_gen_addi_ptr(a2, tcg_ctx.tcg_env, bofs);
> +
> + fn(a0, a1, a2, ptr, desc);
> +
> + tcg_temp_free_ptr(a0);
> + tcg_temp_free_ptr(a1);
> + tcg_temp_free_ptr(a2);
> + tcg_temp_free_i32(desc);
> +}
> +
> +/* Return true if we want to implement something of OPRSZ bytes
> + in units of LNSZ. This limits the expansion of inline code. */
> +static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
> +{
> + uint32_t lnct = oprsz / lnsz;
> + return lnct >= 1 && lnct <= MAX_UNROLL;
> +}
> +
> +/* Clear MAXSZ bytes at DOFS. */
> +static void expand_clr(uint32_t dofs, uint32_t maxsz)
> +{
> + if (maxsz >= 16 && TCG_TARGET_HAS_v128) {
> + TCGv_vec zero;
> +
> + if (maxsz >= 32 && TCG_TARGET_HAS_v256) {
> + zero = tcg_temp_new_vec(TCG_TYPE_V256);
> + tcg_gen_movi_vec(zero, 0);
> +
> + for (; maxsz >= 32; dofs += 32, maxsz -= 32) {
> + tcg_gen_stl_vec(zero, tcg_ctx.tcg_env, dofs, TCG_TYPE_V256);
> + }
> + } else {
> + zero = tcg_temp_new_vec(TCG_TYPE_V128);
> + tcg_gen_movi_vec(zero, 0);
> + }
> + for (; maxsz >= 16; dofs += 16, maxsz -= 16) {
> + tcg_gen_stl_vec(zero, tcg_ctx.tcg_env, dofs, TCG_TYPE_V128);
> + }
> +
> + tcg_temp_free_vec(zero);
> + } if (TCG_TARGET_REG_BITS == 64) {
> + TCGv_i64 zero = tcg_const_i64(0);
> +
> + for (; maxsz >= 8; dofs += 8, maxsz -= 8) {
> + tcg_gen_st_i64(zero, tcg_ctx.tcg_env, dofs);
> + }
> +
> + tcg_temp_free_i64(zero);
> + } else if (TCG_TARGET_HAS_v64) {
> + TCGv_vec zero = tcg_temp_new_vec(TCG_TYPE_V64);
> +
> + tcg_gen_movi_vec(zero, 0);
> + for (; maxsz >= 8; dofs += 8, maxsz -= 8) {
> + tcg_gen_st_vec(zero, tcg_ctx.tcg_env, dofs);
> + }
> +
> + tcg_temp_free_vec(zero);
> + } else {
> + TCGv_i32 zero = tcg_const_i32(0);
> +
> + for (; maxsz >= 4; dofs += 4, maxsz -= 4) {
> + tcg_gen_st_i32(zero, tcg_ctx.tcg_env, dofs);
> + }
> +
> + tcg_temp_free_i32(zero);
> + }
> +}
> +
> +/* Expand OPSZ bytes worth of two-operand operations using i32 elements. */
> +static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t opsz,
> + void (*fni)(TCGv_i32, TCGv_i32))
> +{
> + TCGv_i32 t0 = tcg_temp_new_i32();
> + uint32_t i;
> +
> + for (i = 0; i < opsz; i += 4) {
> + tcg_gen_ld_i32(t0, tcg_ctx.tcg_env, aofs + i);
> + fni(t0, t0);
> + tcg_gen_st_i32(t0, tcg_ctx.tcg_env, dofs + i);
> + }
> + tcg_temp_free_i32(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
> +static void expand_3_i32(uint32_t dofs, uint32_t aofs,
> + uint32_t bofs, uint32_t opsz, bool load_dest,
> + void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
> +{
> + TCGv_i32 t0 = tcg_temp_new_i32();
> + TCGv_i32 t1 = tcg_temp_new_i32();
> + TCGv_i32 t2 = tcg_temp_new_i32();
> + uint32_t i;
> +
> + for (i = 0; i < opsz; i += 4) {
> + tcg_gen_ld_i32(t0, tcg_ctx.tcg_env, aofs + i);
> + tcg_gen_ld_i32(t1, tcg_ctx.tcg_env, bofs + i);
> + if (load_dest) {
> + tcg_gen_ld_i32(t2, tcg_ctx.tcg_env, dofs + i);
> + }
> + fni(t2, t0, t1);
> + tcg_gen_st_i32(t2, tcg_ctx.tcg_env, dofs + i);
> + }
> + tcg_temp_free_i32(t2);
> + tcg_temp_free_i32(t1);
> + tcg_temp_free_i32(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of two-operand operations using i64 elements. */
> +static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t opsz,
> + void (*fni)(TCGv_i64, TCGv_i64))
> +{
> + TCGv_i64 t0 = tcg_temp_new_i64();
> + uint32_t i;
> +
> + for (i = 0; i < opsz; i += 8) {
> + tcg_gen_ld_i64(t0, tcg_ctx.tcg_env, aofs + i);
> + fni(t0, t0);
> + tcg_gen_st_i64(t0, tcg_ctx.tcg_env, dofs + i);
> + }
> + tcg_temp_free_i64(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
> +static void expand_3_i64(uint32_t dofs, uint32_t aofs,
> + uint32_t bofs, uint32_t opsz, bool load_dest,
> + void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
> +{
> + TCGv_i64 t0 = tcg_temp_new_i64();
> + TCGv_i64 t1 = tcg_temp_new_i64();
> + TCGv_i64 t2 = tcg_temp_new_i64();
> + uint32_t i;
> +
> + for (i = 0; i < opsz; i += 8) {
> + tcg_gen_ld_i64(t0, tcg_ctx.tcg_env, aofs + i);
> + tcg_gen_ld_i64(t1, tcg_ctx.tcg_env, bofs + i);
> + if (load_dest) {
> + tcg_gen_ld_i64(t2, tcg_ctx.tcg_env, dofs + i);
> + }
> + fni(t2, t0, t1);
> + tcg_gen_st_i64(t2, tcg_ctx.tcg_env, dofs + i);
> + }
> + tcg_temp_free_i64(t2);
> + tcg_temp_free_i64(t1);
> + tcg_temp_free_i64(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of two-operand operations using host vectors. */
> +static void expand_2_vec(uint32_t dofs, uint32_t aofs,
> + uint32_t opsz, uint32_t tysz, TCGType type,
> + void (*fni)(TCGv_vec, TCGv_vec))
> +{
> + TCGv_vec t0 = tcg_temp_new_vec(type);
> + uint32_t i;
> +
> + for (i = 0; i < opsz; i += tysz) {
> + tcg_gen_ld_vec(t0, tcg_ctx.tcg_env, aofs + i);
> + fni(t0, t0);
> + tcg_gen_st_vec(t0, tcg_ctx.tcg_env, dofs + i);
> + }
> + tcg_temp_free_vec(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using host vectors. */
> +static void expand_3_vec(uint32_t dofs, uint32_t aofs,
> + uint32_t bofs, uint32_t opsz,
> + uint32_t tysz, TCGType type, bool load_dest,
> + void (*fni)(TCGv_vec, TCGv_vec, TCGv_vec))
> +{
> + TCGv_vec t0 = tcg_temp_new_vec(type);
> + TCGv_vec t1 = tcg_temp_new_vec(type);
> + TCGv_vec t2 = tcg_temp_new_vec(type);
> + uint32_t i;
> +
> + for (i = 0; i < opsz; i += tysz) {
> + tcg_gen_ld_vec(t0, tcg_ctx.tcg_env, aofs + i);
> + tcg_gen_ld_vec(t1, tcg_ctx.tcg_env, bofs + i);
> + if (load_dest) {
> + tcg_gen_ld_vec(t2, tcg_ctx.tcg_env, dofs + i);
> + }
> + fni(t2, t0, t1);
> + tcg_gen_st_vec(t2, tcg_ctx.tcg_env, dofs + i);
> + }
> + tcg_temp_free_vec(t2);
> + tcg_temp_free_vec(t1);
> + tcg_temp_free_vec(t0);
> +}
> +
> +/* Expand a vector two-operand operation. */
> +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
> + uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
> +{
> + check_size_align(oprsz, maxsz, dofs | aofs);
> + check_overlap_2(dofs, aofs, maxsz);
> +
> + /* Quick check for sizes we won't support inline. */
> + if (oprsz > MAX_UNROLL * 32 || maxsz > MAX_UNROLL * 32) {
> + goto do_ool;
> + }
> +
> + /* Recall that ARM SVE allows vector sizes that are not a power of 2.
> + Expand with successively smaller host vector sizes. The intent is
> + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
> + /* ??? For maxsz > oprsz, the host may be able to use an op-sized
> + operation, zeroing the balance of the register. We can then
> + use a cl-sized store to implement the clearing without an extra
> + store operation. This is true for aarch64 and x86_64 hosts. */
> +
> + if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
> + uint32_t done = QEMU_ALIGN_DOWN(oprsz, 32);
> + expand_2_vec(dofs, aofs, done, 32, TCG_TYPE_V256, g->fniv);
> + dofs += done;
> + aofs += done;
> + oprsz -= done;
> + maxsz -= done;
> + }
> +
> + if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
> + uint32_t done = QEMU_ALIGN_DOWN(oprsz, 16);
> + expand_2_vec(dofs, aofs, done, 16, TCG_TYPE_V128, g->fniv);
> + dofs += done;
> + aofs += done;
> + oprsz -= done;
> + maxsz -= done;
> + }
> +
> + if (check_size_impl(oprsz, 8)) {
> + uint32_t done = QEMU_ALIGN_DOWN(oprsz, 8);
> + if (TCG_TARGET_HAS_v64 && !g->prefer_i64) {
> + expand_2_vec(dofs, aofs, done, 8, TCG_TYPE_V64, g->fniv);
> + } else if (g->fni8) {
> + expand_2_i64(dofs, aofs, done, g->fni8);
> + } else {
> + done = 0;
> + }
> + dofs += done;
> + aofs += done;
> + oprsz -= done;
> + maxsz -= done;
> + }
> +
> + if (check_size_impl(oprsz, 4)) {
> + uint32_t done = QEMU_ALIGN_DOWN(oprsz, 4);
> + expand_2_i32(dofs, aofs, done, g->fni4);
> + dofs += done;
> + aofs += done;
> + oprsz -= done;
> + maxsz -= done;
> + }
> +
> + if (oprsz == 0) {
> + if (maxsz != 0) {
> + expand_clr(dofs, maxsz);
> + }
> + return;
> + }
> +
> + do_ool:
> + tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, 0, g->fno);
> +}
> +
> +/* Expand a vector three-operand operation. */
> +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
> +{
> + check_size_align(oprsz, maxsz, dofs | aofs | bofs);
> + check_overlap_3(dofs, aofs, bofs, maxsz);
> +
> + /* Quick check for sizes we won't support inline. */
> + if (oprsz > MAX_UNROLL * 32 || maxsz > MAX_UNROLL * 32) {
> + goto do_ool;
> + }
> +
> + /* Recall that ARM SVE allows vector sizes that are not a power of 2.
> + Expand with successively smaller host vector sizes. The intent is
> + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
> + /* ??? For maxsz > oprsz, the host may be able to use an op-sized
> + operation, zeroing the balance of the register. We can then
> + use a cl-sized store to implement the clearing without an extra
> + store operation. This is true for aarch64 and x86_64 hosts. */
> +
> + if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
> + uint32_t done = QEMU_ALIGN_DOWN(oprsz, 32);
> + expand_3_vec(dofs, aofs, bofs, done, 32, TCG_TYPE_V256,
> + g->load_dest, g->fniv);
> + dofs += done;
> + aofs += done;
> + bofs += done;
> + oprsz -= done;
> + maxsz -= done;
> + }
> +
> + if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
> + uint32_t done = QEMU_ALIGN_DOWN(oprsz, 16);
> + expand_3_vec(dofs, aofs, bofs, done, 16, TCG_TYPE_V128,
> + g->load_dest, g->fniv);
> + dofs += done;
> + aofs += done;
> + bofs += done;
> + oprsz -= done;
> + maxsz -= done;
> + }
> +
> + if (check_size_impl(oprsz, 8)) {
> + uint32_t done = QEMU_ALIGN_DOWN(oprsz, 8);
> + if (TCG_TARGET_HAS_v64 && !g->prefer_i64) {
> + expand_3_vec(dofs, aofs, bofs, done, 8, TCG_TYPE_V64,
> + g->load_dest, g->fniv);
> + } else if (g->fni8) {
> + expand_3_i64(dofs, aofs, bofs, done, g->load_dest, g->fni8);
> + } else {
> + done = 0;
> + }
> + dofs += done;
> + aofs += done;
> + bofs += done;
> + oprsz -= done;
> + maxsz -= done;
> + }
> +
> + if (check_size_impl(oprsz, 4)) {
> + uint32_t done = QEMU_ALIGN_DOWN(oprsz, 4);
> + expand_3_i32(dofs, aofs, bofs, done, g->load_dest, g->fni4);
> + dofs += done;
> + aofs += done;
> + bofs += done;
> + oprsz -= done;
> + maxsz -= done;
> + }
> +
> + if (oprsz == 0) {
> + if (maxsz != 0) {
> + expand_clr(dofs, maxsz);
> + }
> + return;
> + }
> +
> + do_ool:
> + tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, g->fno);
> +}
> +
> +/*
> + * Expand specific vector operations.
> + */
> +
> +void tcg_gen_gvec_mov(uint32_t dofs, uint32_t aofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen2 g = {
> + .fni8 = tcg_gen_mov_i64,
> + .fniv = tcg_gen_mov_vec,
> + .fno = gen_helper_gvec_mov,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + };
> + tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_not(uint32_t dofs, uint32_t aofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen2 g = {
> + .fni8 = tcg_gen_not_i64,
> + .fniv = tcg_gen_not_vec,
> + .fno = gen_helper_gvec_not,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + };
> + tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
> +{
> + TCGv_i64 t1 = tcg_temp_new_i64();
> + TCGv_i64 t2 = tcg_temp_new_i64();
> + TCGv_i64 t3 = tcg_temp_new_i64();
> +
> + tcg_gen_andc_i64(t1, a, m);
> + tcg_gen_andc_i64(t2, b, m);
> + tcg_gen_xor_i64(t3, a, b);
> + tcg_gen_add_i64(d, t1, t2);
> + tcg_gen_and_i64(t3, t3, m);
> + tcg_gen_xor_i64(d, d, t3);
> +
> + tcg_temp_free_i64(t1);
> + tcg_temp_free_i64(t2);
> + tcg_temp_free_i64(t3);
> +}
> +
> +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> + TCGv_i64 m = tcg_const_i64(REP8(0x80));
> + gen_addv_mask(d, a, b, m);
> + tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> + TCGv_i64 m = tcg_const_i64(REP16(0x8000));
> + gen_addv_mask(d, a, b, m);
> + tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> + TCGv_i64 t1 = tcg_temp_new_i64();
> + TCGv_i64 t2 = tcg_temp_new_i64();
> +
> + tcg_gen_andi_i64(t1, a, ~0xffffffffull);
> + tcg_gen_add_i64(t2, a, b);
> + tcg_gen_add_i64(t1, t1, b);
> + tcg_gen_deposit_i64(d, t1, t2, 0, 32);
> +
> + tcg_temp_free_i64(t1);
> + tcg_temp_free_i64(t2);
> +}
> +
> +void tcg_gen_gvec_add8(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_vec_add8_i64,
> + .fniv = tcg_gen_add8_vec,
> + .fno = gen_helper_gvec_add8,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_add16(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_vec_add16_i64,
> + .fniv = tcg_gen_add16_vec,
> + .fno = gen_helper_gvec_add16,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_add32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni4 = tcg_gen_add_i32,
> + .fniv = tcg_gen_add32_vec,
> + .fno = gen_helper_gvec_add32,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_add64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_add_i64,
> + .fniv = tcg_gen_add64_vec,
> + .fno = gen_helper_gvec_add64,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
> +{
> + TCGv_i64 t1 = tcg_temp_new_i64();
> + TCGv_i64 t2 = tcg_temp_new_i64();
> + TCGv_i64 t3 = tcg_temp_new_i64();
> +
> + tcg_gen_or_i64(t1, a, m);
> + tcg_gen_andc_i64(t2, b, m);
> + tcg_gen_eqv_i64(t3, a, b);
> + tcg_gen_sub_i64(d, t1, t2);
> + tcg_gen_and_i64(t3, t3, m);
> + tcg_gen_xor_i64(d, d, t3);
> +
> + tcg_temp_free_i64(t1);
> + tcg_temp_free_i64(t2);
> + tcg_temp_free_i64(t3);
> +}
> +
> +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> + TCGv_i64 m = tcg_const_i64(REP8(0x80));
> + gen_subv_mask(d, a, b, m);
> + tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> + TCGv_i64 m = tcg_const_i64(REP16(0x8000));
> + gen_subv_mask(d, a, b, m);
> + tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> + TCGv_i64 t1 = tcg_temp_new_i64();
> + TCGv_i64 t2 = tcg_temp_new_i64();
> +
> + tcg_gen_andi_i64(t1, b, ~0xffffffffull);
> + tcg_gen_sub_i64(t2, a, b);
> + tcg_gen_sub_i64(t1, a, t1);
> + tcg_gen_deposit_i64(d, t1, t2, 0, 32);
> +
> + tcg_temp_free_i64(t1);
> + tcg_temp_free_i64(t2);
> +}
> +
> +void tcg_gen_gvec_sub8(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_vec_sub8_i64,
> + .fniv = tcg_gen_sub8_vec,
> + .fno = gen_helper_gvec_sub8,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_sub16(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_vec_sub16_i64,
> + .fniv = tcg_gen_sub16_vec,
> + .fno = gen_helper_gvec_sub16,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_sub32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni4 = tcg_gen_sub_i32,
> + .fniv = tcg_gen_sub32_vec,
> + .fno = gen_helper_gvec_sub32,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_sub64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_sub_i64,
> + .fniv = tcg_gen_sub64_vec,
> + .fno = gen_helper_gvec_sub64,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
> +{
> + TCGv_i64 t2 = tcg_temp_new_i64();
> + TCGv_i64 t3 = tcg_temp_new_i64();
> +
> + tcg_gen_andc_i64(t3, m, b);
> + tcg_gen_andc_i64(t2, b, m);
> + tcg_gen_sub_i64(d, m, t2);
> + tcg_gen_xor_i64(d, d, t3);
> +
> + tcg_temp_free_i64(t2);
> + tcg_temp_free_i64(t3);
> +}
> +
> +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
> +{
> + TCGv_i64 m = tcg_const_i64(REP8(0x80));
> + gen_negv_mask(d, b, m);
> + tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
> +{
> + TCGv_i64 m = tcg_const_i64(REP16(0x8000));
> + gen_negv_mask(d, b, m);
> + tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
> +{
> + TCGv_i64 t1 = tcg_temp_new_i64();
> + TCGv_i64 t2 = tcg_temp_new_i64();
> +
> + tcg_gen_andi_i64(t1, b, ~0xffffffffull);
> + tcg_gen_neg_i64(t2, b);
> + tcg_gen_neg_i64(t1, t1);
> + tcg_gen_deposit_i64(d, t1, t2, 0, 32);
> +
> + tcg_temp_free_i64(t1);
> + tcg_temp_free_i64(t2);
> +}
> +
> +void tcg_gen_gvec_neg8(uint32_t dofs, uint32_t aofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen2 g = {
> + .fni8 = tcg_gen_vec_neg8_i64,
> + .fniv = tcg_gen_neg8_vec,
> + .fno = gen_helper_gvec_neg8,
> + };
> + tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_neg16(uint32_t dofs, uint32_t aofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen2 g = {
> + .fni8 = tcg_gen_vec_neg16_i64,
> + .fniv = tcg_gen_neg16_vec,
> + .fno = gen_helper_gvec_neg16,
> + };
> + tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_neg32(uint32_t dofs, uint32_t aofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen2 g = {
> + .fni4 = tcg_gen_neg_i32,
> + .fniv = tcg_gen_neg32_vec,
> + .fno = gen_helper_gvec_neg32,
> + };
> + tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_neg64(uint32_t dofs, uint32_t aofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen2 g = {
> + .fni8 = tcg_gen_neg_i64,
> + .fniv = tcg_gen_neg64_vec,
> + .fno = gen_helper_gvec_neg64,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + };
> + tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_and(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_and_i64,
> + .fniv = tcg_gen_and_vec,
> + .fno = gen_helper_gvec_and,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_or(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_or_i64,
> + .fniv = tcg_gen_or_vec,
> + .fno = gen_helper_gvec_or,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_xor(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_xor_i64,
> + .fniv = tcg_gen_xor_vec,
> + .fno = gen_helper_gvec_xor,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_andc(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_andc_i64,
> + .fniv = tcg_gen_andc_vec,
> + .fno = gen_helper_gvec_andc,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_orc(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> + uint32_t opsz, uint32_t clsz)
> +{
> + static const GVecGen3 g = {
> + .fni8 = tcg_gen_orc_i64,
> + .fniv = tcg_gen_orc_vec,
> + .fno = gen_helper_gvec_orc,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + };
> + tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
> index 228cd84fa4..d381a02f34 100644
> --- a/accel/tcg/Makefile.objs
> +++ b/accel/tcg/Makefile.objs
> @@ -1,6 +1,6 @@
> obj-$(CONFIG_SOFTMMU) += tcg-all.o
> obj-$(CONFIG_SOFTMMU) += cputlb.o
> -obj-y += tcg-runtime.o
> +obj-y += tcg-runtime.o tcg-runtime-gvec.o
> obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
> obj-y += translator.o
--
Alex Bennée
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers Richard Henderson
@ 2017-09-26 22:33 ` Alex Bennée
0 siblings, 0 replies; 14+ messages in thread
From: Alex Bennée @ 2017-09-26 22:33 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-devel, f4bug
Richard Henderson <richard.henderson@linaro.org> writes:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> ---
> target/arm/cpu.h | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/target/arm/cpu.h b/target/arm/cpu.h
> index 98b9b26fd3..c346bd148f 100644
> --- a/target/arm/cpu.h
> +++ b/target/arm/cpu.h
> @@ -486,7 +486,7 @@ typedef struct CPUARMState {
> * the two execution states, and means we do not need to explicitly
> * map these registers when changing states.
> */
> - float64 regs[64];
> + float64 regs[64] QEMU_ALIGNED(16);
>
> uint32_t xregs[16];
> /* We store these fpcsr fields separately for convenience. */
--
Alex Bennée
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion
2017-09-16 2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
` (6 preceding siblings ...)
2017-09-16 2:35 ` [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
@ 2017-09-26 22:58 ` no-reply
7 siblings, 0 replies; 14+ messages in thread
From: no-reply @ 2017-09-26 22:58 UTC (permalink / raw)
To: richard.henderson; +Cc: famz, qemu-devel, alex.bennee, f4bug
Hi,
This series seems to have some coding style problems. See output below for
more information:
Type: series
Message-id: 20170916023417.14599-1-richard.henderson@linaro.org
Subject: [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion
=== TEST SCRIPT BEGIN ===
#!/bin/bash
BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0
git config --local diff.renamelimit 0
git config --local diff.renames True
commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git log -n 1 --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done
exit $failed
=== TEST SCRIPT END ===
Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
7f8bff3639 tcg/aarch64: Add vector operations
107700b998 tcg/i386: Add vector operations
63c5d729cd target/arm: Use vector infrastructure for aa64 add/sub/logic
66bd1ba117 target/arm: Align vector registers
bcf88636c0 tcg: Add vector expanders
00e32ea5b2 tcg: Add types and operations for host vectors
=== OUTPUT BEGIN ===
Checking PATCH 1/6: tcg: Add types and operations for host vectors...
Checking PATCH 2/6: tcg: Add vector expanders...
ERROR: spaces required around that '&' (ctx:WxO)
#284: FILE: accel/tcg/tcg-runtime-gvec.c:241:
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
^
ERROR: space prohibited after that '~' (ctx:OxW)
#284: FILE: accel/tcg/tcg-runtime-gvec.c:241:
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
^
ERROR: spaces required around that '|' (ctx:WxO)
#295: FILE: accel/tcg/tcg-runtime-gvec.c:252:
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
^
ERROR: space prohibited after that '~' (ctx:OxW)
#295: FILE: accel/tcg/tcg-runtime-gvec.c:252:
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
^
ERROR: trailing statements should be on next line
#589: FILE: tcg/tcg-op-gvec.c:198:
+ } if (TCG_TARGET_REG_BITS == 64) {
total: 5 errors, 0 warnings, 1342 lines checked
Your patch has style problems, please review. If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
Checking PATCH 3/6: target/arm: Align vector registers...
Checking PATCH 4/6: target/arm: Use vector infrastructure for aa64 add/sub/logic...
Checking PATCH 5/6: tcg/i386: Add vector operations...
WARNING: architecture specific defines should be avoided
#50: FILE: tcg/i386/tcg-target.h:93:
+#ifdef __SSE2__
WARNING: architecture specific defines should be avoided
#55: FILE: tcg/i386/tcg-target.h:98:
+#ifdef __AVX2__
total: 0 errors, 2 warnings, 722 lines checked
Your patch has style problems, please review. If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
Checking PATCH 6/6: tcg/aarch64: Add vector operations...
=== OUTPUT END ===
Test command exited with code: 1
---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-devel@freelists.org
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic Richard Henderson
@ 2017-09-26 23:12 ` Alex Bennée
0 siblings, 0 replies; 14+ messages in thread
From: Alex Bennée @ 2017-09-26 23:12 UTC (permalink / raw)
To: Richard Henderson; +Cc: qemu-devel, f4bug
Richard Henderson <richard.henderson@linaro.org> writes:
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> ---
> target/arm/translate-a64.c | 216 ++++++++++++++++++++++++++++++---------------
> 1 file changed, 143 insertions(+), 73 deletions(-)
>
> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
> index a3984c9a0d..4759cc9829 100644
> --- a/target/arm/translate-a64.c
> +++ b/target/arm/translate-a64.c
> @@ -21,6 +21,7 @@
> #include "cpu.h"
> #include "exec/exec-all.h"
> #include "tcg-op.h"
> +#include "tcg-op-gvec.h"
> #include "qemu/log.h"
> #include "arm_ldst.h"
> #include "translate.h"
> @@ -82,6 +83,7 @@ typedef void NeonGenTwoDoubleOPFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
> typedef void NeonGenOneOpFn(TCGv_i64, TCGv_i64);
> typedef void CryptoTwoOpEnvFn(TCGv_ptr, TCGv_i32, TCGv_i32);
> typedef void CryptoThreeOpEnvFn(TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
> +typedef void GVecGenTwoFn(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t);
>
> /* initialize TCG globals. */
> void a64_translate_init(void)
> @@ -537,6 +539,21 @@ static inline int vec_reg_offset(DisasContext *s, int regno,
> return offs;
> }
>
> +/* Return the offset info CPUARMState of the "whole" vector register Qn. */
> +static inline int vec_full_reg_offset(DisasContext *s, int regno)
> +{
> + assert_fp_access_checked(s);
> + return offsetof(CPUARMState, vfp.regs[regno * 2]);
> +}
> +
> +/* Return the byte size of the "whole" vector register, VL / 8. */
> +static inline int vec_full_reg_size(DisasContext *s)
> +{
> + /* FIXME SVE: We should put the composite ZCR_EL* value into tb->flags.
> + In the meantime this is just the AdvSIMD length of 128. */
> + return 128 / 8;
> +}
> +
> /* Return the offset into CPUARMState of a slice (from
> * the least significant end) of FP register Qn (ie
> * Dn, Sn, Hn or Bn).
> @@ -9036,85 +9053,125 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
> }
> }
>
> +static void gen_bsl_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
> +{
> + tcg_gen_xor_i64(rn, rn, rm);
> + tcg_gen_and_i64(rn, rn, rd);
> + tcg_gen_xor_i64(rd, rm, rn);
> +}
> +
> +static void gen_bit_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
> +{
> + tcg_gen_xor_i64(rn, rn, rd);
> + tcg_gen_and_i64(rn, rn, rm);
> + tcg_gen_xor_i64(rd, rd, rn);
> +}
> +
> +static void gen_bif_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
> +{
> + tcg_gen_xor_i64(rn, rn, rd);
> + tcg_gen_andc_i64(rn, rn, rm);
> + tcg_gen_xor_i64(rd, rd, rn);
> +}
> +
> +static void gen_bsl_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
> +{
> + tcg_gen_xor_vec(rn, rn, rm);
> + tcg_gen_and_vec(rn, rn, rd);
> + tcg_gen_xor_vec(rd, rm, rn);
> +}
> +
> +static void gen_bit_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
> +{
> + tcg_gen_xor_vec(rn, rn, rd);
> + tcg_gen_and_vec(rn, rn, rm);
> + tcg_gen_xor_vec(rd, rd, rn);
> +}
> +
> +static void gen_bif_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
> +{
> + tcg_gen_xor_vec(rn, rn, rd);
> + tcg_gen_andc_vec(rn, rn, rm);
> + tcg_gen_xor_vec(rd, rd, rn);
> +}
> +
> /* Logic op (opcode == 3) subgroup of C3.6.16. */
> static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
> {
> + static const GVecGen3 bsl_op = {
> + .fni8 = gen_bsl_i64,
> + .fniv = gen_bsl_vec,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + .load_dest = true
> + };
> + static const GVecGen3 bit_op = {
> + .fni8 = gen_bit_i64,
> + .fniv = gen_bit_vec,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + .load_dest = true
> + };
> + static const GVecGen3 bif_op = {
> + .fni8 = gen_bif_i64,
> + .fniv = gen_bif_vec,
> + .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> + .load_dest = true
> + };
> +
> int rd = extract32(insn, 0, 5);
> int rn = extract32(insn, 5, 5);
> int rm = extract32(insn, 16, 5);
> int size = extract32(insn, 22, 2);
> bool is_u = extract32(insn, 29, 1);
> bool is_q = extract32(insn, 30, 1);
> - TCGv_i64 tcg_op1, tcg_op2, tcg_res[2];
> - int pass;
> + GVecGenTwoFn *gvec_fn;
> + const GVecGen3 *gvec_op;
>
> if (!fp_access_check(s)) {
> return;
> }
>
> - tcg_op1 = tcg_temp_new_i64();
> - tcg_op2 = tcg_temp_new_i64();
> - tcg_res[0] = tcg_temp_new_i64();
> - tcg_res[1] = tcg_temp_new_i64();
> -
> - for (pass = 0; pass < (is_q ? 2 : 1); pass++) {
> - read_vec_element(s, tcg_op1, rn, pass, MO_64);
> - read_vec_element(s, tcg_op2, rm, pass, MO_64);
> -
> - if (!is_u) {
> - switch (size) {
> - case 0: /* AND */
> - tcg_gen_and_i64(tcg_res[pass], tcg_op1, tcg_op2);
> - break;
> - case 1: /* BIC */
> - tcg_gen_andc_i64(tcg_res[pass], tcg_op1, tcg_op2);
> - break;
> - case 2: /* ORR */
> - tcg_gen_or_i64(tcg_res[pass], tcg_op1, tcg_op2);
> - break;
> - case 3: /* ORN */
> - tcg_gen_orc_i64(tcg_res[pass], tcg_op1, tcg_op2);
> - break;
> - }
> - } else {
> - if (size != 0) {
> - /* B* ops need res loaded to operate on */
> - read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
> - }
> -
> - switch (size) {
> - case 0: /* EOR */
> - tcg_gen_xor_i64(tcg_res[pass], tcg_op1, tcg_op2);
> - break;
> - case 1: /* BSL bitwise select */
> - tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_op2);
> - tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_res[pass]);
> - tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op1);
> - break;
> - case 2: /* BIT, bitwise insert if true */
> - tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
> - tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_op2);
> - tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
> - break;
> - case 3: /* BIF, bitwise insert if false */
> - tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
> - tcg_gen_andc_i64(tcg_op1, tcg_op1, tcg_op2);
> - tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
> - break;
> - }
> - }
> - }
> + switch (size + 4 * is_u) {
> + case 0: /* AND */
> + gvec_fn = tcg_gen_gvec_and;
> + goto do_fn;
> + case 1: /* BIC */
> + gvec_fn = tcg_gen_gvec_andc;
> + goto do_fn;
> + case 2: /* ORR */
> + gvec_fn = tcg_gen_gvec_or;
> + goto do_fn;
> + case 3: /* ORN */
> + gvec_fn = tcg_gen_gvec_orc;
> + goto do_fn;
> + case 4: /* EOR */
> + gvec_fn = tcg_gen_gvec_xor;
> + goto do_fn;
> + do_fn:
> + gvec_fn(vec_full_reg_offset(s, rd),
> + vec_full_reg_offset(s, rn),
> + vec_full_reg_offset(s, rm),
> + is_q ? 16 : 8, vec_full_reg_size(s));
> + return;
> +
> + case 5: /* BSL bitwise select */
> + gvec_op = &bsl_op;
> + goto do_op;
> + case 6: /* BIT, bitwise insert if true */
> + gvec_op = &bit_op;
> + goto do_op;
> + case 7: /* BIF, bitwise insert if false */
> + gvec_op = &bif_op;
> + goto do_op;
> + do_op:
> + tcg_gen_gvec_3(vec_full_reg_offset(s, rd),
> + vec_full_reg_offset(s, rn),
> + vec_full_reg_offset(s, rm),
> + is_q ? 16 : 8, vec_full_reg_size(s), gvec_op);
> + return;
>
> - write_vec_element(s, tcg_res[0], rd, 0, MO_64);
> - if (!is_q) {
> - tcg_gen_movi_i64(tcg_res[1], 0);
> + default:
> + g_assert_not_reached();
> }
> - write_vec_element(s, tcg_res[1], rd, 1, MO_64);
> -
> - tcg_temp_free_i64(tcg_op1);
> - tcg_temp_free_i64(tcg_op2);
> - tcg_temp_free_i64(tcg_res[0]);
> - tcg_temp_free_i64(tcg_res[1]);
> }
>
> /* Helper functions for 32 bit comparisons */
> @@ -9375,6 +9432,7 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
> int rn = extract32(insn, 5, 5);
> int rd = extract32(insn, 0, 5);
> int pass;
> + GVecGenTwoFn *gvec_op;
>
> switch (opcode) {
> case 0x13: /* MUL, PMUL */
> @@ -9414,6 +9472,28 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
> return;
> }
>
> + switch (opcode) {
> + case 0x10: /* ADD, SUB */
> + {
> + static GVecGenTwoFn * const fns[4][2] = {
> + { tcg_gen_gvec_add8, tcg_gen_gvec_sub8 },
> + { tcg_gen_gvec_add16, tcg_gen_gvec_sub16 },
> + { tcg_gen_gvec_add32, tcg_gen_gvec_sub32 },
> + { tcg_gen_gvec_add64, tcg_gen_gvec_sub64 },
> + };
> + gvec_op = fns[size][u];
> + goto do_gvec;
> + }
> + break;
> +
> + do_gvec:
> + gvec_op(vec_full_reg_offset(s, rd),
> + vec_full_reg_offset(s, rn),
> + vec_full_reg_offset(s, rm),
> + is_q ? 16 : 8, vec_full_reg_size(s));
> + return;
> + }
> +
> if (size == 3) {
> assert(is_q);
> for (pass = 0; pass < 2; pass++) {
> @@ -9586,16 +9666,6 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
> genfn = fns[size][u];
> break;
> }
> - case 0x10: /* ADD, SUB */
> - {
> - static NeonGenTwoOpFn * const fns[3][2] = {
> - { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 },
> - { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
> - { tcg_gen_add_i32, tcg_gen_sub_i32 },
> - };
> - genfn = fns[size][u];
> - break;
> - }
> case 0x11: /* CMTST, CMEQ */
> {
> static NeonGenTwoOpFn * const fns[3][2] = {
--
Alex Bennée
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors
2017-09-26 19:28 ` Alex Bennée
@ 2017-09-27 16:18 ` Richard Henderson
0 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2017-09-27 16:18 UTC (permalink / raw)
To: Alex Bennée; +Cc: qemu-devel, f4bug
On 09/26/2017 12:28 PM, Alex Bennée wrote:
>> * TCGv_ptr : a host pointer type
>> + * TCGv_vec : a host vector type; the exact size is not exposed
>> + to the CPU front-end code.
>
> Isn't this a guest vector type (which is pointed to by a host pointer)?
No, it's a host vector, which we have created in response to expanding a guest
vector operation.
> A one line comment wouldn't go amiss here. This looks like we are
> allocating a new temp of the same type as an existing temp?
>
>> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match)
Yes.
>> +All of the vector ops have a final constant argument that specifies the
>> +length of the vector operation LEN as 64 << LEN bits.
>
> That doesn't scan well. So would a 4 lane operation be encoded as 64 <<
> 4? Is this because we are using the bottom bits for something?
64 << 0 = 64
64 << 1 = 128
64 << 2 = 256.
I've fixed up the wording a bit.
>> + Copy C across the entire vector.
>> + At present the only supported values for C are 0 and -1.
>
> I guess this is why the size in unimportant? This is for clearing or
> setting the whole of the vector? What does len mean in this case?
Yes. Len still means the length of the whole vector.
Elsewhere there's a comment about maybe using dupi{8,16,32,64}_vec instead.
However I wanted to put that off until we do some more conversions and see
exactly what's going to be needed.
>> +* and_vec v0, v1, v2, len
>> +* or_vec v0, v1, v2, len
>> +* xor_vec v0, v1, v2, len
>> +* andc_vec v0, v1, v2, len
>> +* orc_vec v0, v1, v2, len
>> +* not_vec v0, v1, len
>> +
>> + Similarly, logical operations.
>
> Similarly, logical operations with and without compliment?
Sure.
r~
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2017-09-27 16:19 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-16 2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
2017-09-26 19:28 ` Alex Bennée
2017-09-27 16:18 ` Richard Henderson
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders Richard Henderson
2017-09-26 22:31 ` Alex Bennée
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers Richard Henderson
2017-09-26 22:33 ` Alex Bennée
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic Richard Henderson
2017-09-26 23:12 ` Alex Bennée
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 5/6] tcg/i386: Add vector operations Richard Henderson
2017-09-16 2:34 ` [Qemu-devel] [PATCH v3 6/6] tcg/aarch64: " Richard Henderson
2017-09-16 2:35 ` [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
2017-09-26 22:58 ` no-reply
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.