[Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion

All of lore.kernel.org
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion
@ 2017-09-16  2:34 Richard Henderson
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
                   ` (7 more replies)
  0 siblings, 8 replies; 14+ messages in thread
From: Richard Henderson @ 2017-09-16  2:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: alex.bennee, f4bug

Now addressing the complex vector op issue.  I now expose TCGv_vec
to target front-ends, but opaque wrt the vector size.  One can thus
compose vector operations, as demonstrated in target/arm/.

The actual host vector length now becomes an argument to the *_vec
opcodes.  It's a little awkward, but does prevent an explosion of
opcode values.

All R-b dropped because all patches rewritten or heavily modified.

Whacha think?


r~


Richard Henderson (6):
  tcg: Add types and operations for host vectors
  tcg: Add vector expanders
  target/arm: Align vector registers
  target/arm: Use vector infrastructure for aa64 add/sub/logic
  tcg/i386: Add vector operations
  tcg/aarch64: Add vector operations

 Makefile.target              |   2 +-
 accel/tcg/tcg-runtime.h      |  24 ++
 target/arm/cpu.h             |   2 +-
 tcg/aarch64/tcg-target.h     |  20 +-
 tcg/i386/tcg-target.h        |  36 +-
 tcg/tcg-gvec-desc.h          |  49 +++
 tcg/tcg-op-gvec.h            | 143 ++++++++
 tcg/tcg-op.h                 |  26 ++
 tcg/tcg-opc.h                |  37 ++
 tcg/tcg.h                    |  34 ++
 accel/tcg/tcg-runtime-gvec.c | 255 +++++++++++++
 target/arm/translate-a64.c   | 216 +++++++----
 tcg/aarch64/tcg-target.inc.c | 340 ++++++++++++++---
 tcg/i386/tcg-target.inc.c    | 423 ++++++++++++++++++---
 tcg/tcg-op-gvec.c            | 853 +++++++++++++++++++++++++++++++++++++++++++
 tcg/tcg-op.c                 | 234 ++++++++++++
 tcg/tcg.c                    |  77 +++-
 accel/tcg/Makefile.objs      |   2 +-
 tcg/README                   |  46 +++
 19 files changed, 2651 insertions(+), 168 deletions(-)
 create mode 100644 tcg/tcg-gvec-desc.h
 create mode 100644 tcg/tcg-op-gvec.h
 create mode 100644 accel/tcg/tcg-runtime-gvec.c
 create mode 100644 tcg/tcg-op-gvec.c

-- 
2.13.5

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors
  2017-09-16  2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
@ 2017-09-16  2:34 ` Richard Henderson
  2017-09-26 19:28   ` Alex Bennée
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders Richard Henderson
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 14+ messages in thread
From: Richard Henderson @ 2017-09-16  2:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: alex.bennee, f4bug

Nothing uses or enables them yet.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg-op.h  |  26 +++++++
 tcg/tcg-opc.h |  37 ++++++++++
 tcg/tcg.h     |  34 +++++++++
 tcg/tcg-op.c  | 234 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 tcg/tcg.c     |  77 ++++++++++++++++++-
 tcg/README    |  46 ++++++++++++
 6 files changed, 453 insertions(+), 1 deletion(-)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 5d3278f243..b9b0b9f46f 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -915,6 +915,32 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
 void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
 void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
 
+void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
+void tcg_gen_movi_vec(TCGv_vec, tcg_target_long);
+void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a);
+
+void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
+void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
+void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz);
+void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz);
+
 #if TARGET_LONG_BITS == 64
 #define tcg_gen_movi_tl tcg_gen_movi_i64
 #define tcg_gen_mov_tl tcg_gen_mov_i64
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 956fb1e9f3..8200184fa9 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -204,8 +204,45 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
 DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
     TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
 
+/* Host vector support.  */
+
+#define IMPLVEC  \
+    IMPL(TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256)
+
+DEF(mov_vec, 1, 1, 1, TCG_OPF_NOT_PRESENT)
+
+/* ??? Simple, but perhaps dupiN would be more descriptive.  */
+DEF(movi_vec, 1, 0, 2, TCG_OPF_NOT_PRESENT)
+
+DEF(ld_vec, 1, 1, 2, IMPLVEC)
+DEF(ldz_vec, 1, 1, 3, IMPLVEC)
+DEF(st_vec, 0, 2, 2, IMPLVEC)
+
+DEF(add8_vec, 1, 2, 1, IMPLVEC)
+DEF(add16_vec, 1, 2, 1, IMPLVEC)
+DEF(add32_vec, 1, 2, 1, IMPLVEC)
+DEF(add64_vec, 1, 2, 1, IMPLVEC)
+
+DEF(sub8_vec, 1, 2, 1, IMPLVEC)
+DEF(sub16_vec, 1, 2, 1, IMPLVEC)
+DEF(sub32_vec, 1, 2, 1, IMPLVEC)
+DEF(sub64_vec, 1, 2, 1, IMPLVEC)
+
+DEF(neg8_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+DEF(neg16_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+DEF(neg32_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+DEF(neg64_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+
+DEF(and_vec, 1, 2, 1, IMPLVEC)
+DEF(or_vec, 1, 2, 1, IMPLVEC)
+DEF(xor_vec, 1, 2, 1, IMPLVEC)
+DEF(andc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
+DEF(orc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
+DEF(not_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
+
 #undef TLADDR_ARGS
 #undef DATA64_ARGS
 #undef IMPL
 #undef IMPL64
+#undef IMPLVEC
 #undef DEF
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 25662c36d4..7cd356e87f 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -173,6 +173,16 @@ typedef uint64_t TCGRegSet;
 # error "Missing unsigned widening multiply"
 #endif
 
+#ifndef TCG_TARGET_HAS_v64
+#define TCG_TARGET_HAS_v64              0
+#define TCG_TARGET_HAS_v128             0
+#define TCG_TARGET_HAS_v256             0
+#define TCG_TARGET_HAS_neg_vec          0
+#define TCG_TARGET_HAS_not_vec          0
+#define TCG_TARGET_HAS_andc_vec         0
+#define TCG_TARGET_HAS_orc_vec          0
+#endif
+
 #ifndef TARGET_INSN_START_EXTRA_WORDS
 # define TARGET_INSN_START_WORDS 1
 #else
@@ -249,6 +259,11 @@ typedef struct TCGPool {
 typedef enum TCGType {
     TCG_TYPE_I32,
     TCG_TYPE_I64,
+
+    TCG_TYPE_V64,
+    TCG_TYPE_V128,
+    TCG_TYPE_V256,
+
     TCG_TYPE_COUNT, /* number of different types */
 
     /* An alias for the size of the host register.  */
@@ -399,6 +414,8 @@ typedef tcg_target_ulong TCGArg;
     * TCGv_i32 : 32 bit integer type
     * TCGv_i64 : 64 bit integer type
     * TCGv_ptr : a host pointer type
+    * TCGv_vec : a host vector type; the exact size is not exposed
+                 to the CPU front-end code.
     * TCGv : an integer type the same size as target_ulong
              (an alias for either TCGv_i32 or TCGv_i64)
    The compiler's type checking will complain if you mix them
@@ -424,6 +441,7 @@ typedef tcg_target_ulong TCGArg;
 typedef struct TCGv_i32_d *TCGv_i32;
 typedef struct TCGv_i64_d *TCGv_i64;
 typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_vec_d *TCGv_vec;
 typedef TCGv_ptr TCGv_env;
 #if TARGET_LONG_BITS == 32
 #define TCGv TCGv_i32
@@ -448,6 +466,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL MAKE_TCGV_PTR(intptr_t i)
     return (TCGv_ptr)i;
 }
 
+static inline TCGv_vec QEMU_ARTIFICIAL MAKE_TCGV_VEC(intptr_t i)
+{
+    return (TCGv_vec)i;
+}
+
 static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
 {
     return (intptr_t)t;
@@ -463,6 +486,11 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
     return (intptr_t)t;
 }
 
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_VEC(TCGv_vec t)
+{
+    return (intptr_t)t;
+}
+
 #if TCG_TARGET_REG_BITS == 32
 #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
 #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
@@ -471,15 +499,18 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
 #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
 #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
 #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
+#define TCGV_EQUAL_VEC(a, b) (GET_TCGV_VEC(a) == GET_TCGV_VEC(b))
 
 /* Dummy definition to avoid compiler warnings.  */
 #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
 #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
 #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
+#define TCGV_UNUSED_VEC(x) x = MAKE_TCGV_VEC(-1)
 
 #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
 #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
 #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
+#define TCGV_IS_UNUSED_VEC(x) (GET_TCGV_VEC(x) == -1)
 
 /* call flags */
 /* Helper does not read globals (either directly or through an exception). It
@@ -790,9 +821,12 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char *name);
 
 TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
 TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_vec tcg_temp_new_vec(TCGType type);
+TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match);
 
 void tcg_temp_free_i32(TCGv_i32 arg);
 void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_vec(TCGv_vec arg);
 
 static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
                                               const char *name)
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 688d91755b..50b3177e5f 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -3072,3 +3072,237 @@ static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b)
 GEN_ATOMIC_HELPER(xchg, mov2, 0)
 
 #undef GEN_ATOMIC_HELPER
+
+static void tcg_gen_op2_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a)
+{
+    TCGArg ri = GET_TCGV_VEC(r);
+    TCGArg ai = GET_TCGV_VEC(a);
+    TCGTemp *rt = &tcg_ctx.temps[ri];
+    TCGTemp *at = &tcg_ctx.temps[ai];
+    TCGType type = rt->base_type;
+
+    tcg_debug_assert(at->base_type == type);
+    tcg_gen_op3(&tcg_ctx, opc, ri, ai, type - TCG_TYPE_V64);
+}
+
+static void tcg_gen_op3_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    TCGArg ri = GET_TCGV_VEC(r);
+    TCGArg ai = GET_TCGV_VEC(a);
+    TCGArg bi = GET_TCGV_VEC(b);
+    TCGTemp *rt = &tcg_ctx.temps[ri];
+    TCGTemp *at = &tcg_ctx.temps[ai];
+    TCGTemp *bt = &tcg_ctx.temps[bi];
+    TCGType type = rt->base_type;
+
+    tcg_debug_assert(at->base_type == type);
+    tcg_debug_assert(bt->base_type == type);
+    tcg_gen_op4(&tcg_ctx, opc, ri, ai, bi, type - TCG_TYPE_V64);
+}
+
+void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
+{
+    if (!TCGV_EQUAL_VEC(r, a)) {
+        tcg_gen_op2_vec(INDEX_op_mov_vec, r, a);
+    }
+}
+
+void tcg_gen_movi_vec(TCGv_vec r, tcg_target_long a)
+{
+    TCGArg ri = GET_TCGV_VEC(r);
+    TCGTemp *rt = &tcg_ctx.temps[ri];
+    TCGType type = rt->base_type;
+
+    tcg_debug_assert(a == 0 || a == -1);
+    tcg_gen_op3(&tcg_ctx, INDEX_op_movi_vec, ri, a, type - TCG_TYPE_V64);
+}
+
+void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
+{
+    TCGArg ri = GET_TCGV_VEC(r);
+    TCGArg bi = GET_TCGV_PTR(b);
+    TCGTemp *rt = &tcg_ctx.temps[ri];
+    TCGType type = rt->base_type;
+
+    tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64);
+}
+
+void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
+{
+    TCGArg ri = GET_TCGV_VEC(r);
+    TCGArg bi = GET_TCGV_PTR(b);
+    TCGTemp *rt = &tcg_ctx.temps[ri];
+    TCGType type = rt->base_type;
+
+    tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64);
+}
+
+/* Load data into a vector R from B+O using TYPE.  If R is wider than TYPE,
+   fill the high bits with zeros.  */
+void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type)
+{
+    TCGArg ri = GET_TCGV_VEC(r);
+    TCGArg bi = GET_TCGV_PTR(b);
+    TCGTemp *rt = &tcg_ctx.temps[ri];
+    TCGType btype = rt->base_type;
+
+    if (type < btype) {
+        tcg_gen_op5(&tcg_ctx, INDEX_op_ldz_vec, ri, bi, o,
+                    type - TCG_TYPE_V64, btype - TCG_TYPE_V64);
+    } else {
+        tcg_debug_assert(type == btype);
+        tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64);
+    }
+}
+
+/* Store data from vector R into B+O using TYPE.  If R is wider than TYPE,
+   store only the low bits.  */
+void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type)
+{
+    TCGArg ri = GET_TCGV_VEC(r);
+    TCGArg bi = GET_TCGV_PTR(b);
+    TCGTemp *rt = &tcg_ctx.temps[ri];
+    TCGType btype = rt->base_type;
+
+    tcg_debug_assert(type <= btype);
+    tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64);
+}
+
+void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_add8_vec, r, a, b);
+}
+
+void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_add16_vec, r, a, b);
+}
+
+void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_add32_vec, r, a, b);
+}
+
+void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_add64_vec, r, a, b);
+}
+
+void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_sub8_vec, r, a, b);
+}
+
+void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_sub16_vec, r, a, b);
+}
+
+void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_sub32_vec, r, a, b);
+}
+
+void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_sub64_vec, r, a, b);
+}
+
+void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_and_vec, r, a, b);
+}
+
+void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_or_vec, r, a, b);
+}
+
+void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    tcg_gen_op3_vec(INDEX_op_xor_vec, r, a, b);
+}
+
+void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    if (TCG_TARGET_HAS_andc_vec) {
+        tcg_gen_op3_vec(INDEX_op_andc_vec, r, a, b);
+    } else {
+        TCGv_vec t = tcg_temp_new_vec_matching(r);
+        tcg_gen_not_vec(t, b);
+        tcg_gen_and_vec(r, a, t);
+        tcg_temp_free_vec(t);
+    }
+}
+
+void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+    if (TCG_TARGET_HAS_orc_vec) {
+        tcg_gen_op3_vec(INDEX_op_orc_vec, r, a, b);
+    } else {
+        TCGv_vec t = tcg_temp_new_vec_matching(r);
+        tcg_gen_not_vec(t, b);
+        tcg_gen_or_vec(r, a, t);
+        tcg_temp_free_vec(t);
+    }
+}
+
+void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a)
+{
+    if (TCG_TARGET_HAS_not_vec) {
+        tcg_gen_op2_vec(INDEX_op_orc_vec, r, a);
+    } else {
+        TCGv_vec t = tcg_temp_new_vec_matching(r);
+        tcg_gen_movi_vec(t, -1);
+        tcg_gen_xor_vec(r, a, t);
+        tcg_temp_free_vec(t);
+    }
+}
+
+void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a)
+{
+    if (TCG_TARGET_HAS_neg_vec) {
+        tcg_gen_op2_vec(INDEX_op_neg8_vec, r, a);
+    } else {
+        TCGv_vec t = tcg_temp_new_vec_matching(r);
+        tcg_gen_movi_vec(t, 0);
+        tcg_gen_sub8_vec(r, t, a);
+        tcg_temp_free_vec(t);
+    }
+}
+
+void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a)
+{
+    if (TCG_TARGET_HAS_neg_vec) {
+        tcg_gen_op2_vec(INDEX_op_neg16_vec, r, a);
+    } else {
+        TCGv_vec t = tcg_temp_new_vec_matching(r);
+        tcg_gen_movi_vec(t, 0);
+        tcg_gen_sub16_vec(r, t, a);
+        tcg_temp_free_vec(t);
+    }
+}
+
+void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a)
+{
+    if (TCG_TARGET_HAS_neg_vec) {
+        tcg_gen_op2_vec(INDEX_op_neg32_vec, r, a);
+    } else {
+        TCGv_vec t = tcg_temp_new_vec_matching(r);
+        tcg_gen_movi_vec(t, 0);
+        tcg_gen_sub32_vec(r, t, a);
+        tcg_temp_free_vec(t);
+    }
+}
+
+void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a)
+{
+    if (TCG_TARGET_HAS_neg_vec) {
+        tcg_gen_op2_vec(INDEX_op_neg64_vec, r, a);
+    } else {
+        TCGv_vec t = tcg_temp_new_vec_matching(r);
+        tcg_gen_movi_vec(t, 0);
+        tcg_gen_sub64_vec(r, t, a);
+        tcg_temp_free_vec(t);
+    }
+}
diff --git a/tcg/tcg.c b/tcg/tcg.c
index dff9999bc6..a4d55efdf0 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -116,7 +116,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
 static bool tcg_out_ldst_finalize(TCGContext *s);
 #endif
 
-static TCGRegSet tcg_target_available_regs[2];
+static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
 static TCGRegSet tcg_target_call_clobber_regs;
 
 #if TCG_TARGET_INSN_UNIT_SIZE == 1
@@ -664,6 +664,44 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
     return MAKE_TCGV_I64(idx);
 }
 
+TCGv_vec tcg_temp_new_vec(TCGType type)
+{
+    int idx;
+
+#ifdef CONFIG_DEBUG_TCG
+    switch (type) {
+    case TCG_TYPE_V64:
+        assert(TCG_TARGET_HAS_v64);
+        break;
+    case TCG_TYPE_V128:
+        assert(TCG_TARGET_HAS_v128);
+        break;
+    case TCG_TYPE_V256:
+        assert(TCG_TARGET_HAS_v256);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+#endif
+
+    idx = tcg_temp_new_internal(type, 0);
+    return MAKE_TCGV_VEC(idx);
+}
+
+TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match)
+{
+    TCGContext *s = &tcg_ctx;
+    int idx = GET_TCGV_VEC(match);
+    TCGTemp *ts;
+
+    tcg_debug_assert(idx >= s->nb_globals && idx < s->nb_temps);
+    ts = &s->temps[idx];
+    tcg_debug_assert(ts->temp_allocated != 0);
+
+    idx = tcg_temp_new_internal(ts->base_type, 0);
+    return MAKE_TCGV_VEC(idx);
+}
+
 static void tcg_temp_free_internal(int idx)
 {
     TCGContext *s = &tcg_ctx;
@@ -696,6 +734,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
     tcg_temp_free_internal(GET_TCGV_I64(arg));
 }
 
+void tcg_temp_free_vec(TCGv_vec arg)
+{
+    tcg_temp_free_internal(GET_TCGV_VEC(arg));
+}
+
 TCGv_i32 tcg_const_i32(int32_t val)
 {
     TCGv_i32 t0;
@@ -753,6 +796,9 @@ int tcg_check_temp_count(void)
    Test the runtime variable that controls each opcode.  */
 bool tcg_op_supported(TCGOpcode op)
 {
+    const bool have_vec
+        = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256;
+
     switch (op) {
     case INDEX_op_discard:
     case INDEX_op_set_label:
@@ -966,6 +1012,35 @@ bool tcg_op_supported(TCGOpcode op)
     case INDEX_op_mulsh_i64:
         return TCG_TARGET_HAS_mulsh_i64;
 
+    case INDEX_op_mov_vec:
+    case INDEX_op_movi_vec:
+    case INDEX_op_ld_vec:
+    case INDEX_op_ldz_vec:
+    case INDEX_op_st_vec:
+    case INDEX_op_add8_vec:
+    case INDEX_op_add16_vec:
+    case INDEX_op_add32_vec:
+    case INDEX_op_add64_vec:
+    case INDEX_op_sub8_vec:
+    case INDEX_op_sub16_vec:
+    case INDEX_op_sub32_vec:
+    case INDEX_op_sub64_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_xor_vec:
+        return have_vec;
+    case INDEX_op_not_vec:
+        return have_vec && TCG_TARGET_HAS_not_vec;
+    case INDEX_op_neg8_vec:
+    case INDEX_op_neg16_vec:
+    case INDEX_op_neg32_vec:
+    case INDEX_op_neg64_vec:
+        return have_vec && TCG_TARGET_HAS_neg_vec;
+    case INDEX_op_andc_vec:
+        return have_vec && TCG_TARGET_HAS_andc_vec;
+    case INDEX_op_orc_vec:
+        return have_vec && TCG_TARGET_HAS_orc_vec;
+
     case NB_OPS:
         break;
     }
diff --git a/tcg/README b/tcg/README
index 03bfb6acd4..3bf3af67db 100644
--- a/tcg/README
+++ b/tcg/README
@@ -503,6 +503,52 @@ of the memory access.
 For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
 64-bit memory access specified in flags.
 
+********* Host vector operations
+
+All of the vector ops have a final constant argument that specifies the
+length of the vector operation LEN as 64 << LEN bits.
+
+* mov_vec   v0, v1, len
+* ld_vec    v0, t1, len
+* st_vec    v0, t1, len
+
+  Move, load and store.
+
+* movi_vec  v0, c, len
+
+  Copy C across the entire vector.
+  At present the only supported values for C are 0 and -1.
+
+* add8_vec    v0, v1, v2, len
+* add16_vec   v0, v1, v2, len
+* add32_vec   v0, v1, v2, len
+* add64_vec   v0, v1, v2, len
+
+  v0 = v1 + v2, in elements of 8/16/32/64 bits, across len.
+
+* sub8_vec    v0, v1, v2, len
+* sub16_vec   v0, v1, v2, len
+* sub32_vec   v0, v1, v2, len
+* sub64_vec   v0, v1, v2, len
+
+  Similarly, v0 = v1 - v2.
+
+* neg8_vec    v0, v1, len
+* neg16_vec   v0, v1, len
+* neg32_vec   v0, v1, len
+* neg64_vec   v0, v1, len
+
+  Similarly, v0 = -v1.
+
+* and_vec     v0, v1, v2, len
+* or_vec      v0, v1, v2, len
+* xor_vec     v0, v1, v2, len
+* andc_vec    v0, v1, v2, len
+* orc_vec     v0, v1, v2, len
+* not_vec     v0, v1, len
+
+  Similarly, logical operations.
+
 *********
 
 Note 1: Some shortcuts are defined when the last operand is known to be
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders
  2017-09-16  2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
@ 2017-09-16  2:34 ` Richard Henderson
  2017-09-26 22:31   ` Alex Bennée
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers Richard Henderson
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 14+ messages in thread
From: Richard Henderson @ 2017-09-16  2:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: alex.bennee, f4bug

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 Makefile.target              |   2 +-
 accel/tcg/tcg-runtime.h      |  24 ++
 tcg/tcg-gvec-desc.h          |  49 +++
 tcg/tcg-op-gvec.h            | 143 ++++++++
 accel/tcg/tcg-runtime-gvec.c | 255 +++++++++++++
 tcg/tcg-op-gvec.c            | 853 +++++++++++++++++++++++++++++++++++++++++++
 accel/tcg/Makefile.objs      |   2 +-
 7 files changed, 1326 insertions(+), 2 deletions(-)
 create mode 100644 tcg/tcg-gvec-desc.h
 create mode 100644 tcg/tcg-op-gvec.h
 create mode 100644 accel/tcg/tcg-runtime-gvec.c
 create mode 100644 tcg/tcg-op-gvec.c

diff --git a/Makefile.target b/Makefile.target
index 6361f957fb..f9967feef5 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -94,7 +94,7 @@ all: $(PROGS) stap
 obj-y += exec.o
 obj-y += accel/
 obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
-obj-$(CONFIG_TCG) += tcg/tcg-common.o
+obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/tcg-op-gvec.o
 obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
 obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
 obj-y += fpu/softfloat.o
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index c41d38a557..61c0ce39d3 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -134,3 +134,27 @@ GEN_ATOMIC_HELPERS(xor_fetch)
 GEN_ATOMIC_HELPERS(xchg)
 
 #undef GEN_ATOMIC_HELPERS
+
+DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/tcg/tcg-gvec-desc.h b/tcg/tcg-gvec-desc.h
new file mode 100644
index 0000000000..8ba9a8168d
--- /dev/null
+++ b/tcg/tcg-gvec-desc.h
@@ -0,0 +1,49 @@
+/*
+ *  Generic vector operation descriptor
+ *
+ *  Copyright (c) 2017 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
+#define SIMD_OPRSZ_SHIFT   0
+#define SIMD_OPRSZ_BITS    5
+
+#define SIMD_MAXSZ_SHIFT   (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
+#define SIMD_MAXSZ_BITS    5
+
+#define SIMD_DATA_SHIFT    (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
+#define SIMD_DATA_BITS     (32 - SIMD_DATA_SHIFT)
+
+/* Create a descriptor from components.  */
+uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
+
+/* Extract the operation size from a descriptor.  */
+static inline intptr_t simd_oprsz(uint32_t desc)
+{
+    return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
+}
+
+/* Extract the max vector size from a descriptor.  */
+static inline intptr_t simd_maxsz(uint32_t desc)
+{
+    return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
+}
+
+/* Extract the operation-specific data from a descriptor.  */
+static inline int32_t simd_data(uint32_t desc)
+{
+    return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS);
+}
diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
new file mode 100644
index 0000000000..28bd77f1dc
--- /dev/null
+++ b/tcg/tcg-op-gvec.h
@@ -0,0 +1,143 @@
+/*
+ *  Generic vector operation expansion
+ *
+ *  Copyright (c) 2017 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * "Generic" vectors.  All operands are given as offsets from ENV,
+ * and therefore cannot also be allocated via tcg_global_mem_new_*.
+ * OPRSZ is the byte size of the vector upon which the operation is performed.
+ * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
+ *
+ * All sizes must be 8 or any multiple of 16.
+ * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
+ * Operands may completely, but not partially, overlap.
+ */
+
+/* Expand a call to a gvec-style helper, with pointers to two vector
+   operands, and a descriptor (see tcg-gvec-desc.h).  */
+typedef void (gen_helper_gvec_2)(TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_2 *fn);
+
+/* Similarly, passing an extra pointer (e.g. env or float_status).  */
+typedef void (gen_helper_gvec_2_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
+                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+                        int32_t data, gen_helper_gvec_2_ptr *fn);
+
+/* Similarly, with three vector operands.  */
+typedef void (gen_helper_gvec_3)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_3 *fn);
+
+typedef void (gen_helper_gvec_3_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+                                     TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+                        int32_t data, gen_helper_gvec_3_ptr *fn);
+
+/* Expand a gvec operation.  Either inline or out-of-line depending on
+   the actual vector size and the operations supported by the host.  */
+typedef struct {
+    /* Expand inline as a 64-bit or 32-bit integer.
+       Only one of these will be non-NULL.  */
+    void (*fni8)(TCGv_i64, TCGv_i64);
+    void (*fni4)(TCGv_i32, TCGv_i32);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(TCGv_vec, TCGv_vec);
+    /* Expand out-of-line helper w/descriptor.  */
+    gen_helper_gvec_2 *fno;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+} GVecGen2;
+
+typedef struct {
+    /* Expand inline as a 64-bit or 32-bit integer.
+       Only one of these will be non-NULL.  */
+    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
+    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
+    /* Expand inline with a host vector type.  */
+    void (*fniv)(TCGv_vec, TCGv_vec, TCGv_vec);
+    /* Expand out-of-line helper w/descriptor.  */
+    gen_helper_gvec_3 *fno;
+    /* Prefer i64 to v64.  */
+    bool prefer_i64;
+    /* Load dest as a 3rd source operand.  */
+    bool load_dest;
+} GVecGen3;
+
+void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
+                    uint32_t opsz, uint32_t clsz, const GVecGen2 *);
+void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                    uint32_t opsz, uint32_t clsz, const GVecGen3 *);
+
+/* Expand a specific vector operation.  */
+
+#define DEF(X) \
+    void tcg_gen_gvec_##X(uint32_t dofs, uint32_t aofs, \
+                          uint32_t opsz, uint32_t clsz)
+
+DEF(mov);
+DEF(not);
+DEF(neg8);
+DEF(neg16);
+DEF(neg32);
+DEF(neg64);
+
+#undef DEF
+#define DEF(X) \
+    void tcg_gen_gvec_##X(uint32_t dofs, uint32_t aofs, uint32_t bofs, \
+                          uint32_t opsz, uint32_t clsz)
+
+DEF(add8);
+DEF(add16);
+DEF(add32);
+DEF(add64);
+
+DEF(sub8);
+DEF(sub16);
+DEF(sub32);
+DEF(sub64);
+
+DEF(and);
+DEF(or);
+DEF(xor);
+DEF(andc);
+DEF(orc);
+
+#undef DEF
+
+/*
+ * 64-bit vector operations.  Use these when the register has been allocated
+ * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
+ * OPRSZ = MAXSZ = 8.
+ */
+
+void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
+void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
+void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
+
+void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+
+void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
new file mode 100644
index 0000000000..c75e76367c
--- /dev/null
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -0,0 +1,255 @@
+/*
+ *  Generic vectorized operation runtime
+ *
+ *  Copyright (c) 2017 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "tcg-gvec-desc.h"
+
+
+/* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
+   them via GCC's generic vector extension.  This turns out to be simpler and
+   more reliable than getting the compiler to autovectorize.
+
+   In tcg-op-gvec.c, we asserted that both the size and alignment
+   of the data are multiples of 16.  */
+
+typedef uint8_t vec8 __attribute__((vector_size(16)));
+typedef uint16_t vec16 __attribute__((vector_size(16)));
+typedef uint32_t vec32 __attribute__((vector_size(16)));
+typedef uint64_t vec64 __attribute__((vector_size(16)));
+
+static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
+{
+    intptr_t maxsz = simd_maxsz(desc);
+    intptr_t i;
+
+    if (unlikely(maxsz > oprsz)) {
+        for (i = oprsz; i < maxsz; i += sizeof(vec64)) {
+            *(vec64 *)(d + i) = (vec64){ 0 };
+        }
+    }
+}
+
+void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec8)) {
+        *(vec8 *)(d + i) = -*(vec8 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec16)) {
+        *(vec16 *)(d + i) = -*(vec16 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec32)) {
+        *(vec32 *)(d + i) = -*(vec32 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = -*(vec64 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+
+    memcpy(d, a, oprsz);
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
+{
+    intptr_t oprsz = simd_oprsz(desc);
+    intptr_t i;
+
+    for (i = 0; i < oprsz; i += sizeof(vec64)) {
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
+    }
+    clear_high(d, oprsz, desc);
+}
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
new file mode 100644
index 0000000000..7464321eba
--- /dev/null
+++ b/tcg/tcg-op-gvec.c
@@ -0,0 +1,853 @@
+/*
+ *  Generic vector operation expansion
+ *
+ *  Copyright (c) 2017 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "tcg.h"
+#include "tcg-op.h"
+#include "tcg-op-gvec.h"
+#include "tcg-gvec-desc.h"
+
+#define REP8(x)    ((x) * 0x0101010101010101ull)
+#define REP16(x)   ((x) * 0x0001000100010001ull)
+
+#define MAX_UNROLL  4
+
+/* Verify vector size and alignment rules.  OFS should be the OR of all
+   of the operand offsets so that we can check them all at once.  */
+static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
+{
+    uint32_t align = maxsz > 16 || oprsz >= 16 ? 15 : 7;
+    tcg_debug_assert(oprsz > 0);
+    tcg_debug_assert(oprsz <= maxsz);
+    tcg_debug_assert((oprsz & align) == 0);
+    tcg_debug_assert((maxsz & align) == 0);
+    tcg_debug_assert((ofs & align) == 0);
+}
+
+/* Verify vector overlap rules for two operands.  */
+static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
+{
+    tcg_debug_assert(d == a || d + s <= a || a + s <= d);
+}
+
+/* Verify vector overlap rules for three operands.  */
+static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
+{
+    check_overlap_2(d, a, s);
+    check_overlap_2(d, b, s);
+    check_overlap_2(a, b, s);
+}
+
+/* Create a descriptor from components.  */
+uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
+{
+    uint32_t desc = 0;
+
+    assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
+    assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
+    assert(data == sextract32(data, 0, SIMD_DATA_BITS));
+
+    oprsz = (oprsz / 8) - 1;
+    maxsz = (maxsz / 8) - 1;
+    desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
+    desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
+    desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
+
+    return desc;
+}
+
+/* Generate a call to a gvec-style helper with two vector operands.  */
+void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_2 *fn)
+{
+    TCGv_ptr a0, a1;
+    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+    a0 = tcg_temp_new_ptr();
+    a1 = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
+    tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
+
+    fn(a0, a1, desc);
+
+    tcg_temp_free_ptr(a0);
+    tcg_temp_free_ptr(a1);
+    tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with three vector operands.  */
+void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t oprsz, uint32_t maxsz, int32_t data,
+                        gen_helper_gvec_3 *fn)
+{
+    TCGv_ptr a0, a1, a2;
+    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+    a0 = tcg_temp_new_ptr();
+    a1 = tcg_temp_new_ptr();
+    a2 = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
+    tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
+    tcg_gen_addi_ptr(a2, tcg_ctx.tcg_env, bofs);
+
+    fn(a0, a1, a2, desc);
+
+    tcg_temp_free_ptr(a0);
+    tcg_temp_free_ptr(a1);
+    tcg_temp_free_ptr(a2);
+    tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with three vector operands
+   and an extra pointer operand.  */
+void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
+                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+                        int32_t data, gen_helper_gvec_2_ptr *fn)
+{
+    TCGv_ptr a0, a1;
+    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+    a0 = tcg_temp_new_ptr();
+    a1 = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
+    tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
+
+    fn(a0, a1, ptr, desc);
+
+    tcg_temp_free_ptr(a0);
+    tcg_temp_free_ptr(a1);
+    tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with three vector operands
+   and an extra pointer operand.  */
+void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+                        int32_t data, gen_helper_gvec_3_ptr *fn)
+{
+    TCGv_ptr a0, a1, a2;
+    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+    a0 = tcg_temp_new_ptr();
+    a1 = tcg_temp_new_ptr();
+    a2 = tcg_temp_new_ptr();
+
+    tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
+    tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
+    tcg_gen_addi_ptr(a2, tcg_ctx.tcg_env, bofs);
+
+    fn(a0, a1, a2, ptr, desc);
+
+    tcg_temp_free_ptr(a0);
+    tcg_temp_free_ptr(a1);
+    tcg_temp_free_ptr(a2);
+    tcg_temp_free_i32(desc);
+}
+
+/* Return true if we want to implement something of OPRSZ bytes
+   in units of LNSZ.  This limits the expansion of inline code.  */
+static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
+{
+    uint32_t lnct = oprsz / lnsz;
+    return lnct >= 1 && lnct <= MAX_UNROLL;
+}
+
+/* Clear MAXSZ bytes at DOFS.  */
+static void expand_clr(uint32_t dofs, uint32_t maxsz)
+{
+    if (maxsz >= 16 && TCG_TARGET_HAS_v128) {
+        TCGv_vec zero;
+
+        if (maxsz >= 32 && TCG_TARGET_HAS_v256) {
+            zero = tcg_temp_new_vec(TCG_TYPE_V256);
+            tcg_gen_movi_vec(zero, 0);
+
+            for (; maxsz >= 32; dofs += 32, maxsz -= 32) {
+                tcg_gen_stl_vec(zero, tcg_ctx.tcg_env, dofs, TCG_TYPE_V256);
+            }
+        } else {
+            zero = tcg_temp_new_vec(TCG_TYPE_V128);
+            tcg_gen_movi_vec(zero, 0);
+        }
+        for (; maxsz >= 16; dofs += 16, maxsz -= 16) {
+            tcg_gen_stl_vec(zero, tcg_ctx.tcg_env, dofs, TCG_TYPE_V128);
+        }
+
+        tcg_temp_free_vec(zero);
+    } if (TCG_TARGET_REG_BITS == 64) {
+        TCGv_i64 zero = tcg_const_i64(0);
+
+        for (; maxsz >= 8; dofs += 8, maxsz -= 8) {
+            tcg_gen_st_i64(zero, tcg_ctx.tcg_env, dofs);
+        }
+
+        tcg_temp_free_i64(zero);
+    } else if (TCG_TARGET_HAS_v64) {
+        TCGv_vec zero = tcg_temp_new_vec(TCG_TYPE_V64);
+
+        tcg_gen_movi_vec(zero, 0);
+        for (; maxsz >= 8; dofs += 8, maxsz -= 8) {
+            tcg_gen_st_vec(zero, tcg_ctx.tcg_env, dofs);
+        }
+
+        tcg_temp_free_vec(zero);
+    } else {
+        TCGv_i32 zero = tcg_const_i32(0);
+
+        for (; maxsz >= 4; dofs += 4, maxsz -= 4) {
+            tcg_gen_st_i32(zero, tcg_ctx.tcg_env, dofs);
+        }
+
+        tcg_temp_free_i32(zero);
+    }
+}
+
+/* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
+static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t opsz,
+                         void (*fni)(TCGv_i32, TCGv_i32))
+{
+    TCGv_i32 t0 = tcg_temp_new_i32();
+    uint32_t i;
+
+    for (i = 0; i < opsz; i += 4) {
+        tcg_gen_ld_i32(t0, tcg_ctx.tcg_env, aofs + i);
+        fni(t0, t0);
+        tcg_gen_st_i32(t0, tcg_ctx.tcg_env, dofs + i);
+    }
+    tcg_temp_free_i32(t0);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
+static void expand_3_i32(uint32_t dofs, uint32_t aofs,
+                         uint32_t bofs, uint32_t opsz, bool load_dest,
+                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+    TCGv_i32 t0 = tcg_temp_new_i32();
+    TCGv_i32 t1 = tcg_temp_new_i32();
+    TCGv_i32 t2 = tcg_temp_new_i32();
+    uint32_t i;
+
+    for (i = 0; i < opsz; i += 4) {
+        tcg_gen_ld_i32(t0, tcg_ctx.tcg_env, aofs + i);
+        tcg_gen_ld_i32(t1, tcg_ctx.tcg_env, bofs + i);
+        if (load_dest) {
+            tcg_gen_ld_i32(t2, tcg_ctx.tcg_env, dofs + i);
+        }
+        fni(t2, t0, t1);
+        tcg_gen_st_i32(t2, tcg_ctx.tcg_env, dofs + i);
+    }
+    tcg_temp_free_i32(t2);
+    tcg_temp_free_i32(t1);
+    tcg_temp_free_i32(t0);
+}
+
+/* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
+static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t opsz,
+                         void (*fni)(TCGv_i64, TCGv_i64))
+{
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    uint32_t i;
+
+    for (i = 0; i < opsz; i += 8) {
+        tcg_gen_ld_i64(t0, tcg_ctx.tcg_env, aofs + i);
+        fni(t0, t0);
+        tcg_gen_st_i64(t0, tcg_ctx.tcg_env, dofs + i);
+    }
+    tcg_temp_free_i64(t0);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
+static void expand_3_i64(uint32_t dofs, uint32_t aofs,
+                         uint32_t bofs, uint32_t opsz, bool load_dest,
+                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
+{
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+    uint32_t i;
+
+    for (i = 0; i < opsz; i += 8) {
+        tcg_gen_ld_i64(t0, tcg_ctx.tcg_env, aofs + i);
+        tcg_gen_ld_i64(t1, tcg_ctx.tcg_env, bofs + i);
+        if (load_dest) {
+            tcg_gen_ld_i64(t2, tcg_ctx.tcg_env, dofs + i);
+        }
+        fni(t2, t0, t1);
+        tcg_gen_st_i64(t2, tcg_ctx.tcg_env, dofs + i);
+    }
+    tcg_temp_free_i64(t2);
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t0);
+}
+
+/* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
+static void expand_2_vec(uint32_t dofs, uint32_t aofs,
+                         uint32_t opsz, uint32_t tysz, TCGType type,
+                         void (*fni)(TCGv_vec, TCGv_vec))
+{
+    TCGv_vec t0 = tcg_temp_new_vec(type);
+    uint32_t i;
+
+    for (i = 0; i < opsz; i += tysz) {
+        tcg_gen_ld_vec(t0, tcg_ctx.tcg_env, aofs + i);
+        fni(t0, t0);
+        tcg_gen_st_vec(t0, tcg_ctx.tcg_env, dofs + i);
+    }
+    tcg_temp_free_vec(t0);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
+static void expand_3_vec(uint32_t dofs, uint32_t aofs,
+                         uint32_t bofs, uint32_t opsz,
+                         uint32_t tysz, TCGType type, bool load_dest,
+                         void (*fni)(TCGv_vec, TCGv_vec, TCGv_vec))
+{
+    TCGv_vec t0 = tcg_temp_new_vec(type);
+    TCGv_vec t1 = tcg_temp_new_vec(type);
+    TCGv_vec t2 = tcg_temp_new_vec(type);
+    uint32_t i;
+
+    for (i = 0; i < opsz; i += tysz) {
+        tcg_gen_ld_vec(t0, tcg_ctx.tcg_env, aofs + i);
+        tcg_gen_ld_vec(t1, tcg_ctx.tcg_env, bofs + i);
+        if (load_dest) {
+            tcg_gen_ld_vec(t2, tcg_ctx.tcg_env, dofs + i);
+        }
+        fni(t2, t0, t1);
+        tcg_gen_st_vec(t2, tcg_ctx.tcg_env, dofs + i);
+    }
+    tcg_temp_free_vec(t2);
+    tcg_temp_free_vec(t1);
+    tcg_temp_free_vec(t0);
+}
+
+/* Expand a vector two-operand operation.  */
+void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
+                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
+{
+    check_size_align(oprsz, maxsz, dofs | aofs);
+    check_overlap_2(dofs, aofs, maxsz);
+
+    /* Quick check for sizes we won't support inline.  */
+    if (oprsz > MAX_UNROLL * 32 || maxsz > MAX_UNROLL * 32) {
+        goto do_ool;
+    }
+
+    /* Recall that ARM SVE allows vector sizes that are not a power of 2.
+       Expand with successively smaller host vector sizes.  The intent is
+       that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
+    /* ??? For maxsz > oprsz, the host may be able to use an op-sized
+       operation, zeroing the balance of the register.  We can then
+       use a cl-sized store to implement the clearing without an extra
+       store operation.  This is true for aarch64 and x86_64 hosts.  */
+
+    if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
+        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 32);
+        expand_2_vec(dofs, aofs, done, 32, TCG_TYPE_V256, g->fniv);
+        dofs += done;
+        aofs += done;
+        oprsz -= done;
+        maxsz -= done;
+    }
+
+    if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
+        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 16);
+        expand_2_vec(dofs, aofs, done, 16, TCG_TYPE_V128, g->fniv);
+        dofs += done;
+        aofs += done;
+        oprsz -= done;
+        maxsz -= done;
+    }
+
+    if (check_size_impl(oprsz, 8)) {
+        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 8);
+        if (TCG_TARGET_HAS_v64 && !g->prefer_i64) {
+            expand_2_vec(dofs, aofs, done, 8, TCG_TYPE_V64, g->fniv);
+        } else if (g->fni8) {
+            expand_2_i64(dofs, aofs, done, g->fni8);
+        } else {
+            done = 0;
+        }
+        dofs += done;
+        aofs += done;
+        oprsz -= done;
+        maxsz -= done;
+    }
+
+    if (check_size_impl(oprsz, 4)) {
+        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 4);
+        expand_2_i32(dofs, aofs, done, g->fni4);
+        dofs += done;
+        aofs += done;
+        oprsz -= done;
+        maxsz -= done;
+    }
+
+    if (oprsz == 0) {
+        if (maxsz != 0) {
+            expand_clr(dofs, maxsz);
+        }
+        return;
+    }
+
+ do_ool:
+    tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, 0, g->fno);
+}
+
+/* Expand a vector three-operand operation.  */
+void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
+{
+    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
+    check_overlap_3(dofs, aofs, bofs, maxsz);
+
+    /* Quick check for sizes we won't support inline.  */
+    if (oprsz > MAX_UNROLL * 32 || maxsz > MAX_UNROLL * 32) {
+        goto do_ool;
+    }
+
+    /* Recall that ARM SVE allows vector sizes that are not a power of 2.
+       Expand with successively smaller host vector sizes.  The intent is
+       that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
+    /* ??? For maxsz > oprsz, the host may be able to use an op-sized
+       operation, zeroing the balance of the register.  We can then
+       use a cl-sized store to implement the clearing without an extra
+       store operation.  This is true for aarch64 and x86_64 hosts.  */
+
+    if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
+        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 32);
+        expand_3_vec(dofs, aofs, bofs, done, 32, TCG_TYPE_V256,
+                     g->load_dest, g->fniv);
+        dofs += done;
+        aofs += done;
+        bofs += done;
+        oprsz -= done;
+        maxsz -= done;
+    }
+
+    if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
+        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 16);
+        expand_3_vec(dofs, aofs, bofs, done, 16, TCG_TYPE_V128,
+                     g->load_dest, g->fniv);
+        dofs += done;
+        aofs += done;
+        bofs += done;
+        oprsz -= done;
+        maxsz -= done;
+    }
+
+    if (check_size_impl(oprsz, 8)) {
+        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 8);
+        if (TCG_TARGET_HAS_v64 && !g->prefer_i64) {
+            expand_3_vec(dofs, aofs, bofs, done, 8, TCG_TYPE_V64,
+                         g->load_dest, g->fniv);
+        } else if (g->fni8) {
+            expand_3_i64(dofs, aofs, bofs, done, g->load_dest, g->fni8);
+        } else {
+            done = 0;
+        }
+        dofs += done;
+        aofs += done;
+        bofs += done;
+        oprsz -= done;
+        maxsz -= done;
+    }
+
+    if (check_size_impl(oprsz, 4)) {
+        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 4);
+        expand_3_i32(dofs, aofs, bofs, done, g->load_dest, g->fni4);
+        dofs += done;
+        aofs += done;
+        bofs += done;
+        oprsz -= done;
+        maxsz -= done;
+    }
+
+    if (oprsz == 0) {
+        if (maxsz != 0) {
+            expand_clr(dofs, maxsz);
+        }
+        return;
+    }
+
+ do_ool:
+    tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, g->fno);
+}
+
+/*
+ * Expand specific vector operations.
+ */
+
+void tcg_gen_gvec_mov(uint32_t dofs, uint32_t aofs,
+                      uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen2 g = {
+        .fni8 = tcg_gen_mov_i64,
+        .fniv = tcg_gen_mov_vec,
+        .fno = gen_helper_gvec_mov,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_not(uint32_t dofs, uint32_t aofs,
+                      uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen2 g = {
+        .fni8 = tcg_gen_not_i64,
+        .fniv = tcg_gen_not_vec,
+        .fno = gen_helper_gvec_not,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+    TCGv_i64 t3 = tcg_temp_new_i64();
+
+    tcg_gen_andc_i64(t1, a, m);
+    tcg_gen_andc_i64(t2, b, m);
+    tcg_gen_xor_i64(t3, a, b);
+    tcg_gen_add_i64(d, t1, t2);
+    tcg_gen_and_i64(t3, t3, m);
+    tcg_gen_xor_i64(d, d, t3);
+
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+    tcg_temp_free_i64(t3);
+}
+
+void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_i64 m = tcg_const_i64(REP8(0x80));
+    gen_addv_mask(d, a, b, m);
+    tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_i64 m = tcg_const_i64(REP16(0x8000));
+    gen_addv_mask(d, a, b, m);
+    tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+
+    tcg_gen_andi_i64(t1, a, ~0xffffffffull);
+    tcg_gen_add_i64(t2, a, b);
+    tcg_gen_add_i64(t1, t1, b);
+    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
+
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
+void tcg_gen_gvec_add8(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                       uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_vec_add8_i64,
+        .fniv = tcg_gen_add8_vec,
+        .fno = gen_helper_gvec_add8,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_add16(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_vec_add16_i64,
+        .fniv = tcg_gen_add16_vec,
+        .fno = gen_helper_gvec_add16,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_add32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni4 = tcg_gen_add_i32,
+        .fniv = tcg_gen_add32_vec,
+        .fno = gen_helper_gvec_add32,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_add64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_add_i64,
+        .fniv = tcg_gen_add64_vec,
+        .fno = gen_helper_gvec_add64,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+    TCGv_i64 t3 = tcg_temp_new_i64();
+
+    tcg_gen_or_i64(t1, a, m);
+    tcg_gen_andc_i64(t2, b, m);
+    tcg_gen_eqv_i64(t3, a, b);
+    tcg_gen_sub_i64(d, t1, t2);
+    tcg_gen_and_i64(t3, t3, m);
+    tcg_gen_xor_i64(d, d, t3);
+
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+    tcg_temp_free_i64(t3);
+}
+
+void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_i64 m = tcg_const_i64(REP8(0x80));
+    gen_subv_mask(d, a, b, m);
+    tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_i64 m = tcg_const_i64(REP16(0x8000));
+    gen_subv_mask(d, a, b, m);
+    tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+
+    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
+    tcg_gen_sub_i64(t2, a, b);
+    tcg_gen_sub_i64(t1, a, t1);
+    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
+
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
+void tcg_gen_gvec_sub8(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                       uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_vec_sub8_i64,
+        .fniv = tcg_gen_sub8_vec,
+        .fno = gen_helper_gvec_sub8,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_sub16(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_vec_sub16_i64,
+        .fniv = tcg_gen_sub16_vec,
+        .fno = gen_helper_gvec_sub16,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_sub32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni4 = tcg_gen_sub_i32,
+        .fniv = tcg_gen_sub32_vec,
+        .fno = gen_helper_gvec_sub32,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_sub64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                        uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_sub_i64,
+        .fniv = tcg_gen_sub64_vec,
+        .fno = gen_helper_gvec_sub64,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
+{
+    TCGv_i64 t2 = tcg_temp_new_i64();
+    TCGv_i64 t3 = tcg_temp_new_i64();
+
+    tcg_gen_andc_i64(t3, m, b);
+    tcg_gen_andc_i64(t2, b, m);
+    tcg_gen_sub_i64(d, m, t2);
+    tcg_gen_xor_i64(d, d, t3);
+
+    tcg_temp_free_i64(t2);
+    tcg_temp_free_i64(t3);
+}
+
+void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
+{
+    TCGv_i64 m = tcg_const_i64(REP8(0x80));
+    gen_negv_mask(d, b, m);
+    tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
+{
+    TCGv_i64 m = tcg_const_i64(REP16(0x8000));
+    gen_negv_mask(d, b, m);
+    tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+
+    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
+    tcg_gen_neg_i64(t2, b);
+    tcg_gen_neg_i64(t1, t1);
+    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
+
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
+void tcg_gen_gvec_neg8(uint32_t dofs, uint32_t aofs,
+                       uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen2 g = {
+        .fni8 = tcg_gen_vec_neg8_i64,
+        .fniv = tcg_gen_neg8_vec,
+        .fno = gen_helper_gvec_neg8,
+    };
+    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_neg16(uint32_t dofs, uint32_t aofs,
+                        uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen2 g = {
+        .fni8 = tcg_gen_vec_neg16_i64,
+        .fniv = tcg_gen_neg16_vec,
+        .fno = gen_helper_gvec_neg16,
+    };
+    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_neg32(uint32_t dofs, uint32_t aofs,
+                        uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen2 g = {
+        .fni4 = tcg_gen_neg_i32,
+        .fniv = tcg_gen_neg32_vec,
+        .fno = gen_helper_gvec_neg32,
+    };
+    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_neg64(uint32_t dofs, uint32_t aofs,
+                        uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen2 g = {
+        .fni8 = tcg_gen_neg_i64,
+        .fniv = tcg_gen_neg64_vec,
+        .fno = gen_helper_gvec_neg64,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_and(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                      uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_and_i64,
+        .fniv = tcg_gen_and_vec,
+        .fno = gen_helper_gvec_and,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_or(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                     uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_or_i64,
+        .fniv = tcg_gen_or_vec,
+        .fno = gen_helper_gvec_or,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_xor(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                      uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_xor_i64,
+        .fniv = tcg_gen_xor_vec,
+        .fno = gen_helper_gvec_xor,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_andc(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                       uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_andc_i64,
+        .fniv = tcg_gen_andc_vec,
+        .fno = gen_helper_gvec_andc,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
+
+void tcg_gen_gvec_orc(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+                      uint32_t opsz, uint32_t clsz)
+{
+    static const GVecGen3 g = {
+        .fni8 = tcg_gen_orc_i64,
+        .fniv = tcg_gen_orc_vec,
+        .fno = gen_helper_gvec_orc,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+    };
+    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
+}
diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
index 228cd84fa4..d381a02f34 100644
--- a/accel/tcg/Makefile.objs
+++ b/accel/tcg/Makefile.objs
@@ -1,6 +1,6 @@
 obj-$(CONFIG_SOFTMMU) += tcg-all.o
 obj-$(CONFIG_SOFTMMU) += cputlb.o
-obj-y += tcg-runtime.o
+obj-y += tcg-runtime.o tcg-runtime-gvec.o
 obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
 obj-y += translator.o
 
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers
  2017-09-16  2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders Richard Henderson
@ 2017-09-16  2:34 ` Richard Henderson
  2017-09-26 22:33   ` Alex Bennée
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic Richard Henderson
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 14+ messages in thread
From: Richard Henderson @ 2017-09-16  2:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: alex.bennee, f4bug

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/cpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 98b9b26fd3..c346bd148f 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -486,7 +486,7 @@ typedef struct CPUARMState {
          * the two execution states, and means we do not need to explicitly
          * map these registers when changing states.
          */
-        float64 regs[64];
+        float64 regs[64] QEMU_ALIGNED(16);
 
         uint32_t xregs[16];
         /* We store these fpcsr fields separately for convenience.  */
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic
  2017-09-16  2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
                   ` (2 preceding siblings ...)
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers Richard Henderson
@ 2017-09-16  2:34 ` Richard Henderson
  2017-09-26 23:12   ` Alex Bennée
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 5/6] tcg/i386: Add vector operations Richard Henderson
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 14+ messages in thread
From: Richard Henderson @ 2017-09-16  2:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: alex.bennee, f4bug

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/translate-a64.c | 216 ++++++++++++++++++++++++++++++---------------
 1 file changed, 143 insertions(+), 73 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index a3984c9a0d..4759cc9829 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -21,6 +21,7 @@
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "tcg-op.h"
+#include "tcg-op-gvec.h"
 #include "qemu/log.h"
 #include "arm_ldst.h"
 #include "translate.h"
@@ -82,6 +83,7 @@ typedef void NeonGenTwoDoubleOPFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
 typedef void NeonGenOneOpFn(TCGv_i64, TCGv_i64);
 typedef void CryptoTwoOpEnvFn(TCGv_ptr, TCGv_i32, TCGv_i32);
 typedef void CryptoThreeOpEnvFn(TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
+typedef void GVecGenTwoFn(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t);
 
 /* initialize TCG globals.  */
 void a64_translate_init(void)
@@ -537,6 +539,21 @@ static inline int vec_reg_offset(DisasContext *s, int regno,
     return offs;
 }
 
+/* Return the offset info CPUARMState of the "whole" vector register Qn.  */
+static inline int vec_full_reg_offset(DisasContext *s, int regno)
+{
+    assert_fp_access_checked(s);
+    return offsetof(CPUARMState, vfp.regs[regno * 2]);
+}
+
+/* Return the byte size of the "whole" vector register, VL / 8.  */
+static inline int vec_full_reg_size(DisasContext *s)
+{
+    /* FIXME SVE: We should put the composite ZCR_EL* value into tb->flags.
+       In the meantime this is just the AdvSIMD length of 128.  */
+    return 128 / 8;
+}
+
 /* Return the offset into CPUARMState of a slice (from
  * the least significant end) of FP register Qn (ie
  * Dn, Sn, Hn or Bn).
@@ -9036,85 +9053,125 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
     }
 }
 
+static void gen_bsl_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+    tcg_gen_xor_i64(rn, rn, rm);
+    tcg_gen_and_i64(rn, rn, rd);
+    tcg_gen_xor_i64(rd, rm, rn);
+}
+
+static void gen_bit_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+    tcg_gen_xor_i64(rn, rn, rd);
+    tcg_gen_and_i64(rn, rn, rm);
+    tcg_gen_xor_i64(rd, rd, rn);
+}
+
+static void gen_bif_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+    tcg_gen_xor_i64(rn, rn, rd);
+    tcg_gen_andc_i64(rn, rn, rm);
+    tcg_gen_xor_i64(rd, rd, rn);
+}
+
+static void gen_bsl_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
+{
+    tcg_gen_xor_vec(rn, rn, rm);
+    tcg_gen_and_vec(rn, rn, rd);
+    tcg_gen_xor_vec(rd, rm, rn);
+}
+
+static void gen_bit_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
+{
+    tcg_gen_xor_vec(rn, rn, rd);
+    tcg_gen_and_vec(rn, rn, rm);
+    tcg_gen_xor_vec(rd, rd, rn);
+}
+
+static void gen_bif_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
+{
+    tcg_gen_xor_vec(rn, rn, rd);
+    tcg_gen_andc_vec(rn, rn, rm);
+    tcg_gen_xor_vec(rd, rd, rn);
+}
+
 /* Logic op (opcode == 3) subgroup of C3.6.16. */
 static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
 {
+    static const GVecGen3 bsl_op = {
+        .fni8 = gen_bsl_i64,
+        .fniv = gen_bsl_vec,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+        .load_dest = true
+    };
+    static const GVecGen3 bit_op = {
+        .fni8 = gen_bit_i64,
+        .fniv = gen_bit_vec,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+        .load_dest = true
+    };
+    static const GVecGen3 bif_op = {
+        .fni8 = gen_bif_i64,
+        .fniv = gen_bif_vec,
+        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+        .load_dest = true
+    };
+
     int rd = extract32(insn, 0, 5);
     int rn = extract32(insn, 5, 5);
     int rm = extract32(insn, 16, 5);
     int size = extract32(insn, 22, 2);
     bool is_u = extract32(insn, 29, 1);
     bool is_q = extract32(insn, 30, 1);
-    TCGv_i64 tcg_op1, tcg_op2, tcg_res[2];
-    int pass;
+    GVecGenTwoFn *gvec_fn;
+    const GVecGen3 *gvec_op;
 
     if (!fp_access_check(s)) {
         return;
     }
 
-    tcg_op1 = tcg_temp_new_i64();
-    tcg_op2 = tcg_temp_new_i64();
-    tcg_res[0] = tcg_temp_new_i64();
-    tcg_res[1] = tcg_temp_new_i64();
-
-    for (pass = 0; pass < (is_q ? 2 : 1); pass++) {
-        read_vec_element(s, tcg_op1, rn, pass, MO_64);
-        read_vec_element(s, tcg_op2, rm, pass, MO_64);
-
-        if (!is_u) {
-            switch (size) {
-            case 0: /* AND */
-                tcg_gen_and_i64(tcg_res[pass], tcg_op1, tcg_op2);
-                break;
-            case 1: /* BIC */
-                tcg_gen_andc_i64(tcg_res[pass], tcg_op1, tcg_op2);
-                break;
-            case 2: /* ORR */
-                tcg_gen_or_i64(tcg_res[pass], tcg_op1, tcg_op2);
-                break;
-            case 3: /* ORN */
-                tcg_gen_orc_i64(tcg_res[pass], tcg_op1, tcg_op2);
-                break;
-            }
-        } else {
-            if (size != 0) {
-                /* B* ops need res loaded to operate on */
-                read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
-            }
-
-            switch (size) {
-            case 0: /* EOR */
-                tcg_gen_xor_i64(tcg_res[pass], tcg_op1, tcg_op2);
-                break;
-            case 1: /* BSL bitwise select */
-                tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_op2);
-                tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_res[pass]);
-                tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op1);
-                break;
-            case 2: /* BIT, bitwise insert if true */
-                tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
-                tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_op2);
-                tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
-                break;
-            case 3: /* BIF, bitwise insert if false */
-                tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
-                tcg_gen_andc_i64(tcg_op1, tcg_op1, tcg_op2);
-                tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
-                break;
-            }
-        }
-    }
+    switch (size + 4 * is_u) {
+    case 0: /* AND */
+        gvec_fn = tcg_gen_gvec_and;
+        goto do_fn;
+    case 1: /* BIC */
+        gvec_fn = tcg_gen_gvec_andc;
+        goto do_fn;
+    case 2: /* ORR */
+        gvec_fn = tcg_gen_gvec_or;
+        goto do_fn;
+    case 3: /* ORN */
+        gvec_fn = tcg_gen_gvec_orc;
+        goto do_fn;
+    case 4: /* EOR */
+        gvec_fn = tcg_gen_gvec_xor;
+        goto do_fn;
+    do_fn:
+        gvec_fn(vec_full_reg_offset(s, rd),
+                vec_full_reg_offset(s, rn),
+                vec_full_reg_offset(s, rm),
+                is_q ? 16 : 8, vec_full_reg_size(s));
+        return;
+
+    case 5: /* BSL bitwise select */
+        gvec_op = &bsl_op;
+        goto do_op;
+    case 6: /* BIT, bitwise insert if true */
+        gvec_op = &bit_op;
+        goto do_op;
+    case 7: /* BIF, bitwise insert if false */
+        gvec_op = &bif_op;
+        goto do_op;
+    do_op:
+        tcg_gen_gvec_3(vec_full_reg_offset(s, rd),
+                       vec_full_reg_offset(s, rn),
+                       vec_full_reg_offset(s, rm),
+                       is_q ? 16 : 8, vec_full_reg_size(s), gvec_op);
+        return;
 
-    write_vec_element(s, tcg_res[0], rd, 0, MO_64);
-    if (!is_q) {
-        tcg_gen_movi_i64(tcg_res[1], 0);
+    default:
+        g_assert_not_reached();
     }
-    write_vec_element(s, tcg_res[1], rd, 1, MO_64);
-
-    tcg_temp_free_i64(tcg_op1);
-    tcg_temp_free_i64(tcg_op2);
-    tcg_temp_free_i64(tcg_res[0]);
-    tcg_temp_free_i64(tcg_res[1]);
 }
 
 /* Helper functions for 32 bit comparisons */
@@ -9375,6 +9432,7 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
     int rn = extract32(insn, 5, 5);
     int rd = extract32(insn, 0, 5);
     int pass;
+    GVecGenTwoFn *gvec_op;
 
     switch (opcode) {
     case 0x13: /* MUL, PMUL */
@@ -9414,6 +9472,28 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
         return;
     }
 
+    switch (opcode) {
+    case 0x10: /* ADD, SUB */
+        {
+            static GVecGenTwoFn * const fns[4][2] = {
+                { tcg_gen_gvec_add8, tcg_gen_gvec_sub8 },
+                { tcg_gen_gvec_add16, tcg_gen_gvec_sub16 },
+                { tcg_gen_gvec_add32, tcg_gen_gvec_sub32 },
+                { tcg_gen_gvec_add64, tcg_gen_gvec_sub64 },
+            };
+            gvec_op = fns[size][u];
+            goto do_gvec;
+        }
+        break;
+
+    do_gvec:
+        gvec_op(vec_full_reg_offset(s, rd),
+                vec_full_reg_offset(s, rn),
+                vec_full_reg_offset(s, rm),
+                is_q ? 16 : 8, vec_full_reg_size(s));
+        return;
+    }
+
     if (size == 3) {
         assert(is_q);
         for (pass = 0; pass < 2; pass++) {
@@ -9586,16 +9666,6 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
                 genfn = fns[size][u];
                 break;
             }
-            case 0x10: /* ADD, SUB */
-            {
-                static NeonGenTwoOpFn * const fns[3][2] = {
-                    { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 },
-                    { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
-                    { tcg_gen_add_i32, tcg_gen_sub_i32 },
-                };
-                genfn = fns[size][u];
-                break;
-            }
             case 0x11: /* CMTST, CMEQ */
             {
                 static NeonGenTwoOpFn * const fns[3][2] = {
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v3 5/6] tcg/i386: Add vector operations
  2017-09-16  2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
                   ` (3 preceding siblings ...)
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic Richard Henderson
@ 2017-09-16  2:34 ` Richard Henderson
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 6/6] tcg/aarch64: " Richard Henderson
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2017-09-16  2:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: alex.bennee, f4bug

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.h     |  36 +++-
 tcg/i386/tcg-target.inc.c | 423 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 413 insertions(+), 46 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index b89dababf4..df69f8db91 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -30,11 +30,10 @@
 
 #ifdef __x86_64__
 # define TCG_TARGET_REG_BITS  64
-# define TCG_TARGET_NB_REGS   16
 #else
 # define TCG_TARGET_REG_BITS  32
-# define TCG_TARGET_NB_REGS    8
 #endif
+# define TCG_TARGET_NB_REGS   24
 
 typedef enum {
     TCG_REG_EAX = 0,
@@ -56,6 +55,19 @@ typedef enum {
     TCG_REG_R13,
     TCG_REG_R14,
     TCG_REG_R15,
+
+    /* SSE registers; 64-bit has access to 8 more, but we won't
+       need more than a few and using only the first 8 minimizes
+       the need for a rex prefix on the sse instructions.  */
+    TCG_REG_XMM0,
+    TCG_REG_XMM1,
+    TCG_REG_XMM2,
+    TCG_REG_XMM3,
+    TCG_REG_XMM4,
+    TCG_REG_XMM5,
+    TCG_REG_XMM6,
+    TCG_REG_XMM7,
+
     TCG_REG_RAX = TCG_REG_EAX,
     TCG_REG_RCX = TCG_REG_ECX,
     TCG_REG_RDX = TCG_REG_EDX,
@@ -78,6 +90,17 @@ typedef enum {
 extern bool have_bmi1;
 extern bool have_popcnt;
 
+#ifdef __SSE2__
+#define have_sse2  true
+#else
+extern bool have_sse2;
+#endif
+#ifdef __AVX2__
+#define have_avx2  true
+#else
+extern bool have_avx2;
+#endif
+
 /* optional instructions */
 #define TCG_TARGET_HAS_div2_i32         1
 #define TCG_TARGET_HAS_rot_i32          1
@@ -146,6 +169,15 @@ extern bool have_popcnt;
 #define TCG_TARGET_HAS_mulsh_i64        0
 #endif
 
+#define TCG_TARGET_HAS_v64              have_sse2
+#define TCG_TARGET_HAS_v128             have_sse2
+#define TCG_TARGET_HAS_v256             have_avx2
+
+#define TCG_TARGET_HAS_andc_vec         1
+#define TCG_TARGET_HAS_orc_vec          0
+#define TCG_TARGET_HAS_not_vec          0
+#define TCG_TARGET_HAS_neg_vec          0
+
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
     (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
      ((ofs) == 0 && (len) == 16))
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 69e49c9f58..df3be932d5 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -28,10 +28,11 @@
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #if TCG_TARGET_REG_BITS == 64
     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
-    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
 #else
     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
 #endif
+    "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
+    "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 };
 #endif
 
@@ -61,6 +62,14 @@ static const int tcg_target_reg_alloc_order[] = {
     TCG_REG_EDX,
     TCG_REG_EAX,
 #endif
+    TCG_REG_XMM0,
+    TCG_REG_XMM1,
+    TCG_REG_XMM2,
+    TCG_REG_XMM3,
+    TCG_REG_XMM4,
+    TCG_REG_XMM5,
+    TCG_REG_XMM6,
+    TCG_REG_XMM7,
 };
 
 static const int tcg_target_call_iarg_regs[] = {
@@ -94,7 +103,7 @@ static const int tcg_target_call_oarg_regs[] = {
 #define TCG_CT_CONST_I32 0x400
 #define TCG_CT_CONST_WSZ 0x800
 
-/* Registers used with L constraint, which are the first argument 
+/* Registers used with L constraint, which are the first argument
    registers on x86_64, and two random call clobbered registers on
    i386. */
 #if TCG_TARGET_REG_BITS == 64
@@ -126,6 +135,16 @@ static bool have_cmov;
 bool have_bmi1;
 bool have_popcnt;
 
+#ifndef have_sse2
+bool have_sse2;
+#endif
+#ifdef have_avx2
+#define have_avx1  have_avx2
+#else
+static bool have_avx1;
+bool have_avx2;
+#endif
+
 #ifdef CONFIG_CPUID_H
 static bool have_movbe;
 static bool have_bmi2;
@@ -192,14 +211,17 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
         tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
         break;
     case 'q':
+        /* A register that can be used as a byte operand.  */
         ct->ct |= TCG_CT_REG;
         ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
         break;
     case 'Q':
+        /* A register with an addressable second byte (e.g. %ah).  */
         ct->ct |= TCG_CT_REG;
         ct->u.regs = 0xf;
         break;
     case 'r':
+        /* A general register.  */
         ct->ct |= TCG_CT_REG;
         ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
         break;
@@ -207,6 +229,11 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
         /* With TZCNT/LZCNT, we can have operand-size as an input.  */
         ct->ct |= TCG_CT_CONST_WSZ;
         break;
+    case 'x':
+        /* A vector register.  */
+        ct->ct |= TCG_CT_REG;
+        ct->u.regs = 0xff0000;
+        break;
 
         /* qemu_ld/st address constraint */
     case 'L':
@@ -277,8 +304,9 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 # define P_REXB_RM	0
 # define P_GS           0
 #endif
-#define P_SIMDF3        0x10000         /* 0xf3 opcode prefix */
-#define P_SIMDF2        0x20000         /* 0xf2 opcode prefix */
+#define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
+#define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
+#define P_VEXL          0x80000         /* Set VEX.L = 1 */
 
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
@@ -310,11 +338,30 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_MOVL_Iv     (0xb8)
 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
+#define OPC_MOVDQA_GyMy (0x6f | P_EXT | P_DATA16)
+#define OPC_MOVDQA_MyGy (0x7f | P_EXT | P_DATA16)
+#define OPC_MOVDQU_GyMy (0x6f | P_EXT | P_SIMDF3)
+#define OPC_MOVDQU_MyGy (0x7f | P_EXT | P_SIMDF3)
+#define OPC_MOVQ_GyMy   (0x7e | P_EXT | P_SIMDF3)
+#define OPC_MOVQ_MyGy   (0xd6 | P_EXT | P_DATA16)
 #define OPC_MOVSBL	(0xbe | P_EXT)
 #define OPC_MOVSWL	(0xbf | P_EXT)
 #define OPC_MOVSLQ	(0x63 | P_REXW)
 #define OPC_MOVZBL	(0xb6 | P_EXT)
 #define OPC_MOVZWL	(0xb7 | P_EXT)
+#define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
+#define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
+#define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
+#define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
+#define OPC_PAND        (0xdb | P_EXT | P_DATA16)
+#define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
+#define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
+#define OPC_POR         (0xeb | P_EXT | P_DATA16)
+#define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
+#define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
+#define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
+#define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
+#define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 #define OPC_POP_r32	(0x58)
 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 #define OPC_PUSH_r32	(0x50)
@@ -330,6 +377,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 #define OPC_TESTL	(0x85)
 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
+#define OPC_VZEROUPPER  (0x77 | P_EXT)
 #define OPC_XCHG_ax_r32	(0x90)
 
 #define OPC_GRP3_Ev	(0xf7)
@@ -479,11 +527,20 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 }
 
-static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
+                            int rm, int index)
 {
     int tmp;
 
-    if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
+    /* Use the two byte form if possible, which cannot encode
+       VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
+    if ((opc & (P_EXT | P_EXT38 | P_REXW)) == P_EXT
+        && ((rm | index) & 8) == 0) {
+        /* Two byte VEX prefix.  */
+        tcg_out8(s, 0xc5);
+
+        tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
+    } else {
         /* Three byte VEX prefix.  */
         tcg_out8(s, 0xc4);
 
@@ -493,20 +550,17 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
         } else if (opc & P_EXT) {
             tmp = 1;
         } else {
-            tcg_abort();
+            g_assert_not_reached();
         }
-        tmp |= 0x40;                       /* VEX.X */
-        tmp |= (r & 8 ? 0 : 0x80);         /* VEX.R */
-        tmp |= (rm & 8 ? 0 : 0x20);        /* VEX.B */
+        tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
+        tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
+        tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
         tcg_out8(s, tmp);
 
-        tmp = (opc & P_REXW ? 0x80 : 0);   /* VEX.W */
-    } else {
-        /* Two byte VEX prefix.  */
-        tcg_out8(s, 0xc5);
-
-        tmp = (r & 8 ? 0 : 0x80);          /* VEX.R */
+        tmp = (opc & P_REXW ? 0x80 : 0);       /* VEX.W */
     }
+
+    tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
     /* VEX.pp */
     if (opc & P_DATA16) {
         tmp |= 1;                          /* 0x66 */
@@ -518,6 +572,11 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
     tcg_out8(s, tmp);
     tcg_out8(s, opc);
+}
+
+static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+{
+    tcg_out_vex_opc(s, opc, r, v, rm, 0);
     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 }
 
@@ -526,8 +585,8 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
    mode for absolute addresses, ~RM is the size of the immediate operand
    that will follow the instruction.  */
 
-static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
-                                     int index, int shift, intptr_t offset)
+static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
+                               int shift, intptr_t offset)
 {
     int mod, len;
 
@@ -538,7 +597,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
             intptr_t disp = offset - pc;
             if (disp == (int32_t)disp) {
-                tcg_out_opc(s, opc, r, 0, 0);
                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
                 tcg_out32(s, disp);
                 return;
@@ -548,7 +606,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
                use of the MODRM+SIB encoding and is therefore larger than
                rip-relative addressing.  */
             if (offset == (int32_t)offset) {
-                tcg_out_opc(s, opc, r, 0, 0);
                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
                 tcg_out8(s, (4 << 3) | 5);
                 tcg_out32(s, offset);
@@ -556,10 +613,9 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
             }
 
             /* ??? The memory isn't directly addressable.  */
-            tcg_abort();
+            g_assert_not_reached();
         } else {
             /* Absolute address.  */
-            tcg_out_opc(s, opc, r, 0, 0);
             tcg_out8(s, (r << 3) | 5);
             tcg_out32(s, offset);
             return;
@@ -582,7 +638,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
        that would be used for %esp is the escape to the two byte form.  */
     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
         /* Single byte MODRM format.  */
-        tcg_out_opc(s, opc, r, rm, 0);
         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
     } else {
         /* Two byte MODRM+SIB format.  */
@@ -596,7 +651,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
             tcg_debug_assert(index != TCG_REG_ESP);
         }
 
-        tcg_out_opc(s, opc, r, rm, index);
         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
     }
@@ -608,6 +662,21 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
     }
 }
 
+static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
+                                     int index, int shift, intptr_t offset)
+{
+    tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
+    tcg_out_sib_offset(s, r, rm, index, shift, offset);
+}
+
+static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
+                                         int rm, int index, int shift,
+                                         intptr_t offset)
+{
+    tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
+    tcg_out_sib_offset(s, r, rm, index, shift, offset);
+}
+
 /* A simplification of the above with no index or shift.  */
 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
                                         int rm, intptr_t offset)
@@ -615,6 +684,31 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 }
 
+static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
+                                            int v, int rm, intptr_t offset)
+{
+    tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
+}
+
+static void tcg_out_maybe_vex_modrm(TCGContext *s, int opc, int r, int rm)
+{
+    if (have_avx1) {
+        tcg_out_vex_modrm(s, opc, r, 0, rm);
+    } else {
+        tcg_out_modrm(s, opc, r, rm);
+    }
+}
+
+static void tcg_out_maybe_vex_modrm_offset(TCGContext *s, int opc, int r,
+                                           int rm, intptr_t offset)
+{
+    if (have_avx1) {
+        tcg_out_vex_modrm_offset(s, opc, r, 0, rm, offset);
+    } else {
+        tcg_out_modrm_offset(s, opc, r, rm, offset);
+    }
+}
+
 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 {
@@ -625,12 +719,34 @@ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 }
 
-static inline void tcg_out_mov(TCGContext *s, TCGType type,
-                               TCGReg ret, TCGReg arg)
+static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 {
-    if (arg != ret) {
-        int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
-        tcg_out_modrm(s, opc, ret, arg);
+    if (arg == ret) {
+        return;
+    }
+    switch (type) {
+    case TCG_TYPE_I32:
+        tcg_debug_assert(ret < 16 && arg < 16);
+        tcg_out_modrm(s, OPC_MOVL_GvEv, ret, arg);
+        break;
+    case TCG_TYPE_I64:
+        tcg_debug_assert(ret < 16 && arg < 16);
+        tcg_out_modrm(s, OPC_MOVL_GvEv | P_REXW, ret, arg);
+        break;
+    case TCG_TYPE_V64:
+        tcg_debug_assert(ret >= 16 && arg >= 16);
+        tcg_out_maybe_vex_modrm(s, OPC_MOVQ_GyMy, ret, arg);
+        break;
+    case TCG_TYPE_V128:
+        tcg_debug_assert(ret >= 16 && arg >= 16);
+        tcg_out_maybe_vex_modrm(s, OPC_MOVDQA_GyMy, ret, arg);
+        break;
+    case TCG_TYPE_V256:
+        tcg_debug_assert(ret >= 16 && arg >= 16);
+        tcg_out_vex_modrm(s, OPC_MOVDQA_GyMy | P_VEXL, ret, 0, arg);
+        break;
+    default:
+        g_assert_not_reached();
     }
 }
 
@@ -638,6 +754,36 @@ static void tcg_out_movi(TCGContext *s, TCGType type,
                          TCGReg ret, tcg_target_long arg)
 {
     tcg_target_long diff;
+    int opc;
+
+    switch (type) {
+    case TCG_TYPE_I32:
+    case TCG_TYPE_I64:
+        tcg_debug_assert(ret < 16);
+        break;
+
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        tcg_debug_assert(ret >= 16);
+        /* ??? Revisit this as the implementation progresses.  */
+        if (arg == 0) {
+            opc = OPC_PXOR;
+        } else if (arg == -1) {
+            opc = OPC_PCMPEQB;
+        } else {
+            g_assert_not_reached();
+        }
+        if (have_avx1) {
+            tcg_out_vex_modrm(s, opc, ret, ret, ret);
+        } else {
+            tcg_out_modrm(s, opc, ret, ret);
+        }
+        return;
+
+    default:
+        g_assert_not_reached();
+    }
 
     if (arg == 0) {
         tgen_arithr(s, ARITH_XOR, ret, ret);
@@ -702,18 +848,64 @@ static inline void tcg_out_pop(TCGContext *s, int reg)
     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
 }
 
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
-                              TCGReg arg1, intptr_t arg2)
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
+                       TCGReg arg1, intptr_t arg2)
 {
-    int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
-    tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
+    switch (type) {
+    case TCG_TYPE_I64:
+        tcg_debug_assert(ret < 16);
+        tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
+        break;
+    case TCG_TYPE_I32:
+        tcg_debug_assert(ret < 16);
+        tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
+        break;
+    case TCG_TYPE_V64:
+        tcg_debug_assert(ret >= 16);
+        tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_GyMy, ret, arg1, arg2);
+        break;
+    case TCG_TYPE_V128:
+        tcg_debug_assert(ret >= 16);
+        tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_GyMy, ret, arg1, arg2);
+        break;
+    case TCG_TYPE_V256:
+        tcg_debug_assert(ret >= 16);
+        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_GyMy | P_VEXL,
+                                 ret, 0, arg1, arg2);
+        break;
+    default:
+        g_assert_not_reached();
+    }
 }
 
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
-                              TCGReg arg1, intptr_t arg2)
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+                       TCGReg arg1, intptr_t arg2)
 {
-    int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
-    tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
+    switch (type) {
+    case TCG_TYPE_I64:
+        tcg_debug_assert(arg < 16);
+        tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
+        break;
+    case TCG_TYPE_I32:
+        tcg_debug_assert(arg < 16);
+        tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
+        break;
+    case TCG_TYPE_V64:
+        tcg_debug_assert(arg >= 16);
+        tcg_out_maybe_vex_modrm_offset(s, OPC_MOVQ_MyGy, arg, arg1, arg2);
+        break;
+    case TCG_TYPE_V128:
+        tcg_debug_assert(arg >= 16);
+        tcg_out_maybe_vex_modrm_offset(s, OPC_MOVDQU_MyGy, arg, arg1, arg2);
+        break;
+    case TCG_TYPE_V256:
+        tcg_debug_assert(arg >= 16);
+        tcg_out_vex_modrm_offset(s, OPC_MOVDQU_MyGy | P_VEXL,
+                                 arg, 0, arg1, arg2);
+        break;
+    default:
+        g_assert_not_reached();
+    }
 }
 
 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
@@ -725,6 +917,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
             return false;
         }
         rexw = P_REXW;
+    } else if (type != TCG_TYPE_I32) {
+        return false;
     }
     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
     tcg_out32(s, val);
@@ -2254,19 +2448,110 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         }
         break;
 
+    case INDEX_op_add8_vec:
+        c = OPC_PADDB;
+        goto gen_simd;
+    case INDEX_op_add16_vec:
+        c = OPC_PADDW;
+        goto gen_simd;
+    case INDEX_op_add32_vec:
+        c = OPC_PADDD;
+        goto gen_simd;
+    case INDEX_op_add64_vec:
+        c = OPC_PADDQ;
+        goto gen_simd;
+    case INDEX_op_sub8_vec:
+        c = OPC_PSUBB;
+        goto gen_simd;
+    case INDEX_op_sub16_vec:
+        c = OPC_PSUBW;
+        goto gen_simd;
+    case INDEX_op_sub32_vec:
+        c = OPC_PSUBD;
+        goto gen_simd;
+    case INDEX_op_sub64_vec:
+        c = OPC_PSUBQ;
+        goto gen_simd;
+    case INDEX_op_and_vec:
+        c = OPC_PAND;
+        goto gen_simd;
+    case INDEX_op_or_vec:
+        c = OPC_POR;
+        goto gen_simd;
+    case INDEX_op_xor_vec:
+        c = OPC_PXOR;
+    gen_simd:
+        if (args[3] == 2) {
+            c |= P_VEXL;
+        }
+        if (have_avx1) {
+            tcg_out_vex_modrm(s, c, a0, a1, a2);
+        } else {
+            tcg_out_modrm(s, c, a0, a2);
+        }
+        break;
+    case INDEX_op_andc_vec:
+        c = OPC_PANDN;
+        if (args[3] == 2) {
+            c |= P_VEXL;
+        }
+        if (have_avx1) {
+            tcg_out_vex_modrm(s, c, a0, a2, a1);
+        } else {
+            tcg_out_modrm(s, c, a0, a1);
+        }
+        break;
+
+    case INDEX_op_ld_vec:
+    case INDEX_op_ldz_vec:
+        switch (args[3]) {
+        case 0:
+            tcg_out_ld(s, TCG_TYPE_V64, a0, a1, a2);
+            break;
+        case 1:
+            tcg_out_ld(s, TCG_TYPE_V128, a0, a1, a2);
+            break;
+        case 2:
+            tcg_out_ld(s, TCG_TYPE_V256, a0, a1, a2);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        break;
+
+    case INDEX_op_st_vec:
+        switch (args[3]) {
+        case 0:
+            tcg_out_st(s, TCG_TYPE_V64, a0, a1, a2);
+            break;
+        case 1:
+            tcg_out_st(s, TCG_TYPE_V128, a0, a1, a2);
+            break;
+        case 2:
+            tcg_out_st(s, TCG_TYPE_V256, a0, a1, a2);
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        break;
+
     case INDEX_op_mb:
         tcg_out_mb(s, a0);
         break;
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
+    case INDEX_op_mov_vec:
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_movi_i64:
+    case INDEX_op_movi_vec:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
         tcg_abort();
     }
 
 #undef OP_32_64
+#undef OP_128_256
+#undef OP_64_128_256
 }
 
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
@@ -2292,6 +2577,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
         = { .args_ct_str = { "r", "r", "L", "L" } };
     static const TCGTargetOpDef L_L_L_L
         = { .args_ct_str = { "L", "L", "L", "L" } };
+    static const TCGTargetOpDef x_0_x = { .args_ct_str = { "x", "0", "x" } };
+    static const TCGTargetOpDef x_x_0 = { .args_ct_str = { "x", "x", "0" } };
+    static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
+    static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
 
     switch (op) {
     case INDEX_op_goto_ptr:
@@ -2493,6 +2782,26 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
             return &s2;
         }
 
+    case INDEX_op_ld_vec:
+    case INDEX_op_ldz_vec:
+    case INDEX_op_st_vec:
+        return &x_r;
+
+    case INDEX_op_add8_vec:
+    case INDEX_op_add16_vec:
+    case INDEX_op_add32_vec:
+    case INDEX_op_add64_vec:
+    case INDEX_op_sub8_vec:
+    case INDEX_op_sub16_vec:
+    case INDEX_op_sub32_vec:
+    case INDEX_op_sub64_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_xor_vec:
+        return have_avx1 ? &x_x_x : &x_0_x;
+    case INDEX_op_andc_vec:
+        return have_avx1 ? &x_x_x : &x_x_0;
+
     default:
         break;
     }
@@ -2577,6 +2886,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 
     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
 
+    if (have_avx2) {
+        tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
+    }
     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
     }
@@ -2598,9 +2910,16 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
 static void tcg_target_init(TCGContext *s)
 {
 #ifdef CONFIG_CPUID_H
-    unsigned a, b, c, d;
+    unsigned a, b, c, d, b7 = 0;
     int max = __get_cpuid_max(0, 0);
 
+    if (max >= 7) {
+        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
+        __cpuid_count(7, 0, a, b7, c, d);
+        have_bmi1 = (b7 & bit_BMI) != 0;
+        have_bmi2 = (b7 & bit_BMI2) != 0;
+    }
+
     if (max >= 1) {
         __cpuid(1, a, b, c, d);
 #ifndef have_cmov
@@ -2609,17 +2928,26 @@ static void tcg_target_init(TCGContext *s)
            available, we'll use a small forward branch.  */
         have_cmov = (d & bit_CMOV) != 0;
 #endif
+#ifndef have_sse2
+        have_sse2 = (d & bit_SSE2) != 0;
+#endif
         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
            need to probe for it.  */
         have_movbe = (c & bit_MOVBE) != 0;
         have_popcnt = (c & bit_POPCNT) != 0;
-    }
 
-    if (max >= 7) {
-        /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
-        __cpuid_count(7, 0, a, b, c, d);
-        have_bmi1 = (b & bit_BMI) != 0;
-        have_bmi2 = (b & bit_BMI2) != 0;
+#ifndef have_avx2
+        /* There are a number of things we must check before we can be
+           sure of not hitting invalid opcode.  */
+        if (c & bit_OSXSAVE) {
+            unsigned xcrl, xcrh;
+            asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
+            if ((xcrl & 6) == 6) {
+                have_avx1 = (c & bit_AVX) != 0;
+                have_avx2 = (b7 & bit_AVX2) != 0;
+            }
+        }
+#endif
     }
 
     max = __get_cpuid_max(0x8000000, 0);
@@ -2636,6 +2964,13 @@ static void tcg_target_init(TCGContext *s)
     } else {
         tcg_target_available_regs[TCG_TYPE_I32] = 0xff;
     }
+    if (have_sse2) {
+        tcg_target_available_regs[TCG_TYPE_V64] = 0xff0000;
+        tcg_target_available_regs[TCG_TYPE_V128] = 0xff0000;
+    }
+    if (have_avx2) {
+        tcg_target_available_regs[TCG_TYPE_V256] = 0xff0000;
+    }
 
     tcg_target_call_clobber_regs = 0;
     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH v3 6/6] tcg/aarch64: Add vector operations
  2017-09-16  2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
                   ` (4 preceding siblings ...)
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 5/6] tcg/i386: Add vector operations Richard Henderson
@ 2017-09-16  2:34 ` Richard Henderson
  2017-09-16  2:35 ` [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
  2017-09-26 22:58 ` no-reply
  7 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2017-09-16  2:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: alex.bennee, f4bug

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/aarch64/tcg-target.h     |  20 ++-
 tcg/aarch64/tcg-target.inc.c | 340 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 315 insertions(+), 45 deletions(-)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index c2525066ab..c3e8c4480f 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -31,13 +31,22 @@ typedef enum {
     TCG_REG_SP = 31,
     TCG_REG_XZR = 31,
 
+    TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+    TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+    TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11,
+    TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+    TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
+
     /* Aliases.  */
     TCG_REG_FP = TCG_REG_X29,
     TCG_REG_LR = TCG_REG_X30,
     TCG_AREG0  = TCG_REG_X19,
 } TCGReg;
 
-#define TCG_TARGET_NB_REGS 32
+#define TCG_TARGET_NB_REGS 64
 
 /* used for function call generation */
 #define TCG_REG_CALL_STACK              TCG_REG_SP
@@ -113,6 +122,15 @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i64        1
 #define TCG_TARGET_HAS_direct_jump      1
 
+#define TCG_TARGET_HAS_v64              1
+#define TCG_TARGET_HAS_v128             1
+#define TCG_TARGET_HAS_v256             0
+
+#define TCG_TARGET_HAS_andc_vec         1
+#define TCG_TARGET_HAS_orc_vec          1
+#define TCG_TARGET_HAS_not_vec          1
+#define TCG_TARGET_HAS_neg_vec          1
+
 #define TCG_TARGET_DEFAULT_MO (0)
 
 static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 150530f30e..4b401cfe6c 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -20,10 +20,15 @@ QEMU_BUILD_BUG_ON(TCG_TYPE_I32 != 0 || TCG_TYPE_I64 != 1);
 
 #ifdef CONFIG_DEBUG_TCG
 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
-    "%x0", "%x1", "%x2", "%x3", "%x4", "%x5", "%x6", "%x7",
-    "%x8", "%x9", "%x10", "%x11", "%x12", "%x13", "%x14", "%x15",
-    "%x16", "%x17", "%x18", "%x19", "%x20", "%x21", "%x22", "%x23",
-    "%x24", "%x25", "%x26", "%x27", "%x28", "%fp", "%x30", "%sp",
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+    "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
+    "x24", "x25", "x26", "x27", "x28", "fp", "x30", "sp",
+
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+    "v24", "v25", "v26", "v27", "v28", "fp", "v30", "v31",
 };
 #endif /* CONFIG_DEBUG_TCG */
 
@@ -43,6 +48,14 @@ static const int tcg_target_reg_alloc_order[] = {
     /* X19 reserved for AREG0 */
     /* X29 reserved as fp */
     /* X30 reserved as temporary */
+
+    TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+    TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+    /* V8 - V15 are call-saved, and skipped.  */
+    TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+    TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+    TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+    TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
 };
 
 static const int tcg_target_call_iarg_regs[8] = {
@@ -119,10 +132,14 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
                                            const char *ct_str, TCGType type)
 {
     switch (*ct_str++) {
-    case 'r':
+    case 'r': /* general registers */
         ct->ct |= TCG_CT_REG;
         ct->u.regs = 0xffffffffu;
         break;
+    case 'w': /* advsimd registers */
+        ct->ct |= TCG_CT_REG;
+        ct->u.regs = 0xffffffff00000000ull;
+        break;
     case 'l': /* qemu_ld / qemu_st address, data_reg */
         ct->ct |= TCG_CT_REG;
         ct->u.regs = 0xffffffffu;
@@ -290,6 +307,12 @@ typedef enum {
     I3312_LDRSHX    = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30,
     I3312_LDRSWX    = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30,
 
+    I3312_LDRVD     = 0x3c000000 | LDST_LD << 22 | MO_64 << 30,
+    I3312_STRVD     = 0x3c000000 | LDST_ST << 22 | MO_64 << 30,
+
+    I3312_LDRVQ     = 0x3c000000 | 3 << 22 | 0 << 30,
+    I3312_STRVQ     = 0x3c000000 | 2 << 22 | 0 << 30,
+
     I3312_TO_I3310  = 0x00200800,
     I3312_TO_I3313  = 0x01000000,
 
@@ -374,8 +397,33 @@ typedef enum {
     I3510_EON       = 0x4a200000,
     I3510_ANDS      = 0x6a000000,
 
-    NOP             = 0xd503201f,
+    /* AdvSIMD modified immediate */
+    I3606_MOVI      = 0x0f000400,
+
+    /* AdvSIMD three same.  */
+    I3616_ADD_B     = 0x0e208400,
+    I3616_ADD_H     = 0x0e608400,
+    I3616_ADD_S     = 0x0ea08400,
+    I3616_ADD_D     = 0x4ee08400,
+    I3616_AND       = 0x0e201c00,
+    I3616_BIC       = 0x0e601c00,
+    I3616_EOR       = 0x2e201c00,
+    I3616_ORR       = 0x0ea01c00,
+    I3616_ORN       = 0x0ee01c00,
+    I3616_SUB_B     = 0x2e208400,
+    I3616_SUB_H     = 0x2e608400,
+    I3616_SUB_S     = 0x2ea08400,
+    I3616_SUB_D     = 0x6ee08400,
+
+    /* AdvSIMD two-reg misc.  */
+    I3617_NOT       = 0x2e205800,
+    I3617_NEG_B     = 0x2e20b800,
+    I3617_NEG_H     = 0x2e60b800,
+    I3617_NEG_S     = 0x2ea0b800,
+    I3617_NEG_D     = 0x6ee0b800,
+
     /* System instructions.  */
+    NOP             = 0xd503201f,
     DMB_ISH         = 0xd50338bf,
     DMB_LD          = 0x00000100,
     DMB_ST          = 0x00000200,
@@ -520,26 +568,47 @@ static void tcg_out_insn_3509(TCGContext *s, AArch64Insn insn, TCGType ext,
     tcg_out32(s, insn | ext << 31 | rm << 16 | ra << 10 | rn << 5 | rd);
 }
 
+static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
+                              TCGReg rd, bool op, int cmode, uint8_t imm8)
+{
+    tcg_out32(s, insn | q << 30 | op << 29 | cmode << 12 | (rd & 0x1f)
+              | (imm8 & 0xe0) << 16 | (imm8 & 0x1f) << 5);
+}
+
+static void tcg_out_insn_3616(TCGContext *s, AArch64Insn insn, bool q,
+                              TCGReg rd, TCGReg rn, TCGReg rm)
+{
+    tcg_out32(s, insn | q << 30 | (rm & 0x1f) << 16
+              | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
+static void tcg_out_insn_3617(TCGContext *s, AArch64Insn insn, bool q,
+                              TCGReg rd, TCGReg rn)
+{
+    tcg_out32(s, insn | q << 30 | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
 static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn,
                               TCGReg rd, TCGReg base, TCGType ext,
                               TCGReg regoff)
 {
     /* Note the AArch64Insn constants above are for C3.3.12.  Adjust.  */
     tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 |
-              0x4000 | ext << 13 | base << 5 | rd);
+              0x4000 | ext << 13 | base << 5 | (rd & 0x1f));
 }
 
 static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn,
                               TCGReg rd, TCGReg rn, intptr_t offset)
 {
-    tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | rd);
+    tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | (rd & 0x1f));
 }
 
 static void tcg_out_insn_3313(TCGContext *s, AArch64Insn insn,
                               TCGReg rd, TCGReg rn, uintptr_t scaled_uimm)
 {
     /* Note the AArch64Insn constants above are for C3.3.12.  Adjust.  */
-    tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10 | rn << 5 | rd);
+    tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10
+              | rn << 5 | (rd & 0x1f));
 }
 
 /* Register to register move using ORR (shifted register with no shift). */
@@ -594,6 +663,24 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
     int s0, s1;
     AArch64Insn opc;
 
+    switch (type) {
+    case TCG_TYPE_I32:
+    case TCG_TYPE_I64:
+        tcg_debug_assert(rd < 32);
+        break;
+
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+        tcg_debug_assert(rd >= 32);
+        /* ??? Revisit this as the implementation progresses.  */
+        tcg_debug_assert(value == 0);
+        tcg_out_insn(s, 3606, MOVI, 0, rd, 0, 0, 0);
+        return;
+
+    default:
+        g_assert_not_reached();
+    }
+
     /* For 32-bit values, discard potential garbage in value.  For 64-bit
        values within [2**31, 2**32-1], we can create smaller sequences by
        interpreting this as a negative 32-bit number, while ensuring that
@@ -669,15 +756,13 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
 /* Define something more legible for general use.  */
 #define tcg_out_ldst_r  tcg_out_insn_3310
 
-static void tcg_out_ldst(TCGContext *s, AArch64Insn insn,
-                         TCGReg rd, TCGReg rn, intptr_t offset)
+static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
+                         TCGReg rn, intptr_t offset, int lgsize)
 {
-    TCGMemOp size = (uint32_t)insn >> 30;
-
     /* If the offset is naturally aligned and in range, then we can
        use the scaled uimm12 encoding */
-    if (offset >= 0 && !(offset & ((1 << size) - 1))) {
-        uintptr_t scaled_uimm = offset >> size;
+    if (offset >= 0 && !(offset & ((1 << lgsize) - 1))) {
+        uintptr_t scaled_uimm = offset >> lgsize;
         if (scaled_uimm <= 0xfff) {
             tcg_out_insn_3313(s, insn, rd, rn, scaled_uimm);
             return;
@@ -695,32 +780,94 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn,
     tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
 }
 
-static inline void tcg_out_mov(TCGContext *s,
-                               TCGType type, TCGReg ret, TCGReg arg)
+static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 {
-    if (ret != arg) {
+    if (ret == arg) {
+        return;
+    }
+    switch (type) {
+    case TCG_TYPE_I32:
+    case TCG_TYPE_I64:
+        tcg_debug_assert(ret < 32 && arg < 32);
         tcg_out_movr(s, type, ret, arg);
+        break;
+
+    case TCG_TYPE_V64:
+        tcg_debug_assert(ret >= 32 && arg >= 32);
+        tcg_out_insn(s, 3616, ORR, 0, ret, arg, arg);
+        break;
+    case TCG_TYPE_V128:
+        tcg_debug_assert(ret >= 32 && arg >= 32);
+        tcg_out_insn(s, 3616, ORR, 1, ret, arg, arg);
+        break;
+
+    default:
+        g_assert_not_reached();
     }
 }
 
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
-                              TCGReg arg1, intptr_t arg2)
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
+                       TCGReg arg1, intptr_t arg2)
 {
-    tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_LDRW : I3312_LDRX,
-                 arg, arg1, arg2);
+    AArch64Insn insn;
+    int lgsz;
+
+    switch (type) {
+    case TCG_TYPE_I32:
+        insn = I3312_LDRW;
+        lgsz = 2;
+        break;
+    case TCG_TYPE_I64:
+        insn = I3312_LDRX;
+        lgsz = 3;
+        break;
+    case TCG_TYPE_V64:
+        insn = I3312_LDRVD;
+        lgsz = 3;
+        break;
+    case TCG_TYPE_V128:
+        insn = I3312_LDRVQ;
+        lgsz = 4;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    tcg_out_ldst(s, insn, arg, arg1, arg2, lgsz);
 }
 
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
-                              TCGReg arg1, intptr_t arg2)
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+                       TCGReg arg1, intptr_t arg2)
 {
-    tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_STRW : I3312_STRX,
-                 arg, arg1, arg2);
+    AArch64Insn insn;
+    int lgsz;
+
+    switch (type) {
+    case TCG_TYPE_I32:
+        insn = I3312_STRW;
+        lgsz = 2;
+        break;
+    case TCG_TYPE_I64:
+        insn = I3312_STRX;
+        lgsz = 3;
+        break;
+    case TCG_TYPE_V64:
+        insn = I3312_STRVD;
+        lgsz = 3;
+        break;
+    case TCG_TYPE_V128:
+        insn = I3312_STRVQ;
+        lgsz = 4;
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    tcg_out_ldst(s, insn, arg, arg1, arg2, lgsz);
 }
 
 static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
                                TCGReg base, intptr_t ofs)
 {
-    if (val == 0) {
+    if (type <= TCG_TYPE_I64 && val == 0) {
         tcg_out_st(s, type, TCG_REG_XZR, base, ofs);
         return true;
     }
@@ -1210,14 +1357,15 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
     /* Merge "low bits" from tlb offset, load the tlb comparator into X0.
        X0 = load [X2 + (tlb_offset & 0x000fff)] */
     tcg_out_ldst(s, TARGET_LONG_BITS == 32 ? I3312_LDRW : I3312_LDRX,
-                 TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff);
+                 TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff,
+                 TARGET_LONG_BITS == 32 ? 2 : 3);
 
     /* Load the tlb addend. Do that early to avoid stalling.
        X1 = load [X2 + (tlb_offset & 0xfff) + offsetof(addend)] */
     tcg_out_ldst(s, I3312_LDRX, TCG_REG_X1, TCG_REG_X2,
                  (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend)) -
                  (is_read ? offsetof(CPUTLBEntry, addr_read)
-                  : offsetof(CPUTLBEntry, addr_write)));
+                  : offsetof(CPUTLBEntry, addr_write)), 3);
 
     /* Perform the address comparison. */
     tcg_out_cmp(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, TCG_REG_X3, 0);
@@ -1435,49 +1583,49 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 
     case INDEX_op_ld8u_i32:
     case INDEX_op_ld8u_i64:
-        tcg_out_ldst(s, I3312_LDRB, a0, a1, a2);
+        tcg_out_ldst(s, I3312_LDRB, a0, a1, a2, 0);
         break;
     case INDEX_op_ld8s_i32:
-        tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2);
+        tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2, 0);
         break;
     case INDEX_op_ld8s_i64:
-        tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2);
+        tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2, 0);
         break;
     case INDEX_op_ld16u_i32:
     case INDEX_op_ld16u_i64:
-        tcg_out_ldst(s, I3312_LDRH, a0, a1, a2);
+        tcg_out_ldst(s, I3312_LDRH, a0, a1, a2, 1);
         break;
     case INDEX_op_ld16s_i32:
-        tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2);
+        tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2, 1);
         break;
     case INDEX_op_ld16s_i64:
-        tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2);
+        tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2, 1);
         break;
     case INDEX_op_ld_i32:
     case INDEX_op_ld32u_i64:
-        tcg_out_ldst(s, I3312_LDRW, a0, a1, a2);
+        tcg_out_ldst(s, I3312_LDRW, a0, a1, a2, 2);
         break;
     case INDEX_op_ld32s_i64:
-        tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2);
+        tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2, 2);
         break;
     case INDEX_op_ld_i64:
-        tcg_out_ldst(s, I3312_LDRX, a0, a1, a2);
+        tcg_out_ldst(s, I3312_LDRX, a0, a1, a2, 3);
         break;
 
     case INDEX_op_st8_i32:
     case INDEX_op_st8_i64:
-        tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2);
+        tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2, 0);
         break;
     case INDEX_op_st16_i32:
     case INDEX_op_st16_i64:
-        tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2);
+        tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2, 1);
         break;
     case INDEX_op_st_i32:
     case INDEX_op_st32_i64:
-        tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2);
+        tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2, 2);
         break;
     case INDEX_op_st_i64:
-        tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2);
+        tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2, 3);
         break;
 
     case INDEX_op_add_i32:
@@ -1774,13 +1922,77 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_mb(s, a0);
         break;
 
+    case INDEX_op_ld_vec:
+    case INDEX_op_ldz_vec:
+        tcg_out_ld(s, TCG_TYPE_V64 + args[3], a0, a1, a2);
+        break;
+    case INDEX_op_st_vec:
+        tcg_out_st(s, TCG_TYPE_V64 + args[3], a0, a1, a2);
+        break;
+    case INDEX_op_add8_vec:
+        tcg_out_insn(s, 3616, ADD_B, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_add16_vec:
+        tcg_out_insn(s, 3616, ADD_H, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_add32_vec:
+        tcg_out_insn(s, 3616, ADD_S, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_add64_vec:
+        tcg_out_insn(s, 3616, ADD_D, 1, a0, a1, a2);
+        break;
+    case INDEX_op_sub8_vec:
+        tcg_out_insn(s, 3616, SUB_B, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_sub16_vec:
+        tcg_out_insn(s, 3616, SUB_H, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_sub32_vec:
+        tcg_out_insn(s, 3616, SUB_S, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_sub64_vec:
+        tcg_out_insn(s, 3616, SUB_D, 1, a0, a1, a2);
+        break;
+    case INDEX_op_neg8_vec:
+        tcg_out_insn(s, 3617, NEG_B, a2, a0, a1);
+        break;
+    case INDEX_op_neg16_vec:
+        tcg_out_insn(s, 3617, NEG_H, a2, a0, a1);
+        break;
+    case INDEX_op_neg32_vec:
+        tcg_out_insn(s, 3617, NEG_S, a2, a0, a1);
+        break;
+    case INDEX_op_neg64_vec:
+        tcg_out_insn(s, 3617, NEG_D, 1, a0, a1);
+        break;
+    case INDEX_op_and_vec:
+        tcg_out_insn(s, 3616, AND, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_or_vec:
+        tcg_out_insn(s, 3616, ORR, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_xor_vec:
+        tcg_out_insn(s, 3616, EOR, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_andc_vec:
+        tcg_out_insn(s, 3616, BIC, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_orc_vec:
+        tcg_out_insn(s, 3616, ORN, args[3], a0, a1, a2);
+        break;
+    case INDEX_op_not_vec:
+        tcg_out_insn(s, 3617, NOT, a2, a0, a1);
+        break;
+
     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
     case INDEX_op_mov_i64:
+    case INDEX_op_mov_vec:
     case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi.  */
     case INDEX_op_movi_i64:
+    case INDEX_op_movi_vec:
     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
     default:
-        tcg_abort();
+        g_assert_not_reached();
     }
 
 #undef REG0
@@ -1790,11 +2002,14 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
     static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
     static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
+    static const TCGTargetOpDef w_w = { .args_ct_str = { "w", "w" } };
+    static const TCGTargetOpDef w_r = { .args_ct_str = { "w", "r" } };
     static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } };
     static const TCGTargetOpDef r_rA = { .args_ct_str = { "r", "rA" } };
     static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } };
     static const TCGTargetOpDef lZ_l = { .args_ct_str = { "lZ", "l" } };
     static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
+    static const TCGTargetOpDef w_w_w = { .args_ct_str = { "w", "w", "w" } };
     static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
     static const TCGTargetOpDef r_r_rA = { .args_ct_str = { "r", "r", "rA" } };
     static const TCGTargetOpDef r_r_rL = { .args_ct_str = { "r", "r", "rL" } };
@@ -1938,6 +2153,33 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
     case INDEX_op_sub2_i64:
         return &add2;
 
+    case INDEX_op_add8_vec:
+    case INDEX_op_add16_vec:
+    case INDEX_op_add32_vec:
+    case INDEX_op_add64_vec:
+    case INDEX_op_sub8_vec:
+    case INDEX_op_sub16_vec:
+    case INDEX_op_sub32_vec:
+    case INDEX_op_sub64_vec:
+    case INDEX_op_and_vec:
+    case INDEX_op_or_vec:
+    case INDEX_op_xor_vec:
+    case INDEX_op_andc_vec:
+    case INDEX_op_orc_vec:
+        return &w_w_w;
+
+    case INDEX_op_not_vec:
+    case INDEX_op_neg8_vec:
+    case INDEX_op_neg16_vec:
+    case INDEX_op_neg32_vec:
+    case INDEX_op_neg64_vec:
+        return &w_w;
+
+    case INDEX_op_ld_vec:
+    case INDEX_op_ldz_vec:
+    case INDEX_op_st_vec:
+        return &w_r;
+
     default:
         return NULL;
     }
@@ -1947,8 +2189,10 @@ static void tcg_target_init(TCGContext *s)
 {
     tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
     tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
+    tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
+    tcg_target_available_regs[TCG_TYPE_V128] = 0xffffffff00000000ull;
 
-    tcg_target_call_clobber_regs = 0xfffffffu;
+    tcg_target_call_clobber_regs = -1ull;
     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X19);
     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X20);
     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X21);
@@ -1960,6 +2204,14 @@ static void tcg_target_init(TCGContext *s)
     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X27);
     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X28);
     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X29);
+    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V8);
+    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V9);
+    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V10);
+    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V11);
+    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V12);
+    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V13);
+    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V14);
+    tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V15);
 
     s->reserved_regs = 0;
     tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
-- 
2.13.5

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion
  2017-09-16  2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
                   ` (5 preceding siblings ...)
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 6/6] tcg/aarch64: " Richard Henderson
@ 2017-09-16  2:35 ` Richard Henderson
  2017-09-26 22:58 ` no-reply
  7 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2017-09-16  2:35 UTC (permalink / raw)
  To: qemu-devel; +Cc: alex.bennee, f4bug

On 09/15/2017 07:34 PM, Richard Henderson wrote:
> Now addressing the complex vector op issue.  I now expose TCGv_vec
> to target front-ends, but opaque wrt the vector size.  One can thus
> compose vector operations, as demonstrated in target/arm/.
> 
> The actual host vector length now becomes an argument to the *_vec
> opcodes.  It's a little awkward, but does prevent an explosion of
> opcode values.
> 
> All R-b dropped because all patches rewritten or heavily modified.

Bah.  Forgot to mention that this depends on tcg-next.  Full tree at

  git://github.com/rth7680/qemu.git native-vector-registers-3


r~

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
@ 2017-09-26 19:28   ` Alex Bennée
  2017-09-27 16:18     ` Richard Henderson
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Bennée @ 2017-09-26 19:28 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, f4bug


Richard Henderson <richard.henderson@linaro.org> writes:

> Nothing uses or enables them yet.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/tcg-op.h  |  26 +++++++
>  tcg/tcg-opc.h |  37 ++++++++++
>  tcg/tcg.h     |  34 +++++++++
>  tcg/tcg-op.c  | 234 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tcg/tcg.c     |  77 ++++++++++++++++++-
>  tcg/README    |  46 ++++++++++++
>  6 files changed, 453 insertions(+), 1 deletion(-)
>
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index 5d3278f243..b9b0b9f46f 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -915,6 +915,32 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
>  void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
>  void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
>
> +void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
> +void tcg_gen_movi_vec(TCGv_vec, tcg_target_long);
> +void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b);
> +void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a);
> +void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a);
> +
> +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
> +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
> +void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz);
> +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType sz);
> +
>  #if TARGET_LONG_BITS == 64
>  #define tcg_gen_movi_tl tcg_gen_movi_i64
>  #define tcg_gen_mov_tl tcg_gen_mov_i64
> diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
> index 956fb1e9f3..8200184fa9 100644
> --- a/tcg/tcg-opc.h
> +++ b/tcg/tcg-opc.h
> @@ -204,8 +204,45 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
>  DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
>      TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
>
> +/* Host vector support.  */
> +
> +#define IMPLVEC  \
> +    IMPL(TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256)
> +
> +DEF(mov_vec, 1, 1, 1, TCG_OPF_NOT_PRESENT)
> +
> +/* ??? Simple, but perhaps dupiN would be more descriptive.  */
> +DEF(movi_vec, 1, 0, 2, TCG_OPF_NOT_PRESENT)
> +
> +DEF(ld_vec, 1, 1, 2, IMPLVEC)
> +DEF(ldz_vec, 1, 1, 3, IMPLVEC)
> +DEF(st_vec, 0, 2, 2, IMPLVEC)
> +
> +DEF(add8_vec, 1, 2, 1, IMPLVEC)
> +DEF(add16_vec, 1, 2, 1, IMPLVEC)
> +DEF(add32_vec, 1, 2, 1, IMPLVEC)
> +DEF(add64_vec, 1, 2, 1, IMPLVEC)
> +
> +DEF(sub8_vec, 1, 2, 1, IMPLVEC)
> +DEF(sub16_vec, 1, 2, 1, IMPLVEC)
> +DEF(sub32_vec, 1, 2, 1, IMPLVEC)
> +DEF(sub64_vec, 1, 2, 1, IMPLVEC)
> +
> +DEF(neg8_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +DEF(neg16_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +DEF(neg32_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +DEF(neg64_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
> +
> +DEF(and_vec, 1, 2, 1, IMPLVEC)
> +DEF(or_vec, 1, 2, 1, IMPLVEC)
> +DEF(xor_vec, 1, 2, 1, IMPLVEC)
> +DEF(andc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
> +DEF(orc_vec, 1, 2, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
> +DEF(not_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
> +
>  #undef TLADDR_ARGS
>  #undef DATA64_ARGS
>  #undef IMPL
>  #undef IMPL64
> +#undef IMPLVEC
>  #undef DEF
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index 25662c36d4..7cd356e87f 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -173,6 +173,16 @@ typedef uint64_t TCGRegSet;
>  # error "Missing unsigned widening multiply"
>  #endif
>
> +#ifndef TCG_TARGET_HAS_v64
> +#define TCG_TARGET_HAS_v64              0
> +#define TCG_TARGET_HAS_v128             0
> +#define TCG_TARGET_HAS_v256             0
> +#define TCG_TARGET_HAS_neg_vec          0
> +#define TCG_TARGET_HAS_not_vec          0
> +#define TCG_TARGET_HAS_andc_vec         0
> +#define TCG_TARGET_HAS_orc_vec          0
> +#endif
> +
>  #ifndef TARGET_INSN_START_EXTRA_WORDS
>  # define TARGET_INSN_START_WORDS 1
>  #else
> @@ -249,6 +259,11 @@ typedef struct TCGPool {
>  typedef enum TCGType {
>      TCG_TYPE_I32,
>      TCG_TYPE_I64,
> +
> +    TCG_TYPE_V64,
> +    TCG_TYPE_V128,
> +    TCG_TYPE_V256,
> +
>      TCG_TYPE_COUNT, /* number of different types */
>
>      /* An alias for the size of the host register.  */
> @@ -399,6 +414,8 @@ typedef tcg_target_ulong TCGArg;
>      * TCGv_i32 : 32 bit integer type
>      * TCGv_i64 : 64 bit integer type
>      * TCGv_ptr : a host pointer type
> +    * TCGv_vec : a host vector type; the exact size is not exposed
> +                 to the CPU front-end code.

Isn't this a guest vector type (which is pointed to by a host pointer)?

>      * TCGv : an integer type the same size as target_ulong
>               (an alias for either TCGv_i32 or TCGv_i64)
>     The compiler's type checking will complain if you mix them
> @@ -424,6 +441,7 @@ typedef tcg_target_ulong TCGArg;
>  typedef struct TCGv_i32_d *TCGv_i32;
>  typedef struct TCGv_i64_d *TCGv_i64;
>  typedef struct TCGv_ptr_d *TCGv_ptr;
> +typedef struct TCGv_vec_d *TCGv_vec;
>  typedef TCGv_ptr TCGv_env;
>  #if TARGET_LONG_BITS == 32
>  #define TCGv TCGv_i32
> @@ -448,6 +466,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL MAKE_TCGV_PTR(intptr_t i)
>      return (TCGv_ptr)i;
>  }
>
> +static inline TCGv_vec QEMU_ARTIFICIAL MAKE_TCGV_VEC(intptr_t i)
> +{
> +    return (TCGv_vec)i;
> +}
> +
>  static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
>  {
>      return (intptr_t)t;
> @@ -463,6 +486,11 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
>      return (intptr_t)t;
>  }
>
> +static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_VEC(TCGv_vec t)
> +{
> +    return (intptr_t)t;
> +}
> +
>  #if TCG_TARGET_REG_BITS == 32
>  #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
>  #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
> @@ -471,15 +499,18 @@ static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_PTR(TCGv_ptr t)
>  #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
>  #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
>  #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
> +#define TCGV_EQUAL_VEC(a, b) (GET_TCGV_VEC(a) == GET_TCGV_VEC(b))
>
>  /* Dummy definition to avoid compiler warnings.  */
>  #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
>  #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
>  #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
> +#define TCGV_UNUSED_VEC(x) x = MAKE_TCGV_VEC(-1)
>
>  #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
>  #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
>  #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
> +#define TCGV_IS_UNUSED_VEC(x) (GET_TCGV_VEC(x) == -1)
>
>  /* call flags */
>  /* Helper does not read globals (either directly or through an exception). It
> @@ -790,9 +821,12 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char *name);
>
>  TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
>  TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
> +TCGv_vec tcg_temp_new_vec(TCGType type);
> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match);
>
>  void tcg_temp_free_i32(TCGv_i32 arg);
>  void tcg_temp_free_i64(TCGv_i64 arg);
> +void tcg_temp_free_vec(TCGv_vec arg);
>
>  static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
>                                                const char *name)
> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
> index 688d91755b..50b3177e5f 100644
> --- a/tcg/tcg-op.c
> +++ b/tcg/tcg-op.c
> @@ -3072,3 +3072,237 @@ static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, TCGv_i64 b)
>  GEN_ATOMIC_HELPER(xchg, mov2, 0)
>
>  #undef GEN_ATOMIC_HELPER
> +
> +static void tcg_gen_op2_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg ai = GET_TCGV_VEC(a);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGTemp *at = &tcg_ctx.temps[ai];
> +    TCGType type = rt->base_type;
> +
> +    tcg_debug_assert(at->base_type == type);
> +    tcg_gen_op3(&tcg_ctx, opc, ri, ai, type - TCG_TYPE_V64);
> +}
> +
> +static void tcg_gen_op3_vec(TCGOpcode opc, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg ai = GET_TCGV_VEC(a);
> +    TCGArg bi = GET_TCGV_VEC(b);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGTemp *at = &tcg_ctx.temps[ai];
> +    TCGTemp *bt = &tcg_ctx.temps[bi];
> +    TCGType type = rt->base_type;
> +
> +    tcg_debug_assert(at->base_type == type);
> +    tcg_debug_assert(bt->base_type == type);
> +    tcg_gen_op4(&tcg_ctx, opc, ri, ai, bi, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (!TCGV_EQUAL_VEC(r, a)) {
> +        tcg_gen_op2_vec(INDEX_op_mov_vec, r, a);
> +    }
> +}
> +
> +void tcg_gen_movi_vec(TCGv_vec r, tcg_target_long a)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGType type = rt->base_type;
> +
> +    tcg_debug_assert(a == 0 || a == -1);
> +    tcg_gen_op3(&tcg_ctx, INDEX_op_movi_vec, ri, a, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg bi = GET_TCGV_PTR(b);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGType type = rt->base_type;
> +
> +    tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg bi = GET_TCGV_PTR(b);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGType type = rt->base_type;
> +
> +    tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64);
> +}
> +
> +/* Load data into a vector R from B+O using TYPE.  If R is wider than TYPE,
> +   fill the high bits with zeros.  */
> +void tcg_gen_ldz_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg bi = GET_TCGV_PTR(b);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGType btype = rt->base_type;
> +
> +    if (type < btype) {
> +        tcg_gen_op5(&tcg_ctx, INDEX_op_ldz_vec, ri, bi, o,
> +                    type - TCG_TYPE_V64, btype - TCG_TYPE_V64);
> +    } else {
> +        tcg_debug_assert(type == btype);
> +        tcg_gen_op4(&tcg_ctx, INDEX_op_ld_vec, ri, bi, o, type - TCG_TYPE_V64);
> +    }
> +}
> +
> +/* Store data from vector R into B+O using TYPE.  If R is wider than TYPE,
> +   store only the low bits.  */
> +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType type)
> +{
> +    TCGArg ri = GET_TCGV_VEC(r);
> +    TCGArg bi = GET_TCGV_PTR(b);
> +    TCGTemp *rt = &tcg_ctx.temps[ri];
> +    TCGType btype = rt->base_type;
> +
> +    tcg_debug_assert(type <= btype);
> +    tcg_gen_op4(&tcg_ctx, INDEX_op_st_vec, ri, bi, o, type - TCG_TYPE_V64);
> +}
> +
> +void tcg_gen_add8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_add8_vec, r, a, b);
> +}
> +
> +void tcg_gen_add16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_add16_vec, r, a, b);
> +}
> +
> +void tcg_gen_add32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_add32_vec, r, a, b);
> +}
> +
> +void tcg_gen_add64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_add64_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub8_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_sub8_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub16_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_sub16_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub32_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_sub32_vec, r, a, b);
> +}
> +
> +void tcg_gen_sub64_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_sub64_vec, r, a, b);
> +}
> +
> +void tcg_gen_and_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_and_vec, r, a, b);
> +}
> +
> +void tcg_gen_or_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_or_vec, r, a, b);
> +}
> +
> +void tcg_gen_xor_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_op3_vec(INDEX_op_xor_vec, r, a, b);
> +}
> +
> +void tcg_gen_andc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    if (TCG_TARGET_HAS_andc_vec) {
> +        tcg_gen_op3_vec(INDEX_op_andc_vec, r, a, b);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_not_vec(t, b);
> +        tcg_gen_and_vec(r, a, t);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_orc_vec(TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +    if (TCG_TARGET_HAS_orc_vec) {
> +        tcg_gen_op3_vec(INDEX_op_orc_vec, r, a, b);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_not_vec(t, b);
> +        tcg_gen_or_vec(r, a, t);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_not_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_not_vec) {
> +        tcg_gen_op2_vec(INDEX_op_orc_vec, r, a);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_movi_vec(t, -1);
> +        tcg_gen_xor_vec(r, a, t);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_neg8_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_neg_vec) {
> +        tcg_gen_op2_vec(INDEX_op_neg8_vec, r, a);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_movi_vec(t, 0);
> +        tcg_gen_sub8_vec(r, t, a);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_neg16_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_neg_vec) {
> +        tcg_gen_op2_vec(INDEX_op_neg16_vec, r, a);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_movi_vec(t, 0);
> +        tcg_gen_sub16_vec(r, t, a);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_neg32_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_neg_vec) {
> +        tcg_gen_op2_vec(INDEX_op_neg32_vec, r, a);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_movi_vec(t, 0);
> +        tcg_gen_sub32_vec(r, t, a);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> +
> +void tcg_gen_neg64_vec(TCGv_vec r, TCGv_vec a)
> +{
> +    if (TCG_TARGET_HAS_neg_vec) {
> +        tcg_gen_op2_vec(INDEX_op_neg64_vec, r, a);
> +    } else {
> +        TCGv_vec t = tcg_temp_new_vec_matching(r);
> +        tcg_gen_movi_vec(t, 0);
> +        tcg_gen_sub64_vec(r, t, a);
> +        tcg_temp_free_vec(t);
> +    }
> +}
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index dff9999bc6..a4d55efdf0 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -116,7 +116,7 @@ static int tcg_target_const_match(tcg_target_long val, TCGType type,
>  static bool tcg_out_ldst_finalize(TCGContext *s);
>  #endif
>
> -static TCGRegSet tcg_target_available_regs[2];
> +static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
>  static TCGRegSet tcg_target_call_clobber_regs;
>
>  #if TCG_TARGET_INSN_UNIT_SIZE == 1
> @@ -664,6 +664,44 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
>      return MAKE_TCGV_I64(idx);
>  }
>
> +TCGv_vec tcg_temp_new_vec(TCGType type)
> +{
> +    int idx;
> +
> +#ifdef CONFIG_DEBUG_TCG
> +    switch (type) {
> +    case TCG_TYPE_V64:
> +        assert(TCG_TARGET_HAS_v64);
> +        break;
> +    case TCG_TYPE_V128:
> +        assert(TCG_TARGET_HAS_v128);
> +        break;
> +    case TCG_TYPE_V256:
> +        assert(TCG_TARGET_HAS_v256);
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +#endif
> +
> +    idx = tcg_temp_new_internal(type, 0);
> +    return MAKE_TCGV_VEC(idx);
> +}
> +

A one line comment wouldn't go amiss here. This looks like we are
allocating a new temp of the same type as an existing temp?

> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match)
> +{
> +    TCGContext *s = &tcg_ctx;
> +    int idx = GET_TCGV_VEC(match);
> +    TCGTemp *ts;
> +
> +    tcg_debug_assert(idx >= s->nb_globals && idx < s->nb_temps);
> +    ts = &s->temps[idx];
> +    tcg_debug_assert(ts->temp_allocated != 0);
> +
> +    idx = tcg_temp_new_internal(ts->base_type, 0);
> +    return MAKE_TCGV_VEC(idx);
> +}
> +
>  static void tcg_temp_free_internal(int idx)
>  {
>      TCGContext *s = &tcg_ctx;
> @@ -696,6 +734,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
>      tcg_temp_free_internal(GET_TCGV_I64(arg));
>  }
>
> +void tcg_temp_free_vec(TCGv_vec arg)
> +{
> +    tcg_temp_free_internal(GET_TCGV_VEC(arg));
> +}
> +
>  TCGv_i32 tcg_const_i32(int32_t val)
>  {
>      TCGv_i32 t0;
> @@ -753,6 +796,9 @@ int tcg_check_temp_count(void)
>     Test the runtime variable that controls each opcode.  */
>  bool tcg_op_supported(TCGOpcode op)
>  {
> +    const bool have_vec
> +        = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256;
> +
>      switch (op) {
>      case INDEX_op_discard:
>      case INDEX_op_set_label:
> @@ -966,6 +1012,35 @@ bool tcg_op_supported(TCGOpcode op)
>      case INDEX_op_mulsh_i64:
>          return TCG_TARGET_HAS_mulsh_i64;
>
> +    case INDEX_op_mov_vec:
> +    case INDEX_op_movi_vec:
> +    case INDEX_op_ld_vec:
> +    case INDEX_op_ldz_vec:
> +    case INDEX_op_st_vec:
> +    case INDEX_op_add8_vec:
> +    case INDEX_op_add16_vec:
> +    case INDEX_op_add32_vec:
> +    case INDEX_op_add64_vec:
> +    case INDEX_op_sub8_vec:
> +    case INDEX_op_sub16_vec:
> +    case INDEX_op_sub32_vec:
> +    case INDEX_op_sub64_vec:
> +    case INDEX_op_and_vec:
> +    case INDEX_op_or_vec:
> +    case INDEX_op_xor_vec:
> +        return have_vec;
> +    case INDEX_op_not_vec:
> +        return have_vec && TCG_TARGET_HAS_not_vec;
> +    case INDEX_op_neg8_vec:
> +    case INDEX_op_neg16_vec:
> +    case INDEX_op_neg32_vec:
> +    case INDEX_op_neg64_vec:
> +        return have_vec && TCG_TARGET_HAS_neg_vec;
> +    case INDEX_op_andc_vec:
> +        return have_vec && TCG_TARGET_HAS_andc_vec;
> +    case INDEX_op_orc_vec:
> +        return have_vec && TCG_TARGET_HAS_orc_vec;
> +
>      case NB_OPS:
>          break;
>      }
> diff --git a/tcg/README b/tcg/README
> index 03bfb6acd4..3bf3af67db 100644
> --- a/tcg/README
> +++ b/tcg/README
> @@ -503,6 +503,52 @@ of the memory access.
>  For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
>  64-bit memory access specified in flags.
>
> +********* Host vector operations
> +
> +All of the vector ops have a final constant argument that specifies the
> +length of the vector operation LEN as 64 << LEN bits.

That doesn't scan well. So would a 4 lane operation be encoded as 64 <<
4? Is this because we are using the bottom bits for something?

> +
> +* mov_vec   v0, v1, len
> +* ld_vec    v0, t1, len
> +* st_vec    v0, t1, len
> +
> +  Move, load and store.
> +
> +* movi_vec  v0, c, len
> +
> +  Copy C across the entire vector.
> +  At present the only supported values for C are 0 and -1.

I guess this is why the size in unimportant? This is for clearing or
setting the whole of the vector? What does len mean in this case?

> +
> +* add8_vec    v0, v1, v2, len
> +* add16_vec   v0, v1, v2, len
> +* add32_vec   v0, v1, v2, len
> +* add64_vec   v0, v1, v2, len
> +
> +  v0 = v1 + v2, in elements of 8/16/32/64 bits, across len.
> +
> +* sub8_vec    v0, v1, v2, len
> +* sub16_vec   v0, v1, v2, len
> +* sub32_vec   v0, v1, v2, len
> +* sub64_vec   v0, v1, v2, len
> +
> +  Similarly, v0 = v1 - v2.
> +
> +* neg8_vec    v0, v1, len
> +* neg16_vec   v0, v1, len
> +* neg32_vec   v0, v1, len
> +* neg64_vec   v0, v1, len
> +
> +  Similarly, v0 = -v1.
> +
> +* and_vec     v0, v1, v2, len
> +* or_vec      v0, v1, v2, len
> +* xor_vec     v0, v1, v2, len
> +* andc_vec    v0, v1, v2, len
> +* orc_vec     v0, v1, v2, len
> +* not_vec     v0, v1, len
> +
> +  Similarly, logical operations.

Similarly, logical operations with and without compliment?

> +
>  *********
>
>  Note 1: Some shortcuts are defined when the last operand is known to be


--
Alex Bennée

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders Richard Henderson
@ 2017-09-26 22:31   ` Alex Bennée
  0 siblings, 0 replies; 14+ messages in thread
From: Alex Bennée @ 2017-09-26 22:31 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, f4bug


Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Other than live comments:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

> ---
>  Makefile.target              |   2 +-
>  accel/tcg/tcg-runtime.h      |  24 ++
>  tcg/tcg-gvec-desc.h          |  49 +++
>  tcg/tcg-op-gvec.h            | 143 ++++++++
>  accel/tcg/tcg-runtime-gvec.c | 255 +++++++++++++
>  tcg/tcg-op-gvec.c            | 853 +++++++++++++++++++++++++++++++++++++++++++
>  accel/tcg/Makefile.objs      |   2 +-
>  7 files changed, 1326 insertions(+), 2 deletions(-)
>  create mode 100644 tcg/tcg-gvec-desc.h
>  create mode 100644 tcg/tcg-op-gvec.h
>  create mode 100644 accel/tcg/tcg-runtime-gvec.c
>  create mode 100644 tcg/tcg-op-gvec.c
>
> diff --git a/Makefile.target b/Makefile.target
> index 6361f957fb..f9967feef5 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -94,7 +94,7 @@ all: $(PROGS) stap
>  obj-y += exec.o
>  obj-y += accel/
>  obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
> -obj-$(CONFIG_TCG) += tcg/tcg-common.o
> +obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/tcg-op-gvec.o
>  obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
>  obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
>  obj-y += fpu/softfloat.o
> diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
> index c41d38a557..61c0ce39d3 100644
> --- a/accel/tcg/tcg-runtime.h
> +++ b/accel/tcg/tcg-runtime.h
> @@ -134,3 +134,27 @@ GEN_ATOMIC_HELPERS(xor_fetch)
>  GEN_ATOMIC_HELPERS(xchg)
>
>  #undef GEN_ATOMIC_HELPERS
> +
> +DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> diff --git a/tcg/tcg-gvec-desc.h b/tcg/tcg-gvec-desc.h
> new file mode 100644
> index 0000000000..8ba9a8168d
> --- /dev/null
> +++ b/tcg/tcg-gvec-desc.h
> @@ -0,0 +1,49 @@
> +/*
> + *  Generic vector operation descriptor
> + *
> + *  Copyright (c) 2017 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
> +#define SIMD_OPRSZ_SHIFT   0
> +#define SIMD_OPRSZ_BITS    5
> +
> +#define SIMD_MAXSZ_SHIFT   (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
> +#define SIMD_MAXSZ_BITS    5
> +
> +#define SIMD_DATA_SHIFT    (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
> +#define SIMD_DATA_BITS     (32 - SIMD_DATA_SHIFT)
> +
> +/* Create a descriptor from components.  */
> +uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
> +
> +/* Extract the operation size from a descriptor.  */
> +static inline intptr_t simd_oprsz(uint32_t desc)
> +{
> +    return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
> +}
> +
> +/* Extract the max vector size from a descriptor.  */
> +static inline intptr_t simd_maxsz(uint32_t desc)
> +{
> +    return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
> +}
> +
> +/* Extract the operation-specific data from a descriptor.  */
> +static inline int32_t simd_data(uint32_t desc)
> +{
> +    return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS);
> +}
> diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
> new file mode 100644
> index 0000000000..28bd77f1dc
> --- /dev/null
> +++ b/tcg/tcg-op-gvec.h
> @@ -0,0 +1,143 @@
> +/*
> + *  Generic vector operation expansion
> + *
> + *  Copyright (c) 2017 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +/*
> + * "Generic" vectors.  All operands are given as offsets from ENV,
> + * and therefore cannot also be allocated via tcg_global_mem_new_*.
> + * OPRSZ is the byte size of the vector upon which the operation is performed.
> + * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
> + *
> + * All sizes must be 8 or any multiple of 16.
> + * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
> + * Operands may completely, but not partially, overlap.
> + */
> +
> +/* Expand a call to a gvec-style helper, with pointers to two vector
> +   operands, and a descriptor (see tcg-gvec-desc.h).  */
> +typedef void (gen_helper_gvec_2)(TCGv_ptr, TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
> +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
> +                        gen_helper_gvec_2 *fn);
> +
> +/* Similarly, passing an extra pointer (e.g. env or float_status).  */
> +typedef void (gen_helper_gvec_2_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
> +                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> +                        int32_t data, gen_helper_gvec_2_ptr *fn);
> +
> +/* Similarly, with three vector operands.  */
> +typedef void (gen_helper_gvec_3)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
> +                        gen_helper_gvec_3 *fn);
> +
> +typedef void (gen_helper_gvec_3_ptr)(TCGv_ptr, TCGv_ptr, TCGv_ptr,
> +                                     TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> +                        int32_t data, gen_helper_gvec_3_ptr *fn);
> +
> +/* Expand a gvec operation.  Either inline or out-of-line depending on
> +   the actual vector size and the operations supported by the host.  */
> +typedef struct {
> +    /* Expand inline as a 64-bit or 32-bit integer.
> +       Only one of these will be non-NULL.  */
> +    void (*fni8)(TCGv_i64, TCGv_i64);
> +    void (*fni4)(TCGv_i32, TCGv_i32);
> +    /* Expand inline with a host vector type.  */
> +    void (*fniv)(TCGv_vec, TCGv_vec);
> +    /* Expand out-of-line helper w/descriptor.  */
> +    gen_helper_gvec_2 *fno;
> +    /* Prefer i64 to v64.  */
> +    bool prefer_i64;
> +} GVecGen2;
> +
> +typedef struct {
> +    /* Expand inline as a 64-bit or 32-bit integer.
> +       Only one of these will be non-NULL.  */
> +    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
> +    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
> +    /* Expand inline with a host vector type.  */
> +    void (*fniv)(TCGv_vec, TCGv_vec, TCGv_vec);
> +    /* Expand out-of-line helper w/descriptor.  */
> +    gen_helper_gvec_3 *fno;
> +    /* Prefer i64 to v64.  */
> +    bool prefer_i64;
> +    /* Load dest as a 3rd source operand.  */
> +    bool load_dest;
> +} GVecGen3;
> +
> +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
> +                    uint32_t opsz, uint32_t clsz, const GVecGen2 *);
> +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                    uint32_t opsz, uint32_t clsz, const GVecGen3 *);
> +
> +/* Expand a specific vector operation.  */
> +
> +#define DEF(X) \
> +    void tcg_gen_gvec_##X(uint32_t dofs, uint32_t aofs, \
> +                          uint32_t opsz, uint32_t clsz)
> +
> +DEF(mov);
> +DEF(not);
> +DEF(neg8);
> +DEF(neg16);
> +DEF(neg32);
> +DEF(neg64);
> +
> +#undef DEF
> +#define DEF(X) \
> +    void tcg_gen_gvec_##X(uint32_t dofs, uint32_t aofs, uint32_t bofs, \
> +                          uint32_t opsz, uint32_t clsz)
> +
> +DEF(add8);
> +DEF(add16);
> +DEF(add32);
> +DEF(add64);
> +
> +DEF(sub8);
> +DEF(sub16);
> +DEF(sub32);
> +DEF(sub64);
> +
> +DEF(and);
> +DEF(or);
> +DEF(xor);
> +DEF(andc);
> +DEF(orc);
> +
> +#undef DEF
> +
> +/*
> + * 64-bit vector operations.  Use these when the register has been allocated
> + * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
> + * OPRSZ = MAXSZ = 8.
> + */
> +
> +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
> +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
> +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
> +
> +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +
> +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
> new file mode 100644
> index 0000000000..c75e76367c
> --- /dev/null
> +++ b/accel/tcg/tcg-runtime-gvec.c
> @@ -0,0 +1,255 @@
> +/*
> + *  Generic vectorized operation runtime
> + *
> + *  Copyright (c) 2017 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/host-utils.h"
> +#include "cpu.h"
> +#include "exec/helper-proto.h"
> +#include "tcg-gvec-desc.h"
> +
> +
> +/* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
> +   them via GCC's generic vector extension.  This turns out to be simpler and
> +   more reliable than getting the compiler to autovectorize.
> +
> +   In tcg-op-gvec.c, we asserted that both the size and alignment
> +   of the data are multiples of 16.  */
> +
> +typedef uint8_t vec8 __attribute__((vector_size(16)));
> +typedef uint16_t vec16 __attribute__((vector_size(16)));
> +typedef uint32_t vec32 __attribute__((vector_size(16)));
> +typedef uint64_t vec64 __attribute__((vector_size(16)));
> +
> +static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
> +{
> +    intptr_t maxsz = simd_maxsz(desc);
> +    intptr_t i;
> +
> +    if (unlikely(maxsz > oprsz)) {
> +        for (i = oprsz; i < maxsz; i += sizeof(vec64)) {
> +            *(vec64 *)(d + i) = (vec64){ 0 };
> +        }
> +    }
> +}
> +
> +void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec8)) {
> +        *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec16)) {
> +        *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec32)) {
> +        *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec8)) {
> +        *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec16)) {
> +        *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec32)) {
> +        *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec8)) {
> +        *(vec8 *)(d + i) = -*(vec8 *)(a + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec16)) {
> +        *(vec16 *)(d + i) = -*(vec16 *)(a + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec32)) {
> +        *(vec32 *)(d + i) = -*(vec32 *)(a + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = -*(vec64 *)(a + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +
> +    memcpy(d, a, oprsz);
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
> new file mode 100644
> index 0000000000..7464321eba
> --- /dev/null
> +++ b/tcg/tcg-op-gvec.c
> @@ -0,0 +1,853 @@
> +/*
> + *  Generic vector operation expansion
> + *
> + *  Copyright (c) 2017 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include "tcg.h"
> +#include "tcg-op.h"
> +#include "tcg-op-gvec.h"
> +#include "tcg-gvec-desc.h"
> +
> +#define REP8(x)    ((x) * 0x0101010101010101ull)
> +#define REP16(x)   ((x) * 0x0001000100010001ull)
> +
> +#define MAX_UNROLL  4
> +
> +/* Verify vector size and alignment rules.  OFS should be the OR of all
> +   of the operand offsets so that we can check them all at once.  */
> +static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
> +{
> +    uint32_t align = maxsz > 16 || oprsz >= 16 ? 15 : 7;
> +    tcg_debug_assert(oprsz > 0);
> +    tcg_debug_assert(oprsz <= maxsz);
> +    tcg_debug_assert((oprsz & align) == 0);
> +    tcg_debug_assert((maxsz & align) == 0);
> +    tcg_debug_assert((ofs & align) == 0);
> +}
> +
> +/* Verify vector overlap rules for two operands.  */
> +static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
> +{
> +    tcg_debug_assert(d == a || d + s <= a || a + s <= d);
> +}
> +
> +/* Verify vector overlap rules for three operands.  */
> +static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
> +{
> +    check_overlap_2(d, a, s);
> +    check_overlap_2(d, b, s);
> +    check_overlap_2(a, b, s);
> +}
> +
> +/* Create a descriptor from components.  */
> +uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
> +{
> +    uint32_t desc = 0;
> +
> +    assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
> +    assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
> +    assert(data == sextract32(data, 0, SIMD_DATA_BITS));
> +
> +    oprsz = (oprsz / 8) - 1;
> +    maxsz = (maxsz / 8) - 1;
> +    desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
> +    desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
> +    desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
> +
> +    return desc;
> +}
> +
> +/* Generate a call to a gvec-style helper with two vector operands.  */
> +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
> +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
> +                        gen_helper_gvec_2 *fn)
> +{
> +    TCGv_ptr a0, a1;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
> +    tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
> +
> +    fn(a0, a1, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with three vector operands.  */
> +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
> +                        gen_helper_gvec_3 *fn)
> +{
> +    TCGv_ptr a0, a1, a2;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +    a2 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
> +    tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
> +    tcg_gen_addi_ptr(a2, tcg_ctx.tcg_env, bofs);
> +
> +    fn(a0, a1, a2, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_ptr(a2);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with three vector operands
> +   and an extra pointer operand.  */
> +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
> +                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> +                        int32_t data, gen_helper_gvec_2_ptr *fn)
> +{
> +    TCGv_ptr a0, a1;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
> +    tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
> +
> +    fn(a0, a1, ptr, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with three vector operands
> +   and an extra pointer operand.  */
> +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> +                        int32_t data, gen_helper_gvec_3_ptr *fn)
> +{
> +    TCGv_ptr a0, a1, a2;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +    a2 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, tcg_ctx.tcg_env, dofs);
> +    tcg_gen_addi_ptr(a1, tcg_ctx.tcg_env, aofs);
> +    tcg_gen_addi_ptr(a2, tcg_ctx.tcg_env, bofs);
> +
> +    fn(a0, a1, a2, ptr, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_ptr(a2);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Return true if we want to implement something of OPRSZ bytes
> +   in units of LNSZ.  This limits the expansion of inline code.  */
> +static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
> +{
> +    uint32_t lnct = oprsz / lnsz;
> +    return lnct >= 1 && lnct <= MAX_UNROLL;
> +}
> +
> +/* Clear MAXSZ bytes at DOFS.  */
> +static void expand_clr(uint32_t dofs, uint32_t maxsz)
> +{
> +    if (maxsz >= 16 && TCG_TARGET_HAS_v128) {
> +        TCGv_vec zero;
> +
> +        if (maxsz >= 32 && TCG_TARGET_HAS_v256) {
> +            zero = tcg_temp_new_vec(TCG_TYPE_V256);
> +            tcg_gen_movi_vec(zero, 0);
> +
> +            for (; maxsz >= 32; dofs += 32, maxsz -= 32) {
> +                tcg_gen_stl_vec(zero, tcg_ctx.tcg_env, dofs, TCG_TYPE_V256);
> +            }
> +        } else {
> +            zero = tcg_temp_new_vec(TCG_TYPE_V128);
> +            tcg_gen_movi_vec(zero, 0);
> +        }
> +        for (; maxsz >= 16; dofs += 16, maxsz -= 16) {
> +            tcg_gen_stl_vec(zero, tcg_ctx.tcg_env, dofs, TCG_TYPE_V128);
> +        }
> +
> +        tcg_temp_free_vec(zero);
> +    } if (TCG_TARGET_REG_BITS == 64) {
> +        TCGv_i64 zero = tcg_const_i64(0);
> +
> +        for (; maxsz >= 8; dofs += 8, maxsz -= 8) {
> +            tcg_gen_st_i64(zero, tcg_ctx.tcg_env, dofs);
> +        }
> +
> +        tcg_temp_free_i64(zero);
> +    } else if (TCG_TARGET_HAS_v64) {
> +        TCGv_vec zero = tcg_temp_new_vec(TCG_TYPE_V64);
> +
> +        tcg_gen_movi_vec(zero, 0);
> +        for (; maxsz >= 8; dofs += 8, maxsz -= 8) {
> +            tcg_gen_st_vec(zero, tcg_ctx.tcg_env, dofs);
> +        }
> +
> +        tcg_temp_free_vec(zero);
> +    } else {
> +        TCGv_i32 zero = tcg_const_i32(0);
> +
> +        for (; maxsz >= 4; dofs += 4, maxsz -= 4) {
> +            tcg_gen_st_i32(zero, tcg_ctx.tcg_env, dofs);
> +        }
> +
> +        tcg_temp_free_i32(zero);
> +    }
> +}
> +
> +/* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
> +static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t opsz,
> +                         void (*fni)(TCGv_i32, TCGv_i32))
> +{
> +    TCGv_i32 t0 = tcg_temp_new_i32();
> +    uint32_t i;
> +
> +    for (i = 0; i < opsz; i += 4) {
> +        tcg_gen_ld_i32(t0, tcg_ctx.tcg_env, aofs + i);
> +        fni(t0, t0);
> +        tcg_gen_st_i32(t0, tcg_ctx.tcg_env, dofs + i);
> +    }
> +    tcg_temp_free_i32(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
> +static void expand_3_i32(uint32_t dofs, uint32_t aofs,
> +                         uint32_t bofs, uint32_t opsz, bool load_dest,
> +                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
> +{
> +    TCGv_i32 t0 = tcg_temp_new_i32();
> +    TCGv_i32 t1 = tcg_temp_new_i32();
> +    TCGv_i32 t2 = tcg_temp_new_i32();
> +    uint32_t i;
> +
> +    for (i = 0; i < opsz; i += 4) {
> +        tcg_gen_ld_i32(t0, tcg_ctx.tcg_env, aofs + i);
> +        tcg_gen_ld_i32(t1, tcg_ctx.tcg_env, bofs + i);
> +        if (load_dest) {
> +            tcg_gen_ld_i32(t2, tcg_ctx.tcg_env, dofs + i);
> +        }
> +        fni(t2, t0, t1);
> +        tcg_gen_st_i32(t2, tcg_ctx.tcg_env, dofs + i);
> +    }
> +    tcg_temp_free_i32(t2);
> +    tcg_temp_free_i32(t1);
> +    tcg_temp_free_i32(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
> +static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t opsz,
> +                         void (*fni)(TCGv_i64, TCGv_i64))
> +{
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    uint32_t i;
> +
> +    for (i = 0; i < opsz; i += 8) {
> +        tcg_gen_ld_i64(t0, tcg_ctx.tcg_env, aofs + i);
> +        fni(t0, t0);
> +        tcg_gen_st_i64(t0, tcg_ctx.tcg_env, dofs + i);
> +    }
> +    tcg_temp_free_i64(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
> +static void expand_3_i64(uint32_t dofs, uint32_t aofs,
> +                         uint32_t bofs, uint32_t opsz, bool load_dest,
> +                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
> +{
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    uint32_t i;
> +
> +    for (i = 0; i < opsz; i += 8) {
> +        tcg_gen_ld_i64(t0, tcg_ctx.tcg_env, aofs + i);
> +        tcg_gen_ld_i64(t1, tcg_ctx.tcg_env, bofs + i);
> +        if (load_dest) {
> +            tcg_gen_ld_i64(t2, tcg_ctx.tcg_env, dofs + i);
> +        }
> +        fni(t2, t0, t1);
> +        tcg_gen_st_i64(t2, tcg_ctx.tcg_env, dofs + i);
> +    }
> +    tcg_temp_free_i64(t2);
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
> +static void expand_2_vec(uint32_t dofs, uint32_t aofs,
> +                         uint32_t opsz, uint32_t tysz, TCGType type,
> +                         void (*fni)(TCGv_vec, TCGv_vec))
> +{
> +    TCGv_vec t0 = tcg_temp_new_vec(type);
> +    uint32_t i;
> +
> +    for (i = 0; i < opsz; i += tysz) {
> +        tcg_gen_ld_vec(t0, tcg_ctx.tcg_env, aofs + i);
> +        fni(t0, t0);
> +        tcg_gen_st_vec(t0, tcg_ctx.tcg_env, dofs + i);
> +    }
> +    tcg_temp_free_vec(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
> +static void expand_3_vec(uint32_t dofs, uint32_t aofs,
> +                         uint32_t bofs, uint32_t opsz,
> +                         uint32_t tysz, TCGType type, bool load_dest,
> +                         void (*fni)(TCGv_vec, TCGv_vec, TCGv_vec))
> +{
> +    TCGv_vec t0 = tcg_temp_new_vec(type);
> +    TCGv_vec t1 = tcg_temp_new_vec(type);
> +    TCGv_vec t2 = tcg_temp_new_vec(type);
> +    uint32_t i;
> +
> +    for (i = 0; i < opsz; i += tysz) {
> +        tcg_gen_ld_vec(t0, tcg_ctx.tcg_env, aofs + i);
> +        tcg_gen_ld_vec(t1, tcg_ctx.tcg_env, bofs + i);
> +        if (load_dest) {
> +            tcg_gen_ld_vec(t2, tcg_ctx.tcg_env, dofs + i);
> +        }
> +        fni(t2, t0, t1);
> +        tcg_gen_st_vec(t2, tcg_ctx.tcg_env, dofs + i);
> +    }
> +    tcg_temp_free_vec(t2);
> +    tcg_temp_free_vec(t1);
> +    tcg_temp_free_vec(t0);
> +}
> +
> +/* Expand a vector two-operand operation.  */
> +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
> +                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
> +{
> +    check_size_align(oprsz, maxsz, dofs | aofs);
> +    check_overlap_2(dofs, aofs, maxsz);
> +
> +    /* Quick check for sizes we won't support inline.  */
> +    if (oprsz > MAX_UNROLL * 32 || maxsz > MAX_UNROLL * 32) {
> +        goto do_ool;
> +    }
> +
> +    /* Recall that ARM SVE allows vector sizes that are not a power of 2.
> +       Expand with successively smaller host vector sizes.  The intent is
> +       that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
> +    /* ??? For maxsz > oprsz, the host may be able to use an op-sized
> +       operation, zeroing the balance of the register.  We can then
> +       use a cl-sized store to implement the clearing without an extra
> +       store operation.  This is true for aarch64 and x86_64 hosts.  */
> +
> +    if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
> +        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 32);
> +        expand_2_vec(dofs, aofs, done, 32, TCG_TYPE_V256, g->fniv);
> +        dofs += done;
> +        aofs += done;
> +        oprsz -= done;
> +        maxsz -= done;
> +    }
> +
> +    if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
> +        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 16);
> +        expand_2_vec(dofs, aofs, done, 16, TCG_TYPE_V128, g->fniv);
> +        dofs += done;
> +        aofs += done;
> +        oprsz -= done;
> +        maxsz -= done;
> +    }
> +
> +    if (check_size_impl(oprsz, 8)) {
> +        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 8);
> +        if (TCG_TARGET_HAS_v64 && !g->prefer_i64) {
> +            expand_2_vec(dofs, aofs, done, 8, TCG_TYPE_V64, g->fniv);
> +        } else if (g->fni8) {
> +            expand_2_i64(dofs, aofs, done, g->fni8);
> +        } else {
> +            done = 0;
> +        }
> +        dofs += done;
> +        aofs += done;
> +        oprsz -= done;
> +        maxsz -= done;
> +    }
> +
> +    if (check_size_impl(oprsz, 4)) {
> +        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 4);
> +        expand_2_i32(dofs, aofs, done, g->fni4);
> +        dofs += done;
> +        aofs += done;
> +        oprsz -= done;
> +        maxsz -= done;
> +    }
> +
> +    if (oprsz == 0) {
> +        if (maxsz != 0) {
> +            expand_clr(dofs, maxsz);
> +        }
> +        return;
> +    }
> +
> + do_ool:
> +    tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, 0, g->fno);
> +}
> +
> +/* Expand a vector three-operand operation.  */
> +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
> +{
> +    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
> +    check_overlap_3(dofs, aofs, bofs, maxsz);
> +
> +    /* Quick check for sizes we won't support inline.  */
> +    if (oprsz > MAX_UNROLL * 32 || maxsz > MAX_UNROLL * 32) {
> +        goto do_ool;
> +    }
> +
> +    /* Recall that ARM SVE allows vector sizes that are not a power of 2.
> +       Expand with successively smaller host vector sizes.  The intent is
> +       that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
> +    /* ??? For maxsz > oprsz, the host may be able to use an op-sized
> +       operation, zeroing the balance of the register.  We can then
> +       use a cl-sized store to implement the clearing without an extra
> +       store operation.  This is true for aarch64 and x86_64 hosts.  */
> +
> +    if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
> +        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 32);
> +        expand_3_vec(dofs, aofs, bofs, done, 32, TCG_TYPE_V256,
> +                     g->load_dest, g->fniv);
> +        dofs += done;
> +        aofs += done;
> +        bofs += done;
> +        oprsz -= done;
> +        maxsz -= done;
> +    }
> +
> +    if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
> +        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 16);
> +        expand_3_vec(dofs, aofs, bofs, done, 16, TCG_TYPE_V128,
> +                     g->load_dest, g->fniv);
> +        dofs += done;
> +        aofs += done;
> +        bofs += done;
> +        oprsz -= done;
> +        maxsz -= done;
> +    }
> +
> +    if (check_size_impl(oprsz, 8)) {
> +        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 8);
> +        if (TCG_TARGET_HAS_v64 && !g->prefer_i64) {
> +            expand_3_vec(dofs, aofs, bofs, done, 8, TCG_TYPE_V64,
> +                         g->load_dest, g->fniv);
> +        } else if (g->fni8) {
> +            expand_3_i64(dofs, aofs, bofs, done, g->load_dest, g->fni8);
> +        } else {
> +            done = 0;
> +        }
> +        dofs += done;
> +        aofs += done;
> +        bofs += done;
> +        oprsz -= done;
> +        maxsz -= done;
> +    }
> +
> +    if (check_size_impl(oprsz, 4)) {
> +        uint32_t done = QEMU_ALIGN_DOWN(oprsz, 4);
> +        expand_3_i32(dofs, aofs, bofs, done, g->load_dest, g->fni4);
> +        dofs += done;
> +        aofs += done;
> +        bofs += done;
> +        oprsz -= done;
> +        maxsz -= done;
> +    }
> +
> +    if (oprsz == 0) {
> +        if (maxsz != 0) {
> +            expand_clr(dofs, maxsz);
> +        }
> +        return;
> +    }
> +
> + do_ool:
> +    tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, g->fno);
> +}
> +
> +/*
> + * Expand specific vector operations.
> + */
> +
> +void tcg_gen_gvec_mov(uint32_t dofs, uint32_t aofs,
> +                      uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen2 g = {
> +        .fni8 = tcg_gen_mov_i64,
> +        .fniv = tcg_gen_mov_vec,
> +        .fno = gen_helper_gvec_mov,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_not(uint32_t dofs, uint32_t aofs,
> +                      uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen2 g = {
> +        .fni8 = tcg_gen_not_i64,
> +        .fniv = tcg_gen_not_vec,
> +        .fno = gen_helper_gvec_not,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    TCGv_i64 t3 = tcg_temp_new_i64();
> +
> +    tcg_gen_andc_i64(t1, a, m);
> +    tcg_gen_andc_i64(t2, b, m);
> +    tcg_gen_xor_i64(t3, a, b);
> +    tcg_gen_add_i64(d, t1, t2);
> +    tcg_gen_and_i64(t3, t3, m);
> +    tcg_gen_xor_i64(d, d, t3);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +    tcg_temp_free_i64(t3);
> +}
> +
> +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(REP8(0x80));
> +    gen_addv_mask(d, a, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(REP16(0x8000));
> +    gen_addv_mask(d, a, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +
> +    tcg_gen_andi_i64(t1, a, ~0xffffffffull);
> +    tcg_gen_add_i64(t2, a, b);
> +    tcg_gen_add_i64(t1, t1, b);
> +    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +void tcg_gen_gvec_add8(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                       uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_vec_add8_i64,
> +        .fniv = tcg_gen_add8_vec,
> +        .fno = gen_helper_gvec_add8,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_add16(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_vec_add16_i64,
> +        .fniv = tcg_gen_add16_vec,
> +        .fno = gen_helper_gvec_add16,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_add32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni4 = tcg_gen_add_i32,
> +        .fniv = tcg_gen_add32_vec,
> +        .fno = gen_helper_gvec_add32,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_add64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_add_i64,
> +        .fniv = tcg_gen_add64_vec,
> +        .fno = gen_helper_gvec_add64,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    TCGv_i64 t3 = tcg_temp_new_i64();
> +
> +    tcg_gen_or_i64(t1, a, m);
> +    tcg_gen_andc_i64(t2, b, m);
> +    tcg_gen_eqv_i64(t3, a, b);
> +    tcg_gen_sub_i64(d, t1, t2);
> +    tcg_gen_and_i64(t3, t3, m);
> +    tcg_gen_xor_i64(d, d, t3);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +    tcg_temp_free_i64(t3);
> +}
> +
> +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(REP8(0x80));
> +    gen_subv_mask(d, a, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(REP16(0x8000));
> +    gen_subv_mask(d, a, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +
> +    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
> +    tcg_gen_sub_i64(t2, a, b);
> +    tcg_gen_sub_i64(t1, a, t1);
> +    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +void tcg_gen_gvec_sub8(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                       uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_vec_sub8_i64,
> +        .fniv = tcg_gen_sub8_vec,
> +        .fno = gen_helper_gvec_sub8,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_sub16(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_vec_sub16_i64,
> +        .fniv = tcg_gen_sub16_vec,
> +        .fno = gen_helper_gvec_sub16,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_sub32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni4 = tcg_gen_sub_i32,
> +        .fniv = tcg_gen_sub32_vec,
> +        .fno = gen_helper_gvec_sub32,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_sub64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_sub_i64,
> +        .fniv = tcg_gen_sub64_vec,
> +        .fno = gen_helper_gvec_sub64,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
> +{
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    TCGv_i64 t3 = tcg_temp_new_i64();
> +
> +    tcg_gen_andc_i64(t3, m, b);
> +    tcg_gen_andc_i64(t2, b, m);
> +    tcg_gen_sub_i64(d, m, t2);
> +    tcg_gen_xor_i64(d, d, t3);
> +
> +    tcg_temp_free_i64(t2);
> +    tcg_temp_free_i64(t3);
> +}
> +
> +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(REP8(0x80));
> +    gen_negv_mask(d, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(REP16(0x8000));
> +    gen_negv_mask(d, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +
> +    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
> +    tcg_gen_neg_i64(t2, b);
> +    tcg_gen_neg_i64(t1, t1);
> +    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +void tcg_gen_gvec_neg8(uint32_t dofs, uint32_t aofs,
> +                       uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen2 g = {
> +        .fni8 = tcg_gen_vec_neg8_i64,
> +        .fniv = tcg_gen_neg8_vec,
> +        .fno = gen_helper_gvec_neg8,
> +    };
> +    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_neg16(uint32_t dofs, uint32_t aofs,
> +                        uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen2 g = {
> +        .fni8 = tcg_gen_vec_neg16_i64,
> +        .fniv = tcg_gen_neg16_vec,
> +        .fno = gen_helper_gvec_neg16,
> +    };
> +    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_neg32(uint32_t dofs, uint32_t aofs,
> +                        uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen2 g = {
> +        .fni4 = tcg_gen_neg_i32,
> +        .fniv = tcg_gen_neg32_vec,
> +        .fno = gen_helper_gvec_neg32,
> +    };
> +    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_neg64(uint32_t dofs, uint32_t aofs,
> +                        uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen2 g = {
> +        .fni8 = tcg_gen_neg_i64,
> +        .fniv = tcg_gen_neg64_vec,
> +        .fno = gen_helper_gvec_neg64,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_2(dofs, aofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_and(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                      uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_and_i64,
> +        .fniv = tcg_gen_and_vec,
> +        .fno = gen_helper_gvec_and,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_or(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                     uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_or_i64,
> +        .fniv = tcg_gen_or_vec,
> +        .fno = gen_helper_gvec_or,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_xor(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                      uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_xor_i64,
> +        .fniv = tcg_gen_xor_vec,
> +        .fno = gen_helper_gvec_xor,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_andc(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                       uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_andc_i64,
> +        .fniv = tcg_gen_andc_vec,
> +        .fno = gen_helper_gvec_andc,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> +
> +void tcg_gen_gvec_orc(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                      uint32_t opsz, uint32_t clsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_orc_i64,
> +        .fniv = tcg_gen_orc_vec,
> +        .fno = gen_helper_gvec_orc,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, opsz, clsz, &g);
> +}
> diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
> index 228cd84fa4..d381a02f34 100644
> --- a/accel/tcg/Makefile.objs
> +++ b/accel/tcg/Makefile.objs
> @@ -1,6 +1,6 @@
>  obj-$(CONFIG_SOFTMMU) += tcg-all.o
>  obj-$(CONFIG_SOFTMMU) += cputlb.o
> -obj-y += tcg-runtime.o
> +obj-y += tcg-runtime.o tcg-runtime-gvec.o
>  obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
>  obj-y += translator.o


--
Alex Bennée

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers Richard Henderson
@ 2017-09-26 22:33   ` Alex Bennée
  0 siblings, 0 replies; 14+ messages in thread
From: Alex Bennée @ 2017-09-26 22:33 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, f4bug


Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> ---
>  target/arm/cpu.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/target/arm/cpu.h b/target/arm/cpu.h
> index 98b9b26fd3..c346bd148f 100644
> --- a/target/arm/cpu.h
> +++ b/target/arm/cpu.h
> @@ -486,7 +486,7 @@ typedef struct CPUARMState {
>           * the two execution states, and means we do not need to explicitly
>           * map these registers when changing states.
>           */
> -        float64 regs[64];
> +        float64 regs[64] QEMU_ALIGNED(16);
>
>          uint32_t xregs[16];
>          /* We store these fpcsr fields separately for convenience.  */


--
Alex Bennée

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion
  2017-09-16  2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
                   ` (6 preceding siblings ...)
  2017-09-16  2:35 ` [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
@ 2017-09-26 22:58 ` no-reply
  7 siblings, 0 replies; 14+ messages in thread
From: no-reply @ 2017-09-26 22:58 UTC (permalink / raw)
  To: richard.henderson; +Cc: famz, qemu-devel, alex.bennee, f4bug

Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20170916023417.14599-1-richard.henderson@linaro.org
Subject: [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

git config --local diff.renamelimit 0
git config --local diff.renames True

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
    echo "Checking PATCH $n/$total: $(git log -n 1 --format=%s $c)..."
    if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
        failed=1
        echo
    fi
    n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
7f8bff3639 tcg/aarch64: Add vector operations
107700b998 tcg/i386: Add vector operations
63c5d729cd target/arm: Use vector infrastructure for aa64 add/sub/logic
66bd1ba117 target/arm: Align vector registers
bcf88636c0 tcg: Add vector expanders
00e32ea5b2 tcg: Add types and operations for host vectors

=== OUTPUT BEGIN ===
Checking PATCH 1/6: tcg: Add types and operations for host vectors...
Checking PATCH 2/6: tcg: Add vector expanders...
ERROR: spaces required around that '&' (ctx:WxO)
#284: FILE: accel/tcg/tcg-runtime-gvec.c:241:
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
                                               ^

ERROR: space prohibited after that '~' (ctx:OxW)
#284: FILE: accel/tcg/tcg-runtime-gvec.c:241:
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
                                                ^

ERROR: spaces required around that '|' (ctx:WxO)
#295: FILE: accel/tcg/tcg-runtime-gvec.c:252:
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
                                               ^

ERROR: space prohibited after that '~' (ctx:OxW)
#295: FILE: accel/tcg/tcg-runtime-gvec.c:252:
+        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
                                                ^

ERROR: trailing statements should be on next line
#589: FILE: tcg/tcg-op-gvec.c:198:
+    } if (TCG_TARGET_REG_BITS == 64) {

total: 5 errors, 0 warnings, 1342 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

Checking PATCH 3/6: target/arm: Align vector registers...
Checking PATCH 4/6: target/arm: Use vector infrastructure for aa64 add/sub/logic...
Checking PATCH 5/6: tcg/i386: Add vector operations...
WARNING: architecture specific defines should be avoided
#50: FILE: tcg/i386/tcg-target.h:93:
+#ifdef __SSE2__

WARNING: architecture specific defines should be avoided
#55: FILE: tcg/i386/tcg-target.h:98:
+#ifdef __AVX2__

total: 0 errors, 2 warnings, 722 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
Checking PATCH 6/6: tcg/aarch64: Add vector operations...
=== OUTPUT END ===

Test command exited with code: 1


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-devel@freelists.org

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic
  2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic Richard Henderson
@ 2017-09-26 23:12   ` Alex Bennée
  0 siblings, 0 replies; 14+ messages in thread
From: Alex Bennée @ 2017-09-26 23:12 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, f4bug


Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

> ---
>  target/arm/translate-a64.c | 216 ++++++++++++++++++++++++++++++---------------
>  1 file changed, 143 insertions(+), 73 deletions(-)
>
> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
> index a3984c9a0d..4759cc9829 100644
> --- a/target/arm/translate-a64.c
> +++ b/target/arm/translate-a64.c
> @@ -21,6 +21,7 @@
>  #include "cpu.h"
>  #include "exec/exec-all.h"
>  #include "tcg-op.h"
> +#include "tcg-op-gvec.h"
>  #include "qemu/log.h"
>  #include "arm_ldst.h"
>  #include "translate.h"
> @@ -82,6 +83,7 @@ typedef void NeonGenTwoDoubleOPFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
>  typedef void NeonGenOneOpFn(TCGv_i64, TCGv_i64);
>  typedef void CryptoTwoOpEnvFn(TCGv_ptr, TCGv_i32, TCGv_i32);
>  typedef void CryptoThreeOpEnvFn(TCGv_ptr, TCGv_i32, TCGv_i32, TCGv_i32);
> +typedef void GVecGenTwoFn(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t);
>
>  /* initialize TCG globals.  */
>  void a64_translate_init(void)
> @@ -537,6 +539,21 @@ static inline int vec_reg_offset(DisasContext *s, int regno,
>      return offs;
>  }
>
> +/* Return the offset info CPUARMState of the "whole" vector register Qn.  */
> +static inline int vec_full_reg_offset(DisasContext *s, int regno)
> +{
> +    assert_fp_access_checked(s);
> +    return offsetof(CPUARMState, vfp.regs[regno * 2]);
> +}
> +
> +/* Return the byte size of the "whole" vector register, VL / 8.  */
> +static inline int vec_full_reg_size(DisasContext *s)
> +{
> +    /* FIXME SVE: We should put the composite ZCR_EL* value into tb->flags.
> +       In the meantime this is just the AdvSIMD length of 128.  */
> +    return 128 / 8;
> +}
> +
>  /* Return the offset into CPUARMState of a slice (from
>   * the least significant end) of FP register Qn (ie
>   * Dn, Sn, Hn or Bn).
> @@ -9036,85 +9053,125 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
>      }
>  }
>
> +static void gen_bsl_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
> +{
> +    tcg_gen_xor_i64(rn, rn, rm);
> +    tcg_gen_and_i64(rn, rn, rd);
> +    tcg_gen_xor_i64(rd, rm, rn);
> +}
> +
> +static void gen_bit_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
> +{
> +    tcg_gen_xor_i64(rn, rn, rd);
> +    tcg_gen_and_i64(rn, rn, rm);
> +    tcg_gen_xor_i64(rd, rd, rn);
> +}
> +
> +static void gen_bif_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
> +{
> +    tcg_gen_xor_i64(rn, rn, rd);
> +    tcg_gen_andc_i64(rn, rn, rm);
> +    tcg_gen_xor_i64(rd, rd, rn);
> +}
> +
> +static void gen_bsl_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
> +{
> +    tcg_gen_xor_vec(rn, rn, rm);
> +    tcg_gen_and_vec(rn, rn, rd);
> +    tcg_gen_xor_vec(rd, rm, rn);
> +}
> +
> +static void gen_bit_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
> +{
> +    tcg_gen_xor_vec(rn, rn, rd);
> +    tcg_gen_and_vec(rn, rn, rm);
> +    tcg_gen_xor_vec(rd, rd, rn);
> +}
> +
> +static void gen_bif_vec(TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
> +{
> +    tcg_gen_xor_vec(rn, rn, rd);
> +    tcg_gen_andc_vec(rn, rn, rm);
> +    tcg_gen_xor_vec(rd, rd, rn);
> +}
> +
>  /* Logic op (opcode == 3) subgroup of C3.6.16. */
>  static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
>  {
> +    static const GVecGen3 bsl_op = {
> +        .fni8 = gen_bsl_i64,
> +        .fniv = gen_bsl_vec,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +        .load_dest = true
> +    };
> +    static const GVecGen3 bit_op = {
> +        .fni8 = gen_bit_i64,
> +        .fniv = gen_bit_vec,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +        .load_dest = true
> +    };
> +    static const GVecGen3 bif_op = {
> +        .fni8 = gen_bif_i64,
> +        .fniv = gen_bif_vec,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +        .load_dest = true
> +    };
> +
>      int rd = extract32(insn, 0, 5);
>      int rn = extract32(insn, 5, 5);
>      int rm = extract32(insn, 16, 5);
>      int size = extract32(insn, 22, 2);
>      bool is_u = extract32(insn, 29, 1);
>      bool is_q = extract32(insn, 30, 1);
> -    TCGv_i64 tcg_op1, tcg_op2, tcg_res[2];
> -    int pass;
> +    GVecGenTwoFn *gvec_fn;
> +    const GVecGen3 *gvec_op;
>
>      if (!fp_access_check(s)) {
>          return;
>      }
>
> -    tcg_op1 = tcg_temp_new_i64();
> -    tcg_op2 = tcg_temp_new_i64();
> -    tcg_res[0] = tcg_temp_new_i64();
> -    tcg_res[1] = tcg_temp_new_i64();
> -
> -    for (pass = 0; pass < (is_q ? 2 : 1); pass++) {
> -        read_vec_element(s, tcg_op1, rn, pass, MO_64);
> -        read_vec_element(s, tcg_op2, rm, pass, MO_64);
> -
> -        if (!is_u) {
> -            switch (size) {
> -            case 0: /* AND */
> -                tcg_gen_and_i64(tcg_res[pass], tcg_op1, tcg_op2);
> -                break;
> -            case 1: /* BIC */
> -                tcg_gen_andc_i64(tcg_res[pass], tcg_op1, tcg_op2);
> -                break;
> -            case 2: /* ORR */
> -                tcg_gen_or_i64(tcg_res[pass], tcg_op1, tcg_op2);
> -                break;
> -            case 3: /* ORN */
> -                tcg_gen_orc_i64(tcg_res[pass], tcg_op1, tcg_op2);
> -                break;
> -            }
> -        } else {
> -            if (size != 0) {
> -                /* B* ops need res loaded to operate on */
> -                read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
> -            }
> -
> -            switch (size) {
> -            case 0: /* EOR */
> -                tcg_gen_xor_i64(tcg_res[pass], tcg_op1, tcg_op2);
> -                break;
> -            case 1: /* BSL bitwise select */
> -                tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_op2);
> -                tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_res[pass]);
> -                tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op1);
> -                break;
> -            case 2: /* BIT, bitwise insert if true */
> -                tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
> -                tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_op2);
> -                tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
> -                break;
> -            case 3: /* BIF, bitwise insert if false */
> -                tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
> -                tcg_gen_andc_i64(tcg_op1, tcg_op1, tcg_op2);
> -                tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
> -                break;
> -            }
> -        }
> -    }
> +    switch (size + 4 * is_u) {
> +    case 0: /* AND */
> +        gvec_fn = tcg_gen_gvec_and;
> +        goto do_fn;
> +    case 1: /* BIC */
> +        gvec_fn = tcg_gen_gvec_andc;
> +        goto do_fn;
> +    case 2: /* ORR */
> +        gvec_fn = tcg_gen_gvec_or;
> +        goto do_fn;
> +    case 3: /* ORN */
> +        gvec_fn = tcg_gen_gvec_orc;
> +        goto do_fn;
> +    case 4: /* EOR */
> +        gvec_fn = tcg_gen_gvec_xor;
> +        goto do_fn;
> +    do_fn:
> +        gvec_fn(vec_full_reg_offset(s, rd),
> +                vec_full_reg_offset(s, rn),
> +                vec_full_reg_offset(s, rm),
> +                is_q ? 16 : 8, vec_full_reg_size(s));
> +        return;
> +
> +    case 5: /* BSL bitwise select */
> +        gvec_op = &bsl_op;
> +        goto do_op;
> +    case 6: /* BIT, bitwise insert if true */
> +        gvec_op = &bit_op;
> +        goto do_op;
> +    case 7: /* BIF, bitwise insert if false */
> +        gvec_op = &bif_op;
> +        goto do_op;
> +    do_op:
> +        tcg_gen_gvec_3(vec_full_reg_offset(s, rd),
> +                       vec_full_reg_offset(s, rn),
> +                       vec_full_reg_offset(s, rm),
> +                       is_q ? 16 : 8, vec_full_reg_size(s), gvec_op);
> +        return;
>
> -    write_vec_element(s, tcg_res[0], rd, 0, MO_64);
> -    if (!is_q) {
> -        tcg_gen_movi_i64(tcg_res[1], 0);
> +    default:
> +        g_assert_not_reached();
>      }
> -    write_vec_element(s, tcg_res[1], rd, 1, MO_64);
> -
> -    tcg_temp_free_i64(tcg_op1);
> -    tcg_temp_free_i64(tcg_op2);
> -    tcg_temp_free_i64(tcg_res[0]);
> -    tcg_temp_free_i64(tcg_res[1]);
>  }
>
>  /* Helper functions for 32 bit comparisons */
> @@ -9375,6 +9432,7 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
>      int rn = extract32(insn, 5, 5);
>      int rd = extract32(insn, 0, 5);
>      int pass;
> +    GVecGenTwoFn *gvec_op;
>
>      switch (opcode) {
>      case 0x13: /* MUL, PMUL */
> @@ -9414,6 +9472,28 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
>          return;
>      }
>
> +    switch (opcode) {
> +    case 0x10: /* ADD, SUB */
> +        {
> +            static GVecGenTwoFn * const fns[4][2] = {
> +                { tcg_gen_gvec_add8, tcg_gen_gvec_sub8 },
> +                { tcg_gen_gvec_add16, tcg_gen_gvec_sub16 },
> +                { tcg_gen_gvec_add32, tcg_gen_gvec_sub32 },
> +                { tcg_gen_gvec_add64, tcg_gen_gvec_sub64 },
> +            };
> +            gvec_op = fns[size][u];
> +            goto do_gvec;
> +        }
> +        break;
> +
> +    do_gvec:
> +        gvec_op(vec_full_reg_offset(s, rd),
> +                vec_full_reg_offset(s, rn),
> +                vec_full_reg_offset(s, rm),
> +                is_q ? 16 : 8, vec_full_reg_size(s));
> +        return;
> +    }
> +
>      if (size == 3) {
>          assert(is_q);
>          for (pass = 0; pass < 2; pass++) {
> @@ -9586,16 +9666,6 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
>                  genfn = fns[size][u];
>                  break;
>              }
> -            case 0x10: /* ADD, SUB */
> -            {
> -                static NeonGenTwoOpFn * const fns[3][2] = {
> -                    { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 },
> -                    { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
> -                    { tcg_gen_add_i32, tcg_gen_sub_i32 },
> -                };
> -                genfn = fns[size][u];
> -                break;
> -            }
>              case 0x11: /* CMTST, CMEQ */
>              {
>                  static NeonGenTwoOpFn * const fns[3][2] = {


--
Alex Bennée

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors
  2017-09-26 19:28   ` Alex Bennée
@ 2017-09-27 16:18     ` Richard Henderson
  0 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2017-09-27 16:18 UTC (permalink / raw)
  To: Alex Bennée; +Cc: qemu-devel, f4bug

On 09/26/2017 12:28 PM, Alex Bennée wrote:
>>      * TCGv_ptr : a host pointer type
>> +    * TCGv_vec : a host vector type; the exact size is not exposed
>> +                 to the CPU front-end code.
> 
> Isn't this a guest vector type (which is pointed to by a host pointer)?

No, it's a host vector, which we have created in response to expanding a guest
vector operation.

> A one line comment wouldn't go amiss here. This looks like we are
> allocating a new temp of the same type as an existing temp?
> 
>> +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match)

Yes.

>> +All of the vector ops have a final constant argument that specifies the
>> +length of the vector operation LEN as 64 << LEN bits.
> 
> That doesn't scan well. So would a 4 lane operation be encoded as 64 <<
> 4? Is this because we are using the bottom bits for something?

64 << 0 = 64
64 << 1 = 128
64 << 2 = 256.

I've fixed up the wording a bit.

>> +  Copy C across the entire vector.
>> +  At present the only supported values for C are 0 and -1.
> 
> I guess this is why the size in unimportant? This is for clearing or
> setting the whole of the vector? What does len mean in this case?

Yes.  Len still means the length of the whole vector.

Elsewhere there's a comment about maybe using dupi{8,16,32,64}_vec instead.
However I wanted to put that off until we do some more conversions and see
exactly what's going to be needed.


>> +* and_vec     v0, v1, v2, len
>> +* or_vec      v0, v1, v2, len
>> +* xor_vec     v0, v1, v2, len
>> +* andc_vec    v0, v1, v2, len
>> +* orc_vec     v0, v1, v2, len
>> +* not_vec     v0, v1, len
>> +
>> +  Similarly, logical operations.
> 
> Similarly, logical operations with and without compliment?

Sure.


r~

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-09-27 16:19 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-16  2:34 [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 1/6] tcg: Add types and operations for host vectors Richard Henderson
2017-09-26 19:28   ` Alex Bennée
2017-09-27 16:18     ` Richard Henderson
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 2/6] tcg: Add vector expanders Richard Henderson
2017-09-26 22:31   ` Alex Bennée
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 3/6] target/arm: Align vector registers Richard Henderson
2017-09-26 22:33   ` Alex Bennée
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 4/6] target/arm: Use vector infrastructure for aa64 add/sub/logic Richard Henderson
2017-09-26 23:12   ` Alex Bennée
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 5/6] tcg/i386: Add vector operations Richard Henderson
2017-09-16  2:34 ` [Qemu-devel] [PATCH v3 6/6] tcg/aarch64: " Richard Henderson
2017-09-16  2:35 ` [Qemu-devel] [PATCH v3 0/6] TCG vectorization and example conversion Richard Henderson
2017-09-26 22:58 ` no-reply

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.