All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Alex Bennée" <alex.bennee@linaro.org>
To: Richard Henderson <richard.henderson@linaro.org>
Cc: qemu-devel@nongnu.org, peter.maydell@linaro.org
Subject: Re: [Qemu-devel] [PATCH v11 04/20] tcg: Add generic vector expanders
Date: Tue, 06 Feb 2018 10:59:25 +0000	[thread overview]
Message-ID: <87wozqwfhu.fsf@linaro.org> (raw)
In-Reply-To: <20180126045742.5487-5-richard.henderson@linaro.org>


Richard Henderson <richard.henderson@linaro.org> writes:

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

> ---
>  Makefile.target              |    2 +-
>  accel/tcg/tcg-runtime.h      |   29 +
>  tcg/tcg-gvec-desc.h          |   49 ++
>  tcg/tcg-op-gvec.h            |  198 +++++++
>  tcg/tcg-op.h                 |    1 +
>  tcg/tcg-opc.h                |    6 +
>  tcg/tcg.h                    |   27 +
>  accel/tcg/tcg-runtime-gvec.c |  325 +++++++++++
>  tcg/tcg-op-gvec.c            | 1308 ++++++++++++++++++++++++++++++++++++++++++
>  tcg/tcg-op-vec.c             |   33 +-
>  tcg/tcg.c                    |   13 +-
>  accel/tcg/Makefile.objs      |    2 +-
>  configure                    |   48 ++
>  13 files changed, 2023 insertions(+), 18 deletions(-)
>  create mode 100644 tcg/tcg-gvec-desc.h
>  create mode 100644 tcg/tcg-op-gvec.h
>  create mode 100644 accel/tcg/tcg-runtime-gvec.c
>  create mode 100644 tcg/tcg-op-gvec.c
>
> diff --git a/Makefile.target b/Makefile.target
> index 7f30a1e725..6549481096 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -93,7 +93,7 @@ all: $(PROGS) stap
>  # cpu emulator library
>  obj-y += exec.o
>  obj-y += accel/
> -obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o
> +obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o tcg/tcg-op-gvec.o
>  obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o
>  obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
>  obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
> diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
> index 1df17d0ba9..76ee41ce58 100644
> --- a/accel/tcg/tcg-runtime.h
> +++ b/accel/tcg/tcg-runtime.h
> @@ -134,3 +134,32 @@ GEN_ATOMIC_HELPERS(xor_fetch)
>  GEN_ATOMIC_HELPERS(xchg)
>
>  #undef GEN_ATOMIC_HELPERS
> +
> +DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_3(gvec_dup8, TCG_CALL_NO_RWG, void, ptr, i32, i32)
> +DEF_HELPER_FLAGS_3(gvec_dup16, TCG_CALL_NO_RWG, void, ptr, i32, i32)
> +DEF_HELPER_FLAGS_3(gvec_dup32, TCG_CALL_NO_RWG, void, ptr, i32, i32)
> +DEF_HELPER_FLAGS_3(gvec_dup64, TCG_CALL_NO_RWG, void, ptr, i32, i64)
> +
> +DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +
> +DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> +DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
> diff --git a/tcg/tcg-gvec-desc.h b/tcg/tcg-gvec-desc.h
> new file mode 100644
> index 0000000000..3b4c2d9c69
> --- /dev/null
> +++ b/tcg/tcg-gvec-desc.h
> @@ -0,0 +1,49 @@
> +/*
> + * Generic vector operation descriptor
> + *
> + * Copyright (c) 2018 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
> +#define SIMD_OPRSZ_SHIFT   0
> +#define SIMD_OPRSZ_BITS    5
> +
> +#define SIMD_MAXSZ_SHIFT   (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
> +#define SIMD_MAXSZ_BITS    5
> +
> +#define SIMD_DATA_SHIFT    (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
> +#define SIMD_DATA_BITS     (32 - SIMD_DATA_SHIFT)
> +
> +/* Create a descriptor from components.  */
> +uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
> +
> +/* Extract the operation size from a descriptor.  */
> +static inline intptr_t simd_oprsz(uint32_t desc)
> +{
> +    return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
> +}
> +
> +/* Extract the max vector size from a descriptor.  */
> +static inline intptr_t simd_maxsz(uint32_t desc)
> +{
> +    return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
> +}
> +
> +/* Extract the operation-specific data from a descriptor.  */
> +static inline int32_t simd_data(uint32_t desc)
> +{
> +    return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS);
> +}
> diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
> new file mode 100644
> index 0000000000..5a7d640a9d
> --- /dev/null
> +++ b/tcg/tcg-op-gvec.h
> @@ -0,0 +1,198 @@
> +/*
> + * Generic vector operation expansion
> + *
> + * Copyright (c) 2018 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +/*
> + * "Generic" vectors.  All operands are given as offsets from ENV,
> + * and therefore cannot also be allocated via tcg_global_mem_new_*.
> + * OPRSZ is the byte size of the vector upon which the operation is performed.
> + * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
> + *
> + * All sizes must be 8 or any multiple of 16.
> + * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
> + * Operands may completely, but not partially, overlap.
> + */
> +
> +/* Expand a call to a gvec-style helper, with pointers to two vector
> +   operands, and a descriptor (see tcg-gvec-desc.h).  */
> +typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
> +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
> +                        gen_helper_gvec_2 *fn);
> +
> +/* Similarly, passing an extra pointer (e.g. env or float_status).  */
> +typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
> +                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> +                        int32_t data, gen_helper_gvec_2_ptr *fn);
> +
> +/* Similarly, with three vector operands.  */
> +typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
> +                        gen_helper_gvec_3 *fn);
> +
> +/* Similarly, with four vector operands.  */
> +typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr,
> +                               TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
> +                        int32_t data, gen_helper_gvec_4 *fn);
> +
> +/* Similarly, with five vector operands.  */
> +typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
> +                               TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
> +                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn);
> +
> +typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
> +                                   TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> +                        int32_t data, gen_helper_gvec_3_ptr *fn);
> +
> +typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
> +                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
> +void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
> +                        uint32_t maxsz, int32_t data,
> +                        gen_helper_gvec_4_ptr *fn);
> +
> +/* Expand a gvec operation.  Either inline or out-of-line depending on
> +   the actual vector size and the operations supported by the host.  */
> +typedef struct {
> +    /* Expand inline as a 64-bit or 32-bit integer.
> +       Only one of these will be non-NULL.  */
> +    void (*fni8)(TCGv_i64, TCGv_i64);
> +    void (*fni4)(TCGv_i32, TCGv_i32);
> +    /* Expand inline with a host vector type.  */
> +    void (*fniv)(unsigned, TCGv_vec, TCGv_vec);
> +    /* Expand out-of-line helper w/descriptor.  */
> +    gen_helper_gvec_2 *fno;
> +    /* The opcode, if any, to which this corresponds.  */
> +    TCGOpcode opc;
> +    /* The data argument to the out-of-line helper.  */
> +    int32_t data;
> +    /* The vector element size, if applicable.  */
> +    uint8_t vece;
> +    /* Prefer i64 to v64.  */
> +    bool prefer_i64;
> +} GVecGen2;
> +
> +typedef struct {
> +    /* Expand inline as a 64-bit or 32-bit integer.
> +       Only one of these will be non-NULL.  */
> +    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
> +    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
> +    /* Expand inline with a host vector type.  */
> +    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
> +    /* Expand out-of-line helper w/descriptor.  */
> +    gen_helper_gvec_3 *fno;
> +    /* The opcode, if any, to which this corresponds.  */
> +    TCGOpcode opc;
> +    /* The data argument to the out-of-line helper.  */
> +    int32_t data;
> +    /* The vector element size, if applicable.  */
> +    uint8_t vece;
> +    /* Prefer i64 to v64.  */
> +    bool prefer_i64;
> +    /* Load dest as a 3rd source operand.  */
> +    bool load_dest;
> +} GVecGen3;
> +
> +typedef struct {
> +    /* Expand inline as a 64-bit or 32-bit integer.
> +       Only one of these will be non-NULL.  */
> +    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64);
> +    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32);
> +    /* Expand inline with a host vector type.  */
> +    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec);
> +    /* Expand out-of-line helper w/descriptor.  */
> +    gen_helper_gvec_4 *fno;
> +    /* The opcode, if any, to which this corresponds.  */
> +    TCGOpcode opc;
> +    /* The data argument to the out-of-line helper.  */
> +    int32_t data;
> +    /* The vector element size, if applicable.  */
> +    uint8_t vece;
> +    /* Prefer i64 to v64.  */
> +    bool prefer_i64;
> +} GVecGen4;
> +
> +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
> +                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *);
> +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *);
> +void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
> +                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *);
> +
> +/* Expand a specific vector operation.  */
> +
> +void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t oprsz, uint32_t maxsz);
> +void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t oprsz, uint32_t maxsz);
> +void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t oprsz, uint32_t maxsz);
> +
> +void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
> +void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
> +
> +void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
> +void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
> +void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
> +void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
> +void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
> +
> +void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                          uint32_t s, uint32_t m);
> +void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
> +                          uint32_t m, TCGv_i32);
> +void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
> +                          uint32_t m, TCGv_i64);
> +
> +void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x);
> +void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x);
> +void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x);
> +void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x);
> +
> +/*
> + * 64-bit vector operations.  Use these when the register has been allocated
> + * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
> + * OPRSZ = MAXSZ = 8.
> + */
> +
> +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
> +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
> +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
> +
> +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +
> +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
> diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
> index a684ab5890..f8ba63340e 100644
> --- a/tcg/tcg-op.h
> +++ b/tcg/tcg-op.h
> @@ -914,6 +914,7 @@ void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
>  void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
>  void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
>  void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
> +void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
>  void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
>  void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
>  void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
> diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
> index b851ad4bca..801b0b1e16 100644
> --- a/tcg/tcg-opc.h
> +++ b/tcg/tcg-opc.h
> @@ -228,6 +228,12 @@ DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
>  DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
>  DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
>
> +DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)
> +
> +#if TCG_TARGET_MAYBE_vec
> +#include "tcg-target.opc.h"
> +#endif
> +
>  #undef TLADDR_ARGS
>  #undef DATA64_ARGS
>  #undef IMPL
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index dce483b0ee..ec8f1bc72e 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -1207,6 +1207,33 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr);
>
>  void tcg_register_jit(void *buf, size_t buf_size);
>
> +#if TCG_TARGET_MAYBE_vec
> +/* Return zero if the tuple (opc, type, vece) is unsupportable;
> +   return > 0 if it is directly supportable;
> +   return < 0 if we must call tcg_expand_vec_op.  */
> +int tcg_can_emit_vec_op(TCGOpcode, TCGType, unsigned);
> +#else
> +static inline int tcg_can_emit_vec_op(TCGOpcode o, TCGType t, unsigned ve)
> +{
> +    return 0;
> +}
> +#endif
> +
> +/* Expand the tuple (opc, type, vece) on the given arguments.  */
> +void tcg_expand_vec_op(TCGOpcode, TCGType, unsigned, TCGArg, ...);
> +
> +/* Replicate a constant C accoring to the log2 of the element size.  */
> +uint64_t dup_const(unsigned vece, uint64_t c);
> +
> +#define dup_const(VECE, C)                                         \
> +    (__builtin_constant_p(VECE)                                    \
> +     ? (  (VECE) == MO_8  ? 0x0101010101010101ull * (uint8_t)(C)   \
> +        : (VECE) == MO_16 ? 0x0001000100010001ull * (uint16_t)(C)  \
> +        : (VECE) == MO_32 ? 0x0000000100000001ull * (uint32_t)(C)  \
> +        : dup_const(VECE, C))                                      \
> +     : dup_const(VECE, C))
> +
> +
>  /*
>   * Memory helpers that will be used by TCG generated code.
>   */
> diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
> new file mode 100644
> index 0000000000..e093922225
> --- /dev/null
> +++ b/accel/tcg/tcg-runtime-gvec.c
> @@ -0,0 +1,325 @@
> +/*
> + * Generic vectorized operation runtime
> + *
> + * Copyright (c) 2018 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/host-utils.h"
> +#include "cpu.h"
> +#include "exec/helper-proto.h"
> +#include "tcg-gvec-desc.h"
> +
> +
> +/* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
> + * them via GCC's generic vector extension.  This turns out to be simpler and
> + * more reliable than getting the compiler to autovectorize.
> + *
> + * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
> + * are multiples of 16.
> + *
> + * When the compiler does not support all of the operations we require, the
> + * loops are written so that we can always fall back on the base types.
> + */
> +#ifdef CONFIG_VECTOR16
> +typedef uint8_t vec8 __attribute__((vector_size(16)));
> +typedef uint16_t vec16 __attribute__((vector_size(16)));
> +typedef uint32_t vec32 __attribute__((vector_size(16)));
> +typedef uint64_t vec64 __attribute__((vector_size(16)));
> +
> +typedef int8_t svec8 __attribute__((vector_size(16)));
> +typedef int16_t svec16 __attribute__((vector_size(16)));
> +typedef int32_t svec32 __attribute__((vector_size(16)));
> +typedef int64_t svec64 __attribute__((vector_size(16)));
> +
> +#define DUP16(X)  { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
> +#define DUP8(X)   { X, X, X, X, X, X, X, X }
> +#define DUP4(X)   { X, X, X, X }
> +#define DUP2(X)   { X, X }
> +#else
> +typedef uint8_t vec8;
> +typedef uint16_t vec16;
> +typedef uint32_t vec32;
> +typedef uint64_t vec64;
> +
> +typedef int8_t svec8;
> +typedef int16_t svec16;
> +typedef int32_t svec32;
> +typedef int64_t svec64;
> +
> +#define DUP16(X)  X
> +#define DUP8(X)   X
> +#define DUP4(X)   X
> +#define DUP2(X)   X
> +#endif /* CONFIG_VECTOR16 */
> +
> +static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
> +{
> +    intptr_t maxsz = simd_maxsz(desc);
> +    intptr_t i;
> +
> +    if (unlikely(maxsz > oprsz)) {
> +        for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) {
> +            *(uint64_t *)(d + i) = 0;
> +        }
> +    }
> +}
> +
> +void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec8)) {
> +        *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec16)) {
> +        *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec32)) {
> +        *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec8)) {
> +        *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec16)) {
> +        *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec32)) {
> +        *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec8)) {
> +        *(vec8 *)(d + i) = -*(vec8 *)(a + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec16)) {
> +        *(vec16 *)(d + i) = -*(vec16 *)(a + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec32)) {
> +        *(vec32 *)(d + i) = -*(vec32 *)(a + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = -*(vec64 *)(a + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +
> +    memcpy(d, a, oprsz);
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    if (c == 0) {
> +        oprsz = 0;
> +    } else {
> +        for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
> +            *(uint64_t *)(d + i) = c;
> +        }
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    if (c == 0) {
> +        oprsz = 0;
> +    } else {
> +        for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
> +            *(uint32_t *)(d + i) = c;
> +        }
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c)
> +{
> +    HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff));
> +}
> +
> +void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c)
> +{
> +    HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff));
> +}
> +
> +void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> +
> +void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
> +{
> +    intptr_t oprsz = simd_oprsz(desc);
> +    intptr_t i;
> +
> +    for (i = 0; i < oprsz; i += sizeof(vec64)) {
> +        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
> +    }
> +    clear_high(d, oprsz, desc);
> +}
> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
> new file mode 100644
> index 0000000000..85570c983a
> --- /dev/null
> +++ b/tcg/tcg-op-gvec.c
> @@ -0,0 +1,1308 @@
> +/*
> + * Generic vector operation expansion
> + *
> + * Copyright (c) 2018 Linaro
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu-common.h"
> +#include "tcg.h"
> +#include "tcg-op.h"
> +#include "tcg-op-gvec.h"
> +#include "tcg-gvec-desc.h"
> +
> +#define MAX_UNROLL  4
> +
> +/* Verify vector size and alignment rules.  OFS should be the OR of all
> +   of the operand offsets so that we can check them all at once.  */
> +static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
> +{
> +    uint32_t align = maxsz > 16 || oprsz >= 16 ? 15 : 7;
> +    tcg_debug_assert(oprsz > 0);
> +    tcg_debug_assert(oprsz <= maxsz);
> +    tcg_debug_assert((oprsz & align) == 0);
> +    tcg_debug_assert((maxsz & align) == 0);
> +    tcg_debug_assert((ofs & align) == 0);
> +}
> +
> +/* Verify vector overlap rules for two operands.  */
> +static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
> +{
> +    tcg_debug_assert(d == a || d + s <= a || a + s <= d);
> +}
> +
> +/* Verify vector overlap rules for three operands.  */
> +static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
> +{
> +    check_overlap_2(d, a, s);
> +    check_overlap_2(d, b, s);
> +    check_overlap_2(a, b, s);
> +}
> +
> +/* Verify vector overlap rules for four operands.  */
> +static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
> +                            uint32_t c, uint32_t s)
> +{
> +    check_overlap_2(d, a, s);
> +    check_overlap_2(d, b, s);
> +    check_overlap_2(d, c, s);
> +    check_overlap_2(a, b, s);
> +    check_overlap_2(a, c, s);
> +    check_overlap_2(b, c, s);
> +}
> +
> +/* Create a descriptor from components.  */
> +uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
> +{
> +    uint32_t desc = 0;
> +
> +    assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
> +    assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
> +    assert(data == sextract32(data, 0, SIMD_DATA_BITS));
> +
> +    oprsz = (oprsz / 8) - 1;
> +    maxsz = (maxsz / 8) - 1;
> +    desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
> +    desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
> +    desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
> +
> +    return desc;
> +}
> +
> +/* Generate a call to a gvec-style helper with two vector operands.  */
> +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
> +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
> +                        gen_helper_gvec_2 *fn)
> +{
> +    TCGv_ptr a0, a1;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, cpu_env, dofs);
> +    tcg_gen_addi_ptr(a1, cpu_env, aofs);
> +
> +    fn(a0, a1, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with three vector operands.  */
> +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t oprsz, uint32_t maxsz, int32_t data,
> +                        gen_helper_gvec_3 *fn)
> +{
> +    TCGv_ptr a0, a1, a2;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +    a2 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, cpu_env, dofs);
> +    tcg_gen_addi_ptr(a1, cpu_env, aofs);
> +    tcg_gen_addi_ptr(a2, cpu_env, bofs);
> +
> +    fn(a0, a1, a2, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_ptr(a2);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with four vector operands.  */
> +void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
> +                        int32_t data, gen_helper_gvec_4 *fn)
> +{
> +    TCGv_ptr a0, a1, a2, a3;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +    a2 = tcg_temp_new_ptr();
> +    a3 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, cpu_env, dofs);
> +    tcg_gen_addi_ptr(a1, cpu_env, aofs);
> +    tcg_gen_addi_ptr(a2, cpu_env, bofs);
> +    tcg_gen_addi_ptr(a3, cpu_env, cofs);
> +
> +    fn(a0, a1, a2, a3, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_ptr(a2);
> +    tcg_temp_free_ptr(a3);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with five vector operands.  */
> +void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
> +                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
> +{
> +    TCGv_ptr a0, a1, a2, a3, a4;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +    a2 = tcg_temp_new_ptr();
> +    a3 = tcg_temp_new_ptr();
> +    a4 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, cpu_env, dofs);
> +    tcg_gen_addi_ptr(a1, cpu_env, aofs);
> +    tcg_gen_addi_ptr(a2, cpu_env, bofs);
> +    tcg_gen_addi_ptr(a3, cpu_env, cofs);
> +    tcg_gen_addi_ptr(a4, cpu_env, xofs);
> +
> +    fn(a0, a1, a2, a3, a4, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_ptr(a2);
> +    tcg_temp_free_ptr(a3);
> +    tcg_temp_free_ptr(a4);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with three vector operands
> +   and an extra pointer operand.  */
> +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
> +                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> +                        int32_t data, gen_helper_gvec_2_ptr *fn)
> +{
> +    TCGv_ptr a0, a1;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, cpu_env, dofs);
> +    tcg_gen_addi_ptr(a1, cpu_env, aofs);
> +
> +    fn(a0, a1, ptr, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with three vector operands
> +   and an extra pointer operand.  */
> +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
> +                        int32_t data, gen_helper_gvec_3_ptr *fn)
> +{
> +    TCGv_ptr a0, a1, a2;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +    a2 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, cpu_env, dofs);
> +    tcg_gen_addi_ptr(a1, cpu_env, aofs);
> +    tcg_gen_addi_ptr(a2, cpu_env, bofs);
> +
> +    fn(a0, a1, a2, ptr, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_ptr(a2);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Generate a call to a gvec-style helper with four vector operands
> +   and an extra pointer operand.  */
> +void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
> +                        uint32_t maxsz, int32_t data,
> +                        gen_helper_gvec_4_ptr *fn)
> +{
> +    TCGv_ptr a0, a1, a2, a3;
> +    TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
> +
> +    a0 = tcg_temp_new_ptr();
> +    a1 = tcg_temp_new_ptr();
> +    a2 = tcg_temp_new_ptr();
> +    a3 = tcg_temp_new_ptr();
> +
> +    tcg_gen_addi_ptr(a0, cpu_env, dofs);
> +    tcg_gen_addi_ptr(a1, cpu_env, aofs);
> +    tcg_gen_addi_ptr(a2, cpu_env, bofs);
> +    tcg_gen_addi_ptr(a3, cpu_env, cofs);
> +
> +    fn(a0, a1, a2, a3, ptr, desc);
> +
> +    tcg_temp_free_ptr(a0);
> +    tcg_temp_free_ptr(a1);
> +    tcg_temp_free_ptr(a2);
> +    tcg_temp_free_ptr(a3);
> +    tcg_temp_free_i32(desc);
> +}
> +
> +/* Return true if we want to implement something of OPRSZ bytes
> +   in units of LNSZ.  This limits the expansion of inline code.  */
> +static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
> +{
> +    uint32_t lnct = oprsz / lnsz;
> +    return lnct >= 1 && lnct <= MAX_UNROLL;
> +}
> +
> +static void expand_clr(uint32_t dofs, uint32_t maxsz);
> +
> +/* Duplicate C as per VECE.  */
> +uint64_t (dup_const)(unsigned vece, uint64_t c)
> +{
> +    switch (vece) {
> +    case MO_8:
> +        return 0x0101010101010101ull * (uint8_t)c;
> +    case MO_16:
> +        return 0x0001000100010001ull * (uint16_t)c;
> +    case MO_32:
> +        return 0x0000000100000001ull * (uint32_t)c;
> +    case MO_64:
> +        return c;
> +    default:
> +        g_assert_not_reached();
> +    }
> +}
> +
> +/* Duplicate IN into OUT as per VECE.  */
> +static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
> +{
> +    switch (vece) {
> +    case MO_8:
> +        tcg_gen_ext8u_i32(out, in);
> +        tcg_gen_muli_i32(out, out, 0x01010101);
> +        break;
> +    case MO_16:
> +        tcg_gen_deposit_i32(out, in, in, 16, 16);
> +        break;
> +    case MO_32:
> +        tcg_gen_mov_i32(out, in);
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +}
> +
> +static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
> +{
> +    switch (vece) {
> +    case MO_8:
> +        tcg_gen_ext8u_i64(out, in);
> +        tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
> +        break;
> +    case MO_16:
> +        tcg_gen_ext16u_i64(out, in);
> +        tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
> +        break;
> +    case MO_32:
> +        tcg_gen_deposit_i64(out, in, in, 32, 32);
> +        break;
> +    case MO_64:
> +        tcg_gen_mov_i64(out, in);
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +}
> +
> +/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
> + * Only one of IN_32 or IN_64 may be set;
> + * IN_C is used if IN_32 and IN_64 are unset.
> + */
> +static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
> +                   uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
> +                   uint64_t in_c)
> +{
> +    TCGType type;
> +    TCGv_i64 t_64;
> +    TCGv_i32 t_32, t_desc;
> +    TCGv_ptr t_ptr;
> +    uint32_t i;
> +
> +    assert(vece <= (in_32 ? MO_32 : MO_64));
> +    assert(in_32 == NULL || in_64 == NULL);
> +
> +    /* If we're storing 0, expand oprsz to maxsz.  */
> +    if (in_32 == NULL && in_64 == NULL) {
> +        in_c = dup_const(vece, in_c);
> +        if (in_c == 0) {
> +            oprsz = maxsz;
> +        }
> +    }
> +
> +    type = 0;
> +    if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
> +        type = TCG_TYPE_V256;
> +    } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
> +        type = TCG_TYPE_V128;
> +    } else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8)
> +               /* Prefer integer when 64-bit host and no variable dup.  */
> +               && !(TCG_TARGET_REG_BITS == 64 && in_32 == NULL
> +                    && (in_64 == NULL || vece == MO_64))) {
> +        type = TCG_TYPE_V64;
> +    }
> +
> +    /* Implement inline with a vector type, if possible.  */
> +    if (type != 0) {
> +        TCGv_vec t_vec = tcg_temp_new_vec(type);
> +
> +        if (in_32) {
> +            tcg_gen_dup_i32_vec(vece, t_vec, in_32);
> +        } else if (in_64) {
> +            tcg_gen_dup_i64_vec(vece, t_vec, in_64);
> +        } else {
> +            switch (vece) {
> +            case MO_8:
> +                tcg_gen_dup8i_vec(t_vec, in_c);
> +                break;
> +            case MO_16:
> +                tcg_gen_dup16i_vec(t_vec, in_c);
> +                break;
> +            case MO_32:
> +                tcg_gen_dup32i_vec(t_vec, in_c);
> +                break;
> +            default:
> +                tcg_gen_dup64i_vec(t_vec, in_c);
> +                break;
> +            }
> +        }
> +
> +        i = 0;
> +        if (TCG_TARGET_HAS_v256) {
> +            for (; i + 32 <= oprsz; i += 32) {
> +                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
> +            }
> +        }
> +        if (TCG_TARGET_HAS_v128) {
> +            for (; i + 16 <= oprsz; i += 16) {
> +                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
> +            }
> +        }
> +        if (TCG_TARGET_HAS_v64) {
> +            for (; i < oprsz; i += 8) {
> +                tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
> +            }
> +        }
> +        tcg_temp_free_vec(t_vec);
> +        goto done;
> +    }
> +
> +    /* Otherwise, inline with an integer type, unless "large".  */
> +    if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
> +        t_64 = NULL;
> +        t_32 = NULL;
> +
> +        if (in_32) {
> +            /* We are given a 32-bit variable input.  For a 64-bit host,
> +               use a 64-bit operation unless the 32-bit operation would
> +               be simple enough.  */
> +            if (TCG_TARGET_REG_BITS == 64
> +                && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
> +                t_64 = tcg_temp_new_i64();
> +                tcg_gen_extu_i32_i64(t_64, in_32);
> +                gen_dup_i64(vece, t_64, t_64);
> +            } else {
> +                t_32 = tcg_temp_new_i32();
> +                gen_dup_i32(vece, t_32, in_32);
> +            }
> +        } else if (in_64) {
> +            /* We are given a 64-bit variable input.  */
> +            t_64 = tcg_temp_new_i64();
> +            gen_dup_i64(vece, t_64, in_64);
> +        } else {
> +            /* We are given a constant input.  */
> +            /* For 64-bit hosts, use 64-bit constants for "simple" constants
> +               or when we'd need too many 32-bit stores, or when a 64-bit
> +               constant is really required.  */
> +            if (vece == MO_64
> +                || (TCG_TARGET_REG_BITS == 64
> +                    && (in_c == 0 || in_c == -1
> +                        || !check_size_impl(oprsz, 4)))) {
> +                t_64 = tcg_const_i64(in_c);
> +            } else {
> +                t_32 = tcg_const_i32(in_c);
> +            }
> +        }
> +
> +        /* Implement inline if we picked an implementation size above.  */
> +        if (t_32) {
> +            for (i = 0; i < oprsz; i += 4) {
> +                tcg_gen_st_i32(t_32, cpu_env, dofs + i);
> +            }
> +            tcg_temp_free_i32(t_32);
> +            goto done;
> +        }
> +        if (t_64) {
> +            for (i = 0; i < oprsz; i += 8) {
> +                tcg_gen_st_i64(t_64, cpu_env, dofs + i);
> +            }
> +            tcg_temp_free_i64(t_64);
> +            goto done;
> +        }
> +    }
> +
> +    /* Otherwise implement out of line.  */
> +    t_ptr = tcg_temp_new_ptr();
> +    tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
> +    t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
> +
> +    if (vece == MO_64) {
> +        if (in_64) {
> +            gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
> +        } else {
> +            t_64 = tcg_const_i64(in_c);
> +            gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
> +            tcg_temp_free_i64(t_64);
> +        }
> +    } else {
> +        typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
> +        static dup_fn * const fns[3] = {
> +            gen_helper_gvec_dup8,
> +            gen_helper_gvec_dup16,
> +            gen_helper_gvec_dup32
> +        };
> +
> +        if (in_32) {
> +            fns[vece](t_ptr, t_desc, in_32);
> +        } else {
> +            t_32 = tcg_temp_new_i32();
> +            if (in_64) {
> +                tcg_gen_extrl_i64_i32(t_32, in_64);
> +            } else if (vece == MO_8) {
> +                tcg_gen_movi_i32(t_32, in_c & 0xff);
> +            } else if (vece == MO_16) {
> +                tcg_gen_movi_i32(t_32, in_c & 0xffff);
> +            } else {
> +                tcg_gen_movi_i32(t_32, in_c);
> +            }
> +            fns[vece](t_ptr, t_desc, t_32);
> +            tcg_temp_free_i32(t_32);
> +        }
> +    }
> +
> +    tcg_temp_free_ptr(t_ptr);
> +    tcg_temp_free_i32(t_desc);
> +    return;
> +
> + done:
> +    if (oprsz < maxsz) {
> +        expand_clr(dofs + oprsz, maxsz - oprsz);
> +    }
> +}
> +
> +/* Likewise, but with zero.  */
> +static void expand_clr(uint32_t dofs, uint32_t maxsz)
> +{
> +    do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
> +}
> +
> +/* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
> +static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
> +                         void (*fni)(TCGv_i32, TCGv_i32))
> +{
> +    TCGv_i32 t0 = tcg_temp_new_i32();
> +    uint32_t i;
> +
> +    for (i = 0; i < oprsz; i += 4) {
> +        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
> +        fni(t0, t0);
> +        tcg_gen_st_i32(t0, cpu_env, dofs + i);
> +    }
> +    tcg_temp_free_i32(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
> +static void expand_3_i32(uint32_t dofs, uint32_t aofs,
> +                         uint32_t bofs, uint32_t oprsz, bool load_dest,
> +                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
> +{
> +    TCGv_i32 t0 = tcg_temp_new_i32();
> +    TCGv_i32 t1 = tcg_temp_new_i32();
> +    TCGv_i32 t2 = tcg_temp_new_i32();
> +    uint32_t i;
> +
> +    for (i = 0; i < oprsz; i += 4) {
> +        tcg_gen_ld_i32(t0, cpu_env, aofs + i);
> +        tcg_gen_ld_i32(t1, cpu_env, bofs + i);
> +        if (load_dest) {
> +            tcg_gen_ld_i32(t2, cpu_env, dofs + i);
> +        }
> +        fni(t2, t0, t1);
> +        tcg_gen_st_i32(t2, cpu_env, dofs + i);
> +    }
> +    tcg_temp_free_i32(t2);
> +    tcg_temp_free_i32(t1);
> +    tcg_temp_free_i32(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
> +static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                         uint32_t cofs, uint32_t oprsz,
> +                         void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
> +{
> +    TCGv_i32 t0 = tcg_temp_new_i32();
> +    TCGv_i32 t1 = tcg_temp_new_i32();
> +    TCGv_i32 t2 = tcg_temp_new_i32();
> +    TCGv_i32 t3 = tcg_temp_new_i32();
> +    uint32_t i;
> +
> +    for (i = 0; i < oprsz; i += 4) {
> +        tcg_gen_ld_i32(t1, cpu_env, aofs + i);
> +        tcg_gen_ld_i32(t2, cpu_env, bofs + i);
> +        tcg_gen_ld_i32(t3, cpu_env, cofs + i);
> +        fni(t0, t1, t2, t3);
> +        tcg_gen_st_i32(t0, cpu_env, dofs + i);
> +    }
> +    tcg_temp_free_i32(t3);
> +    tcg_temp_free_i32(t2);
> +    tcg_temp_free_i32(t1);
> +    tcg_temp_free_i32(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
> +static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
> +                         void (*fni)(TCGv_i64, TCGv_i64))
> +{
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    uint32_t i;
> +
> +    for (i = 0; i < oprsz; i += 8) {
> +        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
> +        fni(t0, t0);
> +        tcg_gen_st_i64(t0, cpu_env, dofs + i);
> +    }
> +    tcg_temp_free_i64(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
> +static void expand_3_i64(uint32_t dofs, uint32_t aofs,
> +                         uint32_t bofs, uint32_t oprsz, bool load_dest,
> +                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
> +{
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    uint32_t i;
> +
> +    for (i = 0; i < oprsz; i += 8) {
> +        tcg_gen_ld_i64(t0, cpu_env, aofs + i);
> +        tcg_gen_ld_i64(t1, cpu_env, bofs + i);
> +        if (load_dest) {
> +            tcg_gen_ld_i64(t2, cpu_env, dofs + i);
> +        }
> +        fni(t2, t0, t1);
> +        tcg_gen_st_i64(t2, cpu_env, dofs + i);
> +    }
> +    tcg_temp_free_i64(t2);
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
> +static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                         uint32_t cofs, uint32_t oprsz,
> +                         void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
> +{
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    TCGv_i64 t3 = tcg_temp_new_i64();
> +    uint32_t i;
> +
> +    for (i = 0; i < oprsz; i += 8) {
> +        tcg_gen_ld_i64(t1, cpu_env, aofs + i);
> +        tcg_gen_ld_i64(t2, cpu_env, bofs + i);
> +        tcg_gen_ld_i64(t3, cpu_env, cofs + i);
> +        fni(t0, t1, t2, t3);
> +        tcg_gen_st_i64(t0, cpu_env, dofs + i);
> +    }
> +    tcg_temp_free_i64(t3);
> +    tcg_temp_free_i64(t2);
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
> +static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                         uint32_t oprsz, uint32_t tysz, TCGType type,
> +                         void (*fni)(unsigned, TCGv_vec, TCGv_vec))
> +{
> +    TCGv_vec t0 = tcg_temp_new_vec(type);
> +    uint32_t i;
> +
> +    for (i = 0; i < oprsz; i += tysz) {
> +        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
> +        fni(vece, t0, t0);
> +        tcg_gen_st_vec(t0, cpu_env, dofs + i);
> +    }
> +    tcg_temp_free_vec(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
> +static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                         uint32_t bofs, uint32_t oprsz,
> +                         uint32_t tysz, TCGType type, bool load_dest,
> +                         void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
> +{
> +    TCGv_vec t0 = tcg_temp_new_vec(type);
> +    TCGv_vec t1 = tcg_temp_new_vec(type);
> +    TCGv_vec t2 = tcg_temp_new_vec(type);
> +    uint32_t i;
> +
> +    for (i = 0; i < oprsz; i += tysz) {
> +        tcg_gen_ld_vec(t0, cpu_env, aofs + i);
> +        tcg_gen_ld_vec(t1, cpu_env, bofs + i);
> +        if (load_dest) {
> +            tcg_gen_ld_vec(t2, cpu_env, dofs + i);
> +        }
> +        fni(vece, t2, t0, t1);
> +        tcg_gen_st_vec(t2, cpu_env, dofs + i);
> +    }
> +    tcg_temp_free_vec(t2);
> +    tcg_temp_free_vec(t1);
> +    tcg_temp_free_vec(t0);
> +}
> +
> +/* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
> +static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                         uint32_t bofs, uint32_t cofs, uint32_t oprsz,
> +                         uint32_t tysz, TCGType type,
> +                         void (*fni)(unsigned, TCGv_vec, TCGv_vec,
> +                                     TCGv_vec, TCGv_vec))
> +{
> +    TCGv_vec t0 = tcg_temp_new_vec(type);
> +    TCGv_vec t1 = tcg_temp_new_vec(type);
> +    TCGv_vec t2 = tcg_temp_new_vec(type);
> +    TCGv_vec t3 = tcg_temp_new_vec(type);
> +    uint32_t i;
> +
> +    for (i = 0; i < oprsz; i += tysz) {
> +        tcg_gen_ld_vec(t1, cpu_env, aofs + i);
> +        tcg_gen_ld_vec(t2, cpu_env, bofs + i);
> +        tcg_gen_ld_vec(t3, cpu_env, cofs + i);
> +        fni(vece, t0, t1, t2, t3);
> +        tcg_gen_st_vec(t0, cpu_env, dofs + i);
> +    }
> +    tcg_temp_free_vec(t3);
> +    tcg_temp_free_vec(t2);
> +    tcg_temp_free_vec(t1);
> +    tcg_temp_free_vec(t0);
> +}
> +
> +/* Expand a vector two-operand operation.  */
> +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
> +                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
> +{
> +    check_size_align(oprsz, maxsz, dofs | aofs);
> +    check_overlap_2(dofs, aofs, maxsz);
> +
> +    /* Recall that ARM SVE allows vector sizes that are not a power of 2.
> +       Expand with successively smaller host vector sizes.  The intent is
> +       that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
> +    /* ??? For maxsz > oprsz, the host may be able to use an opr-sized
> +       operation, zeroing the balance of the register.  We can then
> +       use a max-sized store to implement the clearing without an extra
> +       store operation.  This is true for aarch64 and x86_64 hosts.  */
> +
> +    if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
> +        && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
> +        uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
> +        expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
> +        if (some == oprsz) {
> +            goto done;
> +        }
> +        dofs += some;
> +        aofs += some;
> +        oprsz -= some;
> +        maxsz -= some;
> +    }
> +
> +    if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
> +        && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
> +        expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
> +    } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
> +               && g->fniv && check_size_impl(oprsz, 8)
> +               && (!g->opc
> +                   || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
> +        expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
> +    } else if (g->fni8 && check_size_impl(oprsz, 8)) {
> +        expand_2_i64(dofs, aofs, oprsz, g->fni8);
> +    } else if (g->fni4 && check_size_impl(oprsz, 4)) {
> +        expand_2_i32(dofs, aofs, oprsz, g->fni4);
> +    } else {
> +        assert(g->fno != NULL);
> +        tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
> +        return;
> +    }
> +
> + done:
> +    if (oprsz < maxsz) {
> +        expand_clr(dofs + oprsz, maxsz - oprsz);
> +    }
> +}
> +
> +/* Expand a vector three-operand operation.  */
> +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
> +                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
> +{
> +    check_size_align(oprsz, maxsz, dofs | aofs | bofs);
> +    check_overlap_3(dofs, aofs, bofs, maxsz);
> +
> +    /* Recall that ARM SVE allows vector sizes that are not a power of 2.
> +       Expand with successively smaller host vector sizes.  The intent is
> +       that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
> +
> +    if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
> +        && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
> +        uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
> +        expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
> +                     g->load_dest, g->fniv);
> +        if (some == oprsz) {
> +            goto done;
> +        }
> +        dofs += some;
> +        aofs += some;
> +        bofs += some;
> +        oprsz -= some;
> +        maxsz -= some;
> +    }
> +
> +    if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
> +        && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
> +        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
> +                     g->load_dest, g->fniv);
> +    } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
> +               && g->fniv && check_size_impl(oprsz, 8)
> +               && (!g->opc
> +                   || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
> +        expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
> +                     g->load_dest, g->fniv);
> +    } else if (g->fni8 && check_size_impl(oprsz, 8)) {
> +        expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
> +    } else if (g->fni4 && check_size_impl(oprsz, 4)) {
> +        expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
> +    } else {
> +        assert(g->fno != NULL);
> +        tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, g->data, g->fno);
> +    }
> +
> + done:
> +    if (oprsz < maxsz) {
> +        expand_clr(dofs + oprsz, maxsz - oprsz);
> +    }
> +}
> +
> +/* Expand a vector four-operand operation.  */
> +void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
> +                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
> +{
> +    check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
> +    check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
> +
> +    /* Recall that ARM SVE allows vector sizes that are not a power of 2.
> +       Expand with successively smaller host vector sizes.  The intent is
> +       that e.g. oprsz == 80 would be expanded with 2x32 + 1x16.  */
> +
> +    if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
> +        && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
> +        uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
> +        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
> +                     32, TCG_TYPE_V256, g->fniv);
> +        if (some == oprsz) {
> +            goto done;
> +        }
> +        dofs += some;
> +        aofs += some;
> +        bofs += some;
> +        cofs += some;
> +        oprsz -= some;
> +        maxsz -= some;
> +    }
> +
> +    if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
> +        && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
> +        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
> +                     16, TCG_TYPE_V128, g->fniv);
> +    } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
> +               && g->fniv && check_size_impl(oprsz, 8)
> +                && (!g->opc
> +                    || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
> +        expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
> +                     8, TCG_TYPE_V64, g->fniv);
> +    } else if (g->fni8 && check_size_impl(oprsz, 8)) {
> +        expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8);
> +    } else if (g->fni4 && check_size_impl(oprsz, 4)) {
> +        expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4);
> +    } else {
> +        assert(g->fno != NULL);
> +        tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
> +                           oprsz, maxsz, g->data, g->fno);
> +        return;
> +    }
> +
> + done:
> +    if (oprsz < maxsz) {
> +        expand_clr(dofs + oprsz, maxsz - oprsz);
> +    }
> +}
> +
> +/*
> + * Expand specific vector operations.
> + */
> +
> +static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
> +{
> +    tcg_gen_mov_vec(a, b);
> +}
> +
> +void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const GVecGen2 g = {
> +        .fni8 = tcg_gen_mov_i64,
> +        .fniv = vec_mov2,
> +        .fno = gen_helper_gvec_mov,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    if (dofs != aofs) {
> +        tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
> +    } else {
> +        check_size_align(oprsz, maxsz, dofs);
> +        if (oprsz < maxsz) {
> +            expand_clr(dofs + oprsz, maxsz - oprsz);
> +        }
> +    }
> +}
> +
> +void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
> +                          uint32_t maxsz, TCGv_i32 in)
> +{
> +    check_size_align(oprsz, maxsz, dofs);
> +    tcg_debug_assert(vece <= MO_32);
> +    do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
> +}
> +
> +void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
> +                          uint32_t maxsz, TCGv_i64 in)
> +{
> +    check_size_align(oprsz, maxsz, dofs);
> +    tcg_debug_assert(vece <= MO_64);
> +    do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
> +}
> +
> +void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                          uint32_t oprsz, uint32_t maxsz)
> +{
> +    if (vece <= MO_32) {
> +        TCGv_i32 in = tcg_temp_new_i32();
> +        switch (vece) {
> +        case MO_8:
> +            tcg_gen_ld8u_i32(in, cpu_env, aofs);
> +            break;
> +        case MO_16:
> +            tcg_gen_ld16u_i32(in, cpu_env, aofs);
> +            break;
> +        case MO_32:
> +            tcg_gen_ld_i32(in, cpu_env, aofs);
> +            break;
> +        }
> +        tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
> +        tcg_temp_free_i32(in);
> +    } else if (vece == MO_64) {
> +        TCGv_i64 in = tcg_temp_new_i64();
> +        tcg_gen_ld_i64(in, cpu_env, aofs);
> +        tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
> +        tcg_temp_free_i64(in);
> +    } else {
> +        /* 128-bit duplicate.  */
> +        /* ??? Dup to 256-bit vector.  */
> +        int i;
> +
> +        tcg_debug_assert(vece == 4);
> +        tcg_debug_assert(oprsz >= 16);
> +        if (TCG_TARGET_HAS_v128) {
> +            TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
> +
> +            tcg_gen_ld_vec(in, cpu_env, aofs);
> +            for (i = 0; i < oprsz; i += 16) {
> +                tcg_gen_st_vec(in, cpu_env, dofs + i);
> +            }
> +            tcg_temp_free_vec(in);
> +        } else {
> +            TCGv_i64 in0 = tcg_temp_new_i64();
> +            TCGv_i64 in1 = tcg_temp_new_i64();
> +
> +            tcg_gen_ld_i64(in0, cpu_env, aofs);
> +            tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
> +            for (i = 0; i < oprsz; i += 16) {
> +                tcg_gen_st_i64(in0, cpu_env, dofs + i);
> +                tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
> +            }
> +            tcg_temp_free_i64(in0);
> +            tcg_temp_free_i64(in1);
> +        }
> +    }
> +}
> +
> +void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
> +                         uint32_t maxsz, uint64_t x)
> +{
> +    check_size_align(oprsz, maxsz, dofs);
> +    do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
> +}
> +
> +void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
> +                         uint32_t maxsz, uint32_t x)
> +{
> +    check_size_align(oprsz, maxsz, dofs);
> +    do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
> +}
> +
> +void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
> +                         uint32_t maxsz, uint16_t x)
> +{
> +    check_size_align(oprsz, maxsz, dofs);
> +    do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
> +}
> +
> +void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
> +                         uint32_t maxsz, uint8_t x)
> +{
> +    check_size_align(oprsz, maxsz, dofs);
> +    do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
> +}
> +
> +void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const GVecGen2 g = {
> +        .fni8 = tcg_gen_not_i64,
> +        .fniv = tcg_gen_not_vec,
> +        .fno = gen_helper_gvec_not,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
> +}
> +
> +/* Perform a vector addition using normal addition and a mask.  The mask
> +   should be the sign bit of each lane.  This 6-operation form is more
> +   efficient than separate additions when there are 4 or more lanes in
> +   the 64-bit operation.  */
> +static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    TCGv_i64 t3 = tcg_temp_new_i64();
> +
> +    tcg_gen_andc_i64(t1, a, m);
> +    tcg_gen_andc_i64(t2, b, m);
> +    tcg_gen_xor_i64(t3, a, b);
> +    tcg_gen_add_i64(d, t1, t2);
> +    tcg_gen_and_i64(t3, t3, m);
> +    tcg_gen_xor_i64(d, d, t3);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +    tcg_temp_free_i64(t3);
> +}
> +
> +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
> +    gen_addv_mask(d, a, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
> +    gen_addv_mask(d, a, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +
> +    tcg_gen_andi_i64(t1, a, ~0xffffffffull);
> +    tcg_gen_add_i64(t2, a, b);
> +    tcg_gen_add_i64(t1, t1, b);
> +    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const GVecGen3 g[4] = {
> +        { .fni8 = tcg_gen_vec_add8_i64,
> +          .fniv = tcg_gen_add_vec,
> +          .fno = gen_helper_gvec_add8,
> +          .opc = INDEX_op_add_vec,
> +          .vece = MO_8 },
> +        { .fni8 = tcg_gen_vec_add16_i64,
> +          .fniv = tcg_gen_add_vec,
> +          .fno = gen_helper_gvec_add16,
> +          .opc = INDEX_op_add_vec,
> +          .vece = MO_16 },
> +        { .fni4 = tcg_gen_add_i32,
> +          .fniv = tcg_gen_add_vec,
> +          .fno = gen_helper_gvec_add32,
> +          .opc = INDEX_op_add_vec,
> +          .vece = MO_32 },
> +        { .fni8 = tcg_gen_add_i64,
> +          .fniv = tcg_gen_add_vec,
> +          .fno = gen_helper_gvec_add64,
> +          .opc = INDEX_op_add_vec,
> +          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +          .vece = MO_64 },
> +    };
> +
> +    tcg_debug_assert(vece <= MO_64);
> +    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
> +}
> +
> +/* Perform a vector subtraction using normal subtraction and a mask.
> +   Compare gen_addv_mask above.  */
> +static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    TCGv_i64 t3 = tcg_temp_new_i64();
> +
> +    tcg_gen_or_i64(t1, a, m);
> +    tcg_gen_andc_i64(t2, b, m);
> +    tcg_gen_eqv_i64(t3, a, b);
> +    tcg_gen_sub_i64(d, t1, t2);
> +    tcg_gen_and_i64(t3, t3, m);
> +    tcg_gen_xor_i64(d, d, t3);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +    tcg_temp_free_i64(t3);
> +}
> +
> +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
> +    gen_subv_mask(d, a, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
> +    gen_subv_mask(d, a, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +
> +    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
> +    tcg_gen_sub_i64(t2, a, b);
> +    tcg_gen_sub_i64(t1, a, t1);
> +    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const GVecGen3 g[4] = {
> +        { .fni8 = tcg_gen_vec_sub8_i64,
> +          .fniv = tcg_gen_sub_vec,
> +          .fno = gen_helper_gvec_sub8,
> +          .opc = INDEX_op_sub_vec,
> +          .vece = MO_8 },
> +        { .fni8 = tcg_gen_vec_sub16_i64,
> +          .fniv = tcg_gen_sub_vec,
> +          .fno = gen_helper_gvec_sub16,
> +          .opc = INDEX_op_sub_vec,
> +          .vece = MO_16 },
> +        { .fni4 = tcg_gen_sub_i32,
> +          .fniv = tcg_gen_sub_vec,
> +          .fno = gen_helper_gvec_sub32,
> +          .opc = INDEX_op_sub_vec,
> +          .vece = MO_32 },
> +        { .fni8 = tcg_gen_sub_i64,
> +          .fniv = tcg_gen_sub_vec,
> +          .fno = gen_helper_gvec_sub64,
> +          .opc = INDEX_op_sub_vec,
> +          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +          .vece = MO_64 },
> +    };
> +
> +    tcg_debug_assert(vece <= MO_64);
> +    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
> +}
> +
> +/* Perform a vector negation using normal negation and a mask.
> +   Compare gen_subv_mask above.  */
> +static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
> +{
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    TCGv_i64 t3 = tcg_temp_new_i64();
> +
> +    tcg_gen_andc_i64(t3, m, b);
> +    tcg_gen_andc_i64(t2, b, m);
> +    tcg_gen_sub_i64(d, m, t2);
> +    tcg_gen_xor_i64(d, d, t3);
> +
> +    tcg_temp_free_i64(t2);
> +    tcg_temp_free_i64(t3);
> +}
> +
> +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
> +    gen_negv_mask(d, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
> +{
> +    TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
> +    gen_negv_mask(d, b, m);
> +    tcg_temp_free_i64(m);
> +}
> +
> +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +
> +    tcg_gen_andi_i64(t1, b, ~0xffffffffull);
> +    tcg_gen_neg_i64(t2, b);
> +    tcg_gen_neg_i64(t1, t1);
> +    tcg_gen_deposit_i64(d, t1, t2, 0, 32);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const GVecGen2 g[4] = {
> +        { .fni8 = tcg_gen_vec_neg8_i64,
> +          .fniv = tcg_gen_neg_vec,
> +          .fno = gen_helper_gvec_neg8,
> +          .opc = INDEX_op_neg_vec,
> +          .vece = MO_8 },
> +        { .fni8 = tcg_gen_vec_neg16_i64,
> +          .fniv = tcg_gen_neg_vec,
> +          .fno = gen_helper_gvec_neg16,
> +          .opc = INDEX_op_neg_vec,
> +          .vece = MO_16 },
> +        { .fni4 = tcg_gen_neg_i32,
> +          .fniv = tcg_gen_neg_vec,
> +          .fno = gen_helper_gvec_neg32,
> +          .opc = INDEX_op_neg_vec,
> +          .vece = MO_32 },
> +        { .fni8 = tcg_gen_neg_i64,
> +          .fniv = tcg_gen_neg_vec,
> +          .fno = gen_helper_gvec_neg64,
> +          .opc = INDEX_op_neg_vec,
> +          .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +          .vece = MO_64 },
> +    };
> +
> +    tcg_debug_assert(vece <= MO_64);
> +    tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
> +}
> +
> +void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_and_i64,
> +        .fniv = tcg_gen_and_vec,
> +        .fno = gen_helper_gvec_and,
> +        .opc = INDEX_op_and_vec,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
> +}
> +
> +void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_or_i64,
> +        .fniv = tcg_gen_or_vec,
> +        .fno = gen_helper_gvec_or,
> +        .opc = INDEX_op_or_vec,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
> +}
> +
> +void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_xor_i64,
> +        .fniv = tcg_gen_xor_vec,
> +        .fno = gen_helper_gvec_xor,
> +        .opc = INDEX_op_xor_vec,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
> +}
> +
> +void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_andc_i64,
> +        .fniv = tcg_gen_andc_vec,
> +        .fno = gen_helper_gvec_andc,
> +        .opc = INDEX_op_andc_vec,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
> +}
> +
> +void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
> +                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
> +{
> +    static const GVecGen3 g = {
> +        .fni8 = tcg_gen_orc_i64,
> +        .fniv = tcg_gen_orc_vec,
> +        .fno = gen_helper_gvec_orc,
> +        .opc = INDEX_op_orc_vec,
> +        .prefer_i64 = TCG_TARGET_REG_BITS == 64,
> +    };
> +    tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
> +}
> diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
> index 9e4678878b..ac5b69ccf6 100644
> --- a/tcg/tcg-op-vec.c
> +++ b/tcg/tcg-op-vec.c
> @@ -73,7 +73,8 @@ static void vec_gen_op2(TCGOpcode opc, unsigned vece, TCGv_vec r, TCGv_vec a)
>      TCGTemp *at = tcgv_vec_temp(a);
>      TCGType type = rt->base_type;
>
> -    tcg_debug_assert(at->base_type == type);
> +    /* Must enough inputs for the output.  */
> +    tcg_debug_assert(at->base_type >= type);
>      vec_gen_2(opc, type, vece, temp_arg(rt), temp_arg(at));
>  }
>
> @@ -85,8 +86,9 @@ static void vec_gen_op3(TCGOpcode opc, unsigned vece,
>      TCGTemp *bt = tcgv_vec_temp(b);
>      TCGType type = rt->base_type;
>
> -    tcg_debug_assert(at->base_type == type);
> -    tcg_debug_assert(bt->base_type == type);
> +    /* Must enough inputs for the output.  */
> +    tcg_debug_assert(at->base_type >= type);
> +    tcg_debug_assert(bt->base_type >= type);
>      vec_gen_3(opc, type, vece, temp_arg(rt), temp_arg(at), temp_arg(bt));
>  }
>
> @@ -99,7 +101,7 @@ void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
>
>  #define MO_REG  (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)
>
> -static void tcg_gen_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
> +static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
>  {
>      TCGTemp *rt = tcgv_vec_temp(r);
>      vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
> @@ -108,14 +110,14 @@ static void tcg_gen_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
>  TCGv_vec tcg_const_zeros_vec(TCGType type)
>  {
>      TCGv_vec ret = tcg_temp_new_vec(type);
> -    tcg_gen_dupi_vec(ret, MO_REG, 0);
> +    do_dupi_vec(ret, MO_REG, 0);
>      return ret;
>  }
>
>  TCGv_vec tcg_const_ones_vec(TCGType type)
>  {
>      TCGv_vec ret = tcg_temp_new_vec(type);
> -    tcg_gen_dupi_vec(ret, MO_REG, -1);
> +    do_dupi_vec(ret, MO_REG, -1);
>      return ret;
>  }
>
> @@ -134,9 +136,9 @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
>  void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
>  {
>      if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) {
> -        tcg_gen_dupi_vec(r, MO_32, a);
> +        do_dupi_vec(r, MO_32, a);
>      } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) {
> -        tcg_gen_dupi_vec(r, MO_64, a);
> +        do_dupi_vec(r, MO_64, a);
>      } else {
>          TCGv_i64 c = tcg_const_i64(a);
>          tcg_gen_dup_i64_vec(MO_64, r, c);
> @@ -146,17 +148,22 @@ void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
>
>  void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
>  {
> -    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffffffffu) * a);
> +    do_dupi_vec(r, MO_REG, dup_const(MO_32, a));
>  }
>
>  void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
>  {
> -    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffff) * (a & 0xffff));
> +    do_dupi_vec(r, MO_REG, dup_const(MO_16, a));
>  }
>
>  void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
>  {
> -    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xff) * (a & 0xff));
> +    do_dupi_vec(r, MO_REG, dup_const(MO_8, a));
> +}
> +
> +void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
> +{
> +    do_dupi_vec(r, MO_REG, dup_const(vece, a));
>  }
>
>  void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
> @@ -167,14 +174,14 @@ void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
>
>      if (TCG_TARGET_REG_BITS == 64) {
>          TCGArg ai = tcgv_i64_arg(a);
> -        vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai);
> +        vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
>      } else if (vece == MO_64) {
>          TCGArg al = tcgv_i32_arg(TCGV_LOW(a));
>          TCGArg ah = tcgv_i32_arg(TCGV_HIGH(a));
>          vec_gen_3(INDEX_op_dup2_vec, type, MO_64, ri, al, ah);
>      } else {
>          TCGArg ai = tcgv_i32_arg(TCGV_LOW(a));
> -        vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai);
> +        vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
>      }
>  }
>
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 42f0acdf8e..0862cff58a 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -1403,10 +1403,10 @@ bool tcg_op_supported(TCGOpcode op)
>      case INDEX_op_orc_vec:
>          return have_vec && TCG_TARGET_HAS_orc_vec;
>
> -    case NB_OPS:
> -        break;
> +    default:
> +        tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS);
> +        return true;
>      }
> -    g_assert_not_reached();
>  }
>
>  /* Note: we convert the 64 bit args to 32 bit and do some alignment
> @@ -3733,3 +3733,10 @@ void tcg_register_jit(void *buf, size_t buf_size)
>  {
>  }
>  #endif /* ELF_HOST_MACHINE */
> +
> +#if !TCG_TARGET_MAYBE_vec
> +void tcg_expand_vec_op(TCGOpcode o, TCGType t, unsigned e, TCGArg a0, ...)
> +{
> +    g_assert_not_reached();
> +}
> +#endif
> diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
> index 228cd84fa4..d381a02f34 100644
> --- a/accel/tcg/Makefile.objs
> +++ b/accel/tcg/Makefile.objs
> @@ -1,6 +1,6 @@
>  obj-$(CONFIG_SOFTMMU) += tcg-all.o
>  obj-$(CONFIG_SOFTMMU) += cputlb.o
> -obj-y += tcg-runtime.o
> +obj-y += tcg-runtime.o tcg-runtime-gvec.o
>  obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
>  obj-y += translator.o
>
> diff --git a/configure b/configure
> index 044c6fafe2..951253acad 100755
> --- a/configure
> +++ b/configure
> @@ -4958,6 +4958,50 @@ if compile_prog "" "" ; then
>    atomic64=yes
>  fi
>
> +########################################
> +# See if 16-byte vector operations are supported.
> +# Even without a vector unit the compiler may expand these.
> +# There is a bug in old GCC for PPC that crashes here.
> +# Unfortunately it's the system compiler for Centos 7.
> +
> +cat > $TMPC << EOF
> +typedef unsigned char U1 __attribute__((vector_size(16)));
> +typedef unsigned short U2 __attribute__((vector_size(16)));
> +typedef unsigned int U4 __attribute__((vector_size(16)));
> +typedef unsigned long long U8 __attribute__((vector_size(16)));
> +typedef signed char S1 __attribute__((vector_size(16)));
> +typedef signed short S2 __attribute__((vector_size(16)));
> +typedef signed int S4 __attribute__((vector_size(16)));
> +typedef signed long long S8 __attribute__((vector_size(16)));
> +static U1 a1, b1;
> +static U2 a2, b2;
> +static U4 a4, b4;
> +static U8 a8, b8;
> +static S1 c1;
> +static S2 c2;
> +static S4 c4;
> +static S8 c8;
> +static int i;
> +int main(void)
> +{
> +  a1 += b1; a2 += b2; a4 += b4; a8 += b8;
> +  a1 -= b1; a2 -= b2; a4 -= b4; a8 -= b8;
> +  a1 *= b1; a2 *= b2; a4 *= b4; a8 *= b8;
> +  a1 &= b1; a2 &= b2; a4 &= b4; a8 &= b8;
> +  a1 |= b1; a2 |= b2; a4 |= b4; a8 |= b8;
> +  a1 ^= b1; a2 ^= b2; a4 ^= b4; a8 ^= b8;
> +  a1 <<= i; a2 <<= i; a4 <<= i; a8 <<= i;
> +  a1 >>= i; a2 >>= i; a4 >>= i; a8 >>= i;
> +  c1 >>= i; c2 >>= i; c4 >>= i; c8 >>= i;
> +  return 0;
> +}
> +EOF
> +
> +vector16=no
> +if compile_prog "" "" ; then
> +  vector16=yes
> +fi
> +
>  ########################################
>  # check if getauxval is available.
>
> @@ -6226,6 +6270,10 @@ if test "$atomic64" = "yes" ; then
>    echo "CONFIG_ATOMIC64=y" >> $config_host_mak
>  fi
>
> +if test "$vector16" = "yes" ; then
> +  echo "CONFIG_VECTOR16=y" >> $config_host_mak
> +fi
> +
>  if test "$getauxval" = "yes" ; then
>    echo "CONFIG_GETAUXVAL=y" >> $config_host_mak
>  fi


--
Alex Bennée

  reply	other threads:[~2018-02-06 10:59 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-01-26  4:57 [Qemu-devel] [PATCH v11 00/20] tcg: generic vector operations Richard Henderson
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 01/20] tcg: Allow multiple word entries into the constant pool Richard Henderson
2018-02-06  8:51   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 02/20] tcg: Add types and basic operations for host vectors Richard Henderson
2018-02-06  8:53   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 03/20] tcg: Standardize integral arguments to expanders Richard Henderson
2018-02-06  8:57   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 04/20] tcg: Add generic vector expanders Richard Henderson
2018-02-06 10:59   ` Alex Bennée [this message]
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 05/20] tcg: Add generic vector ops for constant shifts Richard Henderson
2018-02-06 11:00   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 06/20] tcg: Add generic vector ops for comparisons Richard Henderson
2018-02-06 11:01   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 07/20] tcg: Add generic vector ops for multiplication Richard Henderson
2018-02-06 11:02   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 08/20] tcg: Add generic helpers for saturating arithmetic Richard Henderson
2018-02-06 11:03   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 09/20] tcg: Add generic vector helpers with a scalar operand Richard Henderson
2018-02-06 11:04   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 10/20] tcg/optimize: Handle vector opcodes during optimize Richard Henderson
2018-02-06 11:07   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 11/20] target/arm: Align vector registers Richard Henderson
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 12/20] target/arm: Use vector infrastructure for aa64 add/sub/logic Richard Henderson
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 13/20] target/arm: Use vector infrastructure for aa64 mov/not/neg Richard Henderson
2018-02-06 11:08   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 14/20] target/arm: Use vector infrastructure for aa64 dup/movi Richard Henderson
2018-02-06 11:09   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 15/20] target/arm: Use vector infrastructure for aa64 constant shifts Richard Henderson
2018-02-05 11:14   ` Peter Maydell
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 16/20] target/arm: Use vector infrastructure for aa64 compares Richard Henderson
2018-02-06 11:10   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 17/20] target/arm: Use vector infrastructure for aa64 multiplies Richard Henderson
2018-02-06 11:11   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 18/20] target/arm: Use vector infrastructure for aa64 orr/bic immediate Richard Henderson
2018-02-06 11:13   ` Alex Bennée
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 19/20] tcg/i386: Add vector operations Richard Henderson
2018-01-26  4:57 ` [Qemu-devel] [PATCH v11 20/20] tcg/aarch64: " Richard Henderson
2018-02-06 11:15   ` Alex Bennée
2018-01-26 17:25 ` [Qemu-devel] [PATCH v11 00/20] tcg: generic " no-reply
2018-02-06 11:24 ` Alex Bennée
2018-02-06 12:07   ` Philippe Mathieu-Daudé
2018-02-06 12:36     ` Alex Bennée
2018-02-06 16:24 ` Alex Bennée
2018-02-06 20:57   ` Alex Bennée

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87wozqwfhu.fsf@linaro.org \
    --to=alex.bennee@linaro.org \
    --cc=peter.maydell@linaro.org \
    --cc=qemu-devel@nongnu.org \
    --cc=richard.henderson@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.