Re: [PATCH v6 15/36] tcg: Add guest load/store primitives for TCGv_i128

From: "Alex Bennée" <alex.bennee@linaro.org>
To: Richard Henderson <richard.henderson@linaro.org>
Cc: qemu-devel@nongnu.org, philmd@linaro.org
Subject: Re: [PATCH v6 15/36] tcg: Add guest load/store primitives for TCGv_i128
Date: Wed, 01 Feb 2023 09:52:13 +0000	[thread overview]
Message-ID: <874js5u2pu.fsf@linaro.org> (raw)
In-Reply-To: <20230130214844.1158612-16-richard.henderson@linaro.org>

Richard Henderson <richard.henderson@linaro.org> writes:

> These are not yet considering atomicity of the 16-byte value;
> this is a direct replacement for the current target code which
> uses a pair of 8-byte operations.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  include/exec/cpu_ldst.h |  10 +++
>  include/tcg/tcg-op.h    |   2 +
>  accel/tcg/cputlb.c      | 112 +++++++++++++++++++++++++++++++++
>  accel/tcg/user-exec.c   |  66 ++++++++++++++++++++
>  tcg/tcg-op.c            | 134 ++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 324 insertions(+)
>
> diff --git a/include/exec/cpu_ldst.h b/include/exec/cpu_ldst.h
> index d0c7c0d5fe..09b55cc0ee 100644
> --- a/include/exec/cpu_ldst.h
> +++ b/include/exec/cpu_ldst.h
> @@ -220,6 +220,11 @@ uint32_t cpu_ldl_le_mmu(CPUArchState *env, abi_ptr ptr,
>  uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr ptr,
>                          MemOpIdx oi, uintptr_t ra);
>  
> +Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
> +                       MemOpIdx oi, uintptr_t ra);
> +Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
> +                       MemOpIdx oi, uintptr_t ra);
> +
>  void cpu_stb_mmu(CPUArchState *env, abi_ptr ptr, uint8_t val,
>                   MemOpIdx oi, uintptr_t ra);
>  void cpu_stw_be_mmu(CPUArchState *env, abi_ptr ptr, uint16_t val,
> @@ -235,6 +240,11 @@ void cpu_stl_le_mmu(CPUArchState *env, abi_ptr ptr, uint32_t val,
>  void cpu_stq_le_mmu(CPUArchState *env, abi_ptr ptr, uint64_t val,
>                      MemOpIdx oi, uintptr_t ra);
>  
> +void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
> +                     MemOpIdx oi, uintptr_t ra);
> +void cpu_st16_le_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
> +                     MemOpIdx oi, uintptr_t ra);
> +
>  uint32_t cpu_atomic_cmpxchgb_mmu(CPUArchState *env, target_ulong addr,
>                                   uint32_t cmpv, uint32_t newv,
>                                   MemOpIdx oi, uintptr_t retaddr);
> diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
> index c4276767d1..e5f5b63c37 100644
> --- a/include/tcg/tcg-op.h
> +++ b/include/tcg/tcg-op.h
> @@ -845,6 +845,8 @@ void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, MemOp);
>  void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, MemOp);
>  void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, MemOp);
>  void tcg_gen_qemu_st_i64(TCGv_i64, TCGv, TCGArg, MemOp);
> +void tcg_gen_qemu_ld_i128(TCGv_i128, TCGv, TCGArg, MemOp);
> +void tcg_gen_qemu_st_i128(TCGv_i128, TCGv, TCGArg, MemOp);
>  
>  static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index)
>  {
> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
> index 4e040a1cb9..e3604ad313 100644
> --- a/accel/tcg/cputlb.c
> +++ b/accel/tcg/cputlb.c
> @@ -2187,6 +2187,64 @@ uint64_t cpu_ldq_le_mmu(CPUArchState *env, abi_ptr addr,
>      return cpu_load_helper(env, addr, oi, ra, helper_le_ldq_mmu);
>  }
>  
> +Int128 cpu_ld16_be_mmu(CPUArchState *env, abi_ptr addr,
> +                       MemOpIdx oi, uintptr_t ra)
> +{
> +    MemOp mop = get_memop(oi);
> +    int mmu_idx = get_mmuidx(oi);
> +    MemOpIdx new_oi;
> +    unsigned a_bits;
> +    uint64_t h, l;
> +
> +    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_BE|MO_128));
> +    a_bits = get_alignment_bits(mop);
> +
> +    /* Handle CPU specific unaligned behaviour */
> +    if (addr & ((1 << a_bits) - 1)) {
> +        cpu_unaligned_access(env_cpu(env), addr, MMU_DATA_LOAD,
> +                             mmu_idx, ra);
> +    }
> +
> +    /* Construct an unaligned 64-bit replacement MemOpIdx. */
> +    mop = (mop & ~(MO_SIZE | MO_AMASK)) | MO_64 | MO_UNALN;
> +    new_oi = make_memop_idx(mop, mmu_idx);
> +
> +    h = helper_be_ldq_mmu(env, addr, new_oi, ra);
> +    l = helper_be_ldq_mmu(env, addr + 8, new_oi, ra);
> +
> +    qemu_plugin_vcpu_mem_cb(env_cpu(env), addr, oi, QEMU_PLUGIN_MEM_R);
> +    return int128_make128(l, h);
> +}
> +
> +Int128 cpu_ld16_le_mmu(CPUArchState *env, abi_ptr addr,
> +                       MemOpIdx oi, uintptr_t ra)
> +{
> +    MemOp mop = get_memop(oi);
> +    int mmu_idx = get_mmuidx(oi);
> +    MemOpIdx new_oi;
> +    unsigned a_bits;
> +    uint64_t h, l;
> +
> +    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_LE|MO_128));

Why not use validate_memop for this like elsewhere in cputlb?

<snip>
>  
> +void cpu_st16_be_mmu(CPUArchState *env, abi_ptr addr, Int128 val,
> +                     MemOpIdx oi, uintptr_t ra)
> +{
> +    MemOp mop = get_memop(oi);
> +    int mmu_idx = get_mmuidx(oi);
> +    MemOpIdx new_oi;
> +    unsigned a_bits;
> +
> +    tcg_debug_assert((mop & (MO_BSWAP|MO_SSIZE)) == (MO_BE|MO_128));

ditto for the others

> diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
> index cb83d2375d..33ef325f6e 100644
> --- a/tcg/tcg-op.c
> +++ b/tcg/tcg-op.c
> @@ -3109,6 +3109,140 @@ void tcg_gen_qemu_st_i64(TCGv_i64 val, TCGv addr, TCGArg idx, MemOp memop)
>      }
>  }
>

I'm confused because the TCG ops in this patch are still using i64  and
the atomic use hasn't come in yet. Worth splitting the patch?

Anyway:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

> +static void canonicalize_memop_i128_as_i64(MemOp ret[2], MemOp orig)
> +{
> +    MemOp mop_1 = orig, mop_2;
> +
> +    tcg_debug_assert((orig & MO_SIZE) == MO_128);
> +    tcg_debug_assert((orig & MO_SIGN) == 0);
> +
> +    /* Use a memory ordering implemented by the host. */
> +    if (!TCG_TARGET_HAS_MEMORY_BSWAP && (orig & MO_BSWAP)) {
> +        mop_1 &= ~MO_BSWAP;
> +    }
> +
> +    /* Reduce the size to 64-bit. */
> +    mop_1 = (mop_1 & ~MO_SIZE) | MO_64;
> +
> +    /* Retain the alignment constraints of the original. */
> +    switch (orig & MO_AMASK) {
> +    case MO_UNALN:
> +    case MO_ALIGN_2:
> +    case MO_ALIGN_4:
> +        mop_2 = mop_1;
> +        break;
> +    case MO_ALIGN_8:
> +        /* Prefer MO_ALIGN+MO_64 to MO_ALIGN_8+MO_64. */
> +        mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN;
> +        mop_2 = mop_1;
> +        break;
> +    case MO_ALIGN:
> +        /* Second has 8-byte alignment; first has 16-byte alignment. */
> +        mop_2 = mop_1;
> +        mop_1 = (mop_1 & ~MO_AMASK) | MO_ALIGN_16;
> +        break;
> +    case MO_ALIGN_16:
> +    case MO_ALIGN_32:
> +    case MO_ALIGN_64:
> +        /* Second has 8-byte alignment; first retains original. */
> +        mop_2 = (mop_1 & ~MO_AMASK) | MO_ALIGN;
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +    ret[0] = mop_1;
> +    ret[1] = mop_2;
> +}
> +
> +void tcg_gen_qemu_ld_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop)
> +{
> +    MemOp mop[2];
> +    TCGv addr_p8;
> +    TCGv_i64 x, y;
> +
> +    canonicalize_memop_i128_as_i64(mop, memop);
> +
> +    tcg_gen_req_mo(TCG_MO_LD_LD | TCG_MO_ST_LD);
> +    addr = plugin_prep_mem_callbacks(addr);
> +
> +    /* TODO: respect atomicity of the operation. */
> +    /* TODO: allow the tcg backend to see the whole operation. */
> +
> +    /*
> +     * Since there are no global TCGv_i128, there is no visible state
> +     * changed if the second load faults.  Load directly into the two
> +     * subwords.
> +     */
> +    if ((memop & MO_BSWAP) == MO_LE) {
> +        x = TCGV128_LOW(val);
> +        y = TCGV128_HIGH(val);
> +    } else {
> +        x = TCGV128_HIGH(val);
> +        y = TCGV128_LOW(val);
> +    }
> +
> +    gen_ldst_i64(INDEX_op_qemu_ld_i64, x, addr, mop[0], idx);
> +
> +    if ((mop[0] ^ memop) & MO_BSWAP) {
> +        tcg_gen_bswap64_i64(x, x);
> +    }
> +
> +    addr_p8 = tcg_temp_new();
> +    tcg_gen_addi_tl(addr_p8, addr, 8);
> +    gen_ldst_i64(INDEX_op_qemu_ld_i64, y, addr_p8, mop[1], idx);
> +    tcg_temp_free(addr_p8);
> +
> +    if ((mop[0] ^ memop) & MO_BSWAP) {
> +        tcg_gen_bswap64_i64(y, y);
> +    }
> +
> +    plugin_gen_mem_callbacks(addr, make_memop_idx(memop, idx),
> +                             QEMU_PLUGIN_MEM_R);
> +}
> +
> +void tcg_gen_qemu_st_i128(TCGv_i128 val, TCGv addr, TCGArg idx, MemOp memop)
> +{
> +    MemOp mop[2];
> +    TCGv addr_p8;
> +    TCGv_i64 x, y;
> +
> +    canonicalize_memop_i128_as_i64(mop, memop);
> +
> +    tcg_gen_req_mo(TCG_MO_ST_LD | TCG_MO_ST_ST);
> +    addr = plugin_prep_mem_callbacks(addr);
> +
> +    /* TODO: respect atomicity of the operation. */
> +    /* TODO: allow the tcg backend to see the whole operation. */
> +
> +    if ((memop & MO_BSWAP) == MO_LE) {
> +        x = TCGV128_LOW(val);
> +        y = TCGV128_HIGH(val);
> +    } else {
> +        x = TCGV128_HIGH(val);
> +        y = TCGV128_LOW(val);
> +    }
> +
> +    addr_p8 = tcg_temp_new();
> +    if ((mop[0] ^ memop) & MO_BSWAP) {
> +        TCGv_i64 t = tcg_temp_new_i64();
> +
> +        tcg_gen_bswap64_i64(t, x);
> +        gen_ldst_i64(INDEX_op_qemu_st_i64, t, addr, mop[0], idx);
> +        tcg_gen_bswap64_i64(t, y);
> +        tcg_gen_addi_tl(addr_p8, addr, 8);
> +        gen_ldst_i64(INDEX_op_qemu_st_i64, t, addr_p8, mop[1], idx);
> +        tcg_temp_free_i64(t);
> +    } else {
> +        gen_ldst_i64(INDEX_op_qemu_st_i64, x, addr, mop[0], idx);
> +        tcg_gen_addi_tl(addr_p8, addr, 8);
> +        gen_ldst_i64(INDEX_op_qemu_st_i64, y, addr_p8, mop[1], idx);
> +    }
> +    tcg_temp_free(addr_p8);
> +
> +    plugin_gen_mem_callbacks(addr, make_memop_idx(memop, idx),
> +                             QEMU_PLUGIN_MEM_W);
> +}
> +
>  static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc)
>  {
>      switch (opc & MO_SSIZE) {

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro