Re: [PATCH v5 11/22] target/arm: Implement LDG, STG, ST2G instructions

From: Peter Maydell <peter.maydell@linaro.org>
To: Richard Henderson <richard.henderson@linaro.org>
Cc: qemu-arm <qemu-arm@nongnu.org>, QEMU Developers <qemu-devel@nongnu.org>
Subject: Re: [PATCH v5 11/22] target/arm: Implement LDG, STG, ST2G instructions
Date: Thu, 5 Dec 2019 17:07:16 +0000	[thread overview]
Message-ID: <CAFEAcA_0pBg57n8wq2j0tS94ewJf1mrYsJYzaWYcQAKvfBFm-g@mail.gmail.com> (raw)
In-Reply-To: <20191011134744.2477-12-richard.henderson@linaro.org>

On Fri, 11 Oct 2019 at 14:50, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
> v2: Split out allocation_tag_mem.  Handle atomicity of stores.
> v3: Add X[t] input to these insns; require pre-cleaned addresses.
> v5: Fix !32-byte aligned operation of st2g.
> ---
>  target/arm/helper-a64.h    |   5 ++
>  target/arm/mte_helper.c    | 154 +++++++++++++++++++++++++++++++++++++
>  target/arm/translate-a64.c | 115 +++++++++++++++++++++++++++
>  3 files changed, 274 insertions(+)
>

> --- a/target/arm/mte_helper.c
> +++ b/target/arm/mte_helper.c
> @@ -25,8 +25,21 @@
>  #include "exec/helper-proto.h"
>
>
> +static uint8_t *allocation_tag_mem(CPUARMState *env, uint64_t ptr,
> +                                   bool write, uintptr_t ra)
> +{
> +    /* Tag storage not implemented.  */
> +    return NULL;
> +}
> +
>  static int get_allocation_tag(CPUARMState *env, uint64_t ptr, uintptr_t ra)
>  {
> +    uint8_t *mem = allocation_tag_mem(env, ptr, false, ra);
> +
> +    if (mem) {
> +        int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4;
> +        return extract32(atomic_read(mem), ofs, 4);

Can we have a comment somewhere describing what our tag
storage looks like? I guess from the code that we're doing
it as a byte array where each byte stores 2 4-bit tags
(in which order?), but documenting it would be nice.

> +    }
>      /* Tag storage not implemented.  */
>      return -1;
>  }

> +static void do_st2g(CPUARMState *env, uint64_t ptr1, uint64_t xt,
> +                    uintptr_t ra, stg_store1 store1)
> +{
> +    int el, tag;
> +    uint64_t ptr2, sctlr;
> +    uint8_t *mem1, *mem2;
> +
> +    check_tag_aligned(env, ptr1, ra);
> +
> +    el = arm_current_el(env);
> +    sctlr = arm_sctlr(env, el);
> +    tag = allocation_tag_from_addr(xt);
> +
> +    /*
> +     * Trap if accessing an invalid page(s).
> +     * This takes priority over !allocation_tag_access_enabled.
> +     */
> +    mem1 = allocation_tag_mem(env, ptr1, true, ra);
> +
> +    if (ptr1 & TAG_GRANULE) {
> +        /* The two stores are unaligned and modify two bytes.  */
> +        ptr2 = ptr1 + TAG_GRANULE;
> +        mem2 = allocation_tag_mem(env, ptr2, true, ra);
> +
> +        /* Store if page supports tags and access is enabled.  */
> +        if ((mem1 || mem2) && allocation_tag_access_enabled(env, el, sctlr)) {
> +            if (mem1) {
> +                store1(ptr1, mem1, tag);
> +            }
> +            if (mem2) {
> +                store1(ptr2, mem2, tag);
> +            }
> +        }
> +    } else {
> +        /* The two stores are aligned 32, and modify one byte.  */

Not sure what the '32' means here?

> +        if (mem1 && allocation_tag_access_enabled(env, el, sctlr)) {
> +            tag |= tag << 4;
> +            atomic_set(mem1, tag);
> +        }
> +    }
> +}

> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
> index cf341c98d3..c17b36ebb2 100644
> --- a/target/arm/translate-a64.c
> +++ b/target/arm/translate-a64.c
> @@ -3559,6 +3559,118 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
>      }
>  }
>
> +/*
> + * Load/Store memory tags
> + *
> + *  31 30 29         24     22  21     12    10      5      0
> + * +-----+-------------+-----+---+------+-----+------+------+
> + * | 1 1 | 0 1 1 0 0 1 | op1 | 1 | imm9 | op2 |  Rn  |  Rt  |
> + * +-----+-------------+-----+---+------+-----+------+------+
> + */
> +static void disas_ldst_tag(DisasContext *s, uint32_t insn)
> +{
> +    int rt = extract32(insn, 0, 5);
> +    int rn = extract32(insn, 5, 5);
> +    uint64_t offset = sextract64(insn, 12, 9) << LOG2_TAG_GRANULE;
> +    int op2 = extract32(insn, 10, 3);

Typo ? op2 is only 2 bits, not 3.

> +    int op1 = extract32(insn, 22, 2);

The Arm ARM calls this field 'opc', fwiw.

> +    bool is_load = false, is_pair = false, is_zero = false;
> +    int index = 0;
> +    TCGv_i64 dirty_addr, clean_addr, tcg_rt;
> +
> +    if ((insn & 0xff200000) != 0xd9200000
> +        || !dc_isar_feature(aa64_mte_insn_reg, s)) {
> +        goto do_unallocated;
> +    }

Bits 28:24 are already checked by the decode that got us here.

I did wonder about maybe doing the decode of
[31:30] and [21] in the caller (which would match the
structure of the decode tables in the manual), but
we do the same sort of thing for bit [31] in
disas_ldst_multiple_struct() and disas_ldst_single_struct(),
so this is fine.

Not all the insns in this encoding group are present
for the mte_insn_reg cut-down implementation:
LDGM, STGM and STZGM should UNDEF unless we have
full-fat MTE. We haven't added any of those in this patch,
but it might affect how you want to structure the
conditional for doing the feature bit test. (Looking
ahead, patch 13 which adds those insns doesn't update the
feature bit test.)

> +
> +    switch (op1) {
> +    case 0: /* STG */
> +        if (op2 != 0) {
> +            /* STG */
> +            index = op2 - 2;

What does 'index' represent? It looks from the rest of
the code like it's some sort of tristate between
'preindex', 'postindex' and 'not indexed'; if so
a comment explaining what the valid values and meanings
are would be helpful. Alternatively, follow the approach
of disas_ldst_reg_imm9() and just have separate
'post_index' and 'writeback' bools.

> +            break;
> +        }
> +        goto do_unallocated;
> +    case 1:
> +        if (op2 != 0) {
> +            /* STZG */
> +            is_zero = true;
> +            index = op2 - 2;
> +        } else {
> +            /* LDG */
> +            is_load = true;
> +        }
> +        break;
> +    case 2:
> +        if (op2 != 0) {
> +            /* ST2G */
> +            is_pair = true;
> +            index = op2 - 2;
> +            break;
> +        }
> +        goto do_unallocated;
> +    case 3:
> +        if (op2 != 0) {
> +            /* STZ2G */
> +            is_pair = is_zero = true;
> +            index = op2 - 2;
> +            break;
> +        }
> +        goto do_unallocated;
> +
> +    default:
> +    do_unallocated:
> +        unallocated_encoding(s);
> +        return;
> +    }

Should there be a
    if (rn == 31) {
        gen_check_sp_alignment(s);
    }
here?

> +
> +    dirty_addr = read_cpu_reg_sp(s, rn, true);
> +    if (index <= 0) {
> +        /* pre-index or signed offset */
> +        tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
> +    }
> +
> +    clean_addr = clean_data_tbi(s, dirty_addr, false);
> +    tcg_rt = cpu_reg(s, rt);

I think this is only correct for LDG, where the Rt field
is 'specifies the Xt register to use'; for STG and ST2G
it's an '<Xn|SP>' form where 31 means "use SP" and you
want cpu_reg_sp() for those.

> +
> +    if (is_load) {
> +        gen_helper_ldg(tcg_rt, cpu_env, clean_addr, tcg_rt);
> +    } else if (tb_cflags(s->base.tb) & CF_PARALLEL) {
> +        if (is_pair) {
> +            gen_helper_st2g_parallel(cpu_env, clean_addr, tcg_rt);
> +        } else {
> +            gen_helper_stg_parallel(cpu_env, clean_addr, tcg_rt);
> +        }
> +    } else {
> +        if (is_pair) {
> +            gen_helper_st2g(cpu_env, clean_addr, tcg_rt);
> +        } else {
> +            gen_helper_stg(cpu_env, clean_addr, tcg_rt);
> +        }
> +    }
> +
> +    if (is_zero) {
> +        TCGv_i64 tcg_zero = tcg_const_i64(0);
> +        int mem_index = get_mem_index(s);
> +        int i, n = (1 + is_pair) << LOG2_TAG_GRANULE;
> +
> +        for (i = 0; i < n; i += 8) {
> +            tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index, MO_Q);
> +            tcg_gen_addi_i64(clean_addr, clean_addr, 8);
> +        }
> +        tcg_temp_free_i64(tcg_zero);
> +    }
> +
> +    if (index != 0) {
> +        /* pre-index or post-index */
> +        if (index > 0) {
> +            /* post-index */
> +            tcg_gen_addi_i64(dirty_addr, dirty_addr, offset);
> +        }
> +        tcg_gen_mov_i64(cpu_reg_sp(s, rn), dirty_addr);
> +    }
> +}

thanks
-- PMM