Re: [Qemu-devel] [RFC v6 10/14] softmmu: Simplify helper_*_st_name, wrap unaligned code

From: "Alex Bennée" <alex.bennee@linaro.org>
To: Alvise Rigo <a.rigo@virtualopensystems.com>
Cc: mttcg@listserver.greensocs.com, claudio.fontana@huawei.com,
	qemu-devel@nongnu.org, pbonzini@redhat.com,
	jani.kokkonen@huawei.com, tech@virtualopensystems.com,
	rth@twiddle.net
Subject: Re: [Qemu-devel] [RFC v6 10/14] softmmu: Simplify helper_*_st_name, wrap unaligned code
Date: Thu, 07 Jan 2016 14:46:12 +0000	[thread overview]
Message-ID: <8760z5cury.fsf@linaro.org> (raw)
In-Reply-To: <1450082498-27109-11-git-send-email-a.rigo@virtualopensystems.com>

Alvise Rigo <a.rigo@virtualopensystems.com> writes:

> Attempting to simplify the helper_*_st_name, wrap the
> do_unaligned_access code into an inline function.
> Remove also the goto statement.

As I said in the other thread I think these sort of clean-ups can come
before the ll/sc implementations and potentially get merged ahead of the
rest of it.

>
> Suggested-by: Jani Kokkonen <jani.kokkonen@huawei.com>
> Suggested-by: Claudio Fontana <claudio.fontana@huawei.com>
> Signed-off-by: Alvise Rigo <a.rigo@virtualopensystems.com>
> ---
>  softmmu_template.h | 96 ++++++++++++++++++++++++++++++++++--------------------
>  1 file changed, 60 insertions(+), 36 deletions(-)
>
> diff --git a/softmmu_template.h b/softmmu_template.h
> index d3d5902..92f92b1 100644
> --- a/softmmu_template.h
> +++ b/softmmu_template.h
> @@ -370,6 +370,32 @@ static inline void glue(io_write, SUFFIX)(CPUArchState *env,
>                                   iotlbentry->attrs);
>  }
>
> +static inline void glue(helper_le_st_name, _do_unl_access)(CPUArchState *env,
> +                                                           DATA_TYPE val,
> +                                                           target_ulong addr,
> +                                                           TCGMemOpIdx oi,
> +                                                           unsigned mmu_idx,
> +                                                           uintptr_t retaddr)
> +{
> +    int i;
> +
> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> +                             mmu_idx, retaddr);
> +    }
> +    /* XXX: not efficient, but simple */
> +    /* Note: relies on the fact that tlb_fill() does not remove the
> +     * previous page from the TLB cache.  */
> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
> +        /* Little-endian extract.  */
> +        uint8_t val8 = val >> (i * 8);
> +        /* Note the adjustment at the beginning of the function.
> +           Undo that for the recursion.  */
> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> +                                        oi, retaddr + GETPC_ADJ);
> +    }
> +}

There is still duplication of 99% of the code here which is silly given
the compiler inlines the code anyway. If we gave the helper a more
generic name and passed the endianess in via args I would hope the
compiler did the sensible thing and constant fold the code. Something
like:

static inline void glue(helper_generic_st_name, _do_unl_access)
                        (CPUArchState *env,
                        bool little_endian,
                        DATA_TYPE val,
                        target_ulong addr,
                        TCGMemOpIdx oi,
                        unsigned mmu_idx,
                        uintptr_t retaddr)
{
    int i;

    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
                             mmu_idx, retaddr);
    }
    /* Note: relies on the fact that tlb_fill() does not remove the
     * previous page from the TLB cache.  */
    for (i = DATA_SIZE - 1; i >= 0; i--) {
        if (little_endian) {
                /* Little-endian extract.  */
                uint8_t val8 = val >> (i * 8);
        } else {
                /* Big-endian extract.  */
                uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
        }
        /* Note the adjustment at the beginning of the function.
           Undo that for the recursion.  */
        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
                                        oi, retaddr + GETPC_ADJ);
    }
}

> +
>  void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>                         TCGMemOpIdx oi, uintptr_t retaddr)
>  {
> @@ -433,7 +459,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>              return;
>          } else {
>              if ((addr & (DATA_SIZE - 1)) != 0) {
> -                goto do_unaligned_access;
> +                glue(helper_le_st_name, _do_unl_access)(env, val, addr, mmu_idx,
> +                                                        oi, retaddr);
>              }
>              iotlbentry = &env->iotlb[mmu_idx][index];
>
> @@ -449,23 +476,8 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>      if (DATA_SIZE > 1
>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>                       >= TARGET_PAGE_SIZE)) {
> -        int i;
> -    do_unaligned_access:
> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> -                                 mmu_idx, retaddr);
> -        }
> -        /* XXX: not efficient, but simple */
> -        /* Note: relies on the fact that tlb_fill() does not remove the
> -         * previous page from the TLB cache.  */
> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
> -            /* Little-endian extract.  */
> -            uint8_t val8 = val >> (i * 8);
> -            /* Note the adjustment at the beginning of the function.
> -               Undo that for the recursion.  */
> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> -                                            oi, retaddr + GETPC_ADJ);
> -        }
> +        glue(helper_le_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
> +                                                retaddr);
>          return;
>      }
>
> @@ -485,6 +497,32 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>  }
>
>  #if DATA_SIZE > 1
> +static inline void glue(helper_be_st_name, _do_unl_access)(CPUArchState *env,
> +                                                           DATA_TYPE val,
> +                                                           target_ulong addr,
> +                                                           TCGMemOpIdx oi,
> +                                                           unsigned mmu_idx,
> +                                                           uintptr_t retaddr)
> +{
> +    int i;
> +
> +    if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> +        cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> +                             mmu_idx, retaddr);
> +    }
> +    /* XXX: not efficient, but simple */
> +    /* Note: relies on the fact that tlb_fill() does not remove the
> +     * previous page from the TLB cache.  */
> +    for (i = DATA_SIZE - 1; i >= 0; i--) {
> +        /* Big-endian extract.  */
> +        uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
> +        /* Note the adjustment at the beginning of the function.
> +           Undo that for the recursion.  */
> +        glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> +                                        oi, retaddr + GETPC_ADJ);
> +    }
> +}

Not that it matters if you combine to two as suggested because anything
not called shouldn't generate the code.

>  void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>                         TCGMemOpIdx oi, uintptr_t retaddr)
>  {
> @@ -548,7 +586,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>              return;
>          } else {
>              if ((addr & (DATA_SIZE - 1)) != 0) {
> -                goto do_unaligned_access;
> +                glue(helper_be_st_name, _do_unl_access)(env, val, addr, mmu_idx,
> +                                                        oi, retaddr);
>              }
>              iotlbentry = &env->iotlb[mmu_idx][index];
>
> @@ -564,23 +603,8 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
>      if (DATA_SIZE > 1
>          && unlikely((addr & ~TARGET_PAGE_MASK) + DATA_SIZE - 1
>                       >= TARGET_PAGE_SIZE)) {
> -        int i;
> -    do_unaligned_access:
> -        if ((get_memop(oi) & MO_AMASK) == MO_ALIGN) {
> -            cpu_unaligned_access(ENV_GET_CPU(env), addr, MMU_DATA_STORE,
> -                                 mmu_idx, retaddr);
> -        }
> -        /* XXX: not efficient, but simple */
> -        /* Note: relies on the fact that tlb_fill() does not remove the
> -         * previous page from the TLB cache.  */
> -        for (i = DATA_SIZE - 1; i >= 0; i--) {
> -            /* Big-endian extract.  */
> -            uint8_t val8 = val >> (((DATA_SIZE - 1) * 8) - (i * 8));
> -            /* Note the adjustment at the beginning of the function.
> -               Undo that for the recursion.  */
> -            glue(helper_ret_stb, MMUSUFFIX)(env, addr + i, val8,
> -                                            oi, retaddr + GETPC_ADJ);
> -        }
> +        glue(helper_be_st_name, _do_unl_access)(env, val, addr, oi, mmu_idx,
> +                                                retaddr);
>          return;
>      }

--
Alex Bennée