All of lore.kernel.org
 help / color / mirror / Atom feed
From: Aleksandar Markovic <aleksandar.m.mail@gmail.com>
To: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
Cc: "Aleksandar Rikalo" <arikalo@wavecomp.com>,
	"Richard Henderson" <richard.henderson@linaro.org>,
	"QEMU Developers" <qemu-devel@nongnu.org>,
	"Aleksandar Markovic" <amarkovic@wavecomp.com>,
	"Philippe Mathieu-Daudé" <philmd@redhat.com>,
	"Aurelien Jarno" <aurelien@aurel32.net>
Subject: Re: [Qemu-devel] [PATCH v6 3/4] target/mips: Optimize ILVL.<B|H|W|D> MSA instructions
Date: Sat, 13 Apr 2019 18:15:29 +0200	[thread overview]
Message-ID: <CAL1e-=h4HXc4601oU2ZRfBNSe1yQ_zKDvpMtAmE6zOuzSMxryA@mail.gmail.com> (raw)
Message-ID: <20190413161529.Zv837AefgjhxebmFFfLelPKbiJdy57nBjoaOEJjKRa8@z> (raw)
In-Reply-To: <1554383690-28338-4-git-send-email-mateja.marjanovic@rt-rk.com>

On Thu, Apr 4, 2019 at 3:16 PM Mateja Marjanovic
<mateja.marjanovic@rt-rk.com> wrote:
>
> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
>
> Optimized ILVL.<B|H|W|D> instructions, using a hybrid
> approach. For byte data elements, use a helper with an
> unrolled loop (much better performance), for halfword,
> word and doubleword data elements use directly tcg
> registers and logic performed on them.
>
> Performance measurement is done by executing the
> instructions a large number of times on a computer
> with Intel Core i7-3770 CPU @ 3.40GHz×8.
>
> ==================================================
> ||  instr  ||  helper  ||   tcg    ||  hybrid   ||
> ==================================================
> || ilvl.b: || 59.91 ms || 74.41 ms ||  59.24 ms || <-- helper
> || ilvl.h: || 41.33 ms || 33.08 ms ||  32.96 ms || <-- tcg
> || ilvl.w: || 30.99 ms || 22.87 ms ||  22.81 ms || <-- tcg
> || ilvl.d: || 26.40 ms || 19.64 ms ||  19.45 ms || <-- tcg
> ==================================================
>
> Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> ---
>  target/mips/helper.h     |   3 +-
>  target/mips/msa_helper.c |  33 ++++++---
>  target/mips/translate.c  | 184 ++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 210 insertions(+), 10 deletions(-)
>
> diff --git a/target/mips/helper.h b/target/mips/helper.h
> index 82f6a40..cd73723 100644
> --- a/target/mips/helper.h
> +++ b/target/mips/helper.h
> @@ -862,7 +862,6 @@ DEF_HELPER_5(msa_sld_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_splat_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
> -DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
> @@ -946,6 +945,8 @@ DEF_HELPER_4(msa_insert_h, void, env, i32, i32, i32)
>  DEF_HELPER_4(msa_insert_w, void, env, i32, i32, i32)
>  DEF_HELPER_4(msa_insert_d, void, env, i32, i32, i32)
>
> +DEF_HELPER_4(msa_ilvl_b, void, env, i32, i32, i32)
> +
>  DEF_HELPER_4(msa_fclass_df, void, env, i32, i32, i32)
>  DEF_HELPER_4(msa_ftrunc_s_df, void, env, i32, i32, i32)
>  DEF_HELPER_4(msa_ftrunc_u_df, void, env, i32, i32, i32)
> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index d5c3842..84bbe6f 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -1184,14 +1184,6 @@ MSA_FN_DF(pckod_df)
>
>  #define MSA_DO(DF)                      \
>      do {                                \
> -        pwx->DF[2*i]   = L##DF(pwt, i); \
> -        pwx->DF[2*i+1] = L##DF(pws, i); \
> -    } while (0)
> -MSA_FN_DF(ilvl_df)
> -#undef MSA_DO
> -
> -#define MSA_DO(DF)                      \
> -    do {                                \
>          pwx->DF[2*i]   = R##DF(pwt, i); \
>          pwx->DF[2*i+1] = R##DF(pws, i); \
>      } while (0)
> @@ -1232,6 +1224,31 @@ void helper_msa_splati_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
>      msa_splat_df(df, pwd, pws, n);
>  }
>
> +void helper_msa_ilvl_b(CPUMIPSState *env, uint32_t wd,
> +                       uint32_t ws, uint32_t wt)
> +{
> +    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
> +    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
> +    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> +
> +    pwd->b[0]  = pwt->b[8];
> +    pwd->b[1]  = pws->b[8];
> +    pwd->b[2]  = pwt->b[9];
> +    pwd->b[3]  = pws->b[9];
> +    pwd->b[4]  = pwt->b[10];
> +    pwd->b[5]  = pws->b[10];
> +    pwd->b[6]  = pwt->b[11];
> +    pwd->b[7]  = pws->b[11];
> +    pwd->b[8]  = pwt->b[12];
> +    pwd->b[9]  = pws->b[12];
> +    pwd->b[10] = pwt->b[13];
> +    pwd->b[11] = pws->b[13];
> +    pwd->b[12] = pwt->b[14];
> +    pwd->b[13] = pws->b[14];
> +    pwd->b[14] = pwt->b[15];
> +    pwd->b[15] = pws->b[15];
> +}
> +
>  void helper_msa_copy_s_b(CPUMIPSState *env, uint32_t rd,
>                           uint32_t ws, uint32_t n)
>  {
> diff --git a/target/mips/translate.c b/target/mips/translate.c
> index 3057669..6c6811e 100644
> --- a/target/mips/translate.c
> +++ b/target/mips/translate.c
> @@ -28885,6 +28885,173 @@ static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx)
>  }
>
>  /*
> + * [MSA] ILVL.B wd, ws, wt
> + *
> + *   Vector Interleave Left (byte data elements)
> + *
> + */
> +static inline void gen_ilvl_b(CPUMIPSState *env, uint32_t wd,
> +                              uint32_t ws, uint32_t wt)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    uint64_t mask = 0x00000000000000ffULL;
> +
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_mov_i64(t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 8);
> +    tcg_gen_or_i64(t2, t2, t1);
> +
> +    mask <<= 8;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 8);
> +    tcg_gen_or_i64(t2, t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +
> +    mask <<= 8;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 24);
> +    tcg_gen_or_i64(t2, t2, t1);
> +
> +    mask <<= 8;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 24);
> +    tcg_gen_or_i64(t2, t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 32);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
> +
> +    mask <<= 8;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 32);
> +    tcg_gen_mov_i64(t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 24);
> +    tcg_gen_or_i64(t2, t2, t1);
> +
> +    mask <<= 8;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 24);
> +    tcg_gen_or_i64(t2, t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +
> +    mask <<= 8;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 8);
> +    tcg_gen_or_i64(t2, t2, t1);
> +
> +    mask <<= 8;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 8);
> +    tcg_gen_or_i64(t2, t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +/*
> + * [MSA] ILVL.H wd, ws, wt
> + *
> + *   Vector Interleave Left (halfword data elements)
> + *
> + */
> +static inline void gen_ilvl_h(CPUMIPSState *env, uint32_t wd,
> +                              uint32_t ws, uint32_t wt)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    uint64_t mask = 0x000000000000ffffULL;
> +
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_mov_i64(t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +
> +    mask <<= 16;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 32);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
> +
> +    mask <<= 16;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 32);
> +    tcg_gen_mov_i64(t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +
> +    mask <<= 16;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +/*
> + * [MSA] ILVL.W wd, ws, wt
> + *
> + *   Vector Interleave Left (word data elements)
> + *
> + */
> +static inline void gen_ilvl_w(CPUMIPSState *env, uint32_t wd,
> +                              uint32_t ws, uint32_t wt)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    uint64_t mask = 0x00000000ffffffffULL;
> +
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_mov_i64(t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_shli_i64(t1, t1, 32);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
> +
> +    mask <<= 32;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t1, t1, 32);
> +    tcg_gen_mov_i64(t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +/*
> + * [MSA] ILVL.D wd, ws, wt
> + *
> + *   Vector Interleave Left (doubleword data elements)
> + *
> + */
> +static inline void gen_ilvl_d(CPUMIPSState *env, uint32_t wd,
> +                              uint32_t ws, uint32_t wt)
> +{
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2 + 1]);

This code introduces the bug for the case wd == wt. You keep repeating the same
mistake on and on and on.

Please see also my comments for ILVR.D.

Thanks,
Aleksandar

> +}
> +
> +/*
>   * [MSA] ILVOD.B wd, ws, wt
>   *
>   *   Vector Interleave Odd (byte data elements)
> @@ -29177,7 +29344,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
>          gen_helper_msa_div_s_df(cpu_env, tdf, twd, tws, twt);
>          break;
>      case OPC_ILVL_df:
> -        gen_helper_msa_ilvl_df(cpu_env, tdf, twd, tws, twt);
> +        switch (df) {
> +        case DF_BYTE:
> +            gen_helper_msa_ilvl_b(cpu_env, twd, tws, twt);
> +            break;
> +        case DF_HALF:
> +            gen_ilvl_h(env, wd, ws, wt);
> +            break;
> +        case DF_WORD:
> +            gen_ilvl_w(env, wd, ws, wt);
> +            break;
> +        case DF_DOUBLE:
> +            gen_ilvl_d(env, wd, ws, wt);
> +            break;
> +        default:
> +            assert(0);
> +        }
>          break;
>      case OPC_BNEG_df:
>          gen_helper_msa_bneg_df(cpu_env, tdf, twd, tws, twt);
> --
> 2.7.4
>
>


  reply	other threads:[~2019-04-13 16:16 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-04-04 13:14 [Qemu-devel] [PATCH v6 0/4] target/mips: Optimize MSA interleave instructions Mateja Marjanovic
2019-04-04 13:14 ` [Qemu-devel] [PATCH v6 1/4] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions Mateja Marjanovic
2019-04-04 13:47   ` Philippe Mathieu-Daudé
2019-04-13 16:09   ` Aleksandar Markovic
2019-04-13 16:09     ` Aleksandar Markovic
2019-04-04 13:14 ` [Qemu-devel] [PATCH v6 2/4] target/mips: Optimize ILVEV.<B|H|W|D> " Mateja Marjanovic
2019-04-04 13:42   ` Philippe Mathieu-Daudé
2019-04-04 18:19     ` Aleksandar Markovic
2019-04-04 19:17       ` Philippe Mathieu-Daudé
2019-04-05  0:26         ` Aleksandar Markovic
2019-04-05  0:26           ` Aleksandar Markovic
2019-04-17 12:45     ` Mateja Marjanovic
2019-04-17 12:45       ` Mateja Marjanovic
2019-04-13 16:05   ` Aleksandar Markovic
2019-04-13 16:05     ` Aleksandar Markovic
2019-04-15 13:48     ` Mateja Marjanovic
2019-04-15 13:48       ` Mateja Marjanovic
2019-04-04 13:14 ` [Qemu-devel] [PATCH v6 3/4] target/mips: Optimize ILVL.<B|H|W|D> " Mateja Marjanovic
2019-04-13 16:15   ` Aleksandar Markovic [this message]
2019-04-13 16:15     ` Aleksandar Markovic
2019-04-04 13:14 ` [Qemu-devel] [PATCH v6 4/4] target/mips: Optimize ILVR.<B|H|W|D> " Mateja Marjanovic
2019-04-13 16:05   ` Aleksandar Markovic
2019-04-13 16:05     ` Aleksandar Markovic
2019-04-15 11:24     ` Mateja Marjanovic
2019-04-15 11:24       ` Mateja Marjanovic
2019-04-16 21:20       ` Aleksandar Markovic
2019-04-16 21:20         ` Aleksandar Markovic
2019-04-17  8:16         ` Mateja Marjanovic
2019-04-17  8:16           ` Mateja Marjanovic

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAL1e-=h4HXc4601oU2ZRfBNSe1yQ_zKDvpMtAmE6zOuzSMxryA@mail.gmail.com' \
    --to=aleksandar.m.mail@gmail.com \
    --cc=amarkovic@wavecomp.com \
    --cc=arikalo@wavecomp.com \
    --cc=aurelien@aurel32.net \
    --cc=mateja.marjanovic@rt-rk.com \
    --cc=philmd@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=richard.henderson@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.