From: Aleksandar Markovic <aleksandar.m.mail@gmail.com>
To: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
Cc: "Aleksandar Rikalo" <arikalo@wavecomp.com>,
"Richard Henderson" <richard.henderson@linaro.org>,
"QEMU Developers" <qemu-devel@nongnu.org>,
"Aleksandar Markovic" <amarkovic@wavecomp.com>,
"Philippe Mathieu-Daudé" <philmd@redhat.com>,
"Aurelien Jarno" <aurelien@aurel32.net>
Subject: Re: [Qemu-devel] [PATCH v6 4/4] target/mips: Optimize ILVR.<B|H|W|D> MSA instructions
Date: Sat, 13 Apr 2019 18:05:21 +0200 [thread overview]
Message-ID: <CAL1e-=i3H+aMaOCqMBkK4r9E4Ezqtyqw=VbwNmJZ6Wnc3in5PA@mail.gmail.com> (raw)
Message-ID: <20190413160521.whwuQx9kS7B0hcdwa26gAulXsRKhRjXrAH0kehmG9I4@z> (raw)
In-Reply-To: <1554383690-28338-5-git-send-email-mateja.marjanovic@rt-rk.com>
On Thu, Apr 4, 2019 at 3:16 PM Mateja Marjanovic
<mateja.marjanovic@rt-rk.com> wrote:
>
> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
>
> Optimized ILVR.<B|H|W|D> instructions, using a hybrid
Optimized -> Optimize
> approach. For byte data elements, use a helper with an
> unrolled loop (much better performance), for halfword,
(much better performance) -> (having much better performance
than direct tcg translation)
> word and doubleword data elements use directly tcg
> registers and logic performed on them.
>
> Performance measurement is done by executing the
> instructions a large number of times on a computer
> with Intel Core i7-3770 CPU @ 3.40GHz×8.
>
> ===================================================
> || instr || helper || tcg || hybrid ||
> ===================================================
> || ilvr.b: || 62.87 ms || 74.76 ms || 61.52 ms || <-- helper
> || ilvr.h: || 44.11 ms || 33.00 ms || 33.55 ms || <-- tcg
> || ilvr.w: || 34.97 ms || 23.06 ms || 22.67 ms || <-- tcg
> || ilvr.d: || 27.33 ms || 19.87 ms || 20.02 ms || <-- tcg
> ===================================================
>
instr -> instruction
|| 61.52 ms || <-- helper -> || 61.52 ms (helper) ||
and similar for other three raws.
> Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> ---
> target/mips/helper.h | 2 +-
> target/mips/msa_helper.c | 33 +++++++++++----
> target/mips/translate.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++-
> 3 files changed, 132 insertions(+), 10 deletions(-)
>
> diff --git a/target/mips/helper.h b/target/mips/helper.h
> index cd73723..d4755ef 100644
> --- a/target/mips/helper.h
> +++ b/target/mips/helper.h
> @@ -862,7 +862,6 @@ DEF_HELPER_5(msa_sld_df, void, env, i32, i32, i32, i32)
> DEF_HELPER_5(msa_splat_df, void, env, i32, i32, i32, i32)
> DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
> DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
> -DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
> DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
> DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
> DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
> @@ -946,6 +945,7 @@ DEF_HELPER_4(msa_insert_w, void, env, i32, i32, i32)
> DEF_HELPER_4(msa_insert_d, void, env, i32, i32, i32)
>
> DEF_HELPER_4(msa_ilvl_b, void, env, i32, i32, i32)
> +DEF_HELPER_4(msa_ilvr_b, void, env, i32, i32, i32)
>
> DEF_HELPER_4(msa_fclass_df, void, env, i32, i32, i32)
> DEF_HELPER_4(msa_ftrunc_s_df, void, env, i32, i32, i32)
> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index 84bbe6f..2470cef 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -1181,14 +1181,6 @@ MSA_FN_DF(pckev_df)
> } while (0)
> MSA_FN_DF(pckod_df)
> #undef MSA_DO
> -
> -#define MSA_DO(DF) \
> - do { \
> - pwx->DF[2*i] = R##DF(pwt, i); \
> - pwx->DF[2*i+1] = R##DF(pws, i); \
> - } while (0)
> -MSA_FN_DF(ilvr_df)
> -#undef MSA_DO
> #undef MSA_LOOP_COND
>
> #define MSA_LOOP_COND(DF) \
> @@ -1249,6 +1241,31 @@ void helper_msa_ilvl_b(CPUMIPSState *env, uint32_t wd,
> pwd->b[15] = pws->b[15];
> }
>
> +void helper_msa_ilvr_b(CPUMIPSState *env, uint32_t wd,
> + uint32_t ws, uint32_t wt)
> +{
> + wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
> + wr_t *pws = &(env->active_fpu.fpr[ws].wr);
> + wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> +
Why do we use here env->active_fpu.fpr[wd].wr, while for other instructions in
this patch, we access msa_wr_d<b|h|w|d[] directly?
> + pwd->b[15] = pws->b[7];
> + pwd->b[14] = pwt->b[7];
> + pwd->b[13] = pws->b[6];
> + pwd->b[12] = pwt->b[6];
> + pwd->b[11] = pws->b[5];
> + pwd->b[10] = pwt->b[5];
> + pwd->b[9] = pws->b[4];
> + pwd->b[8] = pwt->b[4];
> + pwd->b[7] = pws->b[3];
> + pwd->b[6] = pwt->b[3];
> + pwd->b[5] = pws->b[2];
> + pwd->b[4] = pwt->b[2];
> + pwd->b[3] = pws->b[1];
> + pwd->b[2] = pwt->b[1];
> + pwd->b[1] = pws->b[0];
> + pwd->b[0] = pwt->b[0];
> +}
> +
> void helper_msa_copy_s_b(CPUMIPSState *env, uint32_t rd,
> uint32_t ws, uint32_t n)
> {
> diff --git a/target/mips/translate.c b/target/mips/translate.c
> index 6c6811e..90332fb 100644
> --- a/target/mips/translate.c
> +++ b/target/mips/translate.c
> @@ -28885,6 +28885,96 @@ static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx)
> }
>
> /*
> + * [MSA] ILVR.H wd, ws, wt
> + *
> + * Vector Interleave Right (halfword data elements)
> + *
> + */
> +static inline void gen_ilvr_h(CPUMIPSState *env, uint32_t wd,
> + uint32_t ws, uint32_t wt)
> +{
> + TCGv_i64 t1 = tcg_temp_new_i64();
> + TCGv_i64 t2 = tcg_temp_new_i64();
> + uint64_t mask = 0x000000000000ffffULL;
> +
> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> + tcg_gen_mov_i64(t2, t1);
> + tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> + tcg_gen_shli_i64(t1, t1, 16);
> + tcg_gen_or_i64(t2, t2, t1);
> +
> + mask <<= 16;
> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> + tcg_gen_shli_i64(t1, t1, 16);
> + tcg_gen_or_i64(t2, t2, t1);
> + tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> + tcg_gen_shli_i64(t1, t1, 32);
> + tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
> +
> + mask <<= 16;
> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> + tcg_gen_shri_i64(t1, t1, 32);
> + tcg_gen_mov_i64(t2, t1);
> + tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> + tcg_gen_shri_i64(t1, t1, 16);
> + tcg_gen_or_i64(t2, t2, t1);
> +
> + mask <<= 16;
> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> + tcg_gen_shri_i64(t1, t1, 16);
> + tcg_gen_or_i64(t2, t2, t1);
> + tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> + tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
> +
> + tcg_temp_free_i64(t1);
> + tcg_temp_free_i64(t2);
> +}
> +
> +/*
> + * [MSA] ILVR.W wd, ws, wt
> + *
> + * Vector Interleave Right (word data elements)
> + *
> + */
> +static inline void gen_ilvr_w(CPUMIPSState *env, uint32_t wd,
> + uint32_t ws, uint32_t wt)
> +{
> + TCGv_i64 t1 = tcg_temp_new_i64();
> + TCGv_i64 t2 = tcg_temp_new_i64();
> + uint64_t mask = 0x00000000ffffffffULL;
Use tcg_const_i64(). The same for the previous function.
> +
> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> + tcg_gen_mov_i64(t2, t1);
> + tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> + tcg_gen_shli_i64(t1, t1, 32);
> + tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
> +
> + mask <<= 32;
Just assign the constant value to the mask, no need for shift operation.
The same applies for other similar cases in this patch.
> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> + tcg_gen_shri_i64(t1, t1, 32);
> + tcg_gen_mov_i64(t2, t1);
> + tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> + tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
> +
> + tcg_temp_free_i64(t1);
> + tcg_temp_free_i64(t2);
> +}
> +
> +/*
> + * [MSA] ILVR.D wd, ws, wt
> + *
> + * Vector Interleave Right (doubleword data elements)
> + *
> + */
> +static inline void gen_ilvr_d(CPUMIPSState *env, uint32_t wd,
> + uint32_t ws, uint32_t wt)
> +{
> + tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
> + tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
> +}
> +
This function seems to be identical to the gen_ilvev_d(). Please,
if that is the case, in this patch rename gen_ilvev_d() to gen_ilvev_ilvr_d(),
and use it both for hanlding ILVEV.D and ILVR.D.
> +
> +/*
> * [MSA] ILVL.B wd, ws, wt
> *
> * Vector Interleave Left (byte data elements)
> @@ -29380,7 +29470,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
> gen_helper_msa_div_u_df(cpu_env, tdf, twd, tws, twt);
> break;
> case OPC_ILVR_df:
> - gen_helper_msa_ilvr_df(cpu_env, tdf, twd, tws, twt);
> + switch (df) {
> + case DF_BYTE:
> + gen_helper_msa_ilvr_b(cpu_env, twd, tws, twt);
> + break;
> + case DF_HALF:
> + gen_ilvr_h(env, wd, ws, wt);
> + break;
> + case DF_WORD:
> + gen_ilvr_w(env, wd, ws, wt);
> + break;
> + case DF_DOUBLE:
> + gen_ilvr_d(env, wd, ws, wt);
> + break;
> + default:
> + assert(0);
> + }
> break;
> case OPC_BINSL_df:
> gen_helper_msa_binsl_df(cpu_env, tdf, twd, tws, twt);
> --
> 2.7.4
>
>
Thanks,
Aleksandar
next prev parent reply other threads:[~2019-04-13 16:06 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-04-04 13:14 [Qemu-devel] [PATCH v6 0/4] target/mips: Optimize MSA interleave instructions Mateja Marjanovic
2019-04-04 13:14 ` [Qemu-devel] [PATCH v6 1/4] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions Mateja Marjanovic
2019-04-04 13:47 ` Philippe Mathieu-Daudé
2019-04-13 16:09 ` Aleksandar Markovic
2019-04-13 16:09 ` Aleksandar Markovic
2019-04-04 13:14 ` [Qemu-devel] [PATCH v6 2/4] target/mips: Optimize ILVEV.<B|H|W|D> " Mateja Marjanovic
2019-04-04 13:42 ` Philippe Mathieu-Daudé
2019-04-04 18:19 ` Aleksandar Markovic
2019-04-04 19:17 ` Philippe Mathieu-Daudé
2019-04-05 0:26 ` Aleksandar Markovic
2019-04-05 0:26 ` Aleksandar Markovic
2019-04-17 12:45 ` Mateja Marjanovic
2019-04-17 12:45 ` Mateja Marjanovic
2019-04-13 16:05 ` Aleksandar Markovic
2019-04-13 16:05 ` Aleksandar Markovic
2019-04-15 13:48 ` Mateja Marjanovic
2019-04-15 13:48 ` Mateja Marjanovic
2019-04-04 13:14 ` [Qemu-devel] [PATCH v6 3/4] target/mips: Optimize ILVL.<B|H|W|D> " Mateja Marjanovic
2019-04-13 16:15 ` Aleksandar Markovic
2019-04-13 16:15 ` Aleksandar Markovic
2019-04-04 13:14 ` [Qemu-devel] [PATCH v6 4/4] target/mips: Optimize ILVR.<B|H|W|D> " Mateja Marjanovic
2019-04-13 16:05 ` Aleksandar Markovic [this message]
2019-04-13 16:05 ` Aleksandar Markovic
2019-04-15 11:24 ` Mateja Marjanovic
2019-04-15 11:24 ` Mateja Marjanovic
2019-04-16 21:20 ` Aleksandar Markovic
2019-04-16 21:20 ` Aleksandar Markovic
2019-04-17 8:16 ` Mateja Marjanovic
2019-04-17 8:16 ` Mateja Marjanovic
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAL1e-=i3H+aMaOCqMBkK4r9E4Ezqtyqw=VbwNmJZ6Wnc3in5PA@mail.gmail.com' \
--to=aleksandar.m.mail@gmail.com \
--cc=amarkovic@wavecomp.com \
--cc=arikalo@wavecomp.com \
--cc=aurelien@aurel32.net \
--cc=mateja.marjanovic@rt-rk.com \
--cc=philmd@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=richard.henderson@linaro.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.