All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState
@ 2016-09-14  9:56 Paolo Bonzini
  2016-09-14  9:56 ` [Qemu-devel] [PATCH 1/3] target-arm: introduce cpu_dynamic_tb_cpu_flags Paolo Bonzini
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Paolo Bonzini @ 2016-09-14  9:56 UTC (permalink / raw)
  To: qemu-devel

Computing TranslationBlock flags is pretty expensive on ARM, especially
32-bit.  Because tbflags are computed on every tb lookup, it is not
unlikely to see cpu_get_tb_cpu_state close to the top of the profile
now that QHT makes the hash table much more efficient.

However, most tbflags only change when the EL is switched or after
MSR instructions.  Based on this observation, this series caches these
tbflags in CPUARMState, resulting in a 10-15% speedup on 32-bit code.

Paolo

Paolo Bonzini (3):
  target-arm: introduce cpu_dynamic_tb_cpu_flags
  target-arm: add env->tbflags
  target-arm: cache most tbflags

 target-arm/cpu.c           |  2 ++
 target-arm/cpu.h           | 58 ++++++++++++++++++++++++++++++++--------------
 target-arm/helper.c        |  2 ++
 target-arm/helper.h        |  1 +
 target-arm/op_helper.c     |  7 ++++++
 target-arm/translate-a64.c |  4 ++++
 target-arm/translate.c     | 12 ++++++++--
 target-arm/translate.h     |  1 +
 8 files changed, 68 insertions(+), 19 deletions(-)

-- 
2.7.4

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH 1/3] target-arm: introduce cpu_dynamic_tb_cpu_flags
  2016-09-14  9:56 [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState Paolo Bonzini
@ 2016-09-14  9:56 ` Paolo Bonzini
  2016-09-14  9:56 ` [Qemu-devel] [PATCH 2/3] target-arm: add env->tbflags Paolo Bonzini
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Paolo Bonzini @ 2016-09-14  9:56 UTC (permalink / raw)
  To: qemu-devel

For now, this just moves computation of the TranslationBlock flags to a
separate function.  Later on, dynamic flags will be distinct from static
flags in that static flags need not be recomputed on every
TranslationBlock lookup; hence the name of the function.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target-arm/cpu.h | 43 ++++++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 76d824d..ef195bd 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -2319,31 +2319,30 @@ static inline bool arm_cpu_bswap_data(CPUARMState *env)
 }
 #endif
 
-static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
-                                        target_ulong *cs_base, uint32_t *flags)
+static inline uint32_t cpu_dynamic_tb_cpu_flags(CPUARMState *env)
 {
+    uint32_t flags = 0;
+
     if (is_a64(env)) {
-        *pc = env->pc;
-        *flags = ARM_TBFLAG_AARCH64_STATE_MASK;
+        flags |= ARM_TBFLAG_AARCH64_STATE_MASK;
     } else {
-        *pc = env->regs[15];
-        *flags = (env->thumb << ARM_TBFLAG_THUMB_SHIFT)
+        flags |= (env->thumb << ARM_TBFLAG_THUMB_SHIFT)
             | (env->vfp.vec_len << ARM_TBFLAG_VECLEN_SHIFT)
             | (env->vfp.vec_stride << ARM_TBFLAG_VECSTRIDE_SHIFT)
             | (env->condexec_bits << ARM_TBFLAG_CONDEXEC_SHIFT)
             | (arm_sctlr_b(env) << ARM_TBFLAG_SCTLR_B_SHIFT);
         if (!(access_secure_reg(env))) {
-            *flags |= ARM_TBFLAG_NS_MASK;
+            flags |= ARM_TBFLAG_NS_MASK;
         }
         if (env->vfp.xregs[ARM_VFP_FPEXC] & (1 << 30)
             || arm_el_is_aa64(env, 1)) {
-            *flags |= ARM_TBFLAG_VFPEN_MASK;
+            flags |= ARM_TBFLAG_VFPEN_MASK;
         }
-        *flags |= (extract32(env->cp15.c15_cpar, 0, 2)
+        flags |= (extract32(env->cp15.c15_cpar, 0, 2)
                    << ARM_TBFLAG_XSCALE_CPAR_SHIFT);
     }
 
-    *flags |= (cpu_mmu_index(env, false) << ARM_TBFLAG_MMUIDX_SHIFT);
+    flags |= (cpu_mmu_index(env, false) << ARM_TBFLAG_MMUIDX_SHIFT);
     /* The SS_ACTIVE and PSTATE_SS bits correspond to the state machine
      * states defined in the ARM ARM for software singlestep:
      *  SS_ACTIVE   PSTATE.SS   State
@@ -2352,21 +2351,35 @@ static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
      *     1            1       Active-not-pending
      */
     if (arm_singlestep_active(env)) {
-        *flags |= ARM_TBFLAG_SS_ACTIVE_MASK;
+        flags |= ARM_TBFLAG_SS_ACTIVE_MASK;
         if (is_a64(env)) {
             if (env->pstate & PSTATE_SS) {
-                *flags |= ARM_TBFLAG_PSTATE_SS_MASK;
+                flags |= ARM_TBFLAG_PSTATE_SS_MASK;
             }
         } else {
             if (env->uncached_cpsr & PSTATE_SS) {
-                *flags |= ARM_TBFLAG_PSTATE_SS_MASK;
+                flags |= ARM_TBFLAG_PSTATE_SS_MASK;
             }
         }
     }
     if (arm_cpu_data_is_big_endian(env)) {
-        *flags |= ARM_TBFLAG_BE_DATA_MASK;
+        flags |= ARM_TBFLAG_BE_DATA_MASK;
     }
-    *flags |= fp_exception_el(env) << ARM_TBFLAG_FPEXC_EL_SHIFT;
+    flags |= fp_exception_el(env) << ARM_TBFLAG_FPEXC_EL_SHIFT;
+
+    return flags;
+}
+
+static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
+                                        target_ulong *cs_base, uint32_t *flags)
+{
+    *flags = cpu_dynamic_tb_cpu_flags(env);
+
+    if (is_a64(env)) {
+        *pc = env->pc;
+    } else {
+        *pc = env->regs[15];
+    }
 
     *cs_base = 0;
 }
-- 
2.7.4

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH 2/3] target-arm: add env->tbflags
  2016-09-14  9:56 [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState Paolo Bonzini
  2016-09-14  9:56 ` [Qemu-devel] [PATCH 1/3] target-arm: introduce cpu_dynamic_tb_cpu_flags Paolo Bonzini
@ 2016-09-14  9:56 ` Paolo Bonzini
  2016-09-26 22:00   ` Peter Maydell
  2016-09-14  9:56 ` [Qemu-devel] [PATCH 3/3] target-arm: cache most tbflags Paolo Bonzini
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Paolo Bonzini @ 2016-09-14  9:56 UTC (permalink / raw)
  To: qemu-devel

Computing TranslationBlock flags is pretty expensive on ARM, especially
32-bit.  In order to limit the cost we want to cache as many of them
as possible.  Therefore, the flags are split in two parts.  Static flags
come directly from a new CPUARMState field env->tbflags, and are updated
whenever EL changes (i.e. on exceptions and exception returns) or after
MSR instructions.

Dynamic flags are computed on the fly by cpu_get_tb_cpu_state (which
calls cpu_dynamic_tb_cpu_flags to retrieve them), same as before.
As of this patch, all flags are dynamic and env->tbflags is always 0,
so this patch adds the infrastructure but does not do any caching yet.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target-arm/cpu.c           |  2 ++
 target-arm/cpu.h           | 10 +++++++++-
 target-arm/helper.c        |  2 ++
 target-arm/helper.h        |  1 +
 target-arm/op_helper.c     |  7 +++++++
 target-arm/translate-a64.c |  4 ++++
 target-arm/translate.c     | 12 ++++++++++--
 target-arm/translate.h     |  1 +
 8 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/target-arm/cpu.c b/target-arm/cpu.c
index ce8b8f4..189ceab 100644
--- a/target-arm/cpu.c
+++ b/target-arm/cpu.c
@@ -225,6 +225,8 @@ static void arm_cpu_reset(CPUState *s)
                               &env->vfp.fp_status);
     set_float_detect_tininess(float_tininess_before_rounding,
                               &env->vfp.standard_fp_status);
+
+    env->tbflags = cpu_get_tb_cpu_flags(env);
     tlb_flush(s, 1);
 
 #ifndef CONFIG_USER_ONLY
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index ef195bd..5918df5 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -155,6 +155,7 @@ typedef struct CPUARMState {
      */
     uint32_t pstate;
     uint32_t aarch64; /* 1 if CPU is in aarch64 state; inverse of PSTATE.nRW */
+    uint32_t tbflags;
 
     /* Frequently accessed CPSR bits are stored separately for efficiency.
        This contains all the other bits.  Use cpsr_{read,write} to access
@@ -2370,10 +2371,17 @@ static inline uint32_t cpu_dynamic_tb_cpu_flags(CPUARMState *env)
     return flags;
 }
 
+static inline uint32_t cpu_get_tb_cpu_flags(CPUARMState *env)
+{
+    uint32_t flags = 0;
+
+    return flags;
+}
+
 static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
                                         target_ulong *cs_base, uint32_t *flags)
 {
-    *flags = cpu_dynamic_tb_cpu_flags(env);
+    *flags = env->tbflags | cpu_dynamic_tb_cpu_flags(env);
 
     if (is_a64(env)) {
         *pc = env->pc;
diff --git a/target-arm/helper.c b/target-arm/helper.c
index bdb842c..0b4f6de 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -6068,6 +6068,7 @@ void arm_v7m_cpu_do_interrupt(CPUState *cs)
     addr = ldl_phys(cs->as, env->v7m.vecbase + env->v7m.exception * 4);
     env->regs[15] = addr & 0xfffffffe;
     env->thumb = addr & 1;
+    env->tbflags = cpu_get_tb_cpu_flags(env);
 }
 
 /* Function used to synchronize QEMU's AArch64 register set with AArch32
@@ -6642,6 +6643,7 @@ void arm_cpu_do_interrupt(CPUState *cs)
         arm_cpu_do_interrupt_aarch32(cs);
     }
 
+    env->tbflags = cpu_get_tb_cpu_flags(env);
     arm_call_el_change_hook(cpu);
 
     if (!kvm_enabled()) {
diff --git a/target-arm/helper.h b/target-arm/helper.h
index 84aa637..21a0e35 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -73,6 +73,7 @@ DEF_HELPER_2(get_cp_reg64, i64, env, ptr)
 DEF_HELPER_3(msr_i_pstate, void, env, i32, i32)
 DEF_HELPER_1(clear_pstate_ss, void, env)
 DEF_HELPER_1(exception_return, void, env)
+DEF_HELPER_FLAGS_1(compute_tbflags, TCG_CALL_NO_WG_SE, i32, env)
 
 DEF_HELPER_2(get_r13_banked, i32, env, i32)
 DEF_HELPER_3(set_r13_banked, void, env, i32, i32)
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index be27b21..e5d7cfc 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -479,6 +479,7 @@ void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val)
 {
     cpsr_write(env, val, CPSR_ERET_MASK, CPSRWriteExceptionReturn);
 
+    env->tbflags = cpu_get_tb_cpu_flags(env);
     arm_call_el_change_hook(arm_env_get_cpu(env));
 }
 
@@ -975,6 +976,7 @@ void HELPER(exception_return)(CPUARMState *env)
         env->pc = env->elr_el[cur_el];
     }
 
+    env->tbflags = cpu_get_tb_cpu_flags(env);
     arm_call_el_change_hook(arm_env_get_cpu(env));
 
     return;
@@ -1326,3 +1328,8 @@ uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i)
         return ((uint32_t)x >> shift) | (x << (32 - shift));
     }
 }
+
+uint32_t HELPER(compute_tbflags)(CPUARMState *env)
+{
+    return cpu_get_tb_cpu_flags(env);
+}
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index f5e29d2..7c355b0 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -1507,6 +1507,10 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
         }
     }
 
+    if (!isread) {
+        gen_helper_compute_tbflags(cpu_tbflags, cpu_env);
+    }
+
     if ((s->tb->cflags & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
         /* I/O operations must end the TB here (whether read or write) */
         gen_io_end();
diff --git a/target-arm/translate.c b/target-arm/translate.c
index bd5d5cb..b4d8fe7 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -68,6 +68,7 @@ TCGv_i64 cpu_exclusive_val;
 TCGv_i64 cpu_exclusive_test;
 TCGv_i32 cpu_exclusive_info;
 #endif
+TCGv_i32 cpu_tbflags;
 
 /* FIXME:  These should be removed.  */
 static TCGv_i32 cpu_F0s, cpu_F1s;
@@ -107,6 +108,8 @@ void arm_translate_init(void)
     cpu_exclusive_info = tcg_global_mem_new_i32(cpu_env,
         offsetof(CPUARMState, exclusive_info), "exclusive_info");
 #endif
+    cpu_tbflags = tcg_global_mem_new_i32(cpu_env,
+        offsetof(CPUARMState, tbflags), "tbflags");
 
     a64_translate_init();
 }
@@ -1135,6 +1138,7 @@ static void gen_exception_insn(DisasContext *s, int offset, int excp,
 /* Force a TB lookup after an instruction that changes the CPU state.  */
 static inline void gen_lookup_tb(DisasContext *s)
 {
+    gen_helper_compute_tbflags(cpu_tbflags, cpu_env);
     tcg_gen_movi_i32(cpu_R[15], s->pc & ~1);
     s->is_jmp = DISAS_JUMP;
 }
@@ -7630,12 +7634,16 @@ static int disas_coproc_insn(DisasContext *s, uint32_t insn)
             /* I/O operations must end the TB here (whether read or write) */
             gen_io_end();
             gen_lookup_tb(s);
-        } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
+        } else if (!isread) {
             /* We default to ending the TB on a coprocessor register write,
              * but allow this to be suppressed by the register definition
              * (usually only necessary to work around guest bugs).
              */
-            gen_lookup_tb(s);
+            if (ri->type & ARM_CP_SUPPRESS_TB_END) {
+                gen_helper_compute_tbflags(cpu_tbflags, cpu_env);
+            } else {
+                gen_lookup_tb(s);
+            }
         }
 
         return 0;
diff --git a/target-arm/translate.h b/target-arm/translate.h
index dbd7ac8..d269f4c 100644
--- a/target-arm/translate.h
+++ b/target-arm/translate.h
@@ -81,6 +81,7 @@ extern TCGv_i64 cpu_exclusive_val;
 extern TCGv_i64 cpu_exclusive_test;
 extern TCGv_i32 cpu_exclusive_info;
 #endif
+extern TCGv_i32 cpu_tbflags;
 
 static inline int arm_dc_feature(DisasContext *dc, int feature)
 {
-- 
2.7.4

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Qemu-devel] [PATCH 3/3] target-arm: cache most tbflags
  2016-09-14  9:56 [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState Paolo Bonzini
  2016-09-14  9:56 ` [Qemu-devel] [PATCH 1/3] target-arm: introduce cpu_dynamic_tb_cpu_flags Paolo Bonzini
  2016-09-14  9:56 ` [Qemu-devel] [PATCH 2/3] target-arm: add env->tbflags Paolo Bonzini
@ 2016-09-14  9:56 ` Paolo Bonzini
  2016-09-26 10:04 ` [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState Laurent Desnogues
  2016-11-10 11:42 ` Alex Bennée
  4 siblings, 0 replies; 11+ messages in thread
From: Paolo Bonzini @ 2016-09-14  9:56 UTC (permalink / raw)
  To: qemu-devel

Finally, this patch makes most flags static.  The only remaining
dynamic flags (only used in aarch32 mode) are for Thumb mode and
conditional execution.  These are modified more often than the
others, and are cheap therefore they are looked up directly from
env on every TB lookup.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target-arm/cpu.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 5918df5..0b72740 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -2320,17 +2320,15 @@ static inline bool arm_cpu_bswap_data(CPUARMState *env)
 }
 #endif
 
-static inline uint32_t cpu_dynamic_tb_cpu_flags(CPUARMState *env)
+static inline uint32_t cpu_get_tb_cpu_flags(CPUARMState *env)
 {
     uint32_t flags = 0;
 
     if (is_a64(env)) {
         flags |= ARM_TBFLAG_AARCH64_STATE_MASK;
     } else {
-        flags |= (env->thumb << ARM_TBFLAG_THUMB_SHIFT)
-            | (env->vfp.vec_len << ARM_TBFLAG_VECLEN_SHIFT)
+        flags |= (env->vfp.vec_len << ARM_TBFLAG_VECLEN_SHIFT)
             | (env->vfp.vec_stride << ARM_TBFLAG_VECSTRIDE_SHIFT)
-            | (env->condexec_bits << ARM_TBFLAG_CONDEXEC_SHIFT)
             | (arm_sctlr_b(env) << ARM_TBFLAG_SCTLR_B_SHIFT);
         if (!(access_secure_reg(env))) {
             flags |= ARM_TBFLAG_NS_MASK;
@@ -2371,10 +2369,15 @@ static inline uint32_t cpu_dynamic_tb_cpu_flags(CPUARMState *env)
     return flags;
 }
 
-static inline uint32_t cpu_get_tb_cpu_flags(CPUARMState *env)
+static inline uint32_t cpu_dynamic_tb_cpu_flags(CPUARMState *env)
 {
     uint32_t flags = 0;
 
+    if (!is_a64(env)) {
+        flags |= (env->thumb << ARM_TBFLAG_THUMB_SHIFT)
+            | (env->condexec_bits << ARM_TBFLAG_CONDEXEC_SHIFT);
+    }
+
     return flags;
 }
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState
  2016-09-14  9:56 [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState Paolo Bonzini
                   ` (2 preceding siblings ...)
  2016-09-14  9:56 ` [Qemu-devel] [PATCH 3/3] target-arm: cache most tbflags Paolo Bonzini
@ 2016-09-26 10:04 ` Laurent Desnogues
  2016-09-26 11:13   ` Laurent Desnogues
  2016-11-10 11:42 ` Alex Bennée
  4 siblings, 1 reply; 11+ messages in thread
From: Laurent Desnogues @ 2016-09-26 10:04 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel

Hello,

On Wed, Sep 14, 2016 at 11:56 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Computing TranslationBlock flags is pretty expensive on ARM, especially
> 32-bit.  Because tbflags are computed on every tb lookup, it is not
> unlikely to see cpu_get_tb_cpu_state close to the top of the profile
> now that QHT makes the hash table much more efficient.
>
> However, most tbflags only change when the EL is switched or after
> MSR instructions.  Based on this observation, this series caches these
> tbflags in CPUARMState, resulting in a 10-15% speedup on 32-bit code.

I like that patch!

I quickly tested with some softmmu images on both AArch32 and AArch64
and I can confirm the speedup.

As far as your patch goes:

Tested-by: Laurent Desnogues <laurent.desnogues@gmail.com>
Reviewed-by: Laurent Desnogues <laurent.desnogues@gmail.com>

Thanks,

Laurent

PS - BTW, I couldn't run any user mode program since they segfault on
mainline for some reason I have no time to look into.  The v2.7.0 tag
works.


> Paolo
>
> Paolo Bonzini (3):
>   target-arm: introduce cpu_dynamic_tb_cpu_flags
>   target-arm: add env->tbflags
>   target-arm: cache most tbflags
>
>  target-arm/cpu.c           |  2 ++
>  target-arm/cpu.h           | 58 ++++++++++++++++++++++++++++++++--------------
>  target-arm/helper.c        |  2 ++
>  target-arm/helper.h        |  1 +
>  target-arm/op_helper.c     |  7 ++++++
>  target-arm/translate-a64.c |  4 ++++
>  target-arm/translate.c     | 12 ++++++++--
>  target-arm/translate.h     |  1 +
>  8 files changed, 68 insertions(+), 19 deletions(-)
>
> --
> 2.7.4
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState
  2016-09-26 10:04 ` [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState Laurent Desnogues
@ 2016-09-26 11:13   ` Laurent Desnogues
  0 siblings, 0 replies; 11+ messages in thread
From: Laurent Desnogues @ 2016-09-26 11:13 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel

On Mon, Sep 26, 2016 at 12:04 PM, Laurent Desnogues
<laurent.desnogues@gmail.com> wrote:
> Hello,
>
> On Wed, Sep 14, 2016 at 11:56 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> Computing TranslationBlock flags is pretty expensive on ARM, especially
>> 32-bit.  Because tbflags are computed on every tb lookup, it is not
>> unlikely to see cpu_get_tb_cpu_state close to the top of the profile
>> now that QHT makes the hash table much more efficient.
>>
>> However, most tbflags only change when the EL is switched or after
>> MSR instructions.  Based on this observation, this series caches these
>> tbflags in CPUARMState, resulting in a 10-15% speedup on 32-bit code.
>
> I like that patch!
>
> I quickly tested with some softmmu images on both AArch32 and AArch64
> and I can confirm the speedup.
>
> As far as your patch goes:
>
> Tested-by: Laurent Desnogues <laurent.desnogues@gmail.com>
> Reviewed-by: Laurent Desnogues <laurent.desnogues@gmail.com>
>
> Thanks,
>
> Laurent
>
> PS - BTW, I couldn't run any user mode program since they segfault on
> mainline for some reason I have no time to look into.  The v2.7.0 tag
> works.

It turned out this was a mistake on my side.

I ran one SPEC2k test with the patch in user mode, and got a few
percent improvements for both AArch32 and AArch64.

Thanks,

Laurent

>
>> Paolo
>>
>> Paolo Bonzini (3):
>>   target-arm: introduce cpu_dynamic_tb_cpu_flags
>>   target-arm: add env->tbflags
>>   target-arm: cache most tbflags
>>
>>  target-arm/cpu.c           |  2 ++
>>  target-arm/cpu.h           | 58 ++++++++++++++++++++++++++++++++--------------
>>  target-arm/helper.c        |  2 ++
>>  target-arm/helper.h        |  1 +
>>  target-arm/op_helper.c     |  7 ++++++
>>  target-arm/translate-a64.c |  4 ++++
>>  target-arm/translate.c     | 12 ++++++++--
>>  target-arm/translate.h     |  1 +
>>  8 files changed, 68 insertions(+), 19 deletions(-)
>>
>> --
>> 2.7.4
>>
>>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] target-arm: add env->tbflags
  2016-09-14  9:56 ` [Qemu-devel] [PATCH 2/3] target-arm: add env->tbflags Paolo Bonzini
@ 2016-09-26 22:00   ` Peter Maydell
  2016-09-27  8:15     ` Paolo Bonzini
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Maydell @ 2016-09-26 22:00 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: QEMU Developers

On 14 September 2016 at 02:56, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Computing TranslationBlock flags is pretty expensive on ARM, especially
> 32-bit.  In order to limit the cost we want to cache as many of them
> as possible.  Therefore, the flags are split in two parts.  Static flags
> come directly from a new CPUARMState field env->tbflags, and are updated
> whenever EL changes (i.e. on exceptions and exception returns) or after
> MSR instructions.

Doing this for all MSR writes is a bit sad, because a lot of them
don't actually change the TB flags, and quite a few of them which
previously we were able to code to not have to do a helper call
at all (direct writes to fields) now get a pointless helper call.

You're also recalculating more often than stated here, in that
you also recalc on any gen_lookup_tb() call in the 32-bit
decoder. (This is just as well because for instance vec_len
and vec_stride aren't set via the cp15 system register write
path.)

You're treating the PSTATE_SS flag as static, but you don't
have anything which causes a recalculation of it on the code
path which changes it (gen_ss_advance()).

The 32-bit SETEND instruction changes CPSR_E, which has
an effect on the BE_DATA_MASK flag, but I don't think
that code path will cause us to recalculate flags.

I found this patch kind of difficult to review because
it isn't obvious why we recalculate the static flags at
the points where we do (ie whether those points are
necessary and sufficient for correct behaviour). Most
of the comments above are the result of my looking at
whether some particular flags that I suspected of being
tricky were handled correctly :-)

Is there a possibility of having an assertion somewhere that
the static flags we think we have actually match the
old dynamic flags?

> Dynamic flags are computed on the fly by cpu_get_tb_cpu_state (which
> calls cpu_dynamic_tb_cpu_flags to retrieve them), same as before.
> As of this patch, all flags are dynamic and env->tbflags is always 0,
> so this patch adds the infrastructure but does not do any caching yet.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  target-arm/cpu.c           |  2 ++
>  target-arm/cpu.h           | 10 +++++++++-
>  target-arm/helper.c        |  2 ++
>  target-arm/helper.h        |  1 +
>  target-arm/op_helper.c     |  7 +++++++
>  target-arm/translate-a64.c |  4 ++++
>  target-arm/translate.c     | 12 ++++++++++--
>  target-arm/translate.h     |  1 +
>  8 files changed, 36 insertions(+), 3 deletions(-)
>
> diff --git a/target-arm/cpu.c b/target-arm/cpu.c
> index ce8b8f4..189ceab 100644
> --- a/target-arm/cpu.c
> +++ b/target-arm/cpu.c
> @@ -225,6 +225,8 @@ static void arm_cpu_reset(CPUState *s)
>                                &env->vfp.fp_status);
>      set_float_detect_tininess(float_tininess_before_rounding,
>                                &env->vfp.standard_fp_status);
> +
> +    env->tbflags = cpu_get_tb_cpu_flags(env);
>      tlb_flush(s, 1);
>
>  #ifndef CONFIG_USER_ONLY
> diff --git a/target-arm/cpu.h b/target-arm/cpu.h
> index ef195bd..5918df5 100644
> --- a/target-arm/cpu.h
> +++ b/target-arm/cpu.h
> @@ -155,6 +155,7 @@ typedef struct CPUARMState {
>       */
>      uint32_t pstate;
>      uint32_t aarch64; /* 1 if CPU is in aarch64 state; inverse of PSTATE.nRW */
> +    uint32_t tbflags;
>
>      /* Frequently accessed CPSR bits are stored separately for efficiency.
>         This contains all the other bits.  Use cpsr_{read,write} to access
> @@ -2370,10 +2371,17 @@ static inline uint32_t cpu_dynamic_tb_cpu_flags(CPUARMState *env)
>      return flags;
>  }
>
> +static inline uint32_t cpu_get_tb_cpu_flags(CPUARMState *env)
> +{
> +    uint32_t flags = 0;
> +
> +    return flags;
> +}
> +
>  static inline void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
>                                          target_ulong *cs_base, uint32_t *flags)
>  {
> -    *flags = cpu_dynamic_tb_cpu_flags(env);
> +    *flags = env->tbflags | cpu_dynamic_tb_cpu_flags(env);
>
>      if (is_a64(env)) {
>          *pc = env->pc;
> diff --git a/target-arm/helper.c b/target-arm/helper.c
> index bdb842c..0b4f6de 100644
> --- a/target-arm/helper.c
> +++ b/target-arm/helper.c
> @@ -6068,6 +6068,7 @@ void arm_v7m_cpu_do_interrupt(CPUState *cs)
>      addr = ldl_phys(cs->as, env->v7m.vecbase + env->v7m.exception * 4);
>      env->regs[15] = addr & 0xfffffffe;
>      env->thumb = addr & 1;
> +    env->tbflags = cpu_get_tb_cpu_flags(env);
>  }
>
>  /* Function used to synchronize QEMU's AArch64 register set with AArch32
> @@ -6642,6 +6643,7 @@ void arm_cpu_do_interrupt(CPUState *cs)
>          arm_cpu_do_interrupt_aarch32(cs);
>      }
>
> +    env->tbflags = cpu_get_tb_cpu_flags(env);
>      arm_call_el_change_hook(cpu);
>
>      if (!kvm_enabled()) {
> diff --git a/target-arm/helper.h b/target-arm/helper.h
> index 84aa637..21a0e35 100644
> --- a/target-arm/helper.h
> +++ b/target-arm/helper.h
> @@ -73,6 +73,7 @@ DEF_HELPER_2(get_cp_reg64, i64, env, ptr)
>  DEF_HELPER_3(msr_i_pstate, void, env, i32, i32)
>  DEF_HELPER_1(clear_pstate_ss, void, env)
>  DEF_HELPER_1(exception_return, void, env)
> +DEF_HELPER_FLAGS_1(compute_tbflags, TCG_CALL_NO_WG_SE, i32, env)
>
>  DEF_HELPER_2(get_r13_banked, i32, env, i32)
>  DEF_HELPER_3(set_r13_banked, void, env, i32, i32)
> diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
> index be27b21..e5d7cfc 100644
> --- a/target-arm/op_helper.c
> +++ b/target-arm/op_helper.c
> @@ -479,6 +479,7 @@ void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val)
>  {
>      cpsr_write(env, val, CPSR_ERET_MASK, CPSRWriteExceptionReturn);
>
> +    env->tbflags = cpu_get_tb_cpu_flags(env);
>      arm_call_el_change_hook(arm_env_get_cpu(env));
>  }
>
> @@ -975,6 +976,7 @@ void HELPER(exception_return)(CPUARMState *env)
>          env->pc = env->elr_el[cur_el];
>      }
>
> +    env->tbflags = cpu_get_tb_cpu_flags(env);
>      arm_call_el_change_hook(arm_env_get_cpu(env));
>
>      return;
> @@ -1326,3 +1328,8 @@ uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i)
>          return ((uint32_t)x >> shift) | (x << (32 - shift));
>      }
>  }
> +
> +uint32_t HELPER(compute_tbflags)(CPUARMState *env)
> +{
> +    return cpu_get_tb_cpu_flags(env);
> +}
> diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
> index f5e29d2..7c355b0 100644
> --- a/target-arm/translate-a64.c
> +++ b/target-arm/translate-a64.c
> @@ -1507,6 +1507,10 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
>          }
>      }
>
> +    if (!isread) {
> +        gen_helper_compute_tbflags(cpu_tbflags, cpu_env);
> +    }
> +
>      if ((s->tb->cflags & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
>          /* I/O operations must end the TB here (whether read or write) */
>          gen_io_end();
> diff --git a/target-arm/translate.c b/target-arm/translate.c
> index bd5d5cb..b4d8fe7 100644
> --- a/target-arm/translate.c
> +++ b/target-arm/translate.c
> @@ -68,6 +68,7 @@ TCGv_i64 cpu_exclusive_val;
>  TCGv_i64 cpu_exclusive_test;
>  TCGv_i32 cpu_exclusive_info;
>  #endif
> +TCGv_i32 cpu_tbflags;
>
>  /* FIXME:  These should be removed.  */
>  static TCGv_i32 cpu_F0s, cpu_F1s;
> @@ -107,6 +108,8 @@ void arm_translate_init(void)
>      cpu_exclusive_info = tcg_global_mem_new_i32(cpu_env,
>          offsetof(CPUARMState, exclusive_info), "exclusive_info");
>  #endif
> +    cpu_tbflags = tcg_global_mem_new_i32(cpu_env,
> +        offsetof(CPUARMState, tbflags), "tbflags");
>
>      a64_translate_init();
>  }
> @@ -1135,6 +1138,7 @@ static void gen_exception_insn(DisasContext *s, int offset, int excp,
>  /* Force a TB lookup after an instruction that changes the CPU state.  */
>  static inline void gen_lookup_tb(DisasContext *s)
>  {
> +    gen_helper_compute_tbflags(cpu_tbflags, cpu_env);

Why do we need to do this for 32-bit ARM but there's nothing
similar in the 64-bit ARM decoder?

>      tcg_gen_movi_i32(cpu_R[15], s->pc & ~1);
>      s->is_jmp = DISAS_JUMP;
>  }
> @@ -7630,12 +7634,16 @@ static int disas_coproc_insn(DisasContext *s, uint32_t insn)
>              /* I/O operations must end the TB here (whether read or write) */
>              gen_io_end();
>              gen_lookup_tb(s);
> -        } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
> +        } else if (!isread) {
>              /* We default to ending the TB on a coprocessor register write,
>               * but allow this to be suppressed by the register definition
>               * (usually only necessary to work around guest bugs).
>               */
> -            gen_lookup_tb(s);
> +            if (ri->type & ARM_CP_SUPPRESS_TB_END) {
> +                gen_helper_compute_tbflags(cpu_tbflags, cpu_env);
> +            } else {
> +                gen_lookup_tb(s);
> +            }
>          }
>
>          return 0;
> diff --git a/target-arm/translate.h b/target-arm/translate.h
> index dbd7ac8..d269f4c 100644
> --- a/target-arm/translate.h
> +++ b/target-arm/translate.h
> @@ -81,6 +81,7 @@ extern TCGv_i64 cpu_exclusive_val;
>  extern TCGv_i64 cpu_exclusive_test;
>  extern TCGv_i32 cpu_exclusive_info;
>  #endif
> +extern TCGv_i32 cpu_tbflags;
>
>  static inline int arm_dc_feature(DisasContext *dc, int feature)
>  {
> --
> 2.7.4

thanks
-- PMM

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] target-arm: add env->tbflags
  2016-09-26 22:00   ` Peter Maydell
@ 2016-09-27  8:15     ` Paolo Bonzini
  0 siblings, 0 replies; 11+ messages in thread
From: Paolo Bonzini @ 2016-09-27  8:15 UTC (permalink / raw)
  To: Peter Maydell; +Cc: QEMU Developers

> Doing this for all MSR writes is a bit sad, because a lot of them
> don't actually change the TB flags, and quite a few of them which
> previously we were able to code to not have to do a helper call
> at all (direct writes to fields) now get a pointless helper call.

True.  On the other hand, MSR writes terminate the TB so you are
losing all the TB state anyway.  Before these patches you weren't
recomputing the TB flags in the common case of adjacent MSR writes
on the same page (so QEMU could use linked TBs), now you are.
However, given the speedup from the patch, I felt it was premature
optimization.

If there is a case where you get the helper in the profile, it is
possible to add a new ARM_CP_KEEP_TBFLAGS flag to ARMCPRegInfo.

> You're also recalculating more often than stated here, in that
> you also recalc on any gen_lookup_tb() call in the 32-bit
> decoder. (This is just as well because for instance vec_len
> and vec_stride aren't set via the cp15 system register write
> path.)

Right.  This was of course on purpose, but the commit message
was imprecise.

> You're treating the PSTATE_SS flag as static, but you don't
> have anything which causes a recalculation of it on the code
> path which changes it (gen_ss_advance()).
> 
> The 32-bit SETEND instruction changes CPSR_E, which has
> an effect on the BE_DATA_MASK flag, but I don't think
> that code path will cause us to recalculate flags.

This actually points to a bigger deficiency, in that---even
outside the PSTATE_SS and SETEND code paths---both pstate_write
and cpsr_write need to recompute the flags.  But I think that's
the only other case left.

Do you prefer to have the setend and clear_pstate_ss helpers
call cpsr_write/pstate_write, or do you prefer to inline the
modification to the tbflags?

> I found this patch kind of difficult to review because
> it isn't obvious why we recalculate the static flags at
> the points where we do (ie whether those points are
> necessary and sufficient for correct behaviour). Most
> of the comments above are the result of my looking at
> whether some particular flags that I suspected of being
> tricky were handled correctly :-)

You definitely have a point here.  Adding an assertion requires
looking at CPUARMState in gen_intermediate_code.  You're not really
supposed to do that, but I guess it's okay as long as it's for
debugging purposes.

Paolo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState
  2016-09-14  9:56 [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState Paolo Bonzini
                   ` (3 preceding siblings ...)
  2016-09-26 10:04 ` [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState Laurent Desnogues
@ 2016-11-10 11:42 ` Alex Bennée
  2016-11-10 12:19   ` Paolo Bonzini
  4 siblings, 1 reply; 11+ messages in thread
From: Alex Bennée @ 2016-11-10 11:42 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel


Paolo Bonzini <pbonzini@redhat.com> writes:

> Computing TranslationBlock flags is pretty expensive on ARM, especially
> 32-bit.  Because tbflags are computed on every tb lookup, it is not
> unlikely to see cpu_get_tb_cpu_state close to the top of the profile
> now that QHT makes the hash table much more efficient.
>
> However, most tbflags only change when the EL is switched or after
> MSR instructions.  Based on this observation, this series caches these
> tbflags in CPUARMState, resulting in a 10-15% speedup on 32-bit code.

Hi,

I'm starting to clear out my review queue but I notice these now longer
apply cleanly to master. Where you going to re-issue the series once
you'd addressed Peter's concerns?

My general comments are I think this is a good idea but my concern is
ensuring state changes get picked up and we don't end up with
inconsistent state between real and cached values. I still have the
scars from my last attempt to rationalise cpu.h pstate, aarch64,
uncached_cpsr and spsr!

Cheers,

--
Alex Bennée

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState
  2016-11-10 11:42 ` Alex Bennée
@ 2016-11-10 12:19   ` Paolo Bonzini
  2016-11-10 13:37     ` Alex Bennée
  0 siblings, 1 reply; 11+ messages in thread
From: Paolo Bonzini @ 2016-11-10 12:19 UTC (permalink / raw)
  To: Alex Bennée; +Cc: qemu-devel



On 10/11/2016 12:42, Alex Bennée wrote:
> 
> Paolo Bonzini <pbonzini@redhat.com> writes:
> 
>> Computing TranslationBlock flags is pretty expensive on ARM, especially
>> 32-bit.  Because tbflags are computed on every tb lookup, it is not
>> unlikely to see cpu_get_tb_cpu_state close to the top of the profile
>> now that QHT makes the hash table much more efficient.
>>
>> However, most tbflags only change when the EL is switched or after
>> MSR instructions.  Based on this observation, this series caches these
>> tbflags in CPUARMState, resulting in a 10-15% speedup on 32-bit code.
> 
> Hi,
> 
> I'm starting to clear out my review queue but I notice these now longer
> apply cleanly to master. Where you going to re-issue the series once
> you'd addressed Peter's concerns?

Yes, I didn't want to send them two days before soft freeze and Peter's
vacation. :)

Paolo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState
  2016-11-10 12:19   ` Paolo Bonzini
@ 2016-11-10 13:37     ` Alex Bennée
  0 siblings, 0 replies; 11+ messages in thread
From: Alex Bennée @ 2016-11-10 13:37 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel


Paolo Bonzini <pbonzini@redhat.com> writes:

> On 10/11/2016 12:42, Alex Bennée wrote:
>>
>> Paolo Bonzini <pbonzini@redhat.com> writes:
>>
>>> Computing TranslationBlock flags is pretty expensive on ARM, especially
>>> 32-bit.  Because tbflags are computed on every tb lookup, it is not
>>> unlikely to see cpu_get_tb_cpu_state close to the top of the profile
>>> now that QHT makes the hash table much more efficient.
>>>
>>> However, most tbflags only change when the EL is switched or after
>>> MSR instructions.  Based on this observation, this series caches these
>>> tbflags in CPUARMState, resulting in a 10-15% speedup on 32-bit code.
>>
>> Hi,
>>
>> I'm starting to clear out my review queue but I notice these now longer
>> apply cleanly to master. Where you going to re-issue the series once
>> you'd addressed Peter's concerns?
>
> Yes, I didn't want to send them two days before soft freeze and Peter's
> vacation. :)

Cool. Well I'll happily look at it while he's away as I think I need to
learn more about the flag mechanism anyway. Unless there are any
regressions that have come up during soft-freeze that I need to look at?

--
Alex Bennée

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2016-11-10 13:37 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-14  9:56 [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState Paolo Bonzini
2016-09-14  9:56 ` [Qemu-devel] [PATCH 1/3] target-arm: introduce cpu_dynamic_tb_cpu_flags Paolo Bonzini
2016-09-14  9:56 ` [Qemu-devel] [PATCH 2/3] target-arm: add env->tbflags Paolo Bonzini
2016-09-26 22:00   ` Peter Maydell
2016-09-27  8:15     ` Paolo Bonzini
2016-09-14  9:56 ` [Qemu-devel] [PATCH 3/3] target-arm: cache most tbflags Paolo Bonzini
2016-09-26 10:04 ` [Qemu-devel] [PATCH 0/3] target-arm: cache tbflags in CPUARMState Laurent Desnogues
2016-09-26 11:13   ` Laurent Desnogues
2016-11-10 11:42 ` Alex Bennée
2016-11-10 12:19   ` Paolo Bonzini
2016-11-10 13:37     ` Alex Bennée

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.