All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state
@ 2019-02-14  4:06 Richard Henderson
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 1/4] target/arm: Split out recompute_hflags et al Richard Henderson
                   ` (6 more replies)
  0 siblings, 7 replies; 17+ messages in thread
From: Richard Henderson @ 2019-02-14  4:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, alex.bennee, cota

We've talked about this before, caching state to reduce the
amount of computation that happens looking up each TB.

I know that Peter has been concerned that we would not be able to 
reliably maintain all of the places that need to be updates to
keep this up-to-date.

Well, modulo dirty tricks within linux-user, it appears as if
exception delivery and return, plus after every TB-ending write
to a system register is sufficient.

There seems to be a noticable improvement, although wall-time
is harder to come by -- all of my system-level measurements
include user input, and my user-level measurements seem to be
too small to matter.


r~


Richard Henderson (4):
  target/arm: Split out recompute_hflags et al
  target/arm: Rebuild hflags at el changes and MSR writes
  target/arm: Assert hflags is correct in cpu_get_tb_cpu_state
  target/arm: Rely on hflags correct in cpu_get_tb_cpu_state

 target/arm/cpu.h           |  22 ++-
 target/arm/helper.h        |   3 +
 target/arm/internals.h     |   4 +
 linux-user/syscall.c       |   1 +
 target/arm/cpu.c           |   1 +
 target/arm/helper-a64.c    |   3 +
 target/arm/helper.c        | 267 ++++++++++++++++++++++---------------
 target/arm/machine.c       |   1 +
 target/arm/op_helper.c     |   1 +
 target/arm/translate-a64.c |   6 +-
 target/arm/translate.c     |  14 +-
 11 files changed, 204 insertions(+), 119 deletions(-)

-- 
2.17.1

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Qemu-devel] [PATCH 1/4] target/arm: Split out recompute_hflags et al
  2019-02-14  4:06 [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Richard Henderson
@ 2019-02-14  4:06 ` Richard Henderson
  2019-02-19 11:06   ` Alex Bennée
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 2/4] target/arm: Rebuild hflags at el changes and MSR writes Richard Henderson
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 17+ messages in thread
From: Richard Henderson @ 2019-02-14  4:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, alex.bennee, cota

We will use these to minimize the computation for every call to
cpu_get_tb_cpu_state.  For now, the env->hflags variable is not used.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/cpu.h       |  22 +++-
 target/arm/helper.h    |   3 +
 target/arm/internals.h |   3 +
 target/arm/helper.c    | 268 ++++++++++++++++++++++++-----------------
 4 files changed, 180 insertions(+), 116 deletions(-)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 47238e4245..8b0dea947b 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -240,6 +240,9 @@ typedef struct CPUARMState {
     uint32_t pstate;
     uint32_t aarch64; /* 1 if CPU is in aarch64 state; inverse of PSTATE.nRW */
 
+    /* Cached TBFLAGS state.  See below for which bits are included.  */
+    uint32_t hflags;
+
     /* Frequently accessed CPSR bits are stored separately for efficiency.
        This contains all the other bits.  Use cpsr_{read,write} to access
        the whole CPSR.  */
@@ -3019,25 +3022,28 @@ static inline bool arm_cpu_data_is_big_endian(CPUARMState *env)
 
 #include "exec/cpu-all.h"
 
-/* Bit usage in the TB flags field: bit 31 indicates whether we are
+/*
+ * Bit usage in the TB flags field: bit 31 indicates whether we are
  * in 32 or 64 bit mode. The meaning of the other bits depends on that.
  * We put flags which are shared between 32 and 64 bit mode at the top
  * of the word, and flags which apply to only one mode at the bottom.
+ *
+ * Unless otherwise noted, these bits are cached in env->hflags.
  */
 FIELD(TBFLAG_ANY, AARCH64_STATE, 31, 1)
 FIELD(TBFLAG_ANY, MMUIDX, 28, 3)
 FIELD(TBFLAG_ANY, SS_ACTIVE, 27, 1)
-FIELD(TBFLAG_ANY, PSTATE_SS, 26, 1)
+FIELD(TBFLAG_ANY, PSTATE_SS, 26, 1)     /* Not cached. */
 /* Target EL if we take a floating-point-disabled exception */
 FIELD(TBFLAG_ANY, FPEXC_EL, 24, 2)
 FIELD(TBFLAG_ANY, BE_DATA, 23, 1)
 
 /* Bit usage when in AArch32 state: */
-FIELD(TBFLAG_A32, THUMB, 0, 1)
+FIELD(TBFLAG_A32, THUMB, 0, 1)          /* Not cached. */
 FIELD(TBFLAG_A32, VECLEN, 1, 3)
 FIELD(TBFLAG_A32, VECSTRIDE, 4, 2)
 FIELD(TBFLAG_A32, VFPEN, 7, 1)
-FIELD(TBFLAG_A32, CONDEXEC, 8, 8)
+FIELD(TBFLAG_A32, CONDEXEC, 8, 8)       /* Not cached. */
 FIELD(TBFLAG_A32, SCTLR_B, 16, 1)
 /* We store the bottom two bits of the CPAR as TB flags and handle
  * checks on the other bits at runtime
@@ -3059,7 +3065,7 @@ FIELD(TBFLAG_A64, SVEEXC_EL, 2, 2)
 FIELD(TBFLAG_A64, ZCR_LEN, 4, 4)
 FIELD(TBFLAG_A64, PAUTH_ACTIVE, 8, 1)
 FIELD(TBFLAG_A64, BT, 9, 1)
-FIELD(TBFLAG_A64, BTYPE, 10, 2)
+FIELD(TBFLAG_A64, BTYPE, 10, 2)         /* Not cached. */
 FIELD(TBFLAG_A64, TBID, 12, 2)
 
 static inline bool bswap_code(bool sctlr_b)
@@ -3144,6 +3150,12 @@ void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook,
 void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, void
         *opaque);
 
+/**
+ * arm_rebuild_hflags:
+ * Rebuild the cached TBFLAGS for arbitrary changed processor state.
+ */
+void arm_rebuild_hflags(CPUARMState *env);
+
 /**
  * aa32_vfp_dreg:
  * Return a pointer to the Dn register within env in 32-bit mode.
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 53a38188c6..e3c98913e6 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -653,6 +653,9 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
 DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, ptr, i32)
 
+DEF_HELPER_FLAGS_2(rebuild_hflags_a32, TCG_CALL_NO_RWG, void, env, i32)
+DEF_HELPER_FLAGS_2(rebuild_hflags_a64, TCG_CALL_NO_RWG, void, env, i32)
+
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
 #include "helper-sve.h"
diff --git a/target/arm/internals.h b/target/arm/internals.h
index a4bd1becb7..8c1b813364 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -968,4 +968,7 @@ ARMVAParameters aa64_va_parameters_both(CPUARMState *env, uint64_t va,
 ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va,
                                    ARMMMUIdx mmu_idx, bool data);
 
+uint32_t rebuild_hflags_a32(CPUARMState *env, int el);
+uint32_t rebuild_hflags_a64(CPUARMState *env, int el);
+
 #endif
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 520ceea7a4..7a77f53ba8 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -13745,122 +13745,15 @@ ARMMMUIdx arm_stage1_mmu_idx(CPUARMState *env)
 }
 #endif
 
-void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
-                          target_ulong *cs_base, uint32_t *pflags)
+static uint32_t common_hflags(CPUARMState *env, int el, ARMMMUIdx mmu_idx,
+                              int fp_el, uint32_t flags)
 {
-    ARMMMUIdx mmu_idx = arm_mmu_idx(env);
-    int current_el = arm_current_el(env);
-    int fp_el = fp_exception_el(env, current_el);
-    uint32_t flags = 0;
-
-    if (is_a64(env)) {
-        ARMCPU *cpu = arm_env_get_cpu(env);
-        uint64_t sctlr;
-
-        *pc = env->pc;
-        flags = FIELD_DP32(flags, TBFLAG_ANY, AARCH64_STATE, 1);
-
-        /* Get control bits for tagged addresses.  */
-        {
-            ARMMMUIdx stage1 = stage_1_mmu_idx(mmu_idx);
-            ARMVAParameters p0 = aa64_va_parameters_both(env, 0, stage1);
-            int tbii, tbid;
-
-            /* FIXME: ARMv8.1-VHE S2 translation regime.  */
-            if (regime_el(env, stage1) < 2) {
-                ARMVAParameters p1 = aa64_va_parameters_both(env, -1, stage1);
-                tbid = (p1.tbi << 1) | p0.tbi;
-                tbii = tbid & ~((p1.tbid << 1) | p0.tbid);
-            } else {
-                tbid = p0.tbi;
-                tbii = tbid & !p0.tbid;
-            }
-
-            flags = FIELD_DP32(flags, TBFLAG_A64, TBII, tbii);
-            flags = FIELD_DP32(flags, TBFLAG_A64, TBID, tbid);
-        }
-
-        if (cpu_isar_feature(aa64_sve, cpu)) {
-            int sve_el = sve_exception_el(env, current_el);
-            uint32_t zcr_len;
-
-            /* If SVE is disabled, but FP is enabled,
-             * then the effective len is 0.
-             */
-            if (sve_el != 0 && fp_el == 0) {
-                zcr_len = 0;
-            } else {
-                zcr_len = sve_zcr_len_for_el(env, current_el);
-            }
-            flags = FIELD_DP32(flags, TBFLAG_A64, SVEEXC_EL, sve_el);
-            flags = FIELD_DP32(flags, TBFLAG_A64, ZCR_LEN, zcr_len);
-        }
-
-        if (current_el == 0) {
-            /* FIXME: ARMv8.1-VHE S2 translation regime.  */
-            sctlr = env->cp15.sctlr_el[1];
-        } else {
-            sctlr = env->cp15.sctlr_el[current_el];
-        }
-        if (cpu_isar_feature(aa64_pauth, cpu)) {
-            /*
-             * In order to save space in flags, we record only whether
-             * pauth is "inactive", meaning all insns are implemented as
-             * a nop, or "active" when some action must be performed.
-             * The decision of which action to take is left to a helper.
-             */
-            if (sctlr & (SCTLR_EnIA | SCTLR_EnIB | SCTLR_EnDA | SCTLR_EnDB)) {
-                flags = FIELD_DP32(flags, TBFLAG_A64, PAUTH_ACTIVE, 1);
-            }
-        }
-
-        if (cpu_isar_feature(aa64_bti, cpu)) {
-            /* Note that SCTLR_EL[23].BT == SCTLR_BT1.  */
-            if (sctlr & (current_el == 0 ? SCTLR_BT0 : SCTLR_BT1)) {
-                flags = FIELD_DP32(flags, TBFLAG_A64, BT, 1);
-            }
-            flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
-        }
-    } else {
-        *pc = env->regs[15];
-        flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);
-        flags = FIELD_DP32(flags, TBFLAG_A32, VECLEN, env->vfp.vec_len);
-        flags = FIELD_DP32(flags, TBFLAG_A32, VECSTRIDE, env->vfp.vec_stride);
-        flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);
-        flags = FIELD_DP32(flags, TBFLAG_A32, SCTLR_B, arm_sctlr_b(env));
-        flags = FIELD_DP32(flags, TBFLAG_A32, NS, !access_secure_reg(env));
-        if (env->vfp.xregs[ARM_VFP_FPEXC] & (1 << 30)
-            || arm_el_is_aa64(env, 1)) {
-            flags = FIELD_DP32(flags, TBFLAG_A32, VFPEN, 1);
-        }
-        flags = FIELD_DP32(flags, TBFLAG_A32, XSCALE_CPAR, env->cp15.c15_cpar);
-    }
-
     flags = FIELD_DP32(flags, TBFLAG_ANY, MMUIDX, arm_to_core_mmu_idx(mmu_idx));
+    flags = FIELD_DP32(flags, TBFLAG_ANY, FPEXC_EL, fp_el);
 
-    /* The SS_ACTIVE and PSTATE_SS bits correspond to the state machine
-     * states defined in the ARM ARM for software singlestep:
-     *  SS_ACTIVE   PSTATE.SS   State
-     *     0            x       Inactive (the TB flag for SS is always 0)
-     *     1            0       Active-pending
-     *     1            1       Active-not-pending
-     */
-    if (arm_singlestep_active(env)) {
-        flags = FIELD_DP32(flags, TBFLAG_ANY, SS_ACTIVE, 1);
-        if (is_a64(env)) {
-            if (env->pstate & PSTATE_SS) {
-                flags = FIELD_DP32(flags, TBFLAG_ANY, PSTATE_SS, 1);
-            }
-        } else {
-            if (env->uncached_cpsr & PSTATE_SS) {
-                flags = FIELD_DP32(flags, TBFLAG_ANY, PSTATE_SS, 1);
-            }
-        }
-    }
     if (arm_cpu_data_is_big_endian(env)) {
         flags = FIELD_DP32(flags, TBFLAG_ANY, BE_DATA, 1);
     }
-    flags = FIELD_DP32(flags, TBFLAG_ANY, FPEXC_EL, fp_el);
 
     if (arm_v7m_is_handler_mode(env)) {
         flags = FIELD_DP32(flags, TBFLAG_A32, HANDLER, 1);
@@ -13876,8 +13769,161 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
         flags = FIELD_DP32(flags, TBFLAG_A32, STACKCHECK, 1);
     }
 
-    *pflags = flags;
+    if (arm_singlestep_active(env)) {
+        flags = FIELD_DP32(flags, TBFLAG_ANY, SS_ACTIVE, 1);
+    }
+
+    return flags;
+}
+
+uint32_t rebuild_hflags_a32(CPUARMState *env, int el)
+{
+    uint32_t flags = 0;
+    ARMMMUIdx mmu_idx;
+    int fp_el;
+
+    flags = FIELD_DP32(flags, TBFLAG_A32, VECLEN, env->vfp.vec_len);
+    flags = FIELD_DP32(flags, TBFLAG_A32, VECSTRIDE, env->vfp.vec_stride);
+    flags = FIELD_DP32(flags, TBFLAG_A32, SCTLR_B, arm_sctlr_b(env));
+    flags = FIELD_DP32(flags, TBFLAG_A32, NS, !access_secure_reg(env));
+    if (env->vfp.xregs[ARM_VFP_FPEXC] & (1 << 30)
+        || arm_el_is_aa64(env, 1)) {
+        flags = FIELD_DP32(flags, TBFLAG_A32, VFPEN, 1);
+    }
+    flags = FIELD_DP32(flags, TBFLAG_A32, XSCALE_CPAR, env->cp15.c15_cpar);
+
+    mmu_idx = arm_mmu_idx(env);
+    fp_el = fp_exception_el(env, el);
+    return common_hflags(env, el, mmu_idx, fp_el, flags);
+}
+
+uint32_t rebuild_hflags_a64(CPUARMState *env, int el)
+{
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    ARMMMUIdx mmu_idx = arm_mmu_idx(env);
+    ARMMMUIdx stage1 = stage_1_mmu_idx(mmu_idx);
+    ARMVAParameters p0 = aa64_va_parameters_both(env, 0, stage1);
+    int fp_el = fp_exception_el(env, el);
+    uint32_t flags = 0;
+    uint64_t sctlr;
+    int tbii, tbid;
+
+    flags = FIELD_DP32(flags, TBFLAG_ANY, AARCH64_STATE, 1);
+
+    /* Get control bits for tagged addresses.  */
+    /* FIXME: ARMv8.1-VHE S2 translation regime.  */
+    if (regime_el(env, stage1) < 2) {
+        ARMVAParameters p1 = aa64_va_parameters_both(env, -1, stage1);
+        tbid = (p1.tbi << 1) | p0.tbi;
+        tbii = tbid & ~((p1.tbid << 1) | p0.tbid);
+    } else {
+        tbid = p0.tbi;
+        tbii = tbid & !p0.tbid;
+    }
+
+    flags = FIELD_DP32(flags, TBFLAG_A64, TBII, tbii);
+    flags = FIELD_DP32(flags, TBFLAG_A64, TBID, tbid);
+
+    if (cpu_isar_feature(aa64_sve, cpu)) {
+        int sve_el = sve_exception_el(env, el);
+        uint32_t zcr_len;
+
+        /* If SVE is disabled, but FP is enabled,
+         * then the effective len is 0.
+         */
+        if (sve_el != 0 && fp_el == 0) {
+            zcr_len = 0;
+        } else {
+            zcr_len = sve_zcr_len_for_el(env, el);
+        }
+        flags = FIELD_DP32(flags, TBFLAG_A64, SVEEXC_EL, sve_el);
+        flags = FIELD_DP32(flags, TBFLAG_A64, ZCR_LEN, zcr_len);
+    }
+
+    if (el == 0) {
+        /* FIXME: ARMv8.1-VHE S2 translation regime.  */
+        sctlr = env->cp15.sctlr_el[1];
+    } else {
+        sctlr = env->cp15.sctlr_el[el];
+    }
+    if (cpu_isar_feature(aa64_pauth, cpu)) {
+        /*
+         * In order to save space in flags, we record only whether
+         * pauth is "inactive", meaning all insns are implemented as
+         * a nop, or "active" when some action must be performed.
+         * The decision of which action to take is left to a helper.
+         */
+        if (sctlr & (SCTLR_EnIA | SCTLR_EnIB | SCTLR_EnDA | SCTLR_EnDB)) {
+            flags = FIELD_DP32(flags, TBFLAG_A64, PAUTH_ACTIVE, 1);
+        }
+    }
+
+    if (cpu_isar_feature(aa64_bti, cpu)) {
+        /* Note that SCTLR_EL[23].BT == SCTLR_BT1.  */
+        if (sctlr & (el == 0 ? SCTLR_BT0 : SCTLR_BT1)) {
+            flags = FIELD_DP32(flags, TBFLAG_A64, BT, 1);
+        }
+        flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
+    }
+
+    return common_hflags(env, el, mmu_idx, fp_el, flags);
+}
+
+void arm_rebuild_hflags(CPUARMState *env)
+{
+    int el = arm_current_el(env);
+    env->hflags = (is_a64(env)
+                   ? rebuild_hflags_a64(env, el)
+                   : rebuild_hflags_a32(env, el));
+}
+
+void HELPER(rebuild_hflags_a32)(CPUARMState *env, uint32_t el)
+{
+    tcg_debug_assert(!is_a64(env));
+    env->hflags = rebuild_hflags_a32(env, el);
+}
+
+void HELPER(rebuild_hflags_a64)(CPUARMState *env, uint32_t el)
+{
+    tcg_debug_assert(is_a64(env));
+    env->hflags = rebuild_hflags_a64(env, el);
+}
+
+void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
+                          target_ulong *cs_base, uint32_t *pflags)
+{
+    int current_el = arm_current_el(env);
+    uint32_t flags;
+    uint32_t pstate_for_ss;
+
     *cs_base = 0;
+    if (is_a64(env)) {
+        *pc = env->pc;
+        flags = rebuild_hflags_a64(env, current_el);
+        flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
+        pstate_for_ss = env->pstate;
+    } else {
+        *pc = env->regs[15];
+        flags = rebuild_hflags_a32(env, current_el);
+        flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);
+        flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);
+        pstate_for_ss = env->uncached_cpsr;
+    }
+
+    /* The SS_ACTIVE and PSTATE_SS bits correspond to the state machine
+     * states defined in the ARM ARM for software singlestep:
+     *  SS_ACTIVE   PSTATE.SS   State
+     *     0            x       Inactive (the TB flag for SS is always 0)
+     *     1            0       Active-pending
+     *     1            1       Active-not-pending
+     * SS_ACTIVE is set in hflags; PSTATE_SS is computed every TB.
+     */
+    if (FIELD_EX32(flags, TBFLAG_ANY, SS_ACTIVE)
+        && (pstate_for_ss & PSTATE_SS)) {
+        flags = FIELD_DP32(flags, TBFLAG_ANY, PSTATE_SS, 1);
+    }
+
+    *pflags = flags;
 }
 
 #ifdef TARGET_AARCH64
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [Qemu-devel] [PATCH 2/4] target/arm: Rebuild hflags at el changes and MSR writes
  2019-02-14  4:06 [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Richard Henderson
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 1/4] target/arm: Split out recompute_hflags et al Richard Henderson
@ 2019-02-14  4:06 ` Richard Henderson
  2019-02-19 14:48   ` Alex Bennée
  2019-02-19 19:43   ` Alex Bennée
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 3/4] target/arm: Assert hflags is correct in cpu_get_tb_cpu_state Richard Henderson
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 17+ messages in thread
From: Richard Henderson @ 2019-02-14  4:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, alex.bennee, cota

Now setting, but not relying upon, env->hflags.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/internals.h     |  1 +
 linux-user/syscall.c       |  1 +
 target/arm/cpu.c           |  1 +
 target/arm/helper-a64.c    |  3 +++
 target/arm/helper.c        |  2 ++
 target/arm/machine.c       |  1 +
 target/arm/op_helper.c     |  1 +
 target/arm/translate-a64.c |  6 +++++-
 target/arm/translate.c     | 14 ++++++++++++--
 9 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/target/arm/internals.h b/target/arm/internals.h
index 8c1b813364..235f4fafec 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -970,5 +970,6 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va,
 
 uint32_t rebuild_hflags_a32(CPUARMState *env, int el);
 uint32_t rebuild_hflags_a64(CPUARMState *env, int el);
+void rebuild_hflags_any(CPUARMState *env);
 
 #endif
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 5bbb72f3d5..123f342bdc 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -9691,6 +9691,7 @@ static abi_long do_syscall1(void *cpu_env, int num, abi_long arg1,
                     aarch64_sve_narrow_vq(env, vq);
                 }
                 env->vfp.zcr_el[1] = vq - 1;
+                arm_rebuild_hflags(env);
                 ret = vq * 16;
             }
             return ret;
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index edf6e0e1f1..e4da513eb3 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -390,6 +390,7 @@ static void arm_cpu_reset(CPUState *s)
 
     hw_breakpoint_update_all(cpu);
     hw_watchpoint_update_all(cpu);
+    arm_rebuild_hflags(env);
 }
 
 bool arm_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index 101fa6d3ea..f51cb75444 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -995,6 +995,7 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
         } else {
             env->regs[15] = new_pc & ~0x3;
         }
+        rebuild_hflags_a32(env, new_el);
         qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
                       "AArch32 EL%d PC 0x%" PRIx32 "\n",
                       cur_el, new_el, env->regs[15]);
@@ -1006,10 +1007,12 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
         }
         aarch64_restore_sp(env, new_el);
         env->pc = new_pc;
+        rebuild_hflags_a64(env, new_el);
         qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
                       "AArch64 EL%d PC 0x%" PRIx64 "\n",
                       cur_el, new_el, env->pc);
     }
+
     /*
      * Note that cur_el can never be 0.  If new_el is 0, then
      * el0_a64 is return_to_aa64, else el0_a64 is ignored.
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 7a77f53ba8..d8249f0eae 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -9081,6 +9081,7 @@ static void take_aarch32_exception(CPUARMState *env, int new_mode,
         env->regs[14] = env->regs[15] + offset;
     }
     env->regs[15] = newpc;
+    env->hflags = rebuild_hflags_a32(env, arm_current_el(env));
 }
 
 static void arm_cpu_do_interrupt_aarch32_hyp(CPUState *cs)
@@ -9426,6 +9427,7 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs)
 
     pstate_write(env, PSTATE_DAIF | new_mode);
     env->aarch64 = 1;
+    env->hflags = rebuild_hflags_a64(env, new_el);
     aarch64_restore_sp(env, new_el);
 
     env->pc = addr;
diff --git a/target/arm/machine.c b/target/arm/machine.c
index b292549614..61889801dd 100644
--- a/target/arm/machine.c
+++ b/target/arm/machine.c
@@ -743,6 +743,7 @@ static int cpu_post_load(void *opaque, int version_id)
     if (!kvm_enabled()) {
         pmu_op_finish(&cpu->env);
     }
+    arm_rebuild_hflags(&cpu->env);
 
     return 0;
 }
diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c
index c998eadfaa..8d459d9a84 100644
--- a/target/arm/op_helper.c
+++ b/target/arm/op_helper.c
@@ -571,6 +571,7 @@ void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val)
      */
     env->regs[15] &= (env->thumb ? ~1 : ~3);
 
+    rebuild_hflags_a32(env, arm_current_el(env));
     qemu_mutex_lock_iothread();
     arm_call_el_change_hook(arm_env_get_cpu(env));
     qemu_mutex_unlock_iothread();
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index e002251ac6..ad665c1609 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -1841,11 +1841,15 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
         /* I/O operations must end the TB here (whether read or write) */
         gen_io_end();
         s->base.is_jmp = DISAS_UPDATE;
-    } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
+    }
+    if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
         /* We default to ending the TB on a coprocessor register write,
          * but allow this to be suppressed by the register definition
          * (usually only necessary to work around guest bugs).
          */
+        TCGv_i32 tcg_el = tcg_const_i32(s->current_el);
+        gen_helper_rebuild_hflags_a64(cpu_env, tcg_el);
+        tcg_temp_free_i32(tcg_el);
         s->base.is_jmp = DISAS_UPDATE;
     }
 }
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 66cf28c8cb..36842a27b3 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -8432,6 +8432,8 @@ static int disas_coproc_insn(DisasContext *s, uint32_t insn)
     ri = get_arm_cp_reginfo(s->cp_regs,
             ENCODE_CP_REG(cpnum, is64, s->ns, crn, crm, opc1, opc2));
     if (ri) {
+        bool need_exit_tb;
+
         /* Check access permissions */
         if (!cp_access_ok(s->current_el, ri, isread)) {
             return 1;
@@ -8604,15 +8606,23 @@ static int disas_coproc_insn(DisasContext *s, uint32_t insn)
             }
         }
 
+        need_exit_tb = false;
         if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
             /* I/O operations must end the TB here (whether read or write) */
             gen_io_end();
-            gen_lookup_tb(s);
-        } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
+            need_exit_tb = true;
+        }
+        if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
             /* We default to ending the TB on a coprocessor register write,
              * but allow this to be suppressed by the register definition
              * (usually only necessary to work around guest bugs).
              */
+            TCGv_i32 tcg_el = tcg_const_i32(s->current_el);
+            gen_helper_rebuild_hflags_a32(cpu_env, tcg_el);
+            tcg_temp_free_i32(tcg_el);
+            need_exit_tb = true;
+        }
+        if (need_exit_tb) {
             gen_lookup_tb(s);
         }
 
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [Qemu-devel] [PATCH 3/4] target/arm: Assert hflags is correct in cpu_get_tb_cpu_state
  2019-02-14  4:06 [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Richard Henderson
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 1/4] target/arm: Split out recompute_hflags et al Richard Henderson
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 2/4] target/arm: Rebuild hflags at el changes and MSR writes Richard Henderson
@ 2019-02-14  4:06 ` Richard Henderson
  2019-02-19 14:53   ` Alex Bennée
  2019-02-19 15:23   ` Alex Bennée
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 4/4] target/arm: Rely on hflags " Richard Henderson
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 17+ messages in thread
From: Richard Henderson @ 2019-02-14  4:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, alex.bennee, cota

Make sure that we are updating env->hflags everywhere required.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/helper.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index d8249f0eae..3c8724883d 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -13902,11 +13902,13 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
     if (is_a64(env)) {
         *pc = env->pc;
         flags = rebuild_hflags_a64(env, current_el);
+        assert(flags == env->hflags);
         flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
         pstate_for_ss = env->pstate;
     } else {
         *pc = env->regs[15];
         flags = rebuild_hflags_a32(env, current_el);
+        assert(flags == env->hflags);
         flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);
         flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);
         pstate_for_ss = env->uncached_cpsr;
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [Qemu-devel] [PATCH 4/4] target/arm: Rely on hflags correct in cpu_get_tb_cpu_state
  2019-02-14  4:06 [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Richard Henderson
                   ` (2 preceding siblings ...)
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 3/4] target/arm: Assert hflags is correct in cpu_get_tb_cpu_state Richard Henderson
@ 2019-02-14  4:06 ` Richard Henderson
  2019-02-19 20:17   ` Alex Bennée
  2019-02-14 10:28 ` [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Laurent Desnogues
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 17+ messages in thread
From: Richard Henderson @ 2019-02-14  4:06 UTC (permalink / raw)
  To: qemu-devel; +Cc: peter.maydell, alex.bennee, cota

This is the payoff.

>From perf record -g data of ubuntu 18 boot and shutdown:

BEFORE:

-   23.02%     2.82%  qemu-system-aar  [.] helper_lookup_tb_ptr
   - 20.22% helper_lookup_tb_ptr
      + 10.05% tb_htable_lookup
      - 9.13% cpu_get_tb_cpu_state
           3.20% aa64_va_parameters_both
           0.55% fp_exception_el

-   11.66%     4.74%  qemu-system-aar  [.] cpu_get_tb_cpu_state
   - 6.96% cpu_get_tb_cpu_state
        3.63% aa64_va_parameters_both
        0.60% fp_exception_el
        0.53% sve_exception_el

AFTER:

-   16.40%     3.40%  qemu-system-aar  [.] helper_lookup_tb_ptr
   - 13.03% helper_lookup_tb_ptr
      + 11.19% tb_htable_lookup
        0.55% cpu_get_tb_cpu_state

     0.98%     0.71%  qemu-system-aar  [.] cpu_get_tb_cpu_state

     0.87%     0.24%  qemu-system-aar  [.] rebuild_hflags_a64

Before, helper_lookup_tb_ptr is the second hottest function in the
application, consuming almost a quarter of the runtime.  Within the
entire execution, cpu_get_tb_cpu_state consumes about 12%.

After, helper_lookup_tb_ptr has dropped to the fourth hottest function,
with consumption dropping to a sixth of the runtime.  Within the
entire execution, cpu_get_tb_cpu_state has dropped below 1%, and the
supporting function to rebuild hflags also consumes about 1%.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/arm/helper.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 3c8724883d..1bdb87267e 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -13894,21 +13894,16 @@ void HELPER(rebuild_hflags_a64)(CPUARMState *env, uint32_t el)
 void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
                           target_ulong *cs_base, uint32_t *pflags)
 {
-    int current_el = arm_current_el(env);
-    uint32_t flags;
+    uint32_t flags = env->hflags;
     uint32_t pstate_for_ss;
 
     *cs_base = 0;
-    if (is_a64(env)) {
+    if (FIELD_EX32(flags, TBFLAG_ANY, AARCH64_STATE)) {
         *pc = env->pc;
-        flags = rebuild_hflags_a64(env, current_el);
-        assert(flags == env->hflags);
         flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
         pstate_for_ss = env->pstate;
     } else {
         *pc = env->regs[15];
-        flags = rebuild_hflags_a32(env, current_el);
-        assert(flags == env->hflags);
         flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);
         flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);
         pstate_for_ss = env->uncached_cpsr;
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state
  2019-02-14  4:06 [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Richard Henderson
                   ` (3 preceding siblings ...)
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 4/4] target/arm: Rely on hflags " Richard Henderson
@ 2019-02-14 10:28 ` Laurent Desnogues
  2019-02-14 11:05 ` Alex Bennée
  2019-02-14 17:05 ` Emilio G. Cota
  6 siblings, 0 replies; 17+ messages in thread
From: Laurent Desnogues @ 2019-02-14 10:28 UTC (permalink / raw)
  To: Richard Henderson
  Cc: qemu-devel, Peter Maydell, Emilio G. Cota, Alex Bennée

Hi Richard,

On Thu, Feb 14, 2019 at 5:07 AM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> We've talked about this before, caching state to reduce the
> amount of computation that happens looking up each TB.
>
> I know that Peter has been concerned that we would not be able to
> reliably maintain all of the places that need to be updates to
> keep this up-to-date.
>
> Well, modulo dirty tricks within linux-user, it appears as if
> exception delivery and return, plus after every TB-ending write
> to a system register is sufficient.
>
> There seems to be a noticable improvement, although wall-time
> is harder to come by -- all of my system-level measurements
> include user input, and my user-level measurements seem to be
> too small to matter.

FWIW this patch series made a run of linux-user AArch64 176.gcc 166.i
go from 29.5s down to 24.5s (on an E5-2650 v2).  Though that'd need
more benchmarks, that looks quite good to me.

Thanks,

Laurent

>
> r~
>
>
> Richard Henderson (4):
>   target/arm: Split out recompute_hflags et al
>   target/arm: Rebuild hflags at el changes and MSR writes
>   target/arm: Assert hflags is correct in cpu_get_tb_cpu_state
>   target/arm: Rely on hflags correct in cpu_get_tb_cpu_state
>
>  target/arm/cpu.h           |  22 ++-
>  target/arm/helper.h        |   3 +
>  target/arm/internals.h     |   4 +
>  linux-user/syscall.c       |   1 +
>  target/arm/cpu.c           |   1 +
>  target/arm/helper-a64.c    |   3 +
>  target/arm/helper.c        | 267 ++++++++++++++++++++++---------------
>  target/arm/machine.c       |   1 +
>  target/arm/op_helper.c     |   1 +
>  target/arm/translate-a64.c |   6 +-
>  target/arm/translate.c     |  14 +-
>  11 files changed, 204 insertions(+), 119 deletions(-)
>
> --
> 2.17.1
>
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state
  2019-02-14  4:06 [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Richard Henderson
                   ` (4 preceding siblings ...)
  2019-02-14 10:28 ` [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Laurent Desnogues
@ 2019-02-14 11:05 ` Alex Bennée
  2019-02-14 17:05 ` Emilio G. Cota
  6 siblings, 0 replies; 17+ messages in thread
From: Alex Bennée @ 2019-02-14 11:05 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, peter.maydell, cota


Richard Henderson <richard.henderson@linaro.org> writes:

> We've talked about this before, caching state to reduce the
> amount of computation that happens looking up each TB.
>
> I know that Peter has been concerned that we would not be able to
> reliably maintain all of the places that need to be updates to
> keep this up-to-date.
>
> Well, modulo dirty tricks within linux-user, it appears as if
> exception delivery and return, plus after every TB-ending write
> to a system register is sufficient.
>
> There seems to be a noticable improvement, although wall-time
> is harder to come by -- all of my system-level measurements
> include user input, and my user-level measurements seem to be
> too small to matter.

I'll run some of my benchmarks on it but I'm almost certain it will help
as it is one of the highest hits on "perf top" when running a loaded
guest.

>
>
> r~
>
>
> Richard Henderson (4):
>   target/arm: Split out recompute_hflags et al
>   target/arm: Rebuild hflags at el changes and MSR writes
>   target/arm: Assert hflags is correct in cpu_get_tb_cpu_state
>   target/arm: Rely on hflags correct in cpu_get_tb_cpu_state
>
>  target/arm/cpu.h           |  22 ++-
>  target/arm/helper.h        |   3 +
>  target/arm/internals.h     |   4 +
>  linux-user/syscall.c       |   1 +
>  target/arm/cpu.c           |   1 +
>  target/arm/helper-a64.c    |   3 +
>  target/arm/helper.c        | 267 ++++++++++++++++++++++---------------
>  target/arm/machine.c       |   1 +
>  target/arm/op_helper.c     |   1 +
>  target/arm/translate-a64.c |   6 +-
>  target/arm/translate.c     |  14 +-
>  11 files changed, 204 insertions(+), 119 deletions(-)


--
Alex Bennée

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state
  2019-02-14  4:06 [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Richard Henderson
                   ` (5 preceding siblings ...)
  2019-02-14 11:05 ` Alex Bennée
@ 2019-02-14 17:05 ` Emilio G. Cota
  6 siblings, 0 replies; 17+ messages in thread
From: Emilio G. Cota @ 2019-02-14 17:05 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, peter.maydell, alex.bennee

On Wed, Feb 13, 2019 at 20:06:48 -0800, Richard Henderson wrote:
> We've talked about this before, caching state to reduce the
> amount of computation that happens looking up each TB.
> 
> I know that Peter has been concerned that we would not be able to 
> reliably maintain all of the places that need to be updates to
> keep this up-to-date.
> 
> Well, modulo dirty tricks within linux-user, it appears as if
> exception delivery and return, plus after every TB-ending write
> to a system register is sufficient.
> 
> There seems to be a noticable improvement, although wall-time
> is harder to come by -- all of my system-level measurements
> include user input, and my user-level measurements seem to be
> too small to matter.

Thanks for this!

Some SPEC06int user-mode numbers (before vs. after)

                   aarch64-linux-user speedup for SPEC06int (test set)
                      Host: Intel(R) Xeon(R) Gold 6142 CPU @ 2.60GHz

                      2 +-----------------------------------------+
                        |                                         |
                    1.9 |-+.........................a+-+r.......+-|
                        |                            +-+          |
                        |                            * *          |
                    1.8 |-+..........................*.*........+-|
                        |       +-+                  * *          |
                    1.7 |-+.....+-+...............+-+*.*...+-+..+-|
                        |       * *         +-+   * ** *   +-+    |
                    1.6 |-+.....*.*..........|....*.**.*+-+*.*..+-|
                        |       * *         *|*   * ** *+-+* *    |
                    1.5 |-+.....*.*.........*|*...*.**.**.**.*..+-|
                        |       * *         +-+   * ** ** ** *    |
                        |       * *         * *   * ** ** ** *    |
                    1.4 |-+.....*.*.........*.*...*.**.**.**.*+-+-|
                        |       * *   +-+   * *   * ** ** ** ** * |
                    1.3 |-+.....*.*...+-+...*.*...*.**.**.**.**.*-|
                        | +-+   * *   * *   * *   * ** ** ** ** * |
                    1.2 |-+-+...*.*...*.*...*.*...*.**.**.**.**.*-|
                        | * *   * *   * *   * *   * ** ** ** ** * |
                        | * *   * *   * *+-+* *   * ** ** ** ** * |
                    1.1 |-*.*...*.*...*.**.**.*...*.**.**.**.**.*-|
                        | * *+-+* *+-+* ** ** *+-+* ** ** ** ** * |
                      1 +-----------------------------------------+
              400.per401.b40344454462.li464471.483.xalangeomean
 png: https://imgur.com/RjkYYJ5

That is, a 1.4x average speedup.

		Emilio

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 1/4] target/arm: Split out recompute_hflags et al
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 1/4] target/arm: Split out recompute_hflags et al Richard Henderson
@ 2019-02-19 11:06   ` Alex Bennée
  2019-02-19 15:06     ` Richard Henderson
  0 siblings, 1 reply; 17+ messages in thread
From: Alex Bennée @ 2019-02-19 11:06 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, peter.maydell, cota


Richard Henderson <richard.henderson@linaro.org> writes:

> We will use these to minimize the computation for every call to
> cpu_get_tb_cpu_state.  For now, the env->hflags variable is not used.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/arm/cpu.h       |  22 +++-
>  target/arm/helper.h    |   3 +
>  target/arm/internals.h |   3 +
>  target/arm/helper.c    | 268 ++++++++++++++++++++++++-----------------
>  4 files changed, 180 insertions(+), 116 deletions(-)
>
> diff --git a/target/arm/cpu.h b/target/arm/cpu.h
> index 47238e4245..8b0dea947b 100644
> --- a/target/arm/cpu.h
> +++ b/target/arm/cpu.h
> @@ -240,6 +240,9 @@ typedef struct CPUARMState {
>      uint32_t pstate;
>      uint32_t aarch64; /* 1 if CPU is in aarch64 state; inverse of PSTATE.nRW */
>
> +    /* Cached TBFLAGS state.  See below for which bits are included.  */
> +    uint32_t hflags;
> +
>      /* Frequently accessed CPSR bits are stored separately for efficiency.
>         This contains all the other bits.  Use cpsr_{read,write} to access
>         the whole CPSR.  */
> @@ -3019,25 +3022,28 @@ static inline bool arm_cpu_data_is_big_endian(CPUARMState *env)
>
>  #include "exec/cpu-all.h"
>
> -/* Bit usage in the TB flags field: bit 31 indicates whether we are
> +/*
> + * Bit usage in the TB flags field: bit 31 indicates whether we are
>   * in 32 or 64 bit mode. The meaning of the other bits depends on that.
>   * We put flags which are shared between 32 and 64 bit mode at the top
>   * of the word, and flags which apply to only one mode at the bottom.
> + *
> + * Unless otherwise noted, these bits are cached in env->hflags.
>   */
>  FIELD(TBFLAG_ANY, AARCH64_STATE, 31, 1)
>  FIELD(TBFLAG_ANY, MMUIDX, 28, 3)
>  FIELD(TBFLAG_ANY, SS_ACTIVE, 27, 1)
> -FIELD(TBFLAG_ANY, PSTATE_SS, 26, 1)
> +FIELD(TBFLAG_ANY, PSTATE_SS, 26, 1)     /* Not cached. */
>  /* Target EL if we take a floating-point-disabled exception */
>  FIELD(TBFLAG_ANY, FPEXC_EL, 24, 2)
>  FIELD(TBFLAG_ANY, BE_DATA, 23, 1)
>
>  /* Bit usage when in AArch32 state: */
> -FIELD(TBFLAG_A32, THUMB, 0, 1)
> +FIELD(TBFLAG_A32, THUMB, 0, 1)          /* Not cached. */
>  FIELD(TBFLAG_A32, VECLEN, 1, 3)
>  FIELD(TBFLAG_A32, VECSTRIDE, 4, 2)
>  FIELD(TBFLAG_A32, VFPEN, 7, 1)
> -FIELD(TBFLAG_A32, CONDEXEC, 8, 8)
> +FIELD(TBFLAG_A32, CONDEXEC, 8, 8)       /* Not cached. */
>  FIELD(TBFLAG_A32, SCTLR_B, 16, 1)
>  /* We store the bottom two bits of the CPAR as TB flags and handle
>   * checks on the other bits at runtime
> @@ -3059,7 +3065,7 @@ FIELD(TBFLAG_A64, SVEEXC_EL, 2, 2)
>  FIELD(TBFLAG_A64, ZCR_LEN, 4, 4)
>  FIELD(TBFLAG_A64, PAUTH_ACTIVE, 8, 1)
>  FIELD(TBFLAG_A64, BT, 9, 1)
> -FIELD(TBFLAG_A64, BTYPE, 10, 2)
> +FIELD(TBFLAG_A64, BTYPE, 10, 2)         /* Not cached. */
>  FIELD(TBFLAG_A64, TBID, 12, 2)
>
>  static inline bool bswap_code(bool sctlr_b)
> @@ -3144,6 +3150,12 @@ void arm_register_pre_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook,
>  void arm_register_el_change_hook(ARMCPU *cpu, ARMELChangeHookFn *hook, void
>          *opaque);
>
> +/**
> + * arm_rebuild_hflags:
> + * Rebuild the cached TBFLAGS for arbitrary changed processor state.
> + */
> +void arm_rebuild_hflags(CPUARMState *env);
> +
>  /**
>   * aa32_vfp_dreg:
>   * Return a pointer to the Dn register within env in 32-bit mode.
> diff --git a/target/arm/helper.h b/target/arm/helper.h
> index 53a38188c6..e3c98913e6 100644
> --- a/target/arm/helper.h
> +++ b/target/arm/helper.h
> @@ -653,6 +653,9 @@ DEF_HELPER_FLAGS_6(gvec_fmla_idx_s, TCG_CALL_NO_RWG,
>  DEF_HELPER_FLAGS_6(gvec_fmla_idx_d, TCG_CALL_NO_RWG,
>                     void, ptr, ptr, ptr, ptr, ptr, i32)
>
> +DEF_HELPER_FLAGS_2(rebuild_hflags_a32, TCG_CALL_NO_RWG, void, env, i32)
> +DEF_HELPER_FLAGS_2(rebuild_hflags_a64, TCG_CALL_NO_RWG, void, env, i32)
> +
>  #ifdef TARGET_AARCH64
>  #include "helper-a64.h"
>  #include "helper-sve.h"
> diff --git a/target/arm/internals.h b/target/arm/internals.h
> index a4bd1becb7..8c1b813364 100644
> --- a/target/arm/internals.h
> +++ b/target/arm/internals.h
> @@ -968,4 +968,7 @@ ARMVAParameters aa64_va_parameters_both(CPUARMState *env, uint64_t va,
>  ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va,
>                                     ARMMMUIdx mmu_idx, bool data);
>
> +uint32_t rebuild_hflags_a32(CPUARMState *env, int el);
> +uint32_t rebuild_hflags_a64(CPUARMState *env, int el);
> +
>  #endif
> diff --git a/target/arm/helper.c b/target/arm/helper.c
> index 520ceea7a4..7a77f53ba8 100644
> --- a/target/arm/helper.c
> +++ b/target/arm/helper.c
> @@ -13745,122 +13745,15 @@ ARMMMUIdx arm_stage1_mmu_idx(CPUARMState *env)
>  }
>  #endif
>
> -void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
> -                          target_ulong *cs_base, uint32_t *pflags)
> +static uint32_t common_hflags(CPUARMState *env, int el, ARMMMUIdx mmu_idx,
> +                              int fp_el, uint32_t flags)
>  {
> -    ARMMMUIdx mmu_idx = arm_mmu_idx(env);
> -    int current_el = arm_current_el(env);
> -    int fp_el = fp_exception_el(env, current_el);
> -    uint32_t flags = 0;
> -
> -    if (is_a64(env)) {
> -        ARMCPU *cpu = arm_env_get_cpu(env);
> -        uint64_t sctlr;
> -
> -        *pc = env->pc;
> -        flags = FIELD_DP32(flags, TBFLAG_ANY, AARCH64_STATE, 1);
> -
> -        /* Get control bits for tagged addresses.  */
> -        {
> -            ARMMMUIdx stage1 = stage_1_mmu_idx(mmu_idx);
> -            ARMVAParameters p0 = aa64_va_parameters_both(env, 0, stage1);
> -            int tbii, tbid;
> -
> -            /* FIXME: ARMv8.1-VHE S2 translation regime.  */
> -            if (regime_el(env, stage1) < 2) {
> -                ARMVAParameters p1 = aa64_va_parameters_both(env, -1, stage1);
> -                tbid = (p1.tbi << 1) | p0.tbi;
> -                tbii = tbid & ~((p1.tbid << 1) | p0.tbid);
> -            } else {
> -                tbid = p0.tbi;
> -                tbii = tbid & !p0.tbid;
> -            }
> -
> -            flags = FIELD_DP32(flags, TBFLAG_A64, TBII, tbii);
> -            flags = FIELD_DP32(flags, TBFLAG_A64, TBID, tbid);
> -        }
> -
> -        if (cpu_isar_feature(aa64_sve, cpu)) {
> -            int sve_el = sve_exception_el(env, current_el);
> -            uint32_t zcr_len;
> -
> -            /* If SVE is disabled, but FP is enabled,
> -             * then the effective len is 0.
> -             */
> -            if (sve_el != 0 && fp_el == 0) {
> -                zcr_len = 0;
> -            } else {
> -                zcr_len = sve_zcr_len_for_el(env, current_el);
> -            }
> -            flags = FIELD_DP32(flags, TBFLAG_A64, SVEEXC_EL, sve_el);
> -            flags = FIELD_DP32(flags, TBFLAG_A64, ZCR_LEN, zcr_len);
> -        }
> -
> -        if (current_el == 0) {
> -            /* FIXME: ARMv8.1-VHE S2 translation regime.  */
> -            sctlr = env->cp15.sctlr_el[1];
> -        } else {
> -            sctlr = env->cp15.sctlr_el[current_el];
> -        }
> -        if (cpu_isar_feature(aa64_pauth, cpu)) {
> -            /*
> -             * In order to save space in flags, we record only whether
> -             * pauth is "inactive", meaning all insns are implemented as
> -             * a nop, or "active" when some action must be performed.
> -             * The decision of which action to take is left to a helper.
> -             */
> -            if (sctlr & (SCTLR_EnIA | SCTLR_EnIB | SCTLR_EnDA | SCTLR_EnDB)) {
> -                flags = FIELD_DP32(flags, TBFLAG_A64, PAUTH_ACTIVE, 1);
> -            }
> -        }
> -
> -        if (cpu_isar_feature(aa64_bti, cpu)) {
> -            /* Note that SCTLR_EL[23].BT == SCTLR_BT1.  */
> -            if (sctlr & (current_el == 0 ? SCTLR_BT0 : SCTLR_BT1)) {
> -                flags = FIELD_DP32(flags, TBFLAG_A64, BT, 1);
> -            }
> -            flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
> -        }
> -    } else {
> -        *pc = env->regs[15];
> -        flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);
> -        flags = FIELD_DP32(flags, TBFLAG_A32, VECLEN, env->vfp.vec_len);
> -        flags = FIELD_DP32(flags, TBFLAG_A32, VECSTRIDE, env->vfp.vec_stride);
> -        flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);
> -        flags = FIELD_DP32(flags, TBFLAG_A32, SCTLR_B, arm_sctlr_b(env));
> -        flags = FIELD_DP32(flags, TBFLAG_A32, NS, !access_secure_reg(env));
> -        if (env->vfp.xregs[ARM_VFP_FPEXC] & (1 << 30)
> -            || arm_el_is_aa64(env, 1)) {
> -            flags = FIELD_DP32(flags, TBFLAG_A32, VFPEN, 1);
> -        }
> -        flags = FIELD_DP32(flags, TBFLAG_A32, XSCALE_CPAR, env->cp15.c15_cpar);
> -    }
> -
>      flags = FIELD_DP32(flags, TBFLAG_ANY, MMUIDX, arm_to_core_mmu_idx(mmu_idx));
> +    flags = FIELD_DP32(flags, TBFLAG_ANY, FPEXC_EL, fp_el);
>
> -    /* The SS_ACTIVE and PSTATE_SS bits correspond to the state machine
> -     * states defined in the ARM ARM for software singlestep:
> -     *  SS_ACTIVE   PSTATE.SS   State
> -     *     0            x       Inactive (the TB flag for SS is always 0)
> -     *     1            0       Active-pending
> -     *     1            1       Active-not-pending
> -     */
> -    if (arm_singlestep_active(env)) {
> -        flags = FIELD_DP32(flags, TBFLAG_ANY, SS_ACTIVE, 1);
> -        if (is_a64(env)) {
> -            if (env->pstate & PSTATE_SS) {
> -                flags = FIELD_DP32(flags, TBFLAG_ANY, PSTATE_SS, 1);
> -            }
> -        } else {
> -            if (env->uncached_cpsr & PSTATE_SS) {
> -                flags = FIELD_DP32(flags, TBFLAG_ANY, PSTATE_SS, 1);
> -            }
> -        }
> -    }
>      if (arm_cpu_data_is_big_endian(env)) {
>          flags = FIELD_DP32(flags, TBFLAG_ANY, BE_DATA, 1);
>      }
> -    flags = FIELD_DP32(flags, TBFLAG_ANY, FPEXC_EL, fp_el);
>
>      if (arm_v7m_is_handler_mode(env)) {
>          flags = FIELD_DP32(flags, TBFLAG_A32, HANDLER, 1);
> @@ -13876,8 +13769,161 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
>          flags = FIELD_DP32(flags, TBFLAG_A32, STACKCHECK, 1);
>      }
>
> -    *pflags = flags;
> +    if (arm_singlestep_active(env)) {
> +        flags = FIELD_DP32(flags, TBFLAG_ANY, SS_ACTIVE, 1);
> +    }
> +
> +    return flags;
> +}
> +
> +uint32_t rebuild_hflags_a32(CPUARMState *env, int el)
> +{
> +    uint32_t flags = 0;
> +    ARMMMUIdx mmu_idx;
> +    int fp_el;
> +
> +    flags = FIELD_DP32(flags, TBFLAG_A32, VECLEN, env->vfp.vec_len);
> +    flags = FIELD_DP32(flags, TBFLAG_A32, VECSTRIDE, env->vfp.vec_stride);
> +    flags = FIELD_DP32(flags, TBFLAG_A32, SCTLR_B, arm_sctlr_b(env));
> +    flags = FIELD_DP32(flags, TBFLAG_A32, NS, !access_secure_reg(env));
> +    if (env->vfp.xregs[ARM_VFP_FPEXC] & (1 << 30)
> +        || arm_el_is_aa64(env, 1)) {
> +        flags = FIELD_DP32(flags, TBFLAG_A32, VFPEN, 1);
> +    }
> +    flags = FIELD_DP32(flags, TBFLAG_A32, XSCALE_CPAR, env->cp15.c15_cpar);
> +
> +    mmu_idx = arm_mmu_idx(env);
> +    fp_el = fp_exception_el(env, el);
> +    return common_hflags(env, el, mmu_idx, fp_el, flags);
> +}
> +
> +uint32_t rebuild_hflags_a64(CPUARMState *env, int el)
> +{
> +    ARMCPU *cpu = arm_env_get_cpu(env);
> +    ARMMMUIdx mmu_idx = arm_mmu_idx(env);
> +    ARMMMUIdx stage1 = stage_1_mmu_idx(mmu_idx);
> +    ARMVAParameters p0 = aa64_va_parameters_both(env, 0, stage1);
> +    int fp_el = fp_exception_el(env, el);
> +    uint32_t flags = 0;
> +    uint64_t sctlr;
> +    int tbii, tbid;
> +
> +    flags = FIELD_DP32(flags, TBFLAG_ANY, AARCH64_STATE, 1);
> +
> +    /* Get control bits for tagged addresses.  */
> +    /* FIXME: ARMv8.1-VHE S2 translation regime.  */

This is technically a TODO isn't it?

> +    if (regime_el(env, stage1) < 2) {
> +        ARMVAParameters p1 = aa64_va_parameters_both(env, -1, stage1);
> +        tbid = (p1.tbi << 1) | p0.tbi;
> +        tbii = tbid & ~((p1.tbid << 1) | p0.tbid);
> +    } else {
> +        tbid = p0.tbi;
> +        tbii = tbid & !p0.tbid;
> +    }
> +
> +    flags = FIELD_DP32(flags, TBFLAG_A64, TBII, tbii);
> +    flags = FIELD_DP32(flags, TBFLAG_A64, TBID, tbid);
> +
> +    if (cpu_isar_feature(aa64_sve, cpu)) {
> +        int sve_el = sve_exception_el(env, el);
> +        uint32_t zcr_len;
> +
> +        /* If SVE is disabled, but FP is enabled,
> +         * then the effective len is 0.
> +         */
> +        if (sve_el != 0 && fp_el == 0) {
> +            zcr_len = 0;
> +        } else {
> +            zcr_len = sve_zcr_len_for_el(env, el);
> +        }
> +        flags = FIELD_DP32(flags, TBFLAG_A64, SVEEXC_EL, sve_el);
> +        flags = FIELD_DP32(flags, TBFLAG_A64, ZCR_LEN, zcr_len);
> +    }
> +
> +    if (el == 0) {
> +        /* FIXME: ARMv8.1-VHE S2 translation regime.  */
> +        sctlr = env->cp15.sctlr_el[1];
> +    } else {
> +        sctlr = env->cp15.sctlr_el[el];
> +    }
> +    if (cpu_isar_feature(aa64_pauth, cpu)) {
> +        /*
> +         * In order to save space in flags, we record only whether
> +         * pauth is "inactive", meaning all insns are implemented as
> +         * a nop, or "active" when some action must be performed.
> +         * The decision of which action to take is left to a helper.
> +         */
> +        if (sctlr & (SCTLR_EnIA | SCTLR_EnIB | SCTLR_EnDA | SCTLR_EnDB)) {
> +            flags = FIELD_DP32(flags, TBFLAG_A64, PAUTH_ACTIVE, 1);
> +        }
> +    }
> +
> +    if (cpu_isar_feature(aa64_bti, cpu)) {
> +        /* Note that SCTLR_EL[23].BT == SCTLR_BT1.  */
> +        if (sctlr & (el == 0 ? SCTLR_BT0 : SCTLR_BT1)) {
> +            flags = FIELD_DP32(flags, TBFLAG_A64, BT, 1);
> +        }
> +        flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
> +    }
> +
> +    return common_hflags(env, el, mmu_idx, fp_el, flags);
> +}
> +
> +void arm_rebuild_hflags(CPUARMState *env)
> +{
> +    int el = arm_current_el(env);
> +    env->hflags = (is_a64(env)
> +                   ? rebuild_hflags_a64(env, el)
> +                   : rebuild_hflags_a32(env, el));
> +}
> +
> +void HELPER(rebuild_hflags_a32)(CPUARMState *env, uint32_t el)
> +{
> +    tcg_debug_assert(!is_a64(env));
> +    env->hflags = rebuild_hflags_a32(env, el);
> +}
> +
> +void HELPER(rebuild_hflags_a64)(CPUARMState *env, uint32_t el)
> +{
> +    tcg_debug_assert(is_a64(env));
> +    env->hflags = rebuild_hflags_a64(env, el);
> +}
> +
> +void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
> +                          target_ulong *cs_base, uint32_t *pflags)
> +{
> +    int current_el = arm_current_el(env);
> +    uint32_t flags;
> +    uint32_t pstate_for_ss;
> +
>      *cs_base = 0;
> +    if (is_a64(env)) {
> +        *pc = env->pc;
> +        flags = rebuild_hflags_a64(env, current_el);
> +        flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
> +        pstate_for_ss = env->pstate;
> +    } else {
> +        *pc = env->regs[15];
> +        flags = rebuild_hflags_a32(env, current_el);
> +        flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);
> +        flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);
> +        pstate_for_ss = env->uncached_cpsr;
> +    }
> +
> +    /* The SS_ACTIVE and PSTATE_SS bits correspond to the state machine
> +     * states defined in the ARM ARM for software singlestep:
> +     *  SS_ACTIVE   PSTATE.SS   State
> +     *     0            x       Inactive (the TB flag for SS is always 0)
> +     *     1            0       Active-pending
> +     *     1            1       Active-not-pending
> +     * SS_ACTIVE is set in hflags; PSTATE_SS is computed every TB.
> +     */
> +    if (FIELD_EX32(flags, TBFLAG_ANY, SS_ACTIVE)
> +        && (pstate_for_ss & PSTATE_SS)) {
> +        flags = FIELD_DP32(flags, TBFLAG_ANY, PSTATE_SS, 1);
> +    }
> +
> +    *pflags = flags;
>  }
>
>  #ifdef TARGET_AARCH64

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

--
Alex Bennée

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] target/arm: Rebuild hflags at el changes and MSR writes
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 2/4] target/arm: Rebuild hflags at el changes and MSR writes Richard Henderson
@ 2019-02-19 14:48   ` Alex Bennée
  2019-02-19 15:10     ` Richard Henderson
  2019-02-19 19:43   ` Alex Bennée
  1 sibling, 1 reply; 17+ messages in thread
From: Alex Bennée @ 2019-02-19 14:48 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, peter.maydell, cota


Richard Henderson <richard.henderson@linaro.org> writes:

> Now setting, but not relying upon, env->hflags.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/arm/internals.h     |  1 +
>  linux-user/syscall.c       |  1 +
>  target/arm/cpu.c           |  1 +
>  target/arm/helper-a64.c    |  3 +++
>  target/arm/helper.c        |  2 ++
>  target/arm/machine.c       |  1 +
>  target/arm/op_helper.c     |  1 +
>  target/arm/translate-a64.c |  6 +++++-
>  target/arm/translate.c     | 14 ++++++++++++--
>  9 files changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/target/arm/internals.h b/target/arm/internals.h
> index 8c1b813364..235f4fafec 100644
> --- a/target/arm/internals.h
> +++ b/target/arm/internals.h
> @@ -970,5 +970,6 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va,
>
>  uint32_t rebuild_hflags_a32(CPUARMState *env, int el);
>  uint32_t rebuild_hflags_a64(CPUARMState *env, int el);
> +void rebuild_hflags_any(CPUARMState *env);
>
>  #endif
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index 5bbb72f3d5..123f342bdc 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -9691,6 +9691,7 @@ static abi_long do_syscall1(void *cpu_env, int num, abi_long arg1,
>                      aarch64_sve_narrow_vq(env, vq);
>                  }
>                  env->vfp.zcr_el[1] = vq - 1;
> +                arm_rebuild_hflags(env);
>                  ret = vq * 16;
>              }
>              return ret;
> diff --git a/target/arm/cpu.c b/target/arm/cpu.c
> index edf6e0e1f1..e4da513eb3 100644
> --- a/target/arm/cpu.c
> +++ b/target/arm/cpu.c
> @@ -390,6 +390,7 @@ static void arm_cpu_reset(CPUState *s)
>
>      hw_breakpoint_update_all(cpu);
>      hw_watchpoint_update_all(cpu);
> +    arm_rebuild_hflags(env);
>  }
>
>  bool arm_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
> diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
> index 101fa6d3ea..f51cb75444 100644
> --- a/target/arm/helper-a64.c
> +++ b/target/arm/helper-a64.c
> @@ -995,6 +995,7 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
>          } else {
>              env->regs[15] = new_pc & ~0x3;
>          }
> +        rebuild_hflags_a32(env, new_el);
>          qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
>                        "AArch32 EL%d PC 0x%" PRIx32 "\n",
>                        cur_el, new_el, env->regs[15]);
> @@ -1006,10 +1007,12 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
>          }
>          aarch64_restore_sp(env, new_el);
>          env->pc = new_pc;
> +        rebuild_hflags_a64(env, new_el);
>          qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
>                        "AArch64 EL%d PC 0x%" PRIx64 "\n",
>                        cur_el, new_el, env->pc);
>      }
> +
>      /*
>       * Note that cur_el can never be 0.  If new_el is 0, then
>       * el0_a64 is return_to_aa64, else el0_a64 is ignored.
> diff --git a/target/arm/helper.c b/target/arm/helper.c
> index 7a77f53ba8..d8249f0eae 100644
> --- a/target/arm/helper.c
> +++ b/target/arm/helper.c
> @@ -9081,6 +9081,7 @@ static void take_aarch32_exception(CPUARMState *env, int new_mode,
>          env->regs[14] = env->regs[15] + offset;
>      }
>      env->regs[15] = newpc;
> +    env->hflags = rebuild_hflags_a32(env, arm_current_el(env));
>  }
>
>  static void arm_cpu_do_interrupt_aarch32_hyp(CPUState *cs)
> @@ -9426,6 +9427,7 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs)
>
>      pstate_write(env, PSTATE_DAIF | new_mode);
>      env->aarch64 = 1;
> +    env->hflags = rebuild_hflags_a64(env, new_el);
>      aarch64_restore_sp(env, new_el);
>
>      env->pc = addr;
> diff --git a/target/arm/machine.c b/target/arm/machine.c
> index b292549614..61889801dd 100644
> --- a/target/arm/machine.c
> +++ b/target/arm/machine.c
> @@ -743,6 +743,7 @@ static int cpu_post_load(void *opaque, int version_id)
>      if (!kvm_enabled()) {
>          pmu_op_finish(&cpu->env);
>      }
> +    arm_rebuild_hflags(&cpu->env);
>
>      return 0;
>  }
> diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c
> index c998eadfaa..8d459d9a84 100644
> --- a/target/arm/op_helper.c
> +++ b/target/arm/op_helper.c
> @@ -571,6 +571,7 @@ void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val)
>       */
>      env->regs[15] &= (env->thumb ? ~1 : ~3);
>
> +    rebuild_hflags_a32(env, arm_current_el(env));
>      qemu_mutex_lock_iothread();
>      arm_call_el_change_hook(arm_env_get_cpu(env));
>      qemu_mutex_unlock_iothread();
> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
> index e002251ac6..ad665c1609 100644
> --- a/target/arm/translate-a64.c
> +++ b/target/arm/translate-a64.c
> @@ -1841,11 +1841,15 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
>          /* I/O operations must end the TB here (whether read or write) */
>          gen_io_end();
>          s->base.is_jmp = DISAS_UPDATE;
> -    } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
> +    }
> +    if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {

Does this potentially introduce a icount bug (or fix an existing bug)?

>          /* We default to ending the TB on a coprocessor register write,
>           * but allow this to be suppressed by the register definition
>           * (usually only necessary to work around guest bugs).
>           */
> +        TCGv_i32 tcg_el = tcg_const_i32(s->current_el);
> +        gen_helper_rebuild_hflags_a64(cpu_env, tcg_el);
> +        tcg_temp_free_i32(tcg_el);
>          s->base.is_jmp = DISAS_UPDATE;
>      }
>  }
> diff --git a/target/arm/translate.c b/target/arm/translate.c
> index 66cf28c8cb..36842a27b3 100644
> --- a/target/arm/translate.c
> +++ b/target/arm/translate.c
> @@ -8432,6 +8432,8 @@ static int disas_coproc_insn(DisasContext *s, uint32_t insn)
>      ri = get_arm_cp_reginfo(s->cp_regs,
>              ENCODE_CP_REG(cpnum, is64, s->ns, crn, crm, opc1, opc2));
>      if (ri) {
> +        bool need_exit_tb;
> +
>          /* Check access permissions */
>          if (!cp_access_ok(s->current_el, ri, isread)) {
>              return 1;
> @@ -8604,15 +8606,23 @@ static int disas_coproc_insn(DisasContext *s, uint32_t insn)
>              }
>          }
>
> +        need_exit_tb = false;

I'd of initialised this with false up above but I guess that's more a
stylistic choice.

>          if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
>              /* I/O operations must end the TB here (whether read or write) */
>              gen_io_end();
> -            gen_lookup_tb(s);
> -        } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
> +            need_exit_tb = true;
> +        }
> +        if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
>              /* We default to ending the TB on a coprocessor register write,
>               * but allow this to be suppressed by the register definition
>               * (usually only necessary to work around guest bugs).
>               */
> +            TCGv_i32 tcg_el = tcg_const_i32(s->current_el);
> +            gen_helper_rebuild_hflags_a32(cpu_env, tcg_el);
> +            tcg_temp_free_i32(tcg_el);
> +            need_exit_tb = true;
> +        }
> +        if (need_exit_tb) {
>              gen_lookup_tb(s);
>          }

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

--
Alex Bennée

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 3/4] target/arm: Assert hflags is correct in cpu_get_tb_cpu_state
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 3/4] target/arm: Assert hflags is correct in cpu_get_tb_cpu_state Richard Henderson
@ 2019-02-19 14:53   ` Alex Bennée
  2019-02-19 15:23   ` Alex Bennée
  1 sibling, 0 replies; 17+ messages in thread
From: Alex Bennée @ 2019-02-19 14:53 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, peter.maydell, cota


Richard Henderson <richard.henderson@linaro.org> writes:

> Make sure that we are updating env->hflags everywhere required.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

Arguably this is a debugging aid and we don't need it the git history,
nevertheless:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

> ---
>  target/arm/helper.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/target/arm/helper.c b/target/arm/helper.c
> index d8249f0eae..3c8724883d 100644
> --- a/target/arm/helper.c
> +++ b/target/arm/helper.c
> @@ -13902,11 +13902,13 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
>      if (is_a64(env)) {
>          *pc = env->pc;
>          flags = rebuild_hflags_a64(env, current_el);
> +        assert(flags == env->hflags);
>          flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
>          pstate_for_ss = env->pstate;
>      } else {
>          *pc = env->regs[15];
>          flags = rebuild_hflags_a32(env, current_el);
> +        assert(flags == env->hflags);
>          flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);
>          flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);
>          pstate_for_ss = env->uncached_cpsr;


--
Alex Bennée

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 1/4] target/arm: Split out recompute_hflags et al
  2019-02-19 11:06   ` Alex Bennée
@ 2019-02-19 15:06     ` Richard Henderson
  0 siblings, 0 replies; 17+ messages in thread
From: Richard Henderson @ 2019-02-19 15:06 UTC (permalink / raw)
  To: Alex Bennée; +Cc: qemu-devel, peter.maydell, cota

On 2/19/19 3:06 AM, Alex Bennée wrote:
>> +    /* FIXME: ARMv8.1-VHE S2 translation regime.  */
> This is technically a TODO isn't it?
> 

Yeah, sure, but we aren't good about that categorization.
And anyway this is code movement.


r~

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] target/arm: Rebuild hflags at el changes and MSR writes
  2019-02-19 14:48   ` Alex Bennée
@ 2019-02-19 15:10     ` Richard Henderson
  0 siblings, 0 replies; 17+ messages in thread
From: Richard Henderson @ 2019-02-19 15:10 UTC (permalink / raw)
  To: Alex Bennée; +Cc: qemu-devel, peter.maydell, cota

On 2/19/19 6:48 AM, Alex Bennée wrote:
>> +++ b/target/arm/translate-a64.c
>> @@ -1841,11 +1841,15 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
>>          /* I/O operations must end the TB here (whether read or write) */
>>          gen_io_end();
>>          s->base.is_jmp = DISAS_UPDATE;
>> -    } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
>> +    }
>> +    if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
> Does this potentially introduce a icount bug (or fix an existing bug)?
> 
>>          /* We default to ending the TB on a coprocessor register write,
>>           * but allow this to be suppressed by the register definition
>>           * (usually only necessary to work around guest bugs).
>>           */
>> +        TCGv_i32 tcg_el = tcg_const_i32(s->current_el);
>> +        gen_helper_rebuild_hflags_a64(cpu_env, tcg_el);
>> +        tcg_temp_free_i32(tcg_el);
>>          s->base.is_jmp = DISAS_UPDATE;
>>      }

Neither.

Previously, all that was required was that either icount or !SUPPRESS_TB_END
and a TB, and that was done simply by setting DISAS_UPDATE.  Which both if
blocks did.

Now, icount ends a TB (still by setting DISAS_UPDATE).  But if !SUPPRESS_TB_END
then we must also rebuild hflags (with a possible harmless double-set of
DISAS_UPDATE).


r~

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 3/4] target/arm: Assert hflags is correct in cpu_get_tb_cpu_state
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 3/4] target/arm: Assert hflags is correct in cpu_get_tb_cpu_state Richard Henderson
  2019-02-19 14:53   ` Alex Bennée
@ 2019-02-19 15:23   ` Alex Bennée
  1 sibling, 0 replies; 17+ messages in thread
From: Alex Bennée @ 2019-02-19 15:23 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, peter.maydell, cota


Richard Henderson <richard.henderson@linaro.org> writes:

> Make sure that we are updating env->hflags everywhere required.

It's a good job you added it thought because on this commit with:

  -machine virt,graphics=on,gic-version=3,virtualization=on

We get:

  qemu-system-aarch64: /home/alex.bennee/lsrc/qemu.git/target/arm/helper.c:14045: cpu_get_tb_cpu_state: Assertion `flags == env->hflags' failed.

  Thread 3 "qemu-system-aar" received signal SIGABRT, Aborted.
  [Switching to Thread 0x7fffde05e700 (LWP 20621)]
  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
  51      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
  (gdb) bt
  #0  0x00007ffff278fe97 in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:51
  #1  0x00007ffff2791801 in __GI_abort () at abort.c:79
  #2  0x00007ffff278139a in __assert_fail_base (fmt=0x7ffff29087d8 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=assertion@entry=0x555555dbb012 "flags == env->hflags", file=file@entry=0x555555db96e8 "/home/alex.bennee/lsrc/qemu.git/target/arm/helper.c", line=line@entry=14045, function=function@entry=0x555555dbd810 <__PRETTY_FUNCTION__.43682> "cpu_get_tb_cpu_state") at assert.c:92
  #3  0x00007ffff2781412 in __GI___assert_fail (assertion=assertion@entry=0x555555dbb012 "flags == env->hflags", file=file@entry=0x555555db96e8 "/home/alex.bennee/lsrc/qemu.git/target/arm/helper.c", line=line@entry=14045, function=function@entry=0x555555dbd810 <__PRETTY_FUNCTION__.43682> "cpu_get_tb_cpu_state") at assert.c:101
  #4  0x00005555559bf437 in cpu_get_tb_cpu_state (env=0x5555567150e0, pc=pc@entry=0x7fffde05b0b8, cs_base=cs_base@entry=0x7fffde05b0b0, pflags=pflags@entry=0x7fffde05b0a8) at
  /home/alex.bennee/lsrc/qemu.git/target/arm/helper.c:14045
  #5  0x00005555558ab09b in tb_lookup__cpu_state (cf_mask=524288, flags=0x7fffde05b0a8, cs_base=0x7fffde05b0b0, pc=0x7fffde05b0b8, cpu=0x0) at /home/alex.bennee/lsrc/qemu.git/include/exec/tb-lookup.h:28
  #6  0x00005555558ab09b in tb_find (cf_mask=524288, tb_exit=0, last_tb=0x0, cpu=0x0) at /home/alex.bennee/lsrc/qemu.git/accel/tcg/cpu-exec.c:404
  #7  0x00005555558ab09b in cpu_exec (cpu=cpu@entry=0x55555670ce30) at /home/alex.bennee/lsrc/qemu.git/accel/tcg/cpu-exec.c:728
  #8  0x000055555586963f in tcg_cpu_exec (cpu=0x55555670ce30) at /home/alex.bennee/lsrc/qemu.git/cpus.c:1429
  #9  0x000055555586b7c3 in qemu_tcg_cpu_thread_fn (arg=arg@entry=0x55555670ce30) at /home/alex.bennee/lsrc/qemu.git/cpus.c:1733
  #10 0x0000555555d44e06 in qemu_thread_start (args=<optimized out>) at util/qemu-thread-posix.c:502
  #11 0x00007ffff2b496db in start_thread (arg=0x7fffde05e700) at pthread_create.c:463
  #12 0x00007ffff287288f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95


>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/arm/helper.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/target/arm/helper.c b/target/arm/helper.c
> index d8249f0eae..3c8724883d 100644
> --- a/target/arm/helper.c
> +++ b/target/arm/helper.c
> @@ -13902,11 +13902,13 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
>      if (is_a64(env)) {
>          *pc = env->pc;
>          flags = rebuild_hflags_a64(env, current_el);
> +        assert(flags == env->hflags);
>          flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
>          pstate_for_ss = env->pstate;
>      } else {
>          *pc = env->regs[15];
>          flags = rebuild_hflags_a32(env, current_el);
> +        assert(flags == env->hflags);
>          flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);
>          flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);
>          pstate_for_ss = env->uncached_cpsr;


--
Alex Bennée

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] target/arm: Rebuild hflags at el changes and MSR writes
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 2/4] target/arm: Rebuild hflags at el changes and MSR writes Richard Henderson
  2019-02-19 14:48   ` Alex Bennée
@ 2019-02-19 19:43   ` Alex Bennée
  1 sibling, 0 replies; 17+ messages in thread
From: Alex Bennée @ 2019-02-19 19:43 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, peter.maydell, cota


Richard Henderson <richard.henderson@linaro.org> writes:

> Now setting, but not relying upon, env->hflags.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/arm/internals.h     |  1 +
>  linux-user/syscall.c       |  1 +
>  target/arm/cpu.c           |  1 +
>  target/arm/helper-a64.c    |  3 +++
>  target/arm/helper.c        |  2 ++
>  target/arm/machine.c       |  1 +
>  target/arm/op_helper.c     |  1 +
>  target/arm/translate-a64.c |  6 +++++-
>  target/arm/translate.c     | 14 ++++++++++++--
>  9 files changed, 27 insertions(+), 3 deletions(-)
>
> diff --git a/target/arm/internals.h b/target/arm/internals.h
> index 8c1b813364..235f4fafec 100644
> --- a/target/arm/internals.h
> +++ b/target/arm/internals.h
> @@ -970,5 +970,6 @@ ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va,
>
>  uint32_t rebuild_hflags_a32(CPUARMState *env, int el);
>  uint32_t rebuild_hflags_a64(CPUARMState *env, int el);
> +void rebuild_hflags_any(CPUARMState *env);
>
>  #endif
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index 5bbb72f3d5..123f342bdc 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -9691,6 +9691,7 @@ static abi_long do_syscall1(void *cpu_env, int num, abi_long arg1,
>                      aarch64_sve_narrow_vq(env, vq);
>                  }
>                  env->vfp.zcr_el[1] = vq - 1;
> +                arm_rebuild_hflags(env);
>                  ret = vq * 16;
>              }
>              return ret;
> diff --git a/target/arm/cpu.c b/target/arm/cpu.c
> index edf6e0e1f1..e4da513eb3 100644
> --- a/target/arm/cpu.c
> +++ b/target/arm/cpu.c
> @@ -390,6 +390,7 @@ static void arm_cpu_reset(CPUState *s)
>
>      hw_breakpoint_update_all(cpu);
>      hw_watchpoint_update_all(cpu);
> +    arm_rebuild_hflags(env);
>  }
>
>  bool arm_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
> diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
> index 101fa6d3ea..f51cb75444 100644
> --- a/target/arm/helper-a64.c
> +++ b/target/arm/helper-a64.c
> @@ -995,6 +995,7 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
>          } else {
>              env->regs[15] = new_pc & ~0x3;
>          }
> +        rebuild_hflags_a32(env, new_el);

env->hflags = rebuild_hflags_a32(env, new_el);

>          qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
>                        "AArch32 EL%d PC 0x%" PRIx32 "\n",
>                        cur_el, new_el, env->regs[15]);
> @@ -1006,10 +1007,12 @@ void HELPER(exception_return)(CPUARMState *env, uint64_t new_pc)
>          }
>          aarch64_restore_sp(env, new_el);
>          env->pc = new_pc;
> +        rebuild_hflags_a64(env, new_el);

env->flags = rebuild_hflags_a64(env, new_el);

Or possibly call arm_rebuild_hflags() instead.

>          qemu_log_mask(CPU_LOG_INT, "Exception return from AArch64 EL%d to "
>                        "AArch64 EL%d PC 0x%" PRIx64 "\n",
>                        cur_el, new_el, env->pc);
>      }
> +
>      /*
>       * Note that cur_el can never be 0.  If new_el is 0, then
>       * el0_a64 is return_to_aa64, else el0_a64 is ignored.
> diff --git a/target/arm/helper.c b/target/arm/helper.c
> index 7a77f53ba8..d8249f0eae 100644
> --- a/target/arm/helper.c
> +++ b/target/arm/helper.c
> @@ -9081,6 +9081,7 @@ static void take_aarch32_exception(CPUARMState *env, int new_mode,
>          env->regs[14] = env->regs[15] + offset;
>      }
>      env->regs[15] = newpc;
> +    env->hflags = rebuild_hflags_a32(env, arm_current_el(env));
>  }
>
>  static void arm_cpu_do_interrupt_aarch32_hyp(CPUState *cs)
> @@ -9426,6 +9427,7 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs)
>
>      pstate_write(env, PSTATE_DAIF | new_mode);
>      env->aarch64 = 1;
> +    env->hflags = rebuild_hflags_a64(env, new_el);
>      aarch64_restore_sp(env, new_el);
>
>      env->pc = addr;
> diff --git a/target/arm/machine.c b/target/arm/machine.c
> index b292549614..61889801dd 100644
> --- a/target/arm/machine.c
> +++ b/target/arm/machine.c
> @@ -743,6 +743,7 @@ static int cpu_post_load(void *opaque, int version_id)
>      if (!kvm_enabled()) {
>          pmu_op_finish(&cpu->env);
>      }
> +    arm_rebuild_hflags(&cpu->env);
>
>      return 0;
>  }
> diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c
> index c998eadfaa..8d459d9a84 100644
> --- a/target/arm/op_helper.c
> +++ b/target/arm/op_helper.c
> @@ -571,6 +571,7 @@ void HELPER(cpsr_write_eret)(CPUARMState *env, uint32_t val)
>       */
>      env->regs[15] &= (env->thumb ? ~1 : ~3);
>
> +    rebuild_hflags_a32(env, arm_current_el(env));
>      qemu_mutex_lock_iothread();
>      arm_call_el_change_hook(arm_env_get_cpu(env));
>      qemu_mutex_unlock_iothread();
> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
> index e002251ac6..ad665c1609 100644
> --- a/target/arm/translate-a64.c
> +++ b/target/arm/translate-a64.c
> @@ -1841,11 +1841,15 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
>          /* I/O operations must end the TB here (whether read or write) */
>          gen_io_end();
>          s->base.is_jmp = DISAS_UPDATE;
> -    } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
> +    }
> +    if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
>          /* We default to ending the TB on a coprocessor register write,
>           * but allow this to be suppressed by the register definition
>           * (usually only necessary to work around guest bugs).
>           */
> +        TCGv_i32 tcg_el = tcg_const_i32(s->current_el);
> +        gen_helper_rebuild_hflags_a64(cpu_env, tcg_el);
> +        tcg_temp_free_i32(tcg_el);
>          s->base.is_jmp = DISAS_UPDATE;
>      }
>  }
> diff --git a/target/arm/translate.c b/target/arm/translate.c
> index 66cf28c8cb..36842a27b3 100644
> --- a/target/arm/translate.c
> +++ b/target/arm/translate.c
> @@ -8432,6 +8432,8 @@ static int disas_coproc_insn(DisasContext *s, uint32_t insn)
>      ri = get_arm_cp_reginfo(s->cp_regs,
>              ENCODE_CP_REG(cpnum, is64, s->ns, crn, crm, opc1, opc2));
>      if (ri) {
> +        bool need_exit_tb;
> +
>          /* Check access permissions */
>          if (!cp_access_ok(s->current_el, ri, isread)) {
>              return 1;
> @@ -8604,15 +8606,23 @@ static int disas_coproc_insn(DisasContext *s, uint32_t insn)
>              }
>          }
>
> +        need_exit_tb = false;
>          if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
>              /* I/O operations must end the TB here (whether read or write) */
>              gen_io_end();
> -            gen_lookup_tb(s);
> -        } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
> +            need_exit_tb = true;
> +        }
> +        if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
>              /* We default to ending the TB on a coprocessor register write,
>               * but allow this to be suppressed by the register definition
>               * (usually only necessary to work around guest bugs).
>               */
> +            TCGv_i32 tcg_el = tcg_const_i32(s->current_el);
> +            gen_helper_rebuild_hflags_a32(cpu_env, tcg_el);
> +            tcg_temp_free_i32(tcg_el);
> +            need_exit_tb = true;
> +        }
> +        if (need_exit_tb) {
>              gen_lookup_tb(s);
>          }


--
Alex Bennée

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 4/4] target/arm: Rely on hflags correct in cpu_get_tb_cpu_state
  2019-02-14  4:06 ` [Qemu-devel] [PATCH 4/4] target/arm: Rely on hflags " Richard Henderson
@ 2019-02-19 20:17   ` Alex Bennée
  2019-02-19 22:40     ` Richard Henderson
  0 siblings, 1 reply; 17+ messages in thread
From: Alex Bennée @ 2019-02-19 20:17 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel, peter.maydell, cota


Richard Henderson <richard.henderson@linaro.org> writes:

> This is the payoff.

\o/

Running my own silly pigz benchmark:

Before:
  Time (mean ± σ):     66.035 s ±  1.425 s

After:
  Time (mean ± σ):     57.191 s ±  0.675 s

>
> From perf record -g data of ubuntu 18 boot and shutdown:
>
> BEFORE:
>
> -   23.02%     2.82%  qemu-system-aar  [.] helper_lookup_tb_ptr
>    - 20.22% helper_lookup_tb_ptr
>       + 10.05% tb_htable_lookup
>       - 9.13% cpu_get_tb_cpu_state
>            3.20% aa64_va_parameters_both
>            0.55% fp_exception_el
>
> -   11.66%     4.74%  qemu-system-aar  [.] cpu_get_tb_cpu_state
>    - 6.96% cpu_get_tb_cpu_state
>         3.63% aa64_va_parameters_both
>         0.60% fp_exception_el
>         0.53% sve_exception_el
>
> AFTER:
>
> -   16.40%     3.40%  qemu-system-aar  [.] helper_lookup_tb_ptr
>    - 13.03% helper_lookup_tb_ptr
>       + 11.19% tb_htable_lookup
>         0.55% cpu_get_tb_cpu_state
>
>      0.98%     0.71%  qemu-system-aar  [.] cpu_get_tb_cpu_state
>
>      0.87%     0.24%  qemu-system-aar  [.] rebuild_hflags_a64
>
> Before, helper_lookup_tb_ptr is the second hottest function in the
> application, consuming almost a quarter of the runtime.  Within the
> entire execution, cpu_get_tb_cpu_state consumes about 12%.
>
> After, helper_lookup_tb_ptr has dropped to the fourth hottest function,
> with consumption dropping to a sixth of the runtime.  Within the
> entire execution, cpu_get_tb_cpu_state has dropped below 1%, and the
> supporting function to rebuild hflags also consumes about 1%.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/arm/helper.c | 9 ++-------
>  1 file changed, 2 insertions(+), 7 deletions(-)
>
> diff --git a/target/arm/helper.c b/target/arm/helper.c
> index 3c8724883d..1bdb87267e 100644
> --- a/target/arm/helper.c
> +++ b/target/arm/helper.c
> @@ -13894,21 +13894,16 @@ void HELPER(rebuild_hflags_a64)(CPUARMState *env, uint32_t el)
>  void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
>                            target_ulong *cs_base, uint32_t *pflags)
>  {
> -    int current_el = arm_current_el(env);
> -    uint32_t flags;
> +    uint32_t flags = env->hflags;
>      uint32_t pstate_for_ss;
>
>      *cs_base = 0;
> -    if (is_a64(env)) {
> +    if (FIELD_EX32(flags, TBFLAG_ANY, AARCH64_STATE)) {
>          *pc = env->pc;
> -        flags = rebuild_hflags_a64(env, current_el);
> -        assert(flags == env->hflags);

While debugging I came up with this monstrosity:

    if (FIELD_EX32(flags, TBFLAG_ANY, AARCH64_STATE)) {
#ifdef CONFIG_DEBUG_TCG
        static uint32_t tb_state = 0;
        uint32_t recalc_flags = rebuild_hflags_a64(env, arm_current_el(env));
        tb_state++;
        if (flags != recalc_flags) {
            fprintf(stderr, "%s: flags %#x, should be %#x (%#x/%d)\n", __func__,
                    flags, recalc_flags, flags ^ recalc_flags, tb_state);
            abort();
        }
#endif
        *pc = env->pc;
        flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
        pstate_for_ss = env->pstate;
    } else {

I suspect given cached flags are common about the translator it might be
worth coming up with a nicer macro to encapsulate the check for
DEBUG_TCG builds.

>          flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
>          pstate_for_ss = env->pstate;
>      } else {
>          *pc = env->regs[15];
> -        flags = rebuild_hflags_a32(env, current_el);
> -        assert(flags == env->hflags);
>          flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);
>          flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);
>          pstate_for_ss = env->uncached_cpsr;

--
Alex Bennée

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 4/4] target/arm: Rely on hflags correct in cpu_get_tb_cpu_state
  2019-02-19 20:17   ` Alex Bennée
@ 2019-02-19 22:40     ` Richard Henderson
  0 siblings, 0 replies; 17+ messages in thread
From: Richard Henderson @ 2019-02-19 22:40 UTC (permalink / raw)
  To: Alex Bennée; +Cc: qemu-devel, peter.maydell, cota

On 2/19/19 12:17 PM, Alex Bennée wrote:
> While debugging I came up with this monstrosity:
> 
>     if (FIELD_EX32(flags, TBFLAG_ANY, AARCH64_STATE)) {
> #ifdef CONFIG_DEBUG_TCG
>         static uint32_t tb_state = 0;
>         uint32_t recalc_flags = rebuild_hflags_a64(env, arm_current_el(env));
>         tb_state++;
>         if (flags != recalc_flags) {
>             fprintf(stderr, "%s: flags %#x, should be %#x (%#x/%d)\n", __func__,
>                     flags, recalc_flags, flags ^ recalc_flags, tb_state);
>             abort();
>         }
> #endif
>         *pc = env->pc;
>         flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
>         pstate_for_ss = env->pstate;
>     } else {

I have now included

+#ifdef CONFIG_DEBUG_TCG
+    {
+        int el = arm_current_el(env);
+        uint32_t check_flags;
+        if (is_a64(env)) {
+            check_flags = rebuild_hflags_a64(env, el);
+        } else {
+            check_flags = rebuild_hflags_a32(env, el);
+        }
+        g_assert_cmphex(flags, ==, check_flags);
+    }
+#endif


r~

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2019-02-19 22:40 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-14  4:06 [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Richard Henderson
2019-02-14  4:06 ` [Qemu-devel] [PATCH 1/4] target/arm: Split out recompute_hflags et al Richard Henderson
2019-02-19 11:06   ` Alex Bennée
2019-02-19 15:06     ` Richard Henderson
2019-02-14  4:06 ` [Qemu-devel] [PATCH 2/4] target/arm: Rebuild hflags at el changes and MSR writes Richard Henderson
2019-02-19 14:48   ` Alex Bennée
2019-02-19 15:10     ` Richard Henderson
2019-02-19 19:43   ` Alex Bennée
2019-02-14  4:06 ` [Qemu-devel] [PATCH 3/4] target/arm: Assert hflags is correct in cpu_get_tb_cpu_state Richard Henderson
2019-02-19 14:53   ` Alex Bennée
2019-02-19 15:23   ` Alex Bennée
2019-02-14  4:06 ` [Qemu-devel] [PATCH 4/4] target/arm: Rely on hflags " Richard Henderson
2019-02-19 20:17   ` Alex Bennée
2019-02-19 22:40     ` Richard Henderson
2019-02-14 10:28 ` [Qemu-devel] [PATCH 0/4] target/arm: Reduce overhead of cpu_get_tb_cpu_state Laurent Desnogues
2019-02-14 11:05 ` Alex Bennée
2019-02-14 17:05 ` Emilio G. Cota

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.