All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] Reorg ppc64 pmu insn counting
@ 2021-12-23  3:01 Richard Henderson
  2021-12-23  3:01 ` [PATCH 1/3] target/ppc: Cache per-pmc insn and cycle count settings Richard Henderson
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Richard Henderson @ 2021-12-23  3:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: danielhb413, qemu-ppc, clg, david

In contrast to Daniel's version, the code stays in power8-pmu.c,
but is better organized to not take so much overhead.

Before:

    32.97%  qemu-system-ppc  qemu-system-ppc64   [.] pmc_get_event
    20.22%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
     4.52%  qemu-system-ppc  qemu-system-ppc64   [.] hreg_compute_hflags_value
     3.30%  qemu-system-ppc  qemu-system-ppc64   [.] helper_lookup_tb_ptr
     2.68%  qemu-system-ppc  qemu-system-ppc64   [.] tcg_gen_code
     2.28%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
     1.84%  qemu-system-ppc  qemu-system-ppc64   [.] pmu_insn_cnt_enabled

After:

     8.42%  qemu-system-ppc  qemu-system-ppc64   [.] hreg_compute_hflags_value
     6.65%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
     6.63%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc


r~


Richard Henderson (3):
  target/ppc: Cache per-pmc insn and cycle count settings
  target/ppc: Rewrite pmu_increment_insns
  target/ppc: Use env->pnc_cyc_cnt

 target/ppc/cpu.h         |   3 +
 target/ppc/power8-pmu.h  |  14 +--
 target/ppc/cpu_init.c    |   1 +
 target/ppc/helper_regs.c |   2 +-
 target/ppc/machine.c     |   2 +
 target/ppc/power8-pmu.c  | 230 ++++++++++++++++-----------------------
 6 files changed, 108 insertions(+), 144 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/3] target/ppc: Cache per-pmc insn and cycle count settings
  2021-12-23  3:01 [PATCH 0/3] Reorg ppc64 pmu insn counting Richard Henderson
@ 2021-12-23  3:01 ` Richard Henderson
  2021-12-23  3:01 ` [PATCH 2/3] target/ppc: Rewrite pmu_increment_insns Richard Henderson
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2021-12-23  3:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: danielhb413, qemu-ppc, clg, david

This is the combination of frozen bit and counter type, on a per
counter basis.  So far this is only used by HFLAGS_INSN_CNT, but
will be used more later.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/cpu.h         |  3 +++
 target/ppc/power8-pmu.h  | 14 +++++------
 target/ppc/cpu_init.c    |  1 +
 target/ppc/helper_regs.c |  2 +-
 target/ppc/machine.c     |  2 ++
 target/ppc/power8-pmu.c  | 51 +++++++++++++++++++++++++++++++---------
 6 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index fc66c3561d..a297a52168 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1144,6 +1144,9 @@ struct CPUPPCState {
     /* Other registers */
     target_ulong spr[1024]; /* special purpose registers */
     ppc_spr_t spr_cb[1024];
+    /* Composite status for PMC[1-5] enabled and counting insns or cycles. */
+    uint8_t pmc_ins_cnt;
+    uint8_t pmc_cyc_cnt;
     /* Vector status and control register, minus VSCR_SAT */
     uint32_t vscr;
     /* VSX registers (including FP and AVR) */
diff --git a/target/ppc/power8-pmu.h b/target/ppc/power8-pmu.h
index 3ee4b4cda5..a839199561 100644
--- a/target/ppc/power8-pmu.h
+++ b/target/ppc/power8-pmu.h
@@ -13,14 +13,12 @@
 #ifndef POWER8_PMU
 #define POWER8_PMU
 
-#include "qemu/osdep.h"
-#include "cpu.h"
-#include "exec/exec-all.h"
-#include "exec/helper-proto.h"
-#include "qemu/error-report.h"
-#include "qemu/main-loop.h"
-
 void cpu_ppc_pmu_init(CPUPPCState *env);
-bool pmu_insn_cnt_enabled(CPUPPCState *env);
+
+#if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
+void pmu_update_summaries(CPUPPCState *env);
+#else
+static inline void pmu_update_summaries(CPUPPCState *env) { }
+#endif
 
 #endif
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 06ef15cd9e..63f9babfee 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -8313,6 +8313,7 @@ static void ppc_cpu_reset(DeviceState *dev)
 #endif /* CONFIG_TCG */
 #endif
 
+    pmu_update_summaries(env);
     hreg_compute_hflags(env);
     env->reserve_addr = (target_ulong)-1ULL;
     /* Be sure no exception or interrupt is pending */
diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
index b847928842..8671b7bb69 100644
--- a/target/ppc/helper_regs.c
+++ b/target/ppc/helper_regs.c
@@ -123,7 +123,7 @@ static uint32_t hreg_compute_hflags_value(CPUPPCState *env)
     }
 
 #if defined(TARGET_PPC64)
-    if (pmu_insn_cnt_enabled(env)) {
+    if (env->pmc_ins_cnt) {
         hflags |= 1 << HFLAGS_INSN_CNT;
     }
 #endif
diff --git a/target/ppc/machine.c b/target/ppc/machine.c
index 93972df58e..756d8de5d8 100644
--- a/target/ppc/machine.c
+++ b/target/ppc/machine.c
@@ -8,6 +8,7 @@
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
 #include "kvm_ppc.h"
+#include "power8-pmu.h"
 
 static void post_load_update_msr(CPUPPCState *env)
 {
@@ -19,6 +20,7 @@ static void post_load_update_msr(CPUPPCState *env)
      */
     env->msr ^= env->msr_mask & ~((1ULL << MSR_TGPR) | MSR_HVB);
     ppc_store_msr(env, msr);
+    pmu_update_summaries(env);
 }
 
 static int cpu_load_old(QEMUFile *f, void *opaque, int version_id)
diff --git a/target/ppc/power8-pmu.c b/target/ppc/power8-pmu.c
index 08d1902cd5..d547e4c99b 100644
--- a/target/ppc/power8-pmu.c
+++ b/target/ppc/power8-pmu.c
@@ -11,8 +11,6 @@
  */
 
 #include "qemu/osdep.h"
-
-#include "power8-pmu.h"
 #include "cpu.h"
 #include "helper_regs.h"
 #include "exec/exec-all.h"
@@ -20,6 +18,7 @@
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 #include "hw/ppc/ppc.h"
+#include "power8-pmu.h"
 
 #if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
 
@@ -121,18 +120,47 @@ static PMUEventType pmc_get_event(CPUPPCState *env, int sprn)
     return evt_type;
 }
 
-bool pmu_insn_cnt_enabled(CPUPPCState *env)
+void pmu_update_summaries(CPUPPCState *env)
 {
-    int sprn;
+    target_ulong mmcr0 = env->spr[SPR_POWER_MMCR0];
+    target_ulong mmcr1 = env->spr[SPR_POWER_MMCR1];
+    int ins_cnt = 0;
+    int cyc_cnt = 0;
 
-    for (sprn = SPR_POWER_PMC1; sprn <= SPR_POWER_PMC5; sprn++) {
-        if (pmc_get_event(env, sprn) == PMU_EVENT_INSTRUCTIONS ||
-            pmc_get_event(env, sprn) == PMU_EVENT_INSN_RUN_LATCH) {
-            return true;
+    if (!(mmcr0 & MMCR0_FC14) && mmcr1 != 0) {
+        target_ulong sel;
+
+        sel = extract64(mmcr1, MMCR1_PMC1EVT_EXTR, MMCR1_EVT_SIZE);
+        switch (sel) {
+        case 0x02:
+        case 0xfe:
+            ins_cnt |= 1 << 1;
+            break;
+        case 0x1e:
+        case 0xf0:
+            cyc_cnt |= 1 << 1;
+            break;
         }
+
+        sel = extract64(mmcr1, MMCR1_PMC2EVT_EXTR, MMCR1_EVT_SIZE);
+        ins_cnt |= (sel == 0x02) << 2;
+        cyc_cnt |= (sel == 0x1e) << 2;
+
+        sel = extract64(mmcr1, MMCR1_PMC3EVT_EXTR, MMCR1_EVT_SIZE);
+        ins_cnt |= (sel == 0x02) << 3;
+        cyc_cnt |= (sel == 0x1e) << 3;
+
+        sel = extract64(mmcr1, MMCR1_PMC4EVT_EXTR, MMCR1_EVT_SIZE);
+        ins_cnt |= ((sel == 0xf4) || (sel == 0x2)) << 4;
+        cyc_cnt |= (sel == 0x1e) << 3;
     }
 
-    return false;
+    ins_cnt |= !(mmcr0 & MMCR0_FC56) << 5;
+    cyc_cnt |= !(mmcr0 & MMCR0_FC56) << 6;
+
+    env->pmc_ins_cnt = ins_cnt;
+    env->pmc_cyc_cnt = cyc_cnt;
+    env->hflags = deposit32(env->hflags, HFLAGS_INSN_CNT, 1, ins_cnt != 0);
 }
 
 static bool pmu_increment_insns(CPUPPCState *env, uint32_t num_insns)
@@ -264,8 +292,9 @@ void helper_store_mmcr0(CPUPPCState *env, target_ulong value)
 
     env->spr[SPR_POWER_MMCR0] = value;
 
-    /* MMCR0 writes can change HFLAGS_PMCCCLEAR and HFLAGS_INSN_CNT */
+    /* MMCR0 writes can change HFLAGS_PMCC[01] and HFLAGS_INSN_CNT */
     hreg_compute_hflags(env);
+    pmu_update_summaries(env);
 
     /* Update cycle overflow timers with the current MMCR0 state */
     pmu_update_overflow_timers(env);
@@ -278,7 +307,7 @@ void helper_store_mmcr1(CPUPPCState *env, uint64_t value)
     env->spr[SPR_POWER_MMCR1] = value;
 
     /* MMCR1 writes can change HFLAGS_INSN_CNT */
-    hreg_compute_hflags(env);
+    pmu_update_summaries(env);
 }
 
 target_ulong helper_read_pmc(CPUPPCState *env, uint32_t sprn)
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] target/ppc: Rewrite pmu_increment_insns
  2021-12-23  3:01 [PATCH 0/3] Reorg ppc64 pmu insn counting Richard Henderson
  2021-12-23  3:01 ` [PATCH 1/3] target/ppc: Cache per-pmc insn and cycle count settings Richard Henderson
@ 2021-12-23  3:01 ` Richard Henderson
  2021-12-23  3:01 ` [PATCH 3/3] target/ppc: Use env->pnc_cyc_cnt Richard Henderson
  2021-12-23 20:36 ` [PATCH 0/3] Reorg ppc64 pmu insn counting Daniel Henrique Barboza
  3 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2021-12-23  3:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: danielhb413, qemu-ppc, clg, david

Use the cached pmc_ins_cnt value.  Unroll the loop over the
different PMC counters.  Treat the PMC4 run-latch specially.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/power8-pmu.c | 76 ++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 28 deletions(-)

diff --git a/target/ppc/power8-pmu.c b/target/ppc/power8-pmu.c
index d547e4c99b..c60afa56a5 100644
--- a/target/ppc/power8-pmu.c
+++ b/target/ppc/power8-pmu.c
@@ -165,45 +165,65 @@ void pmu_update_summaries(CPUPPCState *env)
 
 static bool pmu_increment_insns(CPUPPCState *env, uint32_t num_insns)
 {
+    target_ulong mmcr0 = env->spr[SPR_POWER_MMCR0];
+    unsigned ins_cnt = env->pmc_ins_cnt;
     bool overflow_triggered = false;
-    int sprn;
+    target_ulong tmp;
 
-    /* PMC6 never counts instructions */
-    for (sprn = SPR_POWER_PMC1; sprn <= SPR_POWER_PMC5; sprn++) {
-        PMUEventType evt_type = pmc_get_event(env, sprn);
-        bool insn_event = evt_type == PMU_EVENT_INSTRUCTIONS ||
-                          evt_type == PMU_EVENT_INSN_RUN_LATCH;
-
-        if (pmc_is_inactive(env, sprn) || !insn_event) {
-            continue;
+    if (unlikely(ins_cnt & 0x1e)) {
+        if (ins_cnt & (1 << 1)) {
+            tmp = env->spr[SPR_POWER_PMC1];
+            tmp += num_insns;
+            if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMC1CE)) {
+                tmp = PMC_COUNTER_NEGATIVE_VAL;
+                overflow_triggered = true;
+            }
+            env->spr[SPR_POWER_PMC1] = tmp;
         }
 
-        if (evt_type == PMU_EVENT_INSTRUCTIONS) {
-            env->spr[sprn] += num_insns;
+        if (ins_cnt & (1 << 2)) {
+            tmp = env->spr[SPR_POWER_PMC2];
+            tmp += num_insns;
+            if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
+                tmp = PMC_COUNTER_NEGATIVE_VAL;
+                overflow_triggered = true;
+            }
+            env->spr[SPR_POWER_PMC2] = tmp;
         }
 
-        if (evt_type == PMU_EVENT_INSN_RUN_LATCH &&
-            env->spr[SPR_CTRL] & CTRL_RUN) {
-            env->spr[sprn] += num_insns;
+        if (ins_cnt & (1 << 3)) {
+            tmp = env->spr[SPR_POWER_PMC3];
+            tmp += num_insns;
+            if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
+                tmp = PMC_COUNTER_NEGATIVE_VAL;
+                overflow_triggered = true;
+            }
+            env->spr[SPR_POWER_PMC3] = tmp;
         }
 
-        if (env->spr[sprn] >= PMC_COUNTER_NEGATIVE_VAL &&
-            pmc_has_overflow_enabled(env, sprn)) {
+        if (ins_cnt & (1 << 4)) {
+            target_ulong mmcr1 = env->spr[SPR_POWER_MMCR1];
+            int sel = extract64(mmcr1, MMCR1_PMC4EVT_EXTR, MMCR1_EVT_SIZE);
+            if (sel == 0x02 || (env->spr[SPR_CTRL] & CTRL_RUN)) {
+                tmp = env->spr[SPR_POWER_PMC4];
+                tmp += num_insns;
+                if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
+                    tmp = PMC_COUNTER_NEGATIVE_VAL;
+                    overflow_triggered = true;
+                }
+                env->spr[SPR_POWER_PMC4] = tmp;
+            }
+        }
+    }
 
+    if (ins_cnt & (1 << 5)) {
+        tmp = env->spr[SPR_POWER_PMC5];
+        tmp += num_insns;
+        if (tmp >= PMC_COUNTER_NEGATIVE_VAL && (mmcr0 & MMCR0_PMCjCE)) {
+            tmp = PMC_COUNTER_NEGATIVE_VAL;
             overflow_triggered = true;
-
-            /*
-             * The real PMU will always trigger a counter overflow with
-             * PMC_COUNTER_NEGATIVE_VAL. We don't have an easy way to
-             * do that since we're counting block of instructions at
-             * the end of each translation block, and we're probably
-             * passing this value at this point.
-             *
-             * Let's write PMC_COUNTER_NEGATIVE_VAL to the overflowed
-             * counter to simulate what the real hardware would do.
-             */
-            env->spr[sprn] = PMC_COUNTER_NEGATIVE_VAL;
         }
+        env->spr[SPR_POWER_PMC5] = tmp;
     }
 
     return overflow_triggered;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/3] target/ppc: Use env->pnc_cyc_cnt
  2021-12-23  3:01 [PATCH 0/3] Reorg ppc64 pmu insn counting Richard Henderson
  2021-12-23  3:01 ` [PATCH 1/3] target/ppc: Cache per-pmc insn and cycle count settings Richard Henderson
  2021-12-23  3:01 ` [PATCH 2/3] target/ppc: Rewrite pmu_increment_insns Richard Henderson
@ 2021-12-23  3:01 ` Richard Henderson
  2021-12-23 20:36 ` [PATCH 0/3] Reorg ppc64 pmu insn counting Daniel Henrique Barboza
  3 siblings, 0 replies; 14+ messages in thread
From: Richard Henderson @ 2021-12-23  3:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: danielhb413, qemu-ppc, clg, david

Use the cached pmc_cyc_cnt value in pmu_update_cycles
and pmc_update_overflow_timer.  This leaves pmc_get_event
and pmc_is_inactive unused, so remove them.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/ppc/power8-pmu.c | 107 ++++------------------------------------
 1 file changed, 9 insertions(+), 98 deletions(-)

diff --git a/target/ppc/power8-pmu.c b/target/ppc/power8-pmu.c
index c60afa56a5..7859ea24f5 100644
--- a/target/ppc/power8-pmu.c
+++ b/target/ppc/power8-pmu.c
@@ -24,19 +24,6 @@
 
 #define PMC_COUNTER_NEGATIVE_VAL 0x80000000UL
 
-static bool pmc_is_inactive(CPUPPCState *env, int sprn)
-{
-    if (env->spr[SPR_POWER_MMCR0] & MMCR0_FC) {
-        return true;
-    }
-
-    if (sprn < SPR_POWER_PMC5) {
-        return env->spr[SPR_POWER_MMCR0] & MMCR0_FC14;
-    }
-
-    return env->spr[SPR_POWER_MMCR0] & MMCR0_FC56;
-}
-
 static bool pmc_has_overflow_enabled(CPUPPCState *env, int sprn)
 {
     if (sprn == SPR_POWER_PMC1) {
@@ -46,80 +33,6 @@ static bool pmc_has_overflow_enabled(CPUPPCState *env, int sprn)
     return env->spr[SPR_POWER_MMCR0] & MMCR0_PMCjCE;
 }
 
-/*
- * For PMCs 1-4, IBM POWER chips has support for an implementation
- * dependent event, 0x1E, that enables cycle counting. The Linux kernel
- * makes extensive use of 0x1E, so let's also support it.
- *
- * Likewise, event 0x2 is an implementation-dependent event that IBM
- * POWER chips implement (at least since POWER8) that is equivalent to
- * PM_INST_CMPL. Let's support this event on PMCs 1-4 as well.
- */
-static PMUEventType pmc_get_event(CPUPPCState *env, int sprn)
-{
-    uint8_t mmcr1_evt_extr[] = { MMCR1_PMC1EVT_EXTR, MMCR1_PMC2EVT_EXTR,
-                                 MMCR1_PMC3EVT_EXTR, MMCR1_PMC4EVT_EXTR };
-    PMUEventType evt_type = PMU_EVENT_INVALID;
-    uint8_t pmcsel;
-    int i;
-
-    if (pmc_is_inactive(env, sprn)) {
-        return PMU_EVENT_INACTIVE;
-    }
-
-    if (sprn == SPR_POWER_PMC5) {
-        return PMU_EVENT_INSTRUCTIONS;
-    }
-
-    if (sprn == SPR_POWER_PMC6) {
-        return PMU_EVENT_CYCLES;
-    }
-
-    i = sprn - SPR_POWER_PMC1;
-    pmcsel = extract64(env->spr[SPR_POWER_MMCR1], mmcr1_evt_extr[i],
-                       MMCR1_EVT_SIZE);
-
-    switch (pmcsel) {
-    case 0x2:
-        evt_type = PMU_EVENT_INSTRUCTIONS;
-        break;
-    case 0x1E:
-        evt_type = PMU_EVENT_CYCLES;
-        break;
-    case 0xF0:
-        /*
-         * PMC1SEL = 0xF0 is the architected PowerISA v3.1
-         * event that counts cycles using PMC1.
-         */
-        if (sprn == SPR_POWER_PMC1) {
-            evt_type = PMU_EVENT_CYCLES;
-        }
-        break;
-    case 0xFA:
-        /*
-         * PMC4SEL = 0xFA is the "instructions completed
-         * with run latch set" event.
-         */
-        if (sprn == SPR_POWER_PMC4) {
-            evt_type = PMU_EVENT_INSN_RUN_LATCH;
-        }
-        break;
-    case 0xFE:
-        /*
-         * PMC1SEL = 0xFE is the architected PowerISA v3.1
-         * event to sample instructions using PMC1.
-         */
-        if (sprn == SPR_POWER_PMC1) {
-            evt_type = PMU_EVENT_INSTRUCTIONS;
-        }
-        break;
-    default:
-        break;
-    }
-
-    return evt_type;
-}
-
 void pmu_update_summaries(CPUPPCState *env)
 {
     target_ulong mmcr0 = env->spr[SPR_POWER_MMCR0];
@@ -233,18 +146,16 @@ static void pmu_update_cycles(CPUPPCState *env)
 {
     uint64_t now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
     uint64_t time_delta = now - env->pmu_base_time;
-    int sprn;
+    int sprn, cyc_cnt = env->pmc_cyc_cnt;
 
     for (sprn = SPR_POWER_PMC1; sprn <= SPR_POWER_PMC6; sprn++) {
-        if (pmc_get_event(env, sprn) != PMU_EVENT_CYCLES) {
-            continue;
+        if (cyc_cnt & (1 << (sprn - SPR_POWER_PMC1 + 1))) {
+            /*
+             * The pseries and powernv clock runs at 1Ghz, meaning
+             * that 1 nanosec equals 1 cycle.
+             */
+            env->spr[sprn] += time_delta;
         }
-
-        /*
-         * The pseries and powernv clock runs at 1Ghz, meaning
-         * that 1 nanosec equals 1 cycle.
-         */
-        env->spr[sprn] += time_delta;
     }
 
     /* Update base_time for future calculations */
@@ -273,7 +184,7 @@ static void pmc_update_overflow_timer(CPUPPCState *env, int sprn)
         return;
     }
 
-    if (pmc_get_event(env, sprn) != PMU_EVENT_CYCLES ||
+    if (!(env->pmc_cyc_cnt & (1 << (sprn - SPR_POWER_PMC1 + 1))) ||
         !pmc_has_overflow_enabled(env, sprn)) {
         /* Overflow timer is not needed for this counter */
         timer_del(pmc_overflow_timer);
@@ -281,7 +192,7 @@ static void pmc_update_overflow_timer(CPUPPCState *env, int sprn)
     }
 
     if (env->spr[sprn] >= PMC_COUNTER_NEGATIVE_VAL) {
-        timeout =  0;
+        timeout = 0;
     } else {
         timeout = PMC_COUNTER_NEGATIVE_VAL - env->spr[sprn];
     }
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] Reorg ppc64 pmu insn counting
  2021-12-23  3:01 [PATCH 0/3] Reorg ppc64 pmu insn counting Richard Henderson
                   ` (2 preceding siblings ...)
  2021-12-23  3:01 ` [PATCH 3/3] target/ppc: Use env->pnc_cyc_cnt Richard Henderson
@ 2021-12-23 20:36 ` Daniel Henrique Barboza
  2021-12-23 21:19   ` Richard Henderson
  2022-01-03 15:07   ` Alex Bennée
  3 siblings, 2 replies; 14+ messages in thread
From: Daniel Henrique Barboza @ 2021-12-23 20:36 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-ppc, clg, david



On 12/23/21 00:01, Richard Henderson wrote:
> In contrast to Daniel's version, the code stays in power8-pmu.c,
> but is better organized to not take so much overhead.
> 
> Before:
> 
>      32.97%  qemu-system-ppc  qemu-system-ppc64   [.] pmc_get_event
>      20.22%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
>       4.52%  qemu-system-ppc  qemu-system-ppc64   [.] hreg_compute_hflags_value
>       3.30%  qemu-system-ppc  qemu-system-ppc64   [.] helper_lookup_tb_ptr
>       2.68%  qemu-system-ppc  qemu-system-ppc64   [.] tcg_gen_code
>       2.28%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
>       1.84%  qemu-system-ppc  qemu-system-ppc64   [.] pmu_insn_cnt_enabled
> 
> After:
> 
>       8.42%  qemu-system-ppc  qemu-system-ppc64   [.] hreg_compute_hflags_value
>       6.65%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
>       6.63%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
> 

Thanks for looking this up. I had no idea the original C code was that slow.

This reorg is breaking PMU-EBB tests, unfortunately. These tests are run from the kernel
tree [1] and I test them inside a pSeries TCG guest. You'll need to apply patches 9 and
10 of [2] beforehand (they apply cleanly in current master) because they aren't upstream
yet and EBB needs it.

The tests that are breaking consistently with this reorg are:

back_to_back_ebbs_test.c
cpu_event_pinned_vs_ebb_test.c
cycles_test.c
task_event_pinned_vs_ebb_test.c


The issue here is that these tests exercises different Perf events and aspects of branching
(e.g. how fast we're detecting a counter overflow, how many times, etc) and I wasn't able to
find out a fix using your C reorg yet.

With that in mind I decided to post a new version of my TCG rework, with less repetition and
a bit more concise, to have an alternative that can be used upstream to fix the Avocado tests.
Meanwhile I'll see if I can get your reorg working with all EBB tests we need. All things
equal - similar performance, all EBB tests passing - I'd rather stay with your C code than my
TCG rework since yours doesn't rely on TCG Ops knowledge to maintain it.


Thanks,


Daniel


[1] https://github.com/torvalds/linux/tree/master/tools/testing/selftests/powerpc/pmu/ebb
[2] https://lists.gnu.org/archive/html/qemu-devel/2021-12/msg00073.html

> 
> r~
> 
> 
> Richard Henderson (3):
>    target/ppc: Cache per-pmc insn and cycle count settings
>    target/ppc: Rewrite pmu_increment_insns
>    target/ppc: Use env->pnc_cyc_cnt
> 
>   target/ppc/cpu.h         |   3 +
>   target/ppc/power8-pmu.h  |  14 +--
>   target/ppc/cpu_init.c    |   1 +
>   target/ppc/helper_regs.c |   2 +-
>   target/ppc/machine.c     |   2 +
>   target/ppc/power8-pmu.c  | 230 ++++++++++++++++-----------------------
>   6 files changed, 108 insertions(+), 144 deletions(-)
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] Reorg ppc64 pmu insn counting
  2021-12-23 20:36 ` [PATCH 0/3] Reorg ppc64 pmu insn counting Daniel Henrique Barboza
@ 2021-12-23 21:19   ` Richard Henderson
  2021-12-23 22:47     ` Daniel Henrique Barboza
  2021-12-30 22:12     ` Daniel Henrique Barboza
  2022-01-03 15:07   ` Alex Bennée
  1 sibling, 2 replies; 14+ messages in thread
From: Richard Henderson @ 2021-12-23 21:19 UTC (permalink / raw)
  To: Daniel Henrique Barboza, qemu-devel; +Cc: qemu-ppc, clg, david

On 12/23/21 12:36 PM, Daniel Henrique Barboza wrote:
> This reorg is breaking PMU-EBB tests, unfortunately. These tests are run from the kernel
> tree [1] and I test them inside a pSeries TCG guest. You'll need to apply patches 9 and
> 10 of [2] beforehand (they apply cleanly in current master) because they aren't upstream
> yet and EBB needs it.
> 
> The tests that are breaking consistently with this reorg are:
> 
> back_to_back_ebbs_test.c
> cpu_event_pinned_vs_ebb_test.c
> cycles_test.c
> task_event_pinned_vs_ebb_test.c

In which case perhaps drop my first patch for now, and instead simply replicate your tcg 
algorithm in c exactly -- using none of the helpers that currently exist.

We can improve the code, and the use of pmc_get_event from hreg_compute_hregs_value second.


r~


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] Reorg ppc64 pmu insn counting
  2021-12-23 21:19   ` Richard Henderson
@ 2021-12-23 22:47     ` Daniel Henrique Barboza
  2021-12-30 22:12     ` Daniel Henrique Barboza
  1 sibling, 0 replies; 14+ messages in thread
From: Daniel Henrique Barboza @ 2021-12-23 22:47 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-ppc, clg, david



On 12/23/21 18:19, Richard Henderson wrote:
> On 12/23/21 12:36 PM, Daniel Henrique Barboza wrote:
>> This reorg is breaking PMU-EBB tests, unfortunately. These tests are run from the kernel
>> tree [1] and I test them inside a pSeries TCG guest. You'll need to apply patches 9 and
>> 10 of [2] beforehand (they apply cleanly in current master) because they aren't upstream
>> yet and EBB needs it.
>>
>> The tests that are breaking consistently with this reorg are:
>>
>> back_to_back_ebbs_test.c
>> cpu_event_pinned_vs_ebb_test.c
>> cycles_test.c
>> task_event_pinned_vs_ebb_test.c
> 
> In which case perhaps drop my first patch for now, and instead simply replicate your tcg algorithm in c exactly -- using none of the helpers that currently exist.
> 
> We can improve the code, and the use of pmc_get_event from hreg_compute_hregs_value second.


I'll try this out. I'll let you know how that goes.


Thanks,


Daniel

> 
> 
> r~


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] Reorg ppc64 pmu insn counting
  2021-12-23 21:19   ` Richard Henderson
  2021-12-23 22:47     ` Daniel Henrique Barboza
@ 2021-12-30 22:12     ` Daniel Henrique Barboza
  2022-01-03  6:48       ` Cédric Le Goater
  1 sibling, 1 reply; 14+ messages in thread
From: Daniel Henrique Barboza @ 2021-12-30 22:12 UTC (permalink / raw)
  To: Richard Henderson, qemu-devel; +Cc: qemu-ppc, clg, david



On 12/23/21 18:19, Richard Henderson wrote:
> On 12/23/21 12:36 PM, Daniel Henrique Barboza wrote:
>> This reorg is breaking PMU-EBB tests, unfortunately. These tests are run from the kernel
>> tree [1] and I test them inside a pSeries TCG guest. You'll need to apply patches 9 and
>> 10 of [2] beforehand (they apply cleanly in current master) because they aren't upstream
>> yet and EBB needs it.
>>
>> The tests that are breaking consistently with this reorg are:
>>
>> back_to_back_ebbs_test.c
>> cpu_event_pinned_vs_ebb_test.c
>> cycles_test.c
>> task_event_pinned_vs_ebb_test.c
> 
> In which case perhaps drop my first patch for now, and instead simply replicate your tcg algorithm in c exactly -- using none of the helpers that currently exist.
> 
> We can improve the code, and the use of pmc_get_event from hreg_compute_hregs_value second.


While attempting to do that I figured what was off with this series and ended up
fixing it.

It's now working with the event-based branch interrupt tests and Avocado seems happy
as well. It took some small adjustments/fixes in patches 1/2 and an extra patch of mine
tuning the existing logic after the reorg.


I'll clean it up and re-send it next week/year.


Thanks


Daniel




> 
> 
> r~


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] Reorg ppc64 pmu insn counting
  2021-12-30 22:12     ` Daniel Henrique Barboza
@ 2022-01-03  6:48       ` Cédric Le Goater
  2022-01-03 11:10         ` Daniel Henrique Barboza
  0 siblings, 1 reply; 14+ messages in thread
From: Cédric Le Goater @ 2022-01-03  6:48 UTC (permalink / raw)
  To: Daniel Henrique Barboza, Richard Henderson, qemu-devel; +Cc: qemu-ppc, david

On 12/30/21 23:12, Daniel Henrique Barboza wrote:
> 
> 
> On 12/23/21 18:19, Richard Henderson wrote:
>> On 12/23/21 12:36 PM, Daniel Henrique Barboza wrote:
>>> This reorg is breaking PMU-EBB tests, unfortunately. These tests are run from the kernel
>>> tree [1] and I test them inside a pSeries TCG guest. You'll need to apply patches 9 and
>>> 10 of [2] beforehand (they apply cleanly in current master) because they aren't upstream
>>> yet and EBB needs it.
>>>
>>> The tests that are breaking consistently with this reorg are:
>>>
>>> back_to_back_ebbs_test.c
>>> cpu_event_pinned_vs_ebb_test.c
>>> cycles_test.c
>>> task_event_pinned_vs_ebb_test.c
>>
>> In which case perhaps drop my first patch for now, and instead simply replicate your tcg algorithm in c exactly -- using none of the helpers that currently exist.
>>
>> We can improve the code, and the use of pmc_get_event from hreg_compute_hregs_value second.
> 
> 
> While attempting to do that I figured what was off with this series and ended up
> fixing it.
> 
> It's now working with the event-based branch interrupt tests and Avocado seems happy
> as well. It took some small adjustments/fixes in patches 1/2 and an extra patch of mine
> tuning the existing logic after the reorg.
> 
> 
> I'll clean it up and re-send it next week/year.

Shouldn't we merge this series first ? It is really improving emulation
and keeps the check-avocado tests under the timeout limit (which I find
important).

Thanks,

C.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] Reorg ppc64 pmu insn counting
  2022-01-03  6:48       ` Cédric Le Goater
@ 2022-01-03 11:10         ` Daniel Henrique Barboza
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Henrique Barboza @ 2022-01-03 11:10 UTC (permalink / raw)
  To: Cédric Le Goater, Richard Henderson, qemu-devel; +Cc: qemu-ppc, david



On 1/3/22 03:48, Cédric Le Goater wrote:
> On 12/30/21 23:12, Daniel Henrique Barboza wrote:
>>
>>
>> On 12/23/21 18:19, Richard Henderson wrote:
>>> On 12/23/21 12:36 PM, Daniel Henrique Barboza wrote:
>>>> This reorg is breaking PMU-EBB tests, unfortunately. These tests are run from the kernel
>>>> tree [1] and I test them inside a pSeries TCG guest. You'll need to apply patches 9 and
>>>> 10 of [2] beforehand (they apply cleanly in current master) because they aren't upstream
>>>> yet and EBB needs it.
>>>>
>>>> The tests that are breaking consistently with this reorg are:
>>>>
>>>> back_to_back_ebbs_test.c
>>>> cpu_event_pinned_vs_ebb_test.c
>>>> cycles_test.c
>>>> task_event_pinned_vs_ebb_test.c
>>>
>>> In which case perhaps drop my first patch for now, and instead simply replicate your tcg algorithm in c exactly -- using none of the helpers that currently exist.
>>>
>>> We can improve the code, and the use of pmc_get_event from hreg_compute_hregs_value second.
>>
>>
>> While attempting to do that I figured what was off with this series and ended up
>> fixing it.
>>
>> It's now working with the event-based branch interrupt tests and Avocado seems happy
>> as well. It took some small adjustments/fixes in patches 1/2 and an extra patch of mine
>> tuning the existing logic after the reorg.
>>
>>
>> I'll clean it up and re-send it next week/year.
> 
> Shouldn't we merge this series first ? It is really improving emulation
> and keeps the check-avocado tests under the timeout limit (which I find
> important).


As it is this series is breaking EBB tests if we apply the EBB interrupt patches
on top of it. If your concern is strictly with the Avocado tests then we'd better
off pushing my TCG version which fixes Avocado and don't break anything else.

Besides, I'll sent the v2 of this series today, tomorrow at the latest. We're better
off pushing the fixed version.


Thanks,

Daniel



> 
> Thanks,
> 
> C.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] Reorg ppc64 pmu insn counting
  2021-12-23 20:36 ` [PATCH 0/3] Reorg ppc64 pmu insn counting Daniel Henrique Barboza
  2021-12-23 21:19   ` Richard Henderson
@ 2022-01-03 15:07   ` Alex Bennée
  2022-01-03 18:06     ` Daniel Henrique Barboza
  1 sibling, 1 reply; 14+ messages in thread
From: Alex Bennée @ 2022-01-03 15:07 UTC (permalink / raw)
  To: Daniel Henrique Barboza
  Cc: qemu-devel, qemu-ppc, Richard Henderson, clg, david


Daniel Henrique Barboza <danielhb413@gmail.com> writes:

> On 12/23/21 00:01, Richard Henderson wrote:
>> In contrast to Daniel's version, the code stays in power8-pmu.c,
>> but is better organized to not take so much overhead.
>> Before:
>>      32.97%  qemu-system-ppc  qemu-system-ppc64   [.] pmc_get_event
>>      20.22%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
>>       4.52%  qemu-system-ppc  qemu-system-ppc64   [.] hreg_compute_hflags_value
>>       3.30%  qemu-system-ppc  qemu-system-ppc64   [.] helper_lookup_tb_ptr
>>       2.68%  qemu-system-ppc  qemu-system-ppc64   [.] tcg_gen_code
>>       2.28%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
>>       1.84%  qemu-system-ppc  qemu-system-ppc64   [.] pmu_insn_cnt_enabled
>> After:
>>       8.42%  qemu-system-ppc  qemu-system-ppc64   [.]
>> hreg_compute_hflags_value
>>       6.65%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
>>       6.63%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
>> 
>
> Thanks for looking this up. I had no idea the original C code was that slow.
>
<snip>
>
> With that in mind I decided to post a new version of my TCG rework, with less repetition and
> a bit more concise, to have an alternative that can be used upstream to fix the Avocado tests.
> Meanwhile I'll see if I can get your reorg working with all EBB tests we need. All things
> equal - similar performance, all EBB tests passing - I'd rather stay with your C code than my
> TCG rework since yours doesn't rely on TCG Ops knowledge to maintain
> it.

Reading this series did make me wonder if we need a more generic service
from the TCG for helping with "internal" instrumentation needed for
things like decent PMU emulation. We haven't gone as much for it in ARM
yet but it would be nice to. It would be even nicer if such a facility
could be used by stuff like icount as well so we don't end up doing the
same thing twice.

>
>
> Thanks,
>
>
> Daniel
>
>
> [1] https://github.com/torvalds/linux/tree/master/tools/testing/selftests/powerpc/pmu/ebb
> [2] https://lists.gnu.org/archive/html/qemu-devel/2021-12/msg00073.html
>
>> r~
>> Richard Henderson (3):
>>    target/ppc: Cache per-pmc insn and cycle count settings
>>    target/ppc: Rewrite pmu_increment_insns
>>    target/ppc: Use env->pnc_cyc_cnt
>>   target/ppc/cpu.h         |   3 +
>>   target/ppc/power8-pmu.h  |  14 +--
>>   target/ppc/cpu_init.c    |   1 +
>>   target/ppc/helper_regs.c |   2 +-
>>   target/ppc/machine.c     |   2 +
>>   target/ppc/power8-pmu.c  | 230 ++++++++++++++++-----------------------
>>   6 files changed, 108 insertions(+), 144 deletions(-)
>> 


-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] Reorg ppc64 pmu insn counting
  2022-01-03 15:07   ` Alex Bennée
@ 2022-01-03 18:06     ` Daniel Henrique Barboza
  2022-01-04 10:32       ` Alex Bennée
  0 siblings, 1 reply; 14+ messages in thread
From: Daniel Henrique Barboza @ 2022-01-03 18:06 UTC (permalink / raw)
  To: Alex Bennée; +Cc: qemu-devel, qemu-ppc, Richard Henderson, clg, david



On 1/3/22 12:07, Alex Bennée wrote:
> 
> Daniel Henrique Barboza <danielhb413@gmail.com> writes:
> 
>> On 12/23/21 00:01, Richard Henderson wrote:
>>> In contrast to Daniel's version, the code stays in power8-pmu.c,
>>> but is better organized to not take so much overhead.
>>> Before:
>>>       32.97%  qemu-system-ppc  qemu-system-ppc64   [.] pmc_get_event
>>>       20.22%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
>>>        4.52%  qemu-system-ppc  qemu-system-ppc64   [.] hreg_compute_hflags_value
>>>        3.30%  qemu-system-ppc  qemu-system-ppc64   [.] helper_lookup_tb_ptr
>>>        2.68%  qemu-system-ppc  qemu-system-ppc64   [.] tcg_gen_code
>>>        2.28%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
>>>        1.84%  qemu-system-ppc  qemu-system-ppc64   [.] pmu_insn_cnt_enabled
>>> After:
>>>        8.42%  qemu-system-ppc  qemu-system-ppc64   [.]
>>> hreg_compute_hflags_value
>>>        6.65%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
>>>        6.63%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
>>>
>>
>> Thanks for looking this up. I had no idea the original C code was that slow.
>>
> <snip>
>>
>> With that in mind I decided to post a new version of my TCG rework, with less repetition and
>> a bit more concise, to have an alternative that can be used upstream to fix the Avocado tests.
>> Meanwhile I'll see if I can get your reorg working with all EBB tests we need. All things
>> equal - similar performance, all EBB tests passing - I'd rather stay with your C code than my
>> TCG rework since yours doesn't rely on TCG Ops knowledge to maintain
>> it.
> 
> Reading this series did make me wonder if we need a more generic service
> from the TCG for helping with "internal" instrumentation needed for
> things like decent PMU emulation. We haven't gone as much for it in ARM
> yet but it would be nice to. It would be even nicer if such a facility
> could be used by stuff like icount as well so we don't end up doing the
> same thing twice.

Back in May 2021 when I first starting working on this code I tried to base myself in the
ARM PMU code. In fact, the cycle and insn calculation done in the very first version of
this work was based on what ARM does in target/arm/helper.c, cycles_get_count() and
instructions_get_count(). The cycle calculation got simplified because our PPC64 CPU
has a 1Ghz clock so it's easier to just consider 1ns = 1 cycle.

For instruction count, aside from my 2-3 weeks of spectacular failures trying to count
instructions inside translate.c, I also looked into how TCG plugins work and tried to do
something similar to what plugin_gen_tb_end() does at the end of the translator_loop()
in accel/tcg/translator.c. For some reason I wasn't able to replicate the same behavior
that I would have if I used the TCG plugin framework in the 'canonical' way.

I ended up doing something similar to what instructions_get_count() from ARM does, which
relies on icount. Richard then aided me in figuring out that I could count instructions
directly by tapping into the end of each TB.

So, for a generic service of sorts I believe it would be nice to re-use the TCG plugins
API in the internal instrumentation (I tried it once, failed, not sure if I messed up
or it's not possible ATM). That would be a good start to try to get all this logic in a
generic code for internal translate code to use.



Thanks,


Daniel



> 
>>
>>
>> Thanks,
>>
>>
>> Daniel
>>
>>
>> [1] https://github.com/torvalds/linux/tree/master/tools/testing/selftests/powerpc/pmu/ebb
>> [2] https://lists.gnu.org/archive/html/qemu-devel/2021-12/msg00073.html
>>
>>> r~
>>> Richard Henderson (3):
>>>     target/ppc: Cache per-pmc insn and cycle count settings
>>>     target/ppc: Rewrite pmu_increment_insns
>>>     target/ppc: Use env->pnc_cyc_cnt
>>>    target/ppc/cpu.h         |   3 +
>>>    target/ppc/power8-pmu.h  |  14 +--
>>>    target/ppc/cpu_init.c    |   1 +
>>>    target/ppc/helper_regs.c |   2 +-
>>>    target/ppc/machine.c     |   2 +
>>>    target/ppc/power8-pmu.c  | 230 ++++++++++++++++-----------------------
>>>    6 files changed, 108 insertions(+), 144 deletions(-)
>>>
> 
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] Reorg ppc64 pmu insn counting
  2022-01-03 18:06     ` Daniel Henrique Barboza
@ 2022-01-04 10:32       ` Alex Bennée
  2022-01-04 19:56         ` Daniel Henrique Barboza
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Bennée @ 2022-01-04 10:32 UTC (permalink / raw)
  To: Daniel Henrique Barboza
  Cc: qemu-devel, qemu-ppc, Richard Henderson, clg, david


Daniel Henrique Barboza <danielhb413@gmail.com> writes:

> On 1/3/22 12:07, Alex Bennée wrote:
>> Daniel Henrique Barboza <danielhb413@gmail.com> writes:
>> 
>>> On 12/23/21 00:01, Richard Henderson wrote:
>>>> In contrast to Daniel's version, the code stays in power8-pmu.c,
>>>> but is better organized to not take so much overhead.
>>>> Before:
>>>>       32.97%  qemu-system-ppc  qemu-system-ppc64   [.] pmc_get_event
>>>>       20.22%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
>>>>        4.52%  qemu-system-ppc  qemu-system-ppc64   [.] hreg_compute_hflags_value
>>>>        3.30%  qemu-system-ppc  qemu-system-ppc64   [.] helper_lookup_tb_ptr
>>>>        2.68%  qemu-system-ppc  qemu-system-ppc64   [.] tcg_gen_code
>>>>        2.28%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
>>>>        1.84%  qemu-system-ppc  qemu-system-ppc64   [.] pmu_insn_cnt_enabled
>>>> After:
>>>>        8.42%  qemu-system-ppc  qemu-system-ppc64   [.]
>>>> hreg_compute_hflags_value
>>>>        6.65%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
>>>>        6.63%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
>>>>
>>>
>>> Thanks for looking this up. I had no idea the original C code was that slow.
>>>
>> <snip>
>>>
>>> With that in mind I decided to post a new version of my TCG rework, with less repetition and
>>> a bit more concise, to have an alternative that can be used upstream to fix the Avocado tests.
>>> Meanwhile I'll see if I can get your reorg working with all EBB tests we need. All things
>>> equal - similar performance, all EBB tests passing - I'd rather stay with your C code than my
>>> TCG rework since yours doesn't rely on TCG Ops knowledge to maintain
>>> it.
>> Reading this series did make me wonder if we need a more generic
>> service
>> from the TCG for helping with "internal" instrumentation needed for
>> things like decent PMU emulation. We haven't gone as much for it in ARM
>> yet but it would be nice to. It would be even nicer if such a facility
>> could be used by stuff like icount as well so we don't end up doing the
>> same thing twice.
>
> Back in May 2021 when I first starting working on this code I tried to base myself in the
> ARM PMU code. In fact, the cycle and insn calculation done in the very first version of
> this work was based on what ARM does in target/arm/helper.c, cycles_get_count() and
> instructions_get_count(). The cycle calculation got simplified because our PPC64 CPU
> has a 1Ghz clock so it's easier to just consider 1ns = 1 cycle.
>
> For instruction count, aside from my 2-3 weeks of spectacular failures trying to count
> instructions inside translate.c, I also looked into how TCG plugins work and tried to do
> something similar to what plugin_gen_tb_end() does at the end of the translator_loop()
> in accel/tcg/translator.c. For some reason I wasn't able to replicate the same behavior
> that I would have if I used the TCG plugin framework in the
> 'canonical' way.

plugin_gen_tb_end is probably overkill because we should already know
how many instructions there are in a translated block on account of the
insn_start and insn_end ops that mark them. In fact see gen_tb_end()
which is where icount updates the value used in the decrement at the
start of each block. Assuming no synchronous exceptions occur you could
just increment a counter at the end of the block as no async IRQs will
occur until we have executed all of those instructions.

Of course it's never quite so simple and when running in full icount
mode we have to take into account exceptions that can be triggered by IO
accesses. This involves doing a re-translation to ensures the IO
instruction is always the last we execute.

I'm guessing for PMU counters to be somewhat correct we would want to
ensure updates throughout the block (before each memory op and helper
call). This would hopefully avoid the cost of "full" icount support
which is only single threaded. However this is the opposite to icount's
budget and pre-decrement approach which feels messier than it could be.

> I ended up doing something similar to what instructions_get_count() from ARM does, which
> relies on icount. Richard then aided me in figuring out that I could count instructions
> directly by tapping into the end of each TB.

instructions_get_count will also work without icount but is affected by
wall clock time distortions in that case.

> So, for a generic service of sorts I believe it would be nice to re-use the TCG plugins
> API in the internal instrumentation (I tried it once, failed, not sure if I messed up
> or it's not possible ATM). That would be a good start to try to get all this logic in a
> generic code for internal translate code to use.

Agreed - although the plugin specific stuff is really just focused on
our limited visibility API. Unless you are referring to
accel/tcg/plugin-gen.c which are just helpers for manipulating the TCG
ops after the initial translation.

-- 
Alex Bennée


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 0/3] Reorg ppc64 pmu insn counting
  2022-01-04 10:32       ` Alex Bennée
@ 2022-01-04 19:56         ` Daniel Henrique Barboza
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Henrique Barboza @ 2022-01-04 19:56 UTC (permalink / raw)
  To: Alex Bennée; +Cc: qemu-devel, qemu-ppc, Richard Henderson, clg, david



On 1/4/22 07:32, Alex Bennée wrote:
> 
> Daniel Henrique Barboza <danielhb413@gmail.com> writes:
> 
>> On 1/3/22 12:07, Alex Bennée wrote:
>>> Daniel Henrique Barboza <danielhb413@gmail.com> writes:
>>>
>>>> On 12/23/21 00:01, Richard Henderson wrote:
>>>>> In contrast to Daniel's version, the code stays in power8-pmu.c,
>>>>> but is better organized to not take so much overhead.
>>>>> Before:
>>>>>        32.97%  qemu-system-ppc  qemu-system-ppc64   [.] pmc_get_event
>>>>>        20.22%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
>>>>>         4.52%  qemu-system-ppc  qemu-system-ppc64   [.] hreg_compute_hflags_value
>>>>>         3.30%  qemu-system-ppc  qemu-system-ppc64   [.] helper_lookup_tb_ptr
>>>>>         2.68%  qemu-system-ppc  qemu-system-ppc64   [.] tcg_gen_code
>>>>>         2.28%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
>>>>>         1.84%  qemu-system-ppc  qemu-system-ppc64   [.] pmu_insn_cnt_enabled
>>>>> After:
>>>>>         8.42%  qemu-system-ppc  qemu-system-ppc64   [.]
>>>>> hreg_compute_hflags_value
>>>>>         6.65%  qemu-system-ppc  qemu-system-ppc64   [.] cpu_exec
>>>>>         6.63%  qemu-system-ppc  qemu-system-ppc64   [.] helper_insns_inc
>>>>>
>>>>
>>>> Thanks for looking this up. I had no idea the original C code was that slow.
>>>>
>>> <snip>
>>>>
>>>> With that in mind I decided to post a new version of my TCG rework, with less repetition and
>>>> a bit more concise, to have an alternative that can be used upstream to fix the Avocado tests.
>>>> Meanwhile I'll see if I can get your reorg working with all EBB tests we need. All things
>>>> equal - similar performance, all EBB tests passing - I'd rather stay with your C code than my
>>>> TCG rework since yours doesn't rely on TCG Ops knowledge to maintain
>>>> it.
>>> Reading this series did make me wonder if we need a more generic
>>> service
>>> from the TCG for helping with "internal" instrumentation needed for
>>> things like decent PMU emulation. We haven't gone as much for it in ARM
>>> yet but it would be nice to. It would be even nicer if such a facility
>>> could be used by stuff like icount as well so we don't end up doing the
>>> same thing twice.
>>
>> Back in May 2021 when I first starting working on this code I tried to base myself in the
>> ARM PMU code. In fact, the cycle and insn calculation done in the very first version of
>> this work was based on what ARM does in target/arm/helper.c, cycles_get_count() and
>> instructions_get_count(). The cycle calculation got simplified because our PPC64 CPU
>> has a 1Ghz clock so it's easier to just consider 1ns = 1 cycle.
>>
>> For instruction count, aside from my 2-3 weeks of spectacular failures trying to count
>> instructions inside translate.c, I also looked into how TCG plugins work and tried to do
>> something similar to what plugin_gen_tb_end() does at the end of the translator_loop()
>> in accel/tcg/translator.c. For some reason I wasn't able to replicate the same behavior
>> that I would have if I used the TCG plugin framework in the
>> 'canonical' way.
> 
> plugin_gen_tb_end is probably overkill because we should already know
> how many instructions there are in a translated block on account of the
> insn_start and insn_end ops that mark them. In fact see gen_tb_end()
> which is where icount updates the value used in the decrement at the
> start of each block. Assuming no synchronous exceptions occur you could
> just increment a counter at the end of the block as no async IRQs will
> occur until we have executed all of those instructions.
> 
> Of course it's never quite so simple and when running in full icount
> mode we have to take into account exceptions that can be triggered by IO
> accesses. This involves doing a re-translation to ensures the IO
> instruction is always the last we execute.
> 
> I'm guessing for PMU counters to be somewhat correct we would want to
> ensure updates throughout the block (before each memory op and helper
> call). This would hopefully avoid the cost of "full" icount support
> which is only single threaded. However this is the opposite to icount's
> budget and pre-decrement approach which feels messier than it could be.


What about cycle counting without icount? With icount is a rather simple matter
of making some assumptions about the CPU freq and relying on the shift parameter
to have a somewhat good precision. Without icount the cycle count, at least in
the current implementation in the ppc64 PMU, is erratic.

The problem is that, at least as far as I've read pSeries and powernv code (guest
and bare metal IBM Power emulation), the CPU freq is a 1Ghz that we write in
the FDT and do nothing else with it. We do not enforce (or throttle) the CPU freq
in the emulation. A quick look into ARM code also seems to do similar assumptions:


static uint64_t cycles_get_count(CPUARMState *env)
{
#ifndef CONFIG_USER_ONLY
     return muldiv64(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
                    ARM_CPU_FREQ, NANOSECONDS_PER_SECOND);
#else
     return cpu_get_host_ticks();
#endif
}

#ifndef CONFIG_USER_ONLY
static int64_t cycles_ns_per(uint64_t cycles)
{
     return (ARM_CPU_FREQ / NANOSECONDS_PER_SECOND) * cycles;
}


$ git grep 'ARM_CPU_FREQ'
target/arm/helper.c:#define ARM_CPU_FREQ 1000000000 /* FIXME: 1 GHz, should be configurable */
target/arm/helper.c:                   ARM_CPU_FREQ, NANOSECONDS_PER_SECOND);
target/arm/helper.c:    return (ARM_CPU_FREQ / NANOSECONDS_PER_SECOND) * cycles;


But I digress. Having a generic way of counting instruction across all the boards would
be a fine improvement. cycle calculation can wait.


> 
>> I ended up doing something similar to what instructions_get_count() from ARM does, which
>> relies on icount. Richard then aided me in figuring out that I could count instructions
>> directly by tapping into the end of each TB.
> 
> instructions_get_count will also work without icount but is affected by
> wall clock time distortions in that case.
> 
>> So, for a generic service of sorts I believe it would be nice to re-use the TCG plugins
>> API in the internal instrumentation (I tried it once, failed, not sure if I messed up
>> or it's not possible ATM). That would be a good start to try to get all this logic in a
>> generic code for internal translate code to use.
> 
> Agreed - although the plugin specific stuff is really just focused on
> our limited visibility API. Unless you are referring to
> accel/tcg/plugin-gen.c which are just helpers for manipulating the TCG
> ops after the initial translation.


TCG plug-ins came to mind because they operate like generic APIs that can be used across
multiple archs, but any way of putting generic instrumentation code that can be used internally
everywhere would do. TCG plug-ins seems to be a good candidate for that since the infrastructure
is already in place.


Thanks,


Daniel

> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-01-04 19:57 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-23  3:01 [PATCH 0/3] Reorg ppc64 pmu insn counting Richard Henderson
2021-12-23  3:01 ` [PATCH 1/3] target/ppc: Cache per-pmc insn and cycle count settings Richard Henderson
2021-12-23  3:01 ` [PATCH 2/3] target/ppc: Rewrite pmu_increment_insns Richard Henderson
2021-12-23  3:01 ` [PATCH 3/3] target/ppc: Use env->pnc_cyc_cnt Richard Henderson
2021-12-23 20:36 ` [PATCH 0/3] Reorg ppc64 pmu insn counting Daniel Henrique Barboza
2021-12-23 21:19   ` Richard Henderson
2021-12-23 22:47     ` Daniel Henrique Barboza
2021-12-30 22:12     ` Daniel Henrique Barboza
2022-01-03  6:48       ` Cédric Le Goater
2022-01-03 11:10         ` Daniel Henrique Barboza
2022-01-03 15:07   ` Alex Bennée
2022-01-03 18:06     ` Daniel Henrique Barboza
2022-01-04 10:32       ` Alex Bennée
2022-01-04 19:56         ` Daniel Henrique Barboza

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.