[PATCH 1/2] perf/x86/intel/lbr: Fix incomplete LBR call stack

* [PATCH 1/2] perf/x86/intel/lbr: Fix incomplete LBR call stack
@ 2018-06-05 15:38 kan.liang
  2018-06-05 15:38 ` [PATCH 2/2] perf/x86/intel/lbr: Optimizing context switch for " kan.liang
  0 siblings, 1 reply; 2+ messages in thread
From: kan.liang @ 2018-06-05 15:38 UTC (permalink / raw)
  To: peterz, tglx, acme, mingo, linux-kernel; +Cc: eranian, ak, Kan Liang

From: Kan Liang <kan.liang@linux.intel.com>

LBR has a limited stack size. If a task has a deeper call stack than
LBR's stack size, only the overflowed part is reported. A complete call
stack may not be reconstructed by perf tool.

Current code doesn't access all LBR registers. It only read the ones
below the TOS. The LBR registers above the TOS will be discarded
unconditionally.

When a CALL is captured, the TOS is incremented by 1 , modulo max LBR
stack size. The LBR HW only records the call stack information to the
register which the TOS points to. It will not touch other LBR
registers. So the registers above the TOS probably still store the valid
call stack information for an overflowed call stack, which need to be
reported.

To retrieve complete call stack information, we need to start from TOS,
read all LBR registers until an invalid entry is detected.
0s can be used to detect the invalid entry, because
- When a RET is captured, the HW zeros the LBR register which TOS points
  to, then decreases the TOS.
- The LBR registers are reset to 0 when adding a new LBR event or
  scheduling an existing LBR event.
- A taken branch at IP 0 is not expected

The context switch code is also modified to save/restore all valid LBR
registers. Furthermore, the LBR registers, which don't have valid call
stack information, need to be reset in restore, because they may be
polluted while swapped out.

Here is a small test program, tchain_deep.
Its call stack is deeper than 32.

noinline void f33(void)
{
        int i;
        for (i = 0; i < 10000000;) {

                if(i%2)
                        i++;
                else
                        i++;
        }
}
noinline void f32(void)
{
        f33();
}
noinline void f31(void)
{
        f32();
}

... ...

noinline void f1(void)
{
        f2();
}
int main()
{
        f1();
}

Here is the test result on SKX. The max stack size of SKX is 32.

Without the patch:
 $perf record -e cycles --call-graph lbr -- ./tchain_deep
 $perf report --stdio
 #
 # Children      Self  Command      Shared Object     Symbol
 # ........  ........  ...........  ................  .................
 #
   100.00%    99.99%  tchain_deep    tchain_deep       [.] f33
            |
             --99.99%--f30
                       f31
                       f32
                       f33

With the patch:
 $perf record -e cycles --call-graph lbr -- ./tchain_deep
 $perf report --stdio
 # Children      Self  Command      Shared Object     Symbol
 # ........  ........  ...........  ................  ..................
 #
    99.99%     0.00%  tchain_deep    tchain_deep       [.] f1
            |
            ---f1
               f2
               f3
               f4
               f5
               f6
               f7
               f8
               f9
               f10
               f11
               f12
               f13
               f14
               f15
               f16
               f17
               f18
               f19
               f20
               f21
               f22
               f23
               f24
               f25
               f26
               f27
               f28
               f29
               f30
               f31
               f32
               f33

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 arch/x86/events/intel/lbr.c  | 32 ++++++++++++++++++++++++++------
 arch/x86/events/perf_event.h |  1 +
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index cf372b9..a417004 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -346,7 +346,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 
 	mask = x86_pmu.lbr_nr - 1;
 	tos = task_ctx->tos;
-	for (i = 0; i < tos; i++) {
+	for (i = 0; i < task_ctx->valid_lbrs; i++) {
 		lbr_idx = (tos - i) & mask;
 		wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
 		wrlbr_to  (lbr_idx, task_ctx->lbr_to[i]);
@@ -354,6 +354,15 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
 			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
 	}
+
+	for (; i < x86_pmu.lbr_nr; i++) {
+		lbr_idx = (tos - i) & mask;
+		wrlbr_from(lbr_idx, 0);
+		wrlbr_to(lbr_idx, 0);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0);
+	}
+
 	wrmsrl(x86_pmu.lbr_tos, tos);
 	task_ctx->lbr_stack_state = LBR_NONE;
 }
@@ -361,7 +370,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 {
 	unsigned lbr_idx, mask;
-	u64 tos;
+	u64 tos, from;
 	int i;
 
 	if (task_ctx->lbr_callstack_users == 0) {
@@ -371,13 +380,17 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 
 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
-	for (i = 0; i < tos; i++) {
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		lbr_idx = (tos - i) & mask;
-		task_ctx->lbr_from[i] = rdlbr_from(lbr_idx);
+		from = rdlbr_from(lbr_idx);
+		if (!from)
+			break;
+		task_ctx->lbr_from[i] = from;
 		task_ctx->lbr_to[i]   = rdlbr_to(lbr_idx);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
 			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
 	}
+	task_ctx->valid_lbrs = i;
 	task_ctx->tos = tos;
 	task_ctx->lbr_stack_state = LBR_VALID;
 }
@@ -531,7 +544,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
  */
 static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
-	bool need_info = false;
+	bool need_info = false, call_stack = false;
 	unsigned long mask = x86_pmu.lbr_nr - 1;
 	int lbr_format = x86_pmu.intel_cap.lbr_format;
 	u64 tos = intel_pmu_lbr_tos();
@@ -542,7 +555,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	if (cpuc->lbr_sel) {
 		need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
 		if (cpuc->lbr_sel->config & LBR_CALL_STACK)
-			num = tos;
+			call_stack = true;
 	}
 
 	for (i = 0; i < num; i++) {
@@ -555,6 +568,13 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		from = rdlbr_from(lbr_idx);
 		to   = rdlbr_to(lbr_idx);
 
+		/*
+		 * Read LBR call stack entries
+		 * until invalid entry (0s) is detected.
+		 */
+		if (call_stack && !from)
+			break;
+
 		if (lbr_format == LBR_FORMAT_INFO && need_info) {
 			u64 info;
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9f37114..6b72a92 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -648,6 +648,7 @@ struct x86_perf_task_context {
 	u64 lbr_to[MAX_LBR_ENTRIES];
 	u64 lbr_info[MAX_LBR_ENTRIES];
 	int tos;
+	int valid_lbrs;
 	int lbr_callstack_users;
 	int lbr_stack_state;
 };
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 2+ messages in thread