From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754635AbaBRGIr (ORCPT ); Tue, 18 Feb 2014 01:08:47 -0500 Received: from mga11.intel.com ([192.55.52.93]:43042 "EHLO mga11.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754558AbaBRGIC (ORCPT ); Tue, 18 Feb 2014 01:08:02 -0500 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="4.97,500,1389772800"; d="scan'208";a="483239709" From: "Yan, Zheng" To: linux-kernel@vger.kernel.org Cc: a.p.zijlstra@chello.nl, mingo@kernel.org, acme@infradead.org, eranian@google.com, andi@firstfloor.org, "Yan, Zheng" Subject: [PATCH v3 12/14] perf, x86: use LBR call stack to get user callchain Date: Tue, 18 Feb 2014 14:07:39 +0800 Message-Id: <1392703661-15104-13-git-send-email-zheng.z.yan@intel.com> X-Mailer: git-send-email 1.8.5.3 In-Reply-To: <1392703661-15104-1-git-send-email-zheng.z.yan@intel.com> References: <1392703661-15104-1-git-send-email-zheng.z.yan@intel.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Haswell has a new feature that utilizes the existing Last Branch Record facility to record call chains. When the feature is enabled, function call will be collected as normal, but as return instructions are executed the last captured branch record is popped from the on-chip LBR registers. The LBR call stack facility can help perf to get call chains of progam without frame pointer. This patch makes x86's perf_callchain_user() failback to LBR callstack when there is no frame pointer in the user program. Signed-off-by: Yan, Zheng --- arch/x86/kernel/cpu/perf_event.c | 33 ++++++++++++++++++++++++++---- arch/x86/kernel/cpu/perf_event_intel.c | 10 ++++++++- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 2 ++ include/linux/perf_event.h | 1 + 4 files changed, 41 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6094560..0d0fe2f3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1975,12 +1975,28 @@ static unsigned long get_segment_base(unsigned int segment) return get_desc_base(desc + idx); } +static inline void +perf_callchain_lbr_callstack(struct perf_callchain_entry *entry, + struct perf_sample_data *data) +{ + struct perf_branch_stack *br_stack = data->br_stack; + + if (br_stack && br_stack->user_callstack) { + int i = 0; + while (i < br_stack->nr && entry->nr < PERF_MAX_STACK_DEPTH) { + perf_callchain_store(entry, br_stack->entries[i].from); + i++; + } + } +} + #ifdef CONFIG_COMPAT #include static inline int -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +perf_callchain_user32(struct perf_callchain_entry *entry, + struct pt_regs *regs, struct perf_sample_data *data) { /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; @@ -2009,11 +2025,16 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) perf_callchain_store(entry, cs_base + frame.return_address); fp = compat_ptr(ss_base + frame.next_frame); } + + if (fp == compat_ptr(regs->bp)) + perf_callchain_lbr_callstack(entry, data); + return 1; } #else static inline int -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +perf_callchain_user32(struct perf_callchain_entry *entry, + struct pt_regs *regs, struct perf_sample_data *data) { return 0; } @@ -2043,12 +2064,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry, if (!current->mm) return; - if (perf_callchain_user32(regs, entry)) + if (perf_callchain_user32(entry, regs, data)) return; while (entry->nr < PERF_MAX_STACK_DEPTH) { unsigned long bytes; - frame.next_frame = NULL; + frame.next_frame = NULL; frame.return_address = 0; bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); @@ -2061,6 +2082,10 @@ void perf_callchain_user(struct perf_callchain_entry *entry, perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } + + /* try LBR callstack if there is no frame pointer */ + if (fp == (void __user *)regs->bp) + perf_callchain_lbr_callstack(entry, data); } /* diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 722171c..9057a20 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1030,6 +1030,14 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; +static inline bool intel_pmu_needs_lbr_callstack(struct perf_event *event) +{ + if ((event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && + (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)) + return true; + return false; +} + static void intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1398,7 +1406,7 @@ again: perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (needs_branch_stack(event)) data.br_stack = &cpuc->lbr_stack; if (perf_event_overflow(event, &data, regs)) diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index aa726af..7e26367 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -717,6 +717,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) int i, j, type; bool compress = false; + cpuc->lbr_stack.user_callstack = branch_user_callstack(br_sel); + /* if sampling all branches, then nothing to filter */ if ((br_sel & X86_BR_ALL) == X86_BR_ALL) return; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b87974a..517c34a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -74,6 +74,7 @@ struct perf_raw_record { * recent branch. */ struct perf_branch_stack { + bool user_callstack; __u64 nr; struct perf_branch_entry entries[0]; }; -- 1.8.5.3