linux-csky.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH V10 01/18] perf/core: Use static_call to optimize perf_guest_info_callbacks
       [not found] <20210806133802.3528-1-lingshan.zhu@intel.com>
@ 2021-08-06 13:37 ` Zhu Lingshan
  2021-08-26 19:59   ` Sean Christopherson
  2021-08-27 17:23   ` Sean Christopherson
  0 siblings, 2 replies; 6+ messages in thread
From: Zhu Lingshan @ 2021-08-06 13:37 UTC (permalink / raw)
  To: peterz, pbonzini
  Cc: bp, seanjc, vkuznets, wanpengli, jmattson, joro, kan.liang, ak,
	wei.w.wang, eranian, liuxiangdong5, linux-kernel, x86, kvm,
	like.xu.linux, boris.ostrvsky, Like Xu, Will Deacon,
	Marc Zyngier, Guo Ren, Nick Hu, Paul Walmsley, Boris Ostrovsky,
	linux-arm-kernel, kvmarm, linux-csky, linux-riscv, xen-devel,
	Zhu Lingshan

From: Like Xu <like.xu@linux.intel.com>

For "struct perf_guest_info_callbacks", the two fields "is_in_guest"
and "is_user_mode" are replaced with a new multiplexed member named
"state", and the "get_guest_ip" field will be renamed to "get_ip".

For arm64, xen and kvm/x86, the application of DEFINE_STATIC_CALL_RET0
could make all that perf_guest_cbs stuff suck less. For arm, csky, nds32,
and riscv, just applied some renamed refactoring.

Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Nick Hu <nickhu@andestech.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: kvmarm@lists.cs.columbia.edu
Cc: linux-csky@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Cc: xen-devel@lists.xenproject.org
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Original-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Like Xu <like.xu@linux.intel.com>
Signed-off-by: Zhu Lingshan <lingshan.zhu@intel.com>
Reviewed-by: Boris Ostrovsky <boris.ostrvsky@oracle.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/arm/kernel/perf_callchain.c   | 16 +++++++-----
 arch/arm64/kernel/perf_callchain.c | 29 +++++++++++++++++-----
 arch/arm64/kvm/perf.c              | 22 ++++++++---------
 arch/csky/kernel/perf_callchain.c  |  4 +--
 arch/nds32/kernel/perf_event_cpu.c | 16 +++++++-----
 arch/riscv/kernel/perf_callchain.c |  4 +--
 arch/x86/events/core.c             | 39 ++++++++++++++++++++++++------
 arch/x86/events/intel/core.c       |  7 +++---
 arch/x86/include/asm/kvm_host.h    |  2 +-
 arch/x86/kvm/pmu.c                 |  2 +-
 arch/x86/kvm/x86.c                 | 37 +++++++++++++++-------------
 arch/x86/xen/pmu.c                 | 33 ++++++++++---------------
 include/linux/perf_event.h         | 12 ++++++---
 kernel/events/core.c               |  9 +++++++
 14 files changed, 144 insertions(+), 88 deletions(-)

diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 3b69a76d341e..1ce30f86d6c7 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -64,7 +64,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 {
 	struct frame_tail __user *tail;
 
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+	if (perf_guest_cbs && perf_guest_cbs->state()) {
 		/* We don't support guest os callchain now */
 		return;
 	}
@@ -100,7 +100,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 {
 	struct stackframe fr;
 
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+	if (perf_guest_cbs && perf_guest_cbs->state()) {
 		/* We don't support guest os callchain now */
 		return;
 	}
@@ -111,8 +111,8 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-		return perf_guest_cbs->get_guest_ip();
+	if (perf_guest_cbs && perf_guest_cbs->state())
+		return perf_guest_cbs->get_ip();
 
 	return instruction_pointer(regs);
 }
@@ -120,9 +120,13 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs)
 unsigned long perf_misc_flags(struct pt_regs *regs)
 {
 	int misc = 0;
+	unsigned int state = 0;
 
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		if (perf_guest_cbs->is_user_mode())
+	if (perf_guest_cbs)
+		state = perf_guest_cbs->state();
+
+	if (perf_guest_cbs && state) {
+		if (state & PERF_GUEST_USER)
 			misc |= PERF_RECORD_MISC_GUEST_USER;
 		else
 			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 4a72c2727309..1b344e23fd2f 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2015 ARM Limited
  */
 #include <linux/perf_event.h>
+#include <linux/static_call.h>
 #include <linux/uaccess.h>
 
 #include <asm/pointer_auth.h>
@@ -99,10 +100,25 @@ compat_user_backtrace(struct compat_frame_tail __user *tail,
 }
 #endif /* CONFIG_COMPAT */
 
+DEFINE_STATIC_CALL_RET0(arm64_guest_state, *(perf_guest_cbs->state));
+DEFINE_STATIC_CALL_RET0(arm64_guest_get_ip, *(perf_guest_cbs->get_ip));
+
+void arch_perf_update_guest_cbs(void)
+{
+	static_call_update(arm64_guest_state, (void *)&__static_call_return0);
+	static_call_update(arm64_guest_get_ip, (void *)&__static_call_return0);
+
+	if (perf_guest_cbs && perf_guest_cbs->state)
+		static_call_update(arm64_guest_state, perf_guest_cbs->state);
+
+	if (perf_guest_cbs && perf_guest_cbs->get_ip)
+		static_call_update(arm64_guest_get_ip, perf_guest_cbs->get_ip);
+}
+
 void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 			 struct pt_regs *regs)
 {
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+	if (static_call(arm64_guest_state)()) {
 		/* We don't support guest os callchain now */
 		return;
 	}
@@ -149,7 +165,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 {
 	struct stackframe frame;
 
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+	if (static_call(arm64_guest_state)()) {
 		/* We don't support guest os callchain now */
 		return;
 	}
@@ -160,8 +176,8 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-		return perf_guest_cbs->get_guest_ip();
+	if (static_call(arm64_guest_state)())
+		return static_call(arm64_guest_get_ip)();
 
 	return instruction_pointer(regs);
 }
@@ -169,9 +185,10 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs)
 unsigned long perf_misc_flags(struct pt_regs *regs)
 {
 	int misc = 0;
+	unsigned int guest = static_call(arm64_guest_state)();
 
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		if (perf_guest_cbs->is_user_mode())
+	if (guest) {
+		if (guest & PERF_GUEST_USER)
 			misc |= PERF_RECORD_MISC_GUEST_USER;
 		else
 			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c
index 151c31fb9860..8a3387e58f42 100644
--- a/arch/arm64/kvm/perf.c
+++ b/arch/arm64/kvm/perf.c
@@ -13,21 +13,20 @@
 
 DEFINE_STATIC_KEY_FALSE(kvm_arm_pmu_available);
 
-static int kvm_is_in_guest(void)
-{
-        return kvm_get_running_vcpu() != NULL;
-}
-
-static int kvm_is_user_mode(void)
+static unsigned int kvm_guest_state(void)
 {
 	struct kvm_vcpu *vcpu;
+	unsigned int state = 0;
+
+	if (kvm_get_running_vcpu())
+		state |= PERF_GUEST_ACTIVE;
 
 	vcpu = kvm_get_running_vcpu();
 
-	if (vcpu)
-		return !vcpu_mode_priv(vcpu);
+	if (vcpu && !vcpu_mode_priv(vcpu))
+		state |= PERF_GUEST_USER;
 
-	return 0;
+	return state;
 }
 
 static unsigned long kvm_get_guest_ip(void)
@@ -43,9 +42,8 @@ static unsigned long kvm_get_guest_ip(void)
 }
 
 static struct perf_guest_info_callbacks kvm_guest_cbs = {
-	.is_in_guest	= kvm_is_in_guest,
-	.is_user_mode	= kvm_is_user_mode,
-	.get_guest_ip	= kvm_get_guest_ip,
+	.state		= kvm_guest_state,
+	.get_ip		= kvm_get_guest_ip,
 };
 
 int kvm_perf_init(void)
diff --git a/arch/csky/kernel/perf_callchain.c b/arch/csky/kernel/perf_callchain.c
index ab55e98ee8f6..3e42239dd1b2 100644
--- a/arch/csky/kernel/perf_callchain.c
+++ b/arch/csky/kernel/perf_callchain.c
@@ -89,7 +89,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 	unsigned long fp = 0;
 
 	/* C-SKY does not support virtualization. */
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+	if (perf_guest_cbs && perf_guest_cbs->state())
 		return;
 
 	fp = regs->regs[4];
@@ -113,7 +113,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 	struct stackframe fr;
 
 	/* C-SKY does not support virtualization. */
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+	if (perf_guest_cbs && perf_guest_cbs->state()) {
 		pr_warn("C-SKY does not support perf in guest mode!");
 		return;
 	}
diff --git a/arch/nds32/kernel/perf_event_cpu.c b/arch/nds32/kernel/perf_event_cpu.c
index 0ce6f9f307e6..1dc32ba842ce 100644
--- a/arch/nds32/kernel/perf_event_cpu.c
+++ b/arch/nds32/kernel/perf_event_cpu.c
@@ -1371,7 +1371,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 
 	leaf_fp = 0;
 
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+	if (perf_guest_cbs && perf_guest_cbs->state()) {
 		/* We don't support guest os callchain now */
 		return;
 	}
@@ -1481,7 +1481,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 {
 	struct stackframe fr;
 
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+	if (perf_guest_cbs && perf_guest_cbs->state()) {
 		/* We don't support guest os callchain now */
 		return;
 	}
@@ -1494,8 +1494,8 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	/* However, NDS32 does not support virtualization */
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-		return perf_guest_cbs->get_guest_ip();
+	if (perf_guest_cbs && perf_guest_cbs->state())
+		return perf_guest_cbs->get_ip();
 
 	return instruction_pointer(regs);
 }
@@ -1503,10 +1503,14 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs)
 unsigned long perf_misc_flags(struct pt_regs *regs)
 {
 	int misc = 0;
+	unsigned int state = 0;
+
+	if (perf_guest_cbs)
+		state = perf_guest_cbs->state();
 
 	/* However, NDS32 does not support virtualization */
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		if (perf_guest_cbs->is_user_mode())
+	if (perf_guest_cbs && state) {
+		if (state & PERF_GUEST_USER)
 			misc |= PERF_RECORD_MISC_GUEST_USER;
 		else
 			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
diff --git a/arch/riscv/kernel/perf_callchain.c b/arch/riscv/kernel/perf_callchain.c
index 0bb1854dce83..ea63f70cae5d 100644
--- a/arch/riscv/kernel/perf_callchain.c
+++ b/arch/riscv/kernel/perf_callchain.c
@@ -59,7 +59,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 	unsigned long fp = 0;
 
 	/* RISC-V does not support perf in guest mode. */
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+	if (perf_guest_cbs && perf_guest_cbs->state())
 		return;
 
 	fp = regs->s0;
@@ -79,7 +79,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
 	/* RISC-V does not support perf in guest mode. */
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+	if (perf_guest_cbs && perf_guest_cbs->state()) {
 		pr_warn("RISC-V does not support perf in guest mode!");
 		return;
 	}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 1eb45139fcc6..9a908631f6cc 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -90,6 +90,28 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
  */
 DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
 
+DEFINE_STATIC_CALL_RET0(x86_guest_state, *(perf_guest_cbs->state));
+DEFINE_STATIC_CALL_RET0(x86_guest_get_ip, *(perf_guest_cbs->get_ip));
+DEFINE_STATIC_CALL_RET0(x86_guest_handle_intel_pt_intr, *(perf_guest_cbs->handle_intel_pt_intr));
+
+void arch_perf_update_guest_cbs(void)
+{
+	static_call_update(x86_guest_state, (void *)&__static_call_return0);
+	static_call_update(x86_guest_get_ip, (void *)&__static_call_return0);
+	static_call_update(x86_guest_handle_intel_pt_intr, (void *)&__static_call_return0);
+
+	if (perf_guest_cbs && perf_guest_cbs->state)
+		static_call_update(x86_guest_state, perf_guest_cbs->state);
+
+	if (perf_guest_cbs && perf_guest_cbs->get_ip)
+		static_call_update(x86_guest_get_ip, perf_guest_cbs->get_ip);
+
+	if (perf_guest_cbs && perf_guest_cbs->handle_intel_pt_intr) {
+		static_call_update(x86_guest_handle_intel_pt_intr,
+				   perf_guest_cbs->handle_intel_pt_intr);
+	}
+}
+
 u64 __read_mostly hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2764,7 +2786,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 	struct unwind_state state;
 	unsigned long addr;
 
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+	if (static_call(x86_guest_state)()) {
 		/* TODO: We don't support guest os callchain now */
 		return;
 	}
@@ -2867,7 +2889,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 	struct stack_frame frame;
 	const struct stack_frame __user *fp;
 
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+	if (static_call(x86_guest_state)()) {
 		/* TODO: We don't support guest os callchain now */
 		return;
 	}
@@ -2944,18 +2966,21 @@ static unsigned long code_segment_base(struct pt_regs *regs)
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-		return perf_guest_cbs->get_guest_ip();
+	unsigned long ip = static_call(x86_guest_get_ip)();
+
+	if (likely(!ip))
+		ip = regs->ip + code_segment_base(regs);
 
-	return regs->ip + code_segment_base(regs);
+	return ip;
 }
 
 unsigned long perf_misc_flags(struct pt_regs *regs)
 {
+	unsigned int guest = static_call(x86_guest_state)();
 	int misc = 0;
 
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		if (perf_guest_cbs->is_user_mode())
+	if (guest) {
+		if (guest & PERF_GUEST_USER)
 			misc |= PERF_RECORD_MISC_GUEST_USER;
 		else
 			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index fca7a6e2242f..fb1bd7a0e1a6 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2782,6 +2782,8 @@ static void intel_pmu_reset(void)
 	local_irq_restore(flags);
 }
 
+DECLARE_STATIC_CALL(x86_guest_handle_intel_pt_intr, *(perf_guest_cbs->handle_intel_pt_intr));
+
 static int handle_pmi_common(struct pt_regs *regs, u64 status)
 {
 	struct perf_sample_data data;
@@ -2852,10 +2854,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
 	 */
 	if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) {
 		handled++;
-		if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() &&
-			perf_guest_cbs->handle_intel_pt_intr))
-			perf_guest_cbs->handle_intel_pt_intr();
-		else
+		if (!static_call(x86_guest_handle_intel_pt_intr)())
 			intel_pt_interrupt();
 	}
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 974cbfb1eefe..128e2dd9c944 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1884,7 +1884,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
 void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
 
-int kvm_is_in_guest(void);
+unsigned int kvm_guest_state(void);
 
 void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
 				     u32 size);
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 827886c12c16..2dcbd1b30004 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -87,7 +87,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event,
 		 * woken up. So we should wake it, but this is impossible from
 		 * NMI context. Do it from irq work instead.
 		 */
-		if (!kvm_is_in_guest())
+		if (!kvm_guest_state())
 			irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
 		else
 			kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5d5c5ed7dd4..efd11702465c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8268,44 +8268,47 @@ static void kvm_timer_init(void)
 DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
 EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
 
-int kvm_is_in_guest(void)
+unsigned int kvm_guest_state(void)
 {
-	return __this_cpu_read(current_vcpu) != NULL;
-}
-
-static int kvm_is_user_mode(void)
-{
-	int user_mode = 3;
+	struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
+	unsigned int state = 0;
 
-	if (__this_cpu_read(current_vcpu))
-		user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu));
+	if (vcpu) {
+		state |= PERF_GUEST_ACTIVE;
+		if (static_call(kvm_x86_get_cpl)(vcpu))
+			state |= PERF_GUEST_USER;
+	}
 
-	return user_mode != 0;
+	return state;
 }
 
-static unsigned long kvm_get_guest_ip(void)
+static unsigned long kvm_guest_get_ip(void)
 {
+	struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
 	unsigned long ip = 0;
 
-	if (__this_cpu_read(current_vcpu))
-		ip = kvm_rip_read(__this_cpu_read(current_vcpu));
+	if (vcpu)
+		ip = kvm_rip_read(vcpu);
 
 	return ip;
 }
 
-static void kvm_handle_intel_pt_intr(void)
+static unsigned int kvm_handle_intel_pt_intr(void)
 {
 	struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
 
+	if (!vcpu)
+		return 0;
+
 	kvm_make_request(KVM_REQ_PMI, vcpu);
 	__set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
 			(unsigned long *)&vcpu->arch.pmu.global_status);
+	return 1;
 }
 
 static struct perf_guest_info_callbacks kvm_guest_cbs = {
-	.is_in_guest		= kvm_is_in_guest,
-	.is_user_mode		= kvm_is_user_mode,
-	.get_guest_ip		= kvm_get_guest_ip,
+	.state			= kvm_guest_state,
+	.get_ip			= kvm_guest_get_ip,
 	.handle_intel_pt_intr	= kvm_handle_intel_pt_intr,
 };
 
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index e13b0b49fcdf..85c6e6f6f422 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -413,34 +413,28 @@ int pmu_apic_update(uint32_t val)
 }
 
 /* perf callbacks */
-static int xen_is_in_guest(void)
+static unsigned int xen_guest_state(void)
 {
 	const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	unsigned int state = 0;
 
 	if (!xenpmu_data) {
 		pr_warn_once("%s: pmudata not initialized\n", __func__);
-		return 0;
+		return state;
 	}
 
 	if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
-		return 0;
+		return state;
 
-	return 1;
-}
+	state |= PERF_GUEST_ACTIVE;
 
-static int xen_is_user_mode(void)
-{
-	const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+	if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) {
+		if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER)
+			state |= PERF_GUEST_USER;
+	} else if (xenpmu_data->pmu.r.regs.cpl & 3)
+			state |= PERF_GUEST_USER;
 
-	if (!xenpmu_data) {
-		pr_warn_once("%s: pmudata not initialized\n", __func__);
-		return 0;
-	}
-
-	if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
-		return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
-	else
-		return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+	return state;
 }
 
 static unsigned long xen_get_guest_ip(void)
@@ -456,9 +450,8 @@ static unsigned long xen_get_guest_ip(void)
 }
 
 static struct perf_guest_info_callbacks xen_guest_cbs = {
-	.is_in_guest            = xen_is_in_guest,
-	.is_user_mode           = xen_is_user_mode,
-	.get_guest_ip           = xen_get_guest_ip,
+	.state                  = xen_guest_state,
+	.get_ip			= xen_get_guest_ip,
 };
 
 /* Convert registers from Xen's format to Linux' */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2d510ad750ed..e823677a214c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -26,11 +26,13 @@
 # include <asm/local64.h>
 #endif
 
+#define PERF_GUEST_ACTIVE	0x01
+#define PERF_GUEST_USER	0x02
+
 struct perf_guest_info_callbacks {
-	int				(*is_in_guest)(void);
-	int				(*is_user_mode)(void);
-	unsigned long			(*get_guest_ip)(void);
-	void				(*handle_intel_pt_intr)(void);
+	unsigned int			(*state)(void);
+	unsigned long			(*get_ip)(void);
+	unsigned int			(*handle_intel_pt_intr)(void);
 };
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -1237,6 +1239,8 @@ extern void perf_event_bpf_event(struct bpf_prog *prog,
 				 u16 flags);
 
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
+extern void __weak arch_perf_update_guest_cbs(void);
+
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 464917096e73..e466fc8176e1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6489,9 +6489,18 @@ static void perf_pending_event(struct irq_work *entry)
  */
 struct perf_guest_info_callbacks *perf_guest_cbs;
 
+/* explicitly use __weak to fix duplicate symbol error */
+void __weak arch_perf_update_guest_cbs(void)
+{
+}
+
 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 {
+	if (WARN_ON_ONCE(perf_guest_cbs))
+		return -EBUSY;
+
 	perf_guest_cbs = cbs;
+	arch_perf_update_guest_cbs();
 	return 0;
 }
 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH V10 01/18] perf/core: Use static_call to optimize perf_guest_info_callbacks
  2021-08-06 13:37 ` [PATCH V10 01/18] perf/core: Use static_call to optimize perf_guest_info_callbacks Zhu Lingshan
@ 2021-08-26 19:59   ` Sean Christopherson
  2021-08-27  6:31     ` Like Xu
  2021-09-15  1:19     ` Zhu, Lingshan
  2021-08-27 17:23   ` Sean Christopherson
  1 sibling, 2 replies; 6+ messages in thread
From: Sean Christopherson @ 2021-08-26 19:59 UTC (permalink / raw)
  To: Zhu Lingshan
  Cc: peterz, pbonzini, bp, vkuznets, wanpengli, jmattson, joro,
	kan.liang, ak, wei.w.wang, eranian, liuxiangdong5, linux-kernel,
	x86, kvm, like.xu.linux, boris.ostrvsky, Like Xu, Will Deacon,
	Marc Zyngier, Guo Ren, Nick Hu, Paul Walmsley, Boris Ostrovsky,
	linux-arm-kernel, kvmarm, linux-csky, linux-riscv, xen-devel

TL;DR: Please don't merge this patch, it's broken and is also built on a shoddy
       foundation that I would like to fix.

On Fri, Aug 06, 2021, Zhu Lingshan wrote:
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 464917096e73..e466fc8176e1 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6489,9 +6489,18 @@ static void perf_pending_event(struct irq_work *entry)
>   */
>  struct perf_guest_info_callbacks *perf_guest_cbs;
>  
> +/* explicitly use __weak to fix duplicate symbol error */
> +void __weak arch_perf_update_guest_cbs(void)
> +{
> +}
> +
>  int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
>  {
> +	if (WARN_ON_ONCE(perf_guest_cbs))
> +		return -EBUSY;
> +
>  	perf_guest_cbs = cbs;
> +	arch_perf_update_guest_cbs();

This is horribly broken, it fails to cleanup the static calls when KVM unregisters
the callbacks, which happens when the vendor module, e.g. kvm_intel, is unloaded.
The explosion doesn't happen until 'kvm' is unloaded because the functions are
implemented in 'kvm', i.e. the use-after-free is deferred a bit.

  BUG: unable to handle page fault for address: ffffffffa011bb90
  #PF: supervisor instruction fetch in kernel mode
  #PF: error_code(0x0010) - not-present page
  PGD 6211067 P4D 6211067 PUD 6212063 PMD 102b99067 PTE 0
  Oops: 0010 [#1] PREEMPT SMP
  CPU: 0 PID: 1047 Comm: rmmod Not tainted 5.14.0-rc2+ #460
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
  RIP: 0010:0xffffffffa011bb90
  Code: Unable to access opcode bytes at RIP 0xffffffffa011bb66.
  Call Trace:
   <NMI>
   ? perf_misc_flags+0xe/0x50
   ? perf_prepare_sample+0x53/0x6b0
   ? perf_event_output_forward+0x67/0x160
   ? kvm_clock_read+0x14/0x30
   ? kvm_sched_clock_read+0x5/0x10
   ? sched_clock_cpu+0xd/0xd0
   ? __perf_event_overflow+0x52/0xf0
   ? handle_pmi_common+0x1f2/0x2d0
   ? __flush_tlb_all+0x30/0x30
   ? intel_pmu_handle_irq+0xcf/0x410
   ? nmi_handle+0x5/0x260
   ? perf_event_nmi_handler+0x28/0x50
   ? nmi_handle+0xc7/0x260
   ? lock_release+0x2b0/0x2b0
   ? default_do_nmi+0x6b/0x170
   ? exc_nmi+0x103/0x130
   ? end_repeat_nmi+0x16/0x1f
   ? lock_release+0x2b0/0x2b0
   ? lock_release+0x2b0/0x2b0
   ? lock_release+0x2b0/0x2b0
   </NMI>
  Modules linked in: irqbypass [last unloaded: kvm]

Even more fun, the existing perf_guest_cbs framework is also broken, though it's
much harder to get it to fail, and probably impossible to get it to fail without
some help.  The issue is that perf_guest_cbs is global, which means that it can
be nullified by KVM (during module unload) while the callbacks are being accessed
by a PMI handler on a different CPU.

The bug has escaped notice because all dererfences of perf_guest_cbs follow the
same "perf_guest_cbs && perf_guest_cbs->is_in_guest()" pattern, and AFAICT the
compiler never reload perf_guest_cbs in this sequence.  The compiler does reload
perf_guest_cbs for any future dereferences, but the ->is_in_guest() guard all but
guarantees the PMI handler will win the race, e.g. to nullify perf_guest_cbs,
KVM has to completely exit the guest and teardown down all VMs before it can be
unloaded.

But with a help, e.g. RAED_ONCE(perf_guest_cbs), unloading kvm_intel can trigger
a NULL pointer derference, e.g. this tweak

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 1eb45139fcc6..202e5ad97f82 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2954,7 +2954,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
 {
        int misc = 0;

-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+       if (READ_ONCE(perf_guest_cbs) && READ_ONCE(perf_guest_cbs)->is_in_guest()) {
                if (perf_guest_cbs->is_user_mode())
                        misc |= PERF_RECORD_MISC_GUEST_USER;
                else


while spamming module load/unload leads to:

  BUG: kernel NULL pointer dereference, address: 0000000000000000
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 0 P4D 0
  Oops: 0000 [#1] PREEMPT SMP
  CPU: 6 PID: 1825 Comm: stress Not tainted 5.14.0-rc2+ #459
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
  RIP: 0010:perf_misc_flags+0x1c/0x70
  Call Trace:
   perf_prepare_sample+0x53/0x6b0
   perf_event_output_forward+0x67/0x160
   __perf_event_overflow+0x52/0xf0
   handle_pmi_common+0x207/0x300
   intel_pmu_handle_irq+0xcf/0x410
   perf_event_nmi_handler+0x28/0x50
   nmi_handle+0xc7/0x260
   default_do_nmi+0x6b/0x170
   exc_nmi+0x103/0x130
   asm_exc_nmi+0x76/0xbf


The good news is that I have a series that should fix both the existing NULL pointer
bug and mostly obviate the need for static calls.  The bad news is that my approach,
making perf_guest_cbs per-CPU, likely complicates turning these into static calls,
though I'm guessing it's still a solvable problem.

Tangentially related, IMO we should make architectures opt-in to getting
perf_guest_cbs and nuke all of the code in the below files.  Except for arm,
which recently lost KVM support, it's all a bunch of useless copy-paste code that
serves no purpose and just complicates cleanups like this.

>  arch/arm/kernel/perf_callchain.c   | 16 +++++++-----
>  arch/csky/kernel/perf_callchain.c  |  4 +--
>  arch/nds32/kernel/perf_event_cpu.c | 16 +++++++-----
>  arch/riscv/kernel/perf_callchain.c |  4 +--

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH V10 01/18] perf/core: Use static_call to optimize perf_guest_info_callbacks
  2021-08-26 19:59   ` Sean Christopherson
@ 2021-08-27  6:31     ` Like Xu
  2021-09-15  1:19     ` Zhu, Lingshan
  1 sibling, 0 replies; 6+ messages in thread
From: Like Xu @ 2021-08-27  6:31 UTC (permalink / raw)
  To: Sean Christopherson, Zhu Lingshan
  Cc: peterz, pbonzini, bp, vkuznets, wanpengli, jmattson, joro,
	kan.liang, ak, wei.w.wang, eranian, liuxiangdong5, linux-kernel,
	x86, kvm, boris.ostrvsky, Like Xu, Will Deacon, Marc Zyngier,
	Guo Ren, Nick Hu, Paul Walmsley, Boris Ostrovsky,
	linux-arm-kernel, kvmarm, linux-csky, linux-riscv, xen-devel

On 27/8/2021 3:59 am, Sean Christopherson wrote:
> TL;DR: Please don't merge this patch, it's broken and is also built on a shoddy
>         foundation that I would like to fix.

Obviously, this patch is not closely related to the guest PEBS feature enabling,
and we can certainly put this issue in another discussion thread [1].

[1] https://lore.kernel.org/kvm/20210827005718.585190-1-seanjc@google.com/

> 
> On Fri, Aug 06, 2021, Zhu Lingshan wrote:
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 464917096e73..e466fc8176e1 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -6489,9 +6489,18 @@ static void perf_pending_event(struct irq_work *entry)
>>    */
>>   struct perf_guest_info_callbacks *perf_guest_cbs;
>>   
>> +/* explicitly use __weak to fix duplicate symbol error */
>> +void __weak arch_perf_update_guest_cbs(void)
>> +{
>> +}
>> +
>>   int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
>>   {
>> +	if (WARN_ON_ONCE(perf_guest_cbs))
>> +		return -EBUSY;
>> +
>>   	perf_guest_cbs = cbs;
>> +	arch_perf_update_guest_cbs();
> 
> This is horribly broken, it fails to cleanup the static calls when KVM unregisters
> the callbacks, which happens when the vendor module, e.g. kvm_intel, is unloaded.
> The explosion doesn't happen until 'kvm' is unloaded because the functions are
> implemented in 'kvm', i.e. the use-after-free is deferred a bit.
> 
>    BUG: unable to handle page fault for address: ffffffffa011bb90
>    #PF: supervisor instruction fetch in kernel mode
>    #PF: error_code(0x0010) - not-present page
>    PGD 6211067 P4D 6211067 PUD 6212063 PMD 102b99067 PTE 0
>    Oops: 0010 [#1] PREEMPT SMP
>    CPU: 0 PID: 1047 Comm: rmmod Not tainted 5.14.0-rc2+ #460
>    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
>    RIP: 0010:0xffffffffa011bb90
>    Code: Unable to access opcode bytes at RIP 0xffffffffa011bb66.
>    Call Trace:
>     <NMI>
>     ? perf_misc_flags+0xe/0x50
>     ? perf_prepare_sample+0x53/0x6b0
>     ? perf_event_output_forward+0x67/0x160
>     ? kvm_clock_read+0x14/0x30
>     ? kvm_sched_clock_read+0x5/0x10
>     ? sched_clock_cpu+0xd/0xd0
>     ? __perf_event_overflow+0x52/0xf0
>     ? handle_pmi_common+0x1f2/0x2d0
>     ? __flush_tlb_all+0x30/0x30
>     ? intel_pmu_handle_irq+0xcf/0x410
>     ? nmi_handle+0x5/0x260
>     ? perf_event_nmi_handler+0x28/0x50
>     ? nmi_handle+0xc7/0x260
>     ? lock_release+0x2b0/0x2b0
>     ? default_do_nmi+0x6b/0x170
>     ? exc_nmi+0x103/0x130
>     ? end_repeat_nmi+0x16/0x1f
>     ? lock_release+0x2b0/0x2b0
>     ? lock_release+0x2b0/0x2b0
>     ? lock_release+0x2b0/0x2b0
>     </NMI>
>    Modules linked in: irqbypass [last unloaded: kvm]
> 
> Even more fun, the existing perf_guest_cbs framework is also broken, though it's
> much harder to get it to fail, and probably impossible to get it to fail without
> some help.  The issue is that perf_guest_cbs is global, which means that it can
> be nullified by KVM (during module unload) while the callbacks are being accessed
> by a PMI handler on a different CPU.
> 
> The bug has escaped notice because all dererfences of perf_guest_cbs follow the
> same "perf_guest_cbs && perf_guest_cbs->is_in_guest()" pattern, and AFAICT the
> compiler never reload perf_guest_cbs in this sequence.  The compiler does reload
> perf_guest_cbs for any future dereferences, but the ->is_in_guest() guard all but
> guarantees the PMI handler will win the race, e.g. to nullify perf_guest_cbs,
> KVM has to completely exit the guest and teardown down all VMs before it can be
> unloaded.
> 
> But with a help, e.g. RAED_ONCE(perf_guest_cbs), unloading kvm_intel can trigger
> a NULL pointer derference, e.g. this tweak
> 
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 1eb45139fcc6..202e5ad97f82 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2954,7 +2954,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
>   {
>          int misc = 0;
> 
> -       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
> +       if (READ_ONCE(perf_guest_cbs) && READ_ONCE(perf_guest_cbs)->is_in_guest()) {
>                  if (perf_guest_cbs->is_user_mode())
>                          misc |= PERF_RECORD_MISC_GUEST_USER;
>                  else
> 
> 
> while spamming module load/unload leads to:
> 
>    BUG: kernel NULL pointer dereference, address: 0000000000000000
>    #PF: supervisor read access in kernel mode
>    #PF: error_code(0x0000) - not-present page
>    PGD 0 P4D 0
>    Oops: 0000 [#1] PREEMPT SMP
>    CPU: 6 PID: 1825 Comm: stress Not tainted 5.14.0-rc2+ #459
>    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
>    RIP: 0010:perf_misc_flags+0x1c/0x70
>    Call Trace:
>     perf_prepare_sample+0x53/0x6b0
>     perf_event_output_forward+0x67/0x160
>     __perf_event_overflow+0x52/0xf0
>     handle_pmi_common+0x207/0x300
>     intel_pmu_handle_irq+0xcf/0x410
>     perf_event_nmi_handler+0x28/0x50
>     nmi_handle+0xc7/0x260
>     default_do_nmi+0x6b/0x170
>     exc_nmi+0x103/0x130
>     asm_exc_nmi+0x76/0xbf
> 
> 
> The good news is that I have a series that should fix both the existing NULL pointer
> bug and mostly obviate the need for static calls.  The bad news is that my approach,
> making perf_guest_cbs per-CPU, likely complicates turning these into static calls,
> though I'm guessing it's still a solvable problem.
> 
> Tangentially related, IMO we should make architectures opt-in to getting
> perf_guest_cbs and nuke all of the code in the below files.  Except for arm,
> which recently lost KVM support, it's all a bunch of useless copy-paste code that
> serves no purpose and just complicates cleanups like this.
> 
>>   arch/arm/kernel/perf_callchain.c   | 16 +++++++-----
>>   arch/csky/kernel/perf_callchain.c  |  4 +--
>>   arch/nds32/kernel/perf_event_cpu.c | 16 +++++++-----
>>   arch/riscv/kernel/perf_callchain.c |  4 +--

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH V10 01/18] perf/core: Use static_call to optimize perf_guest_info_callbacks
  2021-08-06 13:37 ` [PATCH V10 01/18] perf/core: Use static_call to optimize perf_guest_info_callbacks Zhu Lingshan
  2021-08-26 19:59   ` Sean Christopherson
@ 2021-08-27 17:23   ` Sean Christopherson
  1 sibling, 0 replies; 6+ messages in thread
From: Sean Christopherson @ 2021-08-27 17:23 UTC (permalink / raw)
  To: Zhu Lingshan
  Cc: peterz, pbonzini, bp, vkuznets, wanpengli, jmattson, joro,
	kan.liang, ak, wei.w.wang, eranian, liuxiangdong5, linux-kernel,
	x86, kvm, like.xu.linux, boris.ostrvsky, Like Xu, Will Deacon,
	Marc Zyngier, Guo Ren, Nick Hu, Paul Walmsley, Boris Ostrovsky,
	linux-arm-kernel, kvmarm, linux-csky, linux-riscv, xen-devel

On Fri, Aug 06, 2021, Zhu Lingshan wrote:
> @@ -2944,18 +2966,21 @@ static unsigned long code_segment_base(struct pt_regs *regs)
>  
>  unsigned long perf_instruction_pointer(struct pt_regs *regs)
>  {
> -	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
> -		return perf_guest_cbs->get_guest_ip();
> +	unsigned long ip = static_call(x86_guest_get_ip)();
> +
> +	if (likely(!ip))

Pivoting on ip==0 isn't correct, it's perfectly legal for a guest to execute
from %rip=0.  Unless there's some static_call() magic that supports this with a
default function:
	
	if (unlikely(!static_call(x86_guest_get_ip)(&ip)))
		regs->ip + code_segment_base(regs)

	return ip;

The easiest thing is keep the existing:

	if (unlikely(static_call(x86_guest_state)()))
		return static_call(x86_guest_get_ip)();

	return regs->ip + code_segment_base(regs);

It's an extra call for PMIs in guest, but I don't think any of the KVM folks care
_that_ much about the performance in this case.

> +		ip = regs->ip + code_segment_base(regs);
>  
> -	return regs->ip + code_segment_base(regs);
> +	return ip;
>  }

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH V10 01/18] perf/core: Use static_call to optimize perf_guest_info_callbacks
  2021-08-26 19:59   ` Sean Christopherson
  2021-08-27  6:31     ` Like Xu
@ 2021-09-15  1:19     ` Zhu, Lingshan
  2021-09-21 23:22       ` Sean Christopherson
  1 sibling, 1 reply; 6+ messages in thread
From: Zhu, Lingshan @ 2021-09-15  1:19 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: peterz, pbonzini, bp, vkuznets, wanpengli, jmattson, joro,
	kan.liang, ak, wei.w.wang, eranian, liuxiangdong5, linux-kernel,
	x86, kvm, like.xu.linux, boris.ostrvsky, Like Xu, Will Deacon,
	Marc Zyngier, Guo Ren, Nick Hu, Paul Walmsley, Boris Ostrovsky,
	linux-arm-kernel, kvmarm, linux-csky, linux-riscv, xen-devel



On 8/27/2021 3:59 AM, Sean Christopherson wrote:
> TL;DR: Please don't merge this patch, it's broken and is also built on a shoddy
>         foundation that I would like to fix.
Hi Sean,Peter, Paolo

I will send out an V11 which drops this patch since it's buggy, and Sean 
is working on fix this.
Does this sound good?

Thanks,
Zhu Lingshan
>
> On Fri, Aug 06, 2021, Zhu Lingshan wrote:
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 464917096e73..e466fc8176e1 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -6489,9 +6489,18 @@ static void perf_pending_event(struct irq_work *entry)
>>    */
>>   struct perf_guest_info_callbacks *perf_guest_cbs;
>>   
>> +/* explicitly use __weak to fix duplicate symbol error */
>> +void __weak arch_perf_update_guest_cbs(void)
>> +{
>> +}
>> +
>>   int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
>>   {
>> +	if (WARN_ON_ONCE(perf_guest_cbs))
>> +		return -EBUSY;
>> +
>>   	perf_guest_cbs = cbs;
>> +	arch_perf_update_guest_cbs();
> This is horribly broken, it fails to cleanup the static calls when KVM unregisters
> the callbacks, which happens when the vendor module, e.g. kvm_intel, is unloaded.
> The explosion doesn't happen until 'kvm' is unloaded because the functions are
> implemented in 'kvm', i.e. the use-after-free is deferred a bit.
>
>    BUG: unable to handle page fault for address: ffffffffa011bb90
>    #PF: supervisor instruction fetch in kernel mode
>    #PF: error_code(0x0010) - not-present page
>    PGD 6211067 P4D 6211067 PUD 6212063 PMD 102b99067 PTE 0
>    Oops: 0010 [#1] PREEMPT SMP
>    CPU: 0 PID: 1047 Comm: rmmod Not tainted 5.14.0-rc2+ #460
>    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
>    RIP: 0010:0xffffffffa011bb90
>    Code: Unable to access opcode bytes at RIP 0xffffffffa011bb66.
>    Call Trace:
>     <NMI>
>     ? perf_misc_flags+0xe/0x50
>     ? perf_prepare_sample+0x53/0x6b0
>     ? perf_event_output_forward+0x67/0x160
>     ? kvm_clock_read+0x14/0x30
>     ? kvm_sched_clock_read+0x5/0x10
>     ? sched_clock_cpu+0xd/0xd0
>     ? __perf_event_overflow+0x52/0xf0
>     ? handle_pmi_common+0x1f2/0x2d0
>     ? __flush_tlb_all+0x30/0x30
>     ? intel_pmu_handle_irq+0xcf/0x410
>     ? nmi_handle+0x5/0x260
>     ? perf_event_nmi_handler+0x28/0x50
>     ? nmi_handle+0xc7/0x260
>     ? lock_release+0x2b0/0x2b0
>     ? default_do_nmi+0x6b/0x170
>     ? exc_nmi+0x103/0x130
>     ? end_repeat_nmi+0x16/0x1f
>     ? lock_release+0x2b0/0x2b0
>     ? lock_release+0x2b0/0x2b0
>     ? lock_release+0x2b0/0x2b0
>     </NMI>
>    Modules linked in: irqbypass [last unloaded: kvm]
>
> Even more fun, the existing perf_guest_cbs framework is also broken, though it's
> much harder to get it to fail, and probably impossible to get it to fail without
> some help.  The issue is that perf_guest_cbs is global, which means that it can
> be nullified by KVM (during module unload) while the callbacks are being accessed
> by a PMI handler on a different CPU.
>
> The bug has escaped notice because all dererfences of perf_guest_cbs follow the
> same "perf_guest_cbs && perf_guest_cbs->is_in_guest()" pattern, and AFAICT the
> compiler never reload perf_guest_cbs in this sequence.  The compiler does reload
> perf_guest_cbs for any future dereferences, but the ->is_in_guest() guard all but
> guarantees the PMI handler will win the race, e.g. to nullify perf_guest_cbs,
> KVM has to completely exit the guest and teardown down all VMs before it can be
> unloaded.
>
> But with a help, e.g. RAED_ONCE(perf_guest_cbs), unloading kvm_intel can trigger
> a NULL pointer derference, e.g. this tweak
>
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index 1eb45139fcc6..202e5ad97f82 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -2954,7 +2954,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
>   {
>          int misc = 0;
>
> -       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
> +       if (READ_ONCE(perf_guest_cbs) && READ_ONCE(perf_guest_cbs)->is_in_guest()) {
>                  if (perf_guest_cbs->is_user_mode())
>                          misc |= PERF_RECORD_MISC_GUEST_USER;
>                  else
>
>
> while spamming module load/unload leads to:
>
>    BUG: kernel NULL pointer dereference, address: 0000000000000000
>    #PF: supervisor read access in kernel mode
>    #PF: error_code(0x0000) - not-present page
>    PGD 0 P4D 0
>    Oops: 0000 [#1] PREEMPT SMP
>    CPU: 6 PID: 1825 Comm: stress Not tainted 5.14.0-rc2+ #459
>    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
>    RIP: 0010:perf_misc_flags+0x1c/0x70
>    Call Trace:
>     perf_prepare_sample+0x53/0x6b0
>     perf_event_output_forward+0x67/0x160
>     __perf_event_overflow+0x52/0xf0
>     handle_pmi_common+0x207/0x300
>     intel_pmu_handle_irq+0xcf/0x410
>     perf_event_nmi_handler+0x28/0x50
>     nmi_handle+0xc7/0x260
>     default_do_nmi+0x6b/0x170
>     exc_nmi+0x103/0x130
>     asm_exc_nmi+0x76/0xbf
>
>
> The good news is that I have a series that should fix both the existing NULL pointer
> bug and mostly obviate the need for static calls.  The bad news is that my approach,
> making perf_guest_cbs per-CPU, likely complicates turning these into static calls,
> though I'm guessing it's still a solvable problem.
>
> Tangentially related, IMO we should make architectures opt-in to getting
> perf_guest_cbs and nuke all of the code in the below files.  Except for arm,
> which recently lost KVM support, it's all a bunch of useless copy-paste code that
> serves no purpose and just complicates cleanups like this.
>
>>   arch/arm/kernel/perf_callchain.c   | 16 +++++++-----
>>   arch/csky/kernel/perf_callchain.c  |  4 +--
>>   arch/nds32/kernel/perf_event_cpu.c | 16 +++++++-----
>>   arch/riscv/kernel/perf_callchain.c |  4 +--


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH V10 01/18] perf/core: Use static_call to optimize perf_guest_info_callbacks
  2021-09-15  1:19     ` Zhu, Lingshan
@ 2021-09-21 23:22       ` Sean Christopherson
  0 siblings, 0 replies; 6+ messages in thread
From: Sean Christopherson @ 2021-09-21 23:22 UTC (permalink / raw)
  To: Zhu, Lingshan
  Cc: peterz, pbonzini, bp, vkuznets, wanpengli, jmattson, joro,
	kan.liang, ak, wei.w.wang, eranian, liuxiangdong5, linux-kernel,
	x86, kvm, like.xu.linux, boris.ostrvsky, Like Xu, Will Deacon,
	Marc Zyngier, Guo Ren, Nick Hu, Paul Walmsley, Boris Ostrovsky,
	linux-arm-kernel, kvmarm, linux-csky, linux-riscv, xen-devel

On Wed, Sep 15, 2021, Zhu, Lingshan wrote:
> 
> 
> On 8/27/2021 3:59 AM, Sean Christopherson wrote:
> > TL;DR: Please don't merge this patch, it's broken and is also built on a shoddy
> >         foundation that I would like to fix.
> Hi Sean,Peter, Paolo
> 
> I will send out an V11 which drops this patch since it's buggy, and Sean is
> working on fix this.
> Does this sound good?

Works for me, thanks!

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-09-21 23:22 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20210806133802.3528-1-lingshan.zhu@intel.com>
2021-08-06 13:37 ` [PATCH V10 01/18] perf/core: Use static_call to optimize perf_guest_info_callbacks Zhu Lingshan
2021-08-26 19:59   ` Sean Christopherson
2021-08-27  6:31     ` Like Xu
2021-09-15  1:19     ` Zhu, Lingshan
2021-09-21 23:22       ` Sean Christopherson
2021-08-27 17:23   ` Sean Christopherson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).