Hi Peter, On 2022-11-17 at 19:11:37 +0100, Peter Zijlstra wrote: > On Wed, Nov 16, 2022 at 11:39:53AM +0800, Pengfei Xu wrote: > > int main(void) > > { > > syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); > > syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul); > > syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul); > > setup_sysctl(); > > > > *(uint32_t*)0x20000200 = 0; > > *(uint32_t*)0x20000204 = 0x80; > > *(uint8_t*)0x20000208 = 0; > > *(uint8_t*)0x20000209 = 0; > > *(uint8_t*)0x2000020a = 0; > > *(uint8_t*)0x2000020b = 0; > > *(uint32_t*)0x2000020c = 0; > > *(uint64_t*)0x20000210 = 8; > > *(uint64_t*)0x20000218 = 0; > > *(uint64_t*)0x20000220 = 0; > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 0, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 1, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 2, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 3, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 4, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 5, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 6, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 7, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 8, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 9, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 10, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 11, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 12, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 13, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 14, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 15, 2); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 17, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 18, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 19, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 20, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 21, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 22, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 23, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 24, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 25, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 26, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 27, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 28, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 29, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 30, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 31, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 32, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 33, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 34, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 35, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 36, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 1, 37, 1); > > STORE_BY_BITMASK(uint64_t, , 0x20000228, 0, 38, 26); > > *(uint32_t*)0x20000230 = 0; > > *(uint32_t*)0x20000234 = 0; > > *(uint64_t*)0x20000238 = 0; > > *(uint64_t*)0x20000240 = 0; > > *(uint64_t*)0x20000248 = 0; > > *(uint64_t*)0x20000250 = 0; > > *(uint32_t*)0x20000258 = 0; > > *(uint32_t*)0x2000025c = 0; > > *(uint64_t*)0x20000260 = 0; > > *(uint32_t*)0x20000268 = 0; > > *(uint16_t*)0x2000026c = 0; > > *(uint16_t*)0x2000026e = 0; > > *(uint32_t*)0x20000270 = 0; > > *(uint32_t*)0x20000274 = 0; > > *(uint64_t*)0x20000278 = 0xd62; > > syscall(__NR_perf_event_open, 0x20000200ul, 0, 0ul, -1, 3ul); > > return 0; > > } > > Putting that next to 'pahole --hex -C perf_event_attr' seems to suggest > this is HW_CPU_CYCLES with a sampling on and exclude_kernel and sigtrap > set. > > Now, for hardware events like this, you'd expect the PMU OS filter to > respect exclude_kernel (specifically not set ARCH_PERFMON_EVENTSEL_OS in > the relevant MSR), except there's a bunch of erratas and skid related > 'funnies' that mean a user event can trigger across the kernel boundary. > > Does the below patch help -- it should filter out those sort of things. > > (still also including that previous patch) > > --- > diff --git a/kernel/events/core.c b/kernel/events/core.c > index 4ec3717003d5..cc86bf25f0c3 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -9273,6 +9273,21 @@ int perf_event_account_interrupt(struct perf_event *event) > return __perf_event_account_interrupt(event, 1); > } > > +static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) > +{ > + /* > + * Due to interrupt latency (AKA "skid"), we may enter the > + * kernel before taking an overflow, even if the PMU is only > + * counting user events. > + * To avoid leaking information to userspace, we must always > + * reject kernel samples when exclude_kernel is set. > + */ > + if (event->attr.exclude_kernel && !user_mode(regs)) > + return false; > + > + return true; > +} > + > /* > * Generic event overflow handling, sampling. > */ > @@ -9305,15 +9320,28 @@ static int __perf_event_overflow(struct perf_event *event, > perf_event_disable_inatomic(event); > } > > - if (event->attr.sigtrap) { > - /* > - * Should not be able to return to user space without processing > - * pending_sigtrap (kernel events can overflow multiple times). > - */ > - WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel); > + if (event->attr.sigtrap && sample_is_allowed(event, regs)) { > + unsigned int pending_id = 1; > + > + if (regs) > + pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; > if (!event->pending_sigtrap) { > - event->pending_sigtrap = 1; > + event->pending_sigtrap = pending_id; > local_inc(&event->ctx->nr_pending); > + } else if (event->attr.exclude_kernel) { > + /* > + * Should not be able to return to user space without > + * consuming pending_sigtrap; with exceptions: > + * > + * 1. Where !exclude_kernel, events can overflow again > + * in the kernel without returning to user space. > + * > + * 2. Events that can overflow again before the IRQ- > + * work without user space progress (e.g. hrtimer). > + * To approximate progress (with false negatives), > + * check 32-bit hash of the current IP. > + */ > + WARN_ON_ONCE(event->pending_sigtrap != pending_id); > } > event->pending_addr = data->addr; > irq_work_queue(&event->pending_irq); Thanks a lot for your patch! You added below part based on previous patch in below link: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?h=perf/urgent&id=bb88f9695460bec25aa30ba9072595025cf6c8af " diff --git a/kernel/events/core.c b/kernel/events/core.c index 884871427a94..cc86bf25f0c3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9273,6 +9273,21 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } +static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + * To avoid leaking information to userspace, we must always + * reject kernel samples when exclude_kernel is set. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + /* * Generic event overflow handling, sampling. */ @@ -9305,7 +9320,7 @@ static int __perf_event_overflow(struct perf_event *event, perf_event_disable_inatomic(event); } - if (event->attr.sigtrap) { + if (event->attr.sigtrap && sample_is_allowed(event, regs)) { unsigned int pending_id = 1; if (regs) " After your patch, I could not reproduce this issue in "15465 times ./repro" Attached is the dmesg of guest, and this issue could not be reproduced. The result shows that your additional patch fixed this issue! If possible, could you add Reported-and-tested-by tag from me. Thanks! BR.