From mboxrd@z Thu Jan  1 00:00:00 1970
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S934689AbeAKP3u (ORCPT <rfc822;ralf@linux-mips.org> + 1 other);
        Thu, 11 Jan 2018 10:29:50 -0500
Received: from merlin.infradead.org ([205.233.59.134]:39710 "EHLO
        merlin.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S934278AbeAKP3m (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Thu, 11 Jan 2018 10:29:42 -0500
Date: Thu, 11 Jan 2018 16:29:30 +0100
From: Peter Zijlstra <peterz@infradead.org>
To: Vince Weaver <vincent.weaver@maine.edu>
Cc: Ingo Molnar <mingo@kernel.org>, linux-kernel@vger.kernel.org,
        Ingo Molnar <mingo@redhat.com>,
        Arnaldo Carvalho de Melo <acme@kernel.org>,
        Thomas Gleixner <tglx@linutronix.de>
Subject: Re: perf: perf_fuzzer quickly locks up on 4.15-rc7
Message-ID: <20180111152930.GF2369@hirez.programming.kicks-ass.net>
References: <alpine.DEB.2.21.1801081105570.319@macbook-air>
 <20180108173005.lkglqrixb2ota6g2@gmail.com>
 <alpine.DEB.2.21.1801081528110.14364@macbook-air>
 <20180109102507.GG6176@hirez.programming.kicks-ass.net>
 <20180109132602.GA2369@hirez.programming.kicks-ass.net>
 <alpine.DEB.2.21.1801090841300.15884@macbook-air>
 <20180111091353.GW6176@hirez.programming.kicks-ass.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20180111091353.GW6176@hirez.programming.kicks-ass.net>
User-Agent: Mutt/1.9.2 (2017-12-15)
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Return-Path: <linux-kernel-owner@vger.kernel.org>

On Thu, Jan 11, 2018 at 10:13:53AM +0100, Peter Zijlstra wrote:
> On Tue, Jan 09, 2018 at 08:44:23AM -0500, Vince Weaver wrote:
> > On Tue, 9 Jan 2018, Peter Zijlstra wrote:
> > 
> > > So remind me again, how are you running that fuzzer? I'm running
> > > ./fast_repro99.sh as root.
> > 
> > I'm running ./fast_repro98.sh on a regular haswell machine with paranoid 
> > set to "0".
> 
> I'm seeing things like:
> 
> Cannot open /sys/kernel/tracing/kprobe_events
> 
> this is likely caused by me not having anything mounted there. Rostedt
> provided the magic incantation to make that work, I'll go try now.
> 
> At least I seem to have cured all lockdep splats.. of course, the moment
> I hit send on this someone bad is bound to happen.

So while the IVB-EP (6/62/4) goes wobbly after a while, I also started
fuzzing on my HSW-EX (6/63/4) and there it runs without issue for over
30 minutes.

(I get an unchecked MSR splat from the RAPL pmu, but that is non-fatal)

But so far no weird hangs.. running tip/master + below.

Sadly the only other HSW I have is a laptop, and those are absolutely
useless for debugging (no serial :/) but I'll go see if I can reproduce.

FWIW these test-boxes are running debian/testing

---
 Documentation/admin-guide/kernel-parameters.txt |   5 +
 arch/x86/events/intel/ds.c                      |  33 ++++---
 kernel/events/core.c                            |  47 +++++++++-
 kernel/locking/lockdep.c                        |  10 ++
 kernel/printk/printk.c                          | 119 +++++++++++++++++-------
 kernel/smp.c                                    |  10 ++
 6 files changed, 171 insertions(+), 53 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 8bc512782bd2..7fa1a59461f7 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1169,6 +1169,11 @@
 			parameter will force ia64_sal_cache_flush to call
 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
+	force_early_printk
+			Forcefully uses early_console (as per earlyprintk=)
+			usage for regular printk, bypassing everything,
+			including the syslog (dmesg will be empty).
+
 	forcepae [X86-32]
 			Forcefully enable Physical Address Extension (PAE).
 			Many Pentium M systems disable PAE but may have a
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8156e47da7ba..18c25ab28557 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -372,10 +372,9 @@ static int alloc_pebs_buffer(int cpu)
 static void release_pebs_buffer(int cpu)
 {
 	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-	struct debug_store *ds = hwev->ds;
 	void *cea;
 
-	if (!ds || !x86_pmu.pebs)
+	if (!x86_pmu.pebs)
 		return;
 
 	kfree(per_cpu(insn_buffer, cpu));
@@ -384,7 +383,6 @@ static void release_pebs_buffer(int cpu)
 	/* Clear the fixmap */
 	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
 	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
-	ds->pebs_buffer_base = 0;
 	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
 	hwev->ds_pebs_vaddr = NULL;
 }
@@ -419,16 +417,14 @@ static int alloc_bts_buffer(int cpu)
 static void release_bts_buffer(int cpu)
 {
 	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
-	struct debug_store *ds = hwev->ds;
 	void *cea;
 
-	if (!ds || !x86_pmu.bts)
+	if (!x86_pmu.bts)
 		return;
 
 	/* Clear the fixmap */
 	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
 	ds_clear_cea(cea, BTS_BUFFER_SIZE);
-	ds->bts_buffer_base = 0;
 	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
 	hwev->ds_bts_vaddr = NULL;
 }
@@ -454,16 +450,22 @@ void release_ds_buffers(void)
 	if (!x86_pmu.bts && !x86_pmu.pebs)
 		return;
 
-	get_online_cpus();
-	for_each_online_cpu(cpu)
+	for_each_possible_cpu(cpu)
+		release_ds_buffer(cpu);
+
+	for_each_possible_cpu(cpu) {
+		/*
+		 * Again, ignore errors from offline CPUs, they will no longer
+		 * observe cpu_hw_events.ds and not program the DS_AREA when
+		 * they come up.
+		 */
 		fini_debug_store_on_cpu(cpu);
+	}
 
 	for_each_possible_cpu(cpu) {
 		release_pebs_buffer(cpu);
 		release_bts_buffer(cpu);
-		release_ds_buffer(cpu);
 	}
-	put_online_cpus();
 }
 
 void reserve_ds_buffers(void)
@@ -483,8 +485,6 @@ void reserve_ds_buffers(void)
 	if (!x86_pmu.pebs)
 		pebs_err = 1;
 
-	get_online_cpus();
-
 	for_each_possible_cpu(cpu) {
 		if (alloc_ds_buffer(cpu)) {
 			bts_err = 1;
@@ -521,11 +521,14 @@ void reserve_ds_buffers(void)
 		if (x86_pmu.pebs && !pebs_err)
 			x86_pmu.pebs_active = 1;
 
-		for_each_online_cpu(cpu)
+		for_each_possible_cpu(cpu) {
+			/*
+			 * Ignores wrmsr_on_cpu() errors for offline CPUs they
+			 * will get this call through intel_pmu_cpu_starting().
+			 */
 			init_debug_store_on_cpu(cpu);
+		}
 	}
-
-	put_online_cpus();
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4e1a1bf8d867..fb079be8971a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx)
  *	      perf_event_context::lock
  *	    perf_event::mmap_mutex
  *	    mmap_sem
+ *
+ *    cpu_hotplug_lock
+ *      pmus_lock
+ * 	  cpuctx->mutex / perf_event_context::mutex
  */
 static struct perf_event_context *
 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_event *child, *tmp;
+	LIST_HEAD(free_list);
 
 	/*
 	 * If we got here through err_file: fput(event_file); we will not have
@@ -4268,8 +4273,7 @@ int perf_event_release_kernel(struct perf_event *event)
 					       struct perf_event, child_list);
 		if (tmp == child) {
 			perf_remove_from_context(child, DETACH_GROUP);
-			list_del(&child->child_list);
-			free_event(child);
+			list_move(&child->child_list, &free_list);
 			/*
 			 * This matches the refcount bump in inherit_event();
 			 * this can't be the last reference.
@@ -4284,6 +4288,11 @@ int perf_event_release_kernel(struct perf_event *event)
 	}
 	mutex_unlock(&event->child_mutex);
 
+	list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+		list_del(&child->child_list);
+		free_event(child);
+	}
+
 no_ctx:
 	put_event(event); /* Must be the 'last' reference */
 	return 0;
@@ -8526,6 +8535,29 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
 	return ret;
 }
 
+static int
+perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
+{
+	struct perf_event_context *ctx = event->ctx;
+	int ret;
+
+	/*
+	 * Beware, here be dragons!!
+	 *
+	 * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
+	 * stuff does not actually need it. So temporarily drop ctx->mutex. As per
+	 * perf_event_ctx_lock() we already have a reference on ctx.
+	 *
+	 * This can result in event getting moved to a different ctx, but that
+	 * does not affect the tracepoint state.
+	 */
+	mutex_unlock(&ctx->mutex);
+	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+	mutex_lock(&ctx->mutex);
+
+	return ret;
+}
+
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 {
 	char *filter_str;
@@ -8542,8 +8574,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 
 	if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
 	    event->attr.type == PERF_TYPE_TRACEPOINT)
-		ret = ftrace_profile_set_filter(event, event->attr.config,
-						filter_str);
+		ret = perf_tracepoint_set_filter(event, filter_str);
 	else if (has_addr_filter(event))
 		ret = perf_event_set_addr_filter(event, filter_str);
 
@@ -9178,7 +9209,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 	if (!try_module_get(pmu->module))
 		return -ENODEV;
 
-	if (event->group_leader != event) {
+	/*
+	 * A number of pmu->event_init() methods iterate the sibling_list to,
+	 * for example, validate if the group fits on the PMU. Therefore,
+	 * if this is a sibling event, acquire the ctx->mutex to protect
+	 * the sibling_list.
+	 */
+	if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
 		/*
 		 * This ctx->mutex can nest when we're called through
 		 * inheritance. See the perf_event_ctx_lock_nested() comment.
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 5fa1324a4f29..fb81dc92aaa6 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2668,8 +2668,12 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 					this, 0, irqclass);
 }
 
+DECLARE_PER_CPU(struct cpumask *, smp_call_mask);
+
 void print_irqtrace_events(struct task_struct *curr)
 {
+	struct cpumask *__mask = this_cpu_read(smp_call_mask);
+
 	printk("irq event stamp: %u\n", curr->irq_events);
 	printk("hardirqs last  enabled at (%u): [<%p>] %pS\n",
 		curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
@@ -2683,6 +2687,12 @@ void print_irqtrace_events(struct task_struct *curr)
 	printk("softirqs last disabled at (%u): [<%p>] %pS\n",
 		curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
 		(void *)curr->softirq_disable_ip);
+
+	if (__mask) {
+		printk("smp_function_call mask: %*pbl\n", cpumask_pr_args(__mask));
+	} else {
+		printk("smp_function_call mask: (NULL)\n");
+	}
 }
 
 static int HARDIRQ_verbose(struct lock_class *class)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b9006617710f..c1a4ed17c22c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -365,6 +365,75 @@ __packed __aligned(4)
 #endif
 ;
 
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+static bool __read_mostly force_early_printk;
+
+static int __init force_early_printk_setup(char *str)
+{
+	force_early_printk = true;
+	return 0;
+}
+early_param("force_early_printk", force_early_printk_setup);
+
+static int early_printk_cpu = -1;
+
+static int early_vprintk(const char *fmt, va_list args)
+{
+	int n, cpu, old;
+	char buf[512];
+
+	cpu = get_cpu();
+	/*
+	 * Test-and-Set inter-cpu spinlock with recursion.
+	 */
+	for (;;) {
+		/*
+		 * c-cas to avoid the exclusive bouncing on spin.
+		 * Depends on the memory barrier implied by cmpxchg
+		 * for ACQUIRE semantics.
+		 */
+		old = READ_ONCE(early_printk_cpu);
+		if (old == -1) {
+			old = cmpxchg(&early_printk_cpu, -1, cpu);
+			if (old == -1)
+				break;
+		}
+		/*
+		 * Allow recursion for interrupts and the like.
+		 */
+		if (old == cpu)
+			break;
+
+		cpu_relax();
+	}
+
+	n = vscnprintf(buf, sizeof(buf), fmt, args);
+	early_console->write(early_console, buf, n);
+
+	/*
+	 * Unlock -- in case @old == @cpu, this is a no-op.
+	 */
+	smp_store_release(&early_printk_cpu, old);
+	put_cpu();
+
+	return n;
+}
+
+asmlinkage __visible void early_printk(const char *fmt, ...)
+{
+	va_list ap;
+
+	if (!early_console)
+		return;
+
+	va_start(ap, fmt);
+	early_vprintk(fmt, ap);
+	va_end(ap);
+}
+#endif
+
 /*
  * The logbuf_lock protects kmsg buffer, indices, counters.  This can be taken
  * within the scheduler's rq lock. It must be released before calling
@@ -1692,6 +1761,16 @@ asmlinkage int vprintk_emit(int facility, int level,
 	int printed_len;
 	bool in_sched = false;
 
+#ifdef CONFIG_KGDB_KDB
+	if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
+		return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+	if (force_early_printk && early_console)
+		return early_vprintk(fmt, args);
+#endif
+
 	if (level == LOGLEVEL_SCHED) {
 		level = LOGLEVEL_DEFAULT;
 		in_sched = true;
@@ -1784,18 +1863,7 @@ EXPORT_SYMBOL(printk_emit);
 
 int vprintk_default(const char *fmt, va_list args)
 {
-	int r;
-
-#ifdef CONFIG_KGDB_KDB
-	/* Allow to pass printk() to kdb but avoid a recursion. */
-	if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
-		r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
-		return r;
-	}
-#endif
-	r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
-
-	return r;
+	return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
 }
 EXPORT_SYMBOL_GPL(vprintk_default);
 
@@ -1826,7 +1894,12 @@ asmlinkage __visible int printk(const char *fmt, ...)
 	int r;
 
 	va_start(args, fmt);
-	r = vprintk_func(fmt, args);
+#ifdef CONFIG_EARLY_PRINTK
+	if (force_early_printk && early_console)
+		r = vprintk_default(fmt, args);
+	else
+#endif
+		r = vprintk_func(fmt, args);
 	va_end(args);
 
 	return r;
@@ -1863,26 +1936,6 @@ static bool suppress_message_printing(int level) { return false; }
 
 #endif /* CONFIG_PRINTK */
 
-#ifdef CONFIG_EARLY_PRINTK
-struct console *early_console;
-
-asmlinkage __visible void early_printk(const char *fmt, ...)
-{
-	va_list ap;
-	char buf[512];
-	int n;
-
-	if (!early_console)
-		return;
-
-	va_start(ap, fmt);
-	n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	va_end(ap);
-
-	early_console->write(early_console, buf, n);
-}
-#endif
-
 static int __add_preferred_console(char *name, int idx, char *options,
 				   char *brl_options)
 {
diff --git a/kernel/smp.c b/kernel/smp.c
index 084c8b3a2681..d014602e13f7 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -257,6 +257,8 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 	irq_work_run();
 }
 
+DEFINE_PER_CPU(const struct cpumask *, smp_call_mask);
+
 /*
  * smp_call_function_single - Run a function on a specific CPU
  * @func: The function to run. This must be fast and non-blocking.
@@ -296,11 +298,15 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 		csd_lock(csd);
 	}
 
+	this_cpu_write(smp_call_mask, cpumask_of(cpu));
+
 	err = generic_exec_single(cpu, csd, func, info);
 
 	if (wait)
 		csd_lock_wait(csd);
 
+	this_cpu_write(smp_call_mask, NULL);
+
 	put_cpu();
 
 	return err;
@@ -457,6 +463,8 @@ void smp_call_function_many(const struct cpumask *mask,
 			__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 	}
 
+	this_cpu_write(smp_call_mask, cfd->cpumask_ipi);
+
 	/* Send a message to all CPUs in the map */
 	arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
 
@@ -468,6 +476,8 @@ void smp_call_function_many(const struct cpumask *mask,
 			csd_lock_wait(csd);
 		}
 	}
+
+	this_cpu_write(smp_call_mask, NULL);
 }
 EXPORT_SYMBOL(smp_call_function_many);