All of lore.kernel.org
 help / color / mirror / Atom feed
* [ANNOUNCE] v4.9-rt1
@ 2016-12-23 16:32 Sebastian Andrzej Siewior
  2016-12-26  6:54 ` [patch-rt] kvm: Convert pvclock_gtod_sync_lock to raw_spinlock_t Mike Galbraith
                   ` (4 more replies)
  0 siblings, 5 replies; 21+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-12-23 16:32 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, linux-rt-users, Steven Rostedt

Dear RT folks!

I'm pleased to announce the v4.9-rt1 patch set. 

Please don't download and boot this before Christmas Eve.

Changes since v4.8.15-rt10

  - rebase to v4.9

Known issues
	- CPU hotplug got a little better but can deadlock.

You can get this release via the git tree at:

    git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v4.9-rt1

The RT patch against v4.9 can be found here:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/older/patch-4.9-rt1.patch.xz

The split quilt queue is available at:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/older/patches-4.9-rt1.tar.xz

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [patch-rt] kvm: Convert pvclock_gtod_sync_lock to raw_spinlock_t
  2016-12-23 16:32 [ANNOUNCE] v4.9-rt1 Sebastian Andrzej Siewior
@ 2016-12-26  6:54 ` Mike Galbraith
  2017-01-20 16:44   ` Sebastian Andrzej Siewior
  2016-12-26  7:00 ` [rfc patch-rt] posix_cpu_timers: Kill hotplug cpu notifier Mike Galbraith
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 21+ messages in thread
From: Mike Galbraith @ 2016-12-26  6:54 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Thomas Gleixner
  Cc: LKML, linux-rt-users, Steven Rostedt

Fix splat below.  Lock is short hold, make it raw.

[15528.614216] BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:995
[15528.614218] in_atomic(): 0, irqs_disabled(): 1, pid: 19619, name: qemu-system-x86
[15528.614218] no locks held by qemu-system-x86/19619.
[15528.614219] irq event stamp: 321840
[15528.614224] hardirqs last  enabled at (321839): [<ffffffff816a8467>] entry_SYSCALL_64_fastpath+0x5/0xc2
[15528.614244] hardirqs last disabled at (321840): [<ffffffffa08578f4>] kvm_arch_vm_ioctl+0x234/0xda0 [kvm]
[15528.614246] softirqs last  enabled at (0): [<ffffffff81077e8a>] copy_process.part.36+0x5ba/0x20b0
[15528.614247] softirqs last disabled at (0): [<          (null)>]           (null)
[15528.614250] CPU: 7 PID: 19619 Comm: qemu-system-x86 Tainted: G            E   4.9.0-rt1-virgin #1
[15528.614250] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
[15528.614253]  ffffc9000b98bc30 ffffffff8136874d 0000000000000000 ffff8803e76db200
[15528.614255]  ffffc9000b98bc68 ffffffff810abe9d ffff8800353472d0 ffff8800353472d0
[15528.614257]  00007ffc53dbc2b0 000000000000000b 00007ffc53dbc2b0 ffffc9000b98bc88
[15528.614257] Call Trace:
[15528.614262]  [<ffffffff8136874d>] dump_stack+0x85/0xc8
[15528.614266]  [<ffffffff810abe9d>] ___might_sleep+0x15d/0x260
[15528.614268]  [<ffffffff816a7ca4>] rt_spin_lock+0x24/0x80
[15528.614283]  [<ffffffffa084b212>] __get_kvmclock_ns+0x22/0xf0 [kvm]
[15528.614297]  [<ffffffffa08578fc>] kvm_arch_vm_ioctl+0x23c/0xda0 [kvm]
[15528.614300]  [<ffffffff810dc5d5>] ? __lock_acquire+0x305/0x16a0
[15528.614301]  [<ffffffff8107d916>] ? unpin_current_cpu+0x16/0x70
[15528.614314]  [<ffffffffa08409dd>] kvm_vm_ioctl+0x9d/0x920 [kvm]
[15528.614316]  [<ffffffff8127b547>] ? __fget+0x107/0x220
[15528.614318]  [<ffffffff810da5f9>] ? __lock_is_held+0x49/0x70
[15528.614320]  [<ffffffff8126e9b6>] do_vfs_ioctl+0x96/0x6c0
[15528.614321]  [<ffffffff8127b564>] ? __fget+0x124/0x220
[15528.614322]  [<ffffffff8127b445>] ? __fget+0x5/0x220
[15528.614324]  [<ffffffff8126f021>] SyS_ioctl+0x41/0x70
[15528.614326]  [<ffffffff816a8481>] entry_SYSCALL_64_fastpath+0x1f/0xc2

Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 arch/x86/include/asm/kvm_host.h |    2 +-
 arch/x86/kvm/x86.c              |   20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -755,7 +755,7 @@ struct kvm_arch {
 	u64 cur_tsc_generation;
 	int nr_vcpus_matched_tsc;
 
-	spinlock_t pvclock_gtod_sync_lock;
+	raw_spinlock_t pvclock_gtod_sync_lock;
 	bool use_master_clock;
 	u64 master_kernel_ns;
 	cycle_t master_cycle_now;
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1540,7 +1540,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu
 	kvm_vcpu_write_tsc_offset(vcpu, offset);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+	raw_spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
 	if (!matched) {
 		kvm->arch.nr_vcpus_matched_tsc = 0;
 	} else if (!already_matched) {
@@ -1548,7 +1548,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu
 	}
 
 	kvm_track_tsc_matching(vcpu);
-	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
+	raw_spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
 }
 
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
@@ -1715,7 +1715,7 @@ static void kvm_gen_update_masterclock(s
 	struct kvm_vcpu *vcpu;
 	struct kvm_arch *ka = &kvm->arch;
 
-	spin_lock(&ka->pvclock_gtod_sync_lock);
+	raw_spin_lock(&ka->pvclock_gtod_sync_lock);
 	kvm_make_mclock_inprogress_request(kvm);
 	/* no guest entries from this point */
 	pvclock_update_vm_gtod_copy(kvm);
@@ -1727,7 +1727,7 @@ static void kvm_gen_update_masterclock(s
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
 
-	spin_unlock(&ka->pvclock_gtod_sync_lock);
+	raw_spin_unlock(&ka->pvclock_gtod_sync_lock);
 #endif
 }
 
@@ -1736,15 +1736,15 @@ static u64 __get_kvmclock_ns(struct kvm
 	struct kvm_arch *ka = &kvm->arch;
 	struct pvclock_vcpu_time_info hv_clock;
 
-	spin_lock(&ka->pvclock_gtod_sync_lock);
+	raw_spin_lock(&ka->pvclock_gtod_sync_lock);
 	if (!ka->use_master_clock) {
-		spin_unlock(&ka->pvclock_gtod_sync_lock);
+		raw_spin_unlock(&ka->pvclock_gtod_sync_lock);
 		return ktime_get_boot_ns() + ka->kvmclock_offset;
 	}
 
 	hv_clock.tsc_timestamp = ka->master_cycle_now;
 	hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-	spin_unlock(&ka->pvclock_gtod_sync_lock);
+	raw_spin_unlock(&ka->pvclock_gtod_sync_lock);
 
 	kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
 			   &hv_clock.tsc_shift,
@@ -1835,13 +1835,13 @@ static int kvm_guest_time_update(struct
 	 * If the host uses TSC clock, then passthrough TSC as stable
 	 * to the guest.
 	 */
-	spin_lock(&ka->pvclock_gtod_sync_lock);
+	raw_spin_lock(&ka->pvclock_gtod_sync_lock);
 	use_master_clock = ka->use_master_clock;
 	if (use_master_clock) {
 		host_tsc = ka->master_cycle_now;
 		kernel_ns = ka->master_kernel_ns;
 	}
-	spin_unlock(&ka->pvclock_gtod_sync_lock);
+	raw_spin_unlock(&ka->pvclock_gtod_sync_lock);
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
@@ -7831,7 +7831,7 @@ int kvm_arch_init_vm(struct kvm *kvm, un
 
 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
 	mutex_init(&kvm->arch.apic_map_lock);
-	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+	raw_spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 
 	kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
 	pvclock_update_vm_gtod_copy(kvm);

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [rfc patch-rt] posix_cpu_timers: Kill hotplug cpu notifier
  2016-12-23 16:32 [ANNOUNCE] v4.9-rt1 Sebastian Andrzej Siewior
  2016-12-26  6:54 ` [patch-rt] kvm: Convert pvclock_gtod_sync_lock to raw_spinlock_t Mike Galbraith
@ 2016-12-26  7:00 ` Mike Galbraith
  2017-01-20 16:46   ` Sebastian Andrzej Siewior
  2016-12-31  8:20 ` [patch-rt] softirq: Move ksoftirqd_running() under !CONFIG_PREEMPT_RT_FULL Mike Galbraith
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 21+ messages in thread
From: Mike Galbraith @ 2016-12-26  7:00 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Thomas Gleixner
  Cc: LKML, linux-rt-users, Steven Rostedt

Shamelessly steal softirq.c thread initialization method.

Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 include/linux/cpuhotplug.h     |    1 
 kernel/time/posix-cpu-timers.c |  158 ++++++++++++++---------------------------
 2 files changed, 56 insertions(+), 103 deletions(-)

--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -20,6 +20,7 @@ enum cpuhp_state {
 	CPUHP_SLUB_DEAD,
 	CPUHP_MM_WRITEBACK_DEAD,
 	CPUHP_SOFTIRQ_DEAD,
+	CPUHP_POSIXCPUTMR_DEAD,
 	CPUHP_NET_MVNETA_DEAD,
 	CPUHP_CPUIDLE_DEAD,
 	CPUHP_ARM64_FPSIMD_DEAD,
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -13,6 +13,8 @@
 #include <linux/random.h>
 #include <linux/tick.h>
 #include <linux/workqueue.h>
+#include <linux/smpboot.h>
+#include <linux/cpuhotplug.h>
 
 /*
  * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -1220,62 +1222,36 @@ static void __run_posix_cpu_timers(struc
 DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
 DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
 
-static int posix_cpu_timers_thread(void *data)
+static void posix_cpu_timers_thread(unsigned int cpu)
 {
-	int cpu = (long)data;
+	struct task_struct *tsk = NULL;
+	struct task_struct *next = NULL;
 
-	BUG_ON(per_cpu(posix_timer_task,cpu) != current);
+	/* grab task list */
+	raw_local_irq_disable();
+	tsk = per_cpu(posix_timer_tasklist, cpu);
+	per_cpu(posix_timer_tasklist, cpu) = NULL;
+	raw_local_irq_enable();
+
+	/* Process task list */
+	while (tsk) {
+		/* save next */
+		next = tsk->posix_timer_list;
 
-	while (!kthread_should_stop()) {
-		struct task_struct *tsk = NULL;
-		struct task_struct *next = NULL;
-
-		if (cpu_is_offline(cpu))
-			goto wait_to_die;
-
-		/* grab task list */
-		raw_local_irq_disable();
-		tsk = per_cpu(posix_timer_tasklist, cpu);
-		per_cpu(posix_timer_tasklist, cpu) = NULL;
-		raw_local_irq_enable();
-
-		/* its possible the list is empty, just return */
-		if (!tsk) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			schedule();
-			__set_current_state(TASK_RUNNING);
-			continue;
-		}
-
-		/* Process task list */
-		while (1) {
-			/* save next */
-			next = tsk->posix_timer_list;
+		/* run the task timers, clear its ptr and
+		 * unreference it
+		 */
+		__run_posix_cpu_timers(tsk);
+		tsk->posix_timer_list = NULL;
+		put_task_struct(tsk);
 
-			/* run the task timers, clear its ptr and
-			 * unreference it
-			 */
-			__run_posix_cpu_timers(tsk);
-			tsk->posix_timer_list = NULL;
-			put_task_struct(tsk);
-
-			/* check if this is the last on the list */
-			if (next == tsk)
-				break;
-			tsk = next;
-		}
+		/* check if this is the last on the list */
+		if (next == tsk)
+			break;
+		tsk = next;
 	}
-	return 0;
 
-wait_to_die:
-	/* Wait for kthread_stop */
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		schedule();
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-	__set_current_state(TASK_RUNNING);
-	return 0;
+	return;
 }
 
 static inline int __fastpath_timer_check(struct task_struct *tsk)
@@ -1322,72 +1298,48 @@ void run_posix_cpu_timers(struct task_st
 	}
 }
 
-/*
- * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
- * Here we can start up the necessary migration thread for the new CPU.
- */
-static int posix_cpu_thread_call(struct notifier_block *nfb,
-				 unsigned long action, void *hcpu)
+static void posix_cpu_thread_setup(unsigned int cpu)
 {
-	int cpu = (long)hcpu;
-	struct task_struct *p;
-	struct sched_param param;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
-	switch (action) {
-	case CPU_UP_PREPARE:
-		p = kthread_create(posix_cpu_timers_thread, hcpu,
-					"posixcputmr/%d",cpu);
-		if (IS_ERR(p))
-			return NOTIFY_BAD;
-		p->flags |= PF_NOFREEZE;
-		kthread_bind(p, cpu);
-		/* Must be high prio to avoid getting starved */
-		param.sched_priority = MAX_RT_PRIO-1;
-		sched_setscheduler(p, SCHED_FIFO, &param);
-		per_cpu(posix_timer_task,cpu) = p;
-		break;
-	case CPU_ONLINE:
-		/* Strictly unneccessary, as first user will wake it. */
-		wake_up_process(per_cpu(posix_timer_task,cpu));
-		break;
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-		/* Unbind it from offline cpu so it can run.  Fall thru. */
-		kthread_bind(per_cpu(posix_timer_task, cpu),
-			     cpumask_any(cpu_online_mask));
-		kthread_stop(per_cpu(posix_timer_task,cpu));
-		per_cpu(posix_timer_task,cpu) = NULL;
-		break;
-	case CPU_DEAD:
-		kthread_stop(per_cpu(posix_timer_task,cpu));
-		per_cpu(posix_timer_task,cpu) = NULL;
-		break;
-#endif
-	}
-	return NOTIFY_OK;
+	current->flags |= PF_NOFREEZE;
+	sched_setscheduler(current, SCHED_FIFO, &param);
 }
 
-/* Register at highest priority so that task migration (migrate_all_tasks)
- * happens before everything else.
- */
-static struct notifier_block posix_cpu_thread_notifier = {
-	.notifier_call = posix_cpu_thread_call,
-	.priority = 10
+static void posix_cpu_thread_cleanup(unsigned int cpu, bool online)
+{
+	struct sched_param param = { .sched_priority = 0 };
+
+	current->flags &= ~PF_NOFREEZE;
+	sched_setscheduler(current, SCHED_NORMAL, &param);
+}
+
+static int posix_cpu_thread_should_run(unsigned int cpu)
+{
+	return !!per_cpu(posix_timer_tasklist, cpu);
+}
+
+static struct smp_hotplug_thread posix_timer_threads = {
+	.store			= &posix_timer_task,
+	.setup			= posix_cpu_thread_setup,
+	.cleanup		= posix_cpu_thread_cleanup,
+	.thread_should_run	= posix_cpu_thread_should_run,
+	.thread_fn		= posix_cpu_timers_thread,
+	.thread_comm		= "posixcputmr/%u",
 };
 
 static int __init posix_cpu_thread_init(void)
 {
-	void *hcpu = (void *)(long)smp_processor_id();
-	/* Start one for boot CPU. */
+	struct smp_hotplug_thread *t = &posix_timer_threads;
 	unsigned long cpu;
 
-	/* init the per-cpu posix_timer_tasklets */
+	/* init the per-cpu posix_timer_tasklist */
 	for_each_possible_cpu(cpu)
 		per_cpu(posix_timer_tasklist, cpu) = NULL;
 
-	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
-	posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
-	register_cpu_notifier(&posix_cpu_thread_notifier);
+	cpuhp_setup_state_nocalls(CPUHP_POSIXCPUTMR_DEAD, "posixcputmr:dead", NULL, NULL);
+	BUG_ON(smpboot_register_percpu_thread(t));
+
 	return 0;
 }
 early_initcall(posix_cpu_thread_init);

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [patch-rt] softirq: Move ksoftirqd_running() under !CONFIG_PREEMPT_RT_FULL
  2016-12-23 16:32 [ANNOUNCE] v4.9-rt1 Sebastian Andrzej Siewior
  2016-12-26  6:54 ` [patch-rt] kvm: Convert pvclock_gtod_sync_lock to raw_spinlock_t Mike Galbraith
  2016-12-26  7:00 ` [rfc patch-rt] posix_cpu_timers: Kill hotplug cpu notifier Mike Galbraith
@ 2016-12-31  8:20 ` Mike Galbraith
  2017-01-20 17:21   ` Sebastian Andrzej Siewior
  2017-01-06  8:13 ` [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc() Mike Galbraith
  2017-01-08  8:32 ` {patch-rt] cpuset: Convert callback_lock to raw_spinlock_t Mike Galbraith
  4 siblings, 1 reply; 21+ messages in thread
From: Mike Galbraith @ 2016-12-31  8:20 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Thomas Gleixner
  Cc: LKML, linux-rt-users, Steven Rostedt


Otherwise, ktimersoftd may not be awakened when it has work to do.

[   84.087571] NOHZ: local_softirq_pending 02
[   84.087593] NOHZ: local_softirq_pending 02
[   84.087598] NOHZ: local_softirq_pending 02
[   84.087904] NOHZ: local_softirq_pending 02
[   84.088526] NOHZ: local_softirq_pending 02
[   84.088899] NOHZ: local_softirq_pending 02
[   84.089463] NOHZ: local_softirq_pending 02
[  115.013470] NOHZ: local_softirq_pending 02
[  115.013601] NOHZ: local_softirq_pending 02
[  115.013709] NOHZ: local_softirq_pending 02

Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/softirq.c |   10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -206,6 +206,7 @@ static void handle_softirq(unsigned int
 	}
 }
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * If ksoftirqd is scheduled, we do not want to process pending softirqs
  * right now. Let ksoftirqd handle this at its own rate, to get fairness.
@@ -217,7 +218,6 @@ static bool ksoftirqd_running(void)
 	return tsk && (tsk->state == TASK_RUNNING);
 }
 
-#ifndef CONFIG_PREEMPT_RT_FULL
 static inline int ksoftirqd_softirq_pending(void)
 {
 	return local_softirq_pending();
@@ -794,13 +794,10 @@ void irq_enter(void)
 
 static inline void invoke_softirq(void)
 {
-#ifdef CONFIG_PREEMPT_RT_FULL
-	unsigned long flags;
-#endif
-
+#ifndef CONFIG_PREEMPT_RT_FULL
 	if (ksoftirqd_running())
 		return;
-#ifndef CONFIG_PREEMPT_RT_FULL
+
 	if (!force_irqthreads) {
 #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
 		/*
@@ -821,6 +818,7 @@ static inline void invoke_softirq(void)
 		wakeup_softirqd();
 	}
 #else /* PREEMPT_RT_FULL */
+	unsigned long flags;
 
 	local_irq_save(flags);
 	if (__this_cpu_read(ksoftirqd) &&

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
  2016-12-23 16:32 [ANNOUNCE] v4.9-rt1 Sebastian Andrzej Siewior
                   ` (2 preceding siblings ...)
  2016-12-31  8:20 ` [patch-rt] softirq: Move ksoftirqd_running() under !CONFIG_PREEMPT_RT_FULL Mike Galbraith
@ 2017-01-06  8:13 ` Mike Galbraith
  2017-01-06  8:28   ` Mike Galbraith
                     ` (2 more replies)
  2017-01-08  8:32 ` {patch-rt] cpuset: Convert callback_lock to raw_spinlock_t Mike Galbraith
  4 siblings, 3 replies; 21+ messages in thread
From: Mike Galbraith @ 2017-01-06  8:13 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Thomas Gleixner
  Cc: LKML, linux-rt-users, Steven Rostedt

radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()

Having no preload, which turns accounting off for non-rt kernels, trying to
allocate coming from shmem_fault() when memcg is full sends us scurrying off
to pagefault_out_of_memory(), with dramatic (usually terminal) consequences.
LTP's madvise06 testcase triggers this quite well, and per gitk, the below
was the beginning of RT memcg woes.

58e698af4c63 radix-tree: account radix_tree_node to memory cgroup

Turn memcg accounting off for RT in the problematic path.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: stable-rt@vger.kernel.org # +v4.6-rt
---
 lib/radix-tree.c |    7 +++++++
 1 file changed, 7 insertions(+)

--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -303,6 +303,13 @@ radix_tree_node_alloc(struct radix_tree_
 	if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
 		struct radix_tree_preload *rtp;
 
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_MEMCG)
+		/*
+		 * Arriving here from shmem_fault() and meeting a full memcg
+		 * will send us to pagefault_out_of_memory(), and a dead box.
+		 */
+		gfp_mask &= ~__GFP_ACCOUNT;
+#endif
 		/*
 		 * Even if the caller has preloaded, try to allocate from the
 		 * cache first for the new node to get accounted to the memory

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
  2017-01-06  8:13 ` [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc() Mike Galbraith
@ 2017-01-06  8:28   ` Mike Galbraith
  2017-01-06  8:55   ` Michal Hocko
  2017-01-25 15:06   ` Sebastian Andrzej Siewior
  2 siblings, 0 replies; 21+ messages in thread
From: Mike Galbraith @ 2017-01-06  8:28 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Thomas Gleixner
  Cc: LKML, linux-rt-users, Steven Rostedt

Trace of the bad thing about to happen.

       madvise06-4719  [003] .......  1187.428766: handle_mm_fault <-__do_page_fault
       madvise06-4719  [003] .......  1187.428766: __rcu_read_lock <-handle_mm_fault
       madvise06-4719  [003] .......  1187.428766: mem_cgroup_from_task <-handle_mm_fault
       madvise06-4719  [003] .......  1187.428766: __rcu_read_unlock <-handle_mm_fault
       madvise06-4719  [003] .......  1187.428766: __do_fault <-handle_mm_fault
       madvise06-4719  [003] .......  1187.428766: shmem_fault <-__do_fault
       madvise06-4719  [003] .......  1187.428766: shmem_getpage_gfp <-shmem_fault
       madvise06-4719  [003] .......  1187.428766: find_lock_entry <-shmem_getpage_gfp
       madvise06-4719  [003] .......  1187.428766: find_get_entry <-find_lock_entry
       madvise06-4719  [003] .......  1187.428766: __rcu_read_lock <-find_get_entry
       madvise06-4719  [003] .......  1187.428766: __rcu_read_unlock <-find_get_entry
       madvise06-4719  [003] .......  1187.428766: shmem_alloc_page <-shmem_getpage_gfp
       madvise06-4719  [003] .......  1187.428766: mpol_shared_policy_lookup <-shmem_alloc_page
       madvise06-4719  [003] .......  1187.428766: alloc_pages_vma <-shmem_alloc_page
       madvise06-4719  [003] .......  1187.428766: get_vma_policy <-alloc_pages_vma
       madvise06-4719  [003] .......  1187.428766: __get_vma_policy <-get_vma_policy
       madvise06-4719  [003] .......  1187.428766: get_task_policy.part.40 <-alloc_pages_vma
       madvise06-4719  [003] .......  1187.428766: policy_nodemask <-alloc_pages_vma
       madvise06-4719  [003] .......  1187.428767: policy_zonelist <-alloc_pages_vma
       madvise06-4719  [003] .......  1187.428767: __alloc_pages_nodemask <-alloc_pages_vma
       madvise06-4719  [003] .......  1187.428767: get_page_from_freelist <-__alloc_pages_nodemask
       madvise06-4719  [003] .......  1187.428767: migrate_disable <-get_page_from_freelist
       madvise06-4719  [003] ....11.  1187.428767: pin_current_cpu <-migrate_disable
       madvise06-4719  [003] .....11  1187.428767: rt_spin_lock__no_mg <-get_page_from_freelist
       madvise06-4719  [003] .....11  1187.428767: __inc_zone_state <-get_page_from_freelist
       madvise06-4719  [003] .....11  1187.428767: __inc_zone_state <-get_page_from_freelist
       madvise06-4719  [003] .....11  1187.428767: rt_spin_unlock__no_mg <-get_page_from_freelist
       madvise06-4719  [003] .....11  1187.428767: migrate_enable <-get_page_from_freelist
       madvise06-4719  [003] ....11.  1187.428767: unpin_current_cpu <-migrate_enable
       madvise06-4719  [003] .......  1187.428767: mem_cgroup_try_charge <-shmem_getpage_gfp
       madvise06-4719  [003] .......  1187.428767: get_mem_cgroup_from_mm <-mem_cgroup_try_charge
       madvise06-4719  [003] .......  1187.428767: __rcu_read_lock <-get_mem_cgroup_from_mm
       madvise06-4719  [003] .......  1187.428767: __rcu_read_unlock <-get_mem_cgroup_from_mm
       madvise06-4719  [003] .......  1187.428767: try_charge <-mem_cgroup_try_charge
       madvise06-4719  [003] .......  1187.428767: migrate_disable <-try_charge
       madvise06-4719  [003] ....11.  1187.428768: pin_current_cpu <-migrate_disable
       madvise06-4719  [003] .....11  1187.428768: rt_spin_lock__no_mg <-try_charge
       madvise06-4719  [003] .....11  1187.428768: rt_spin_unlock__no_mg <-try_charge
       madvise06-4719  [003] .....11  1187.428768: migrate_enable <-try_charge
       madvise06-4719  [003] ....11.  1187.428768: unpin_current_cpu <-migrate_enable
       madvise06-4719  [003] .......  1187.428768: page_counter_try_charge <-try_charge
       madvise06-4719  [003] .......  1187.428768: migrate_disable <-try_charge
       madvise06-4719  [003] ....11.  1187.428768: pin_current_cpu <-migrate_disable
       madvise06-4719  [003] .....11  1187.428768: rt_spin_lock__no_mg <-try_charge
       madvise06-4719  [003] .....11  1187.428768: rt_spin_unlock__no_mg <-try_charge
       madvise06-4719  [003] .....11  1187.428768: migrate_enable <-try_charge
       madvise06-4719  [003] ....11.  1187.428768: unpin_current_cpu <-migrate_enable
       madvise06-4719  [003] .......  1187.428768: page_counter_try_charge <-try_charge
       madvise06-4719  [003] .......  1187.428768: shmem_add_to_page_cache <-shmem_getpage_gfp
       madvise06-4719  [003] .......  1187.428768: rt_spin_lock <-shmem_add_to_page_cache
       madvise06-4719  [003] .......  1187.428768: migrate_disable <-rt_spin_lock
       madvise06-4719  [003] ....11.  1187.428769: pin_current_cpu <-migrate_disable
       madvise06-4719  [003] .....11  1187.428769: kmem_cache_alloc <-radix_tree_node_alloc.constprop.25
       madvise06-4719  [003] .....11  1187.428769: memcg_kmem_get_cache <-kmem_cache_alloc
       madvise06-4719  [003] .....11  1187.428769: get_mem_cgroup_from_mm <-memcg_kmem_get_cache
       madvise06-4719  [003] .....11  1187.428769: __rcu_read_lock <-get_mem_cgroup_from_mm
       madvise06-4719  [003] .....11  1187.428769: __rcu_read_unlock <-get_mem_cgroup_from_mm
       madvise06-4719  [003] .....11  1187.428769: __rcu_read_lock <-memcg_kmem_get_cache
       madvise06-4719  [003] .....11  1187.428769: __rcu_read_unlock <-memcg_kmem_get_cache
       madvise06-4719  [003] .....11  1187.428769: __slab_alloc.isra.76 <-kmem_cache_alloc
       madvise06-4719  [003] d....11  1187.428769: ___slab_alloc <-__slab_alloc.isra.76
       madvise06-4719  [003] d....11  1187.428769: get_partial_node.isra.65 <-___slab_alloc
       madvise06-4719  [003] d....11  1187.428770: mempolicy_slab_node <-___slab_alloc
       madvise06-4719  [003] d....11  1187.428770: new_slab <-___slab_alloc
       madvise06-4719  [003] .....11  1187.428770: alloc_pages_current <-new_slab
       madvise06-4719  [003] .....11  1187.428770: get_task_policy.part.40 <-alloc_pages_current
       madvise06-4719  [003] .....11  1187.428770: policy_nodemask <-alloc_pages_current
       madvise06-4719  [003] .....11  1187.428770: policy_zonelist <-alloc_pages_current
       madvise06-4719  [003] .....11  1187.428770: __alloc_pages_nodemask <-alloc_pages_current
       madvise06-4719  [003] .....11  1187.428770: get_page_from_freelist <-__alloc_pages_nodemask
       madvise06-4719  [003] .....11  1187.428770: __zone_watermark_ok <-get_page_from_freelist
       madvise06-4719  [003] .....11  1187.428770: migrate_disable <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428770: rt_spin_lock__no_mg <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428770: rt_spin_lock <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428770: migrate_disable <-rt_spin_lock
       madvise06-4719  [003] .....13  1187.428771: __rmqueue <-get_page_from_freelist
       madvise06-4719  [003] .....13  1187.428771: __mod_zone_page_state <-get_page_from_freelist
       madvise06-4719  [003] .....13  1187.428771: rt_spin_unlock <-get_page_from_freelist
       madvise06-4719  [003] .....13  1187.428771: migrate_enable <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428771: __inc_zone_state <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428771: __inc_zone_state <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428771: rt_spin_unlock__no_mg <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428771: migrate_enable <-get_page_from_freelist
       madvise06-4719  [003] .....11  1187.428771: memcg_kmem_charge_memcg <-new_slab
       madvise06-4719  [003] .....11  1187.428771: try_charge <-memcg_kmem_charge_memcg
       madvise06-4719  [003] .....11  1187.428771: migrate_disable <-try_charge
       madvise06-4719  [003] .....12  1187.428771: rt_spin_lock__no_mg <-try_charge
       madvise06-4719  [003] .....12  1187.428771: rt_spin_unlock__no_mg <-try_charge
       madvise06-4719  [003] .....12  1187.428772: migrate_enable <-try_charge
       madvise06-4719  [003] .....11  1187.428772: page_counter_try_charge <-try_charge
       madvise06-4719  [003] .....11  1187.428772: migrate_disable <-try_charge
       madvise06-4719  [003] .....12  1187.428772: rt_spin_lock__no_mg <-try_charge
       madvise06-4719  [003] .....12  1187.428772: rt_spin_unlock__no_mg <-try_charge
       madvise06-4719  [003] .....12  1187.428772: migrate_enable <-try_charge
       madvise06-4719  [003] .....11  1187.428772: page_counter_try_charge <-try_charge
       madvise06-4719  [003] .....11  1187.428772: __free_pages <-new_slab
       madvise06-4719  [003] .....11  1187.428772: __free_pages_ok <-new_slab
       madvise06-4719  [003] .....11  1187.428772: migrate_disable <-__free_pages_ok
       madvise06-4719  [003] .....12  1187.428772: rt_spin_lock__no_mg <-__free_pages_ok
       madvise06-4719  [003] .....12  1187.428772: free_one_page <-__free_pages_ok
       madvise06-4719  [003] .....12  1187.428772: rt_spin_lock <-free_one_page
       madvise06-4719  [003] .....12  1187.428773: migrate_disable <-rt_spin_lock
       madvise06-4719  [003] .....13  1187.428773: node_page_state <-free_one_page
       madvise06-4719  [003] .....13  1187.428773: __mod_zone_page_state <-free_one_page
       madvise06-4719  [003] .....13  1187.428773: rt_spin_unlock <-__free_pages_ok
       madvise06-4719  [003] .....13  1187.428773: migrate_enable <-__free_pages_ok
       madvise06-4719  [003] .....12  1187.428773: rt_spin_unlock__no_mg <-__free_pages_ok
       madvise06-4719  [003] .....12  1187.428773: migrate_enable <-new_slab
       madvise06-4719  [003] .....11  1187.428773: alloc_pages_current <-new_slab
       madvise06-4719  [003] .....11  1187.428773: get_task_policy.part.40 <-alloc_pages_current
       madvise06-4719  [003] .....11  1187.428773: policy_nodemask <-alloc_pages_current
       madvise06-4719  [003] .....11  1187.428773: policy_zonelist <-alloc_pages_current
       madvise06-4719  [003] .....11  1187.428773: __alloc_pages_nodemask <-alloc_pages_current
       madvise06-4719  [003] .....11  1187.428773: get_page_from_freelist <-__alloc_pages_nodemask
       madvise06-4719  [003] .....11  1187.428773: migrate_disable <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428774: rt_spin_lock__no_mg <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428774: __inc_zone_state <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428774: __inc_zone_state <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428774: rt_spin_unlock__no_mg <-get_page_from_freelist
       madvise06-4719  [003] .....12  1187.428774: migrate_enable <-get_page_from_freelist
       madvise06-4719  [003] .....11  1187.428774: memcg_kmem_charge_memcg <-new_slab
       madvise06-4719  [003] .....11  1187.428774: try_charge <-memcg_kmem_charge_memcg
       madvise06-4719  [003] .....11  1187.428774: migrate_disable <-try_charge
       madvise06-4719  [003] .....12  1187.428774: rt_spin_lock__no_mg <-try_charge
       madvise06-4719  [003] .....12  1187.428774: rt_spin_unlock__no_mg <-try_charge
       madvise06-4719  [003] .....12  1187.428774: migrate_enable <-try_charge
       madvise06-4719  [003] .....11  1187.428774: page_counter_try_charge <-try_charge
       madvise06-4719  [003] .....11  1187.428774: migrate_disable <-try_charge
       madvise06-4719  [003] .....12  1187.428774: rt_spin_lock__no_mg <-try_charge
       madvise06-4719  [003] .....12  1187.428774: rt_spin_unlock__no_mg <-try_charge
       madvise06-4719  [003] .....12  1187.428775: migrate_enable <-try_charge
       madvise06-4719  [003] .....11  1187.428775: page_counter_try_charge <-try_charge
       madvise06-4719  [003] .....11  1187.428775: __free_pages <-new_slab
       madvise06-4719  [003] .....11  1187.428775: free_hot_cold_page <-new_slab
       madvise06-4719  [003] .....11  1187.428775: migrate_disable <-free_hot_cold_page
       madvise06-4719  [003] .....12  1187.428775: rt_spin_lock__no_mg <-free_hot_cold_page
       madvise06-4719  [003] .....12  1187.428775: rt_spin_unlock__no_mg <-free_hot_cold_page
       madvise06-4719  [003] .....12  1187.428775: migrate_enable <-free_hot_cold_page
       madvise06-4719  [003] d....11  1187.428775: slab_out_of_memory <-___slab_alloc
       madvise06-4719  [003] .....11  1187.428775: free_delayed <-__slab_alloc.isra.76
       madvise06-4719  [003] .....11  1187.428775: memcg_kmem_put_cache <-kmem_cache_alloc
       madvise06-4719  [003] .....11  1187.428775: rt_spin_unlock <-shmem_add_to_page_cache
       madvise06-4719  [003] .....11  1187.428776: migrate_enable <-shmem_add_to_page_cache
       madvise06-4719  [003] ....11.  1187.428776: unpin_current_cpu <-migrate_enable
       madvise06-4719  [003] .......  1187.428776: mem_cgroup_cancel_charge <-shmem_getpage_gfp
       madvise06-4719  [003] .......  1187.428776: cancel_charge <-shmem_getpage_gfp
       madvise06-4719  [003] .......  1187.428776: page_counter_uncharge <-cancel_charge
       madvise06-4719  [003] .......  1187.428776: page_counter_cancel <-page_counter_uncharge
       madvise06-4719  [003] .......  1187.428776: page_counter_cancel <-page_counter_uncharge
       madvise06-4719  [003] .......  1187.428776: unlock_page <-shmem_getpage_gfp
       madvise06-4719  [003] .......  1187.428776: __put_page <-shmem_getpage_gfp
       madvise06-4719  [003] .......  1187.428776: __page_cache_release <-__put_page
       madvise06-4719  [003] .......  1187.428777: mem_cgroup_uncharge <-__put_page
       madvise06-4719  [003] .......  1187.428777: free_hot_cold_page <-shmem_getpage_gfp
       madvise06-4719  [003] .......  1187.428777: migrate_disable <-free_hot_cold_page
       madvise06-4719  [003] ....11.  1187.428777: pin_current_cpu <-migrate_disable
       madvise06-4719  [003] .....11  1187.428777: rt_spin_lock__no_mg <-free_hot_cold_page
       madvise06-4719  [003] .....11  1187.428777: rt_spin_unlock__no_mg <-free_hot_cold_page
       madvise06-4719  [003] .....11  1187.428777: migrate_enable <-free_hot_cold_page
       madvise06-4719  [003] ....11.  1187.428777: unpin_current_cpu <-migrate_enable
       madvise06-4719  [003] .......  1187.428777: rt_up_read <-__do_page_fault
       madvise06-4719  [003] .......  1187.428777: rt_mutex_unlock <-__do_page_fault
       madvise06-4719  [003] .......  1187.428778: mm_fault_error <-do_page_fault
       madvise06-4719  [003] .......  1187.428778: pagefault_out_of_memory <-do_page_fault

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
  2017-01-06  8:13 ` [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc() Mike Galbraith
  2017-01-06  8:28   ` Mike Galbraith
@ 2017-01-06  8:55   ` Michal Hocko
  2017-01-06 10:52     ` Mike Galbraith
  2017-01-25 15:06   ` Sebastian Andrzej Siewior
  2 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2017-01-06  8:55 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Sebastian Andrzej Siewior, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt

On Fri 06-01-17 09:13:23, Mike Galbraith wrote:
> radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
> 
> Having no preload, which turns accounting off for non-rt kernels, trying to
> allocate coming from shmem_fault() when memcg is full sends us scurrying off
> to pagefault_out_of_memory(), with dramatic (usually terminal) consequences.
> LTP's madvise06 testcase triggers this quite well, and per gitk, the below
> was the beginning of RT memcg woes.
> 
> 58e698af4c63 radix-tree: account radix_tree_node to memory cgroup
> 
> Turn memcg accounting off for RT in the problematic path.

I am really wondering why this is RT specific and the non RT kernels
doesn't have any problem.
 
> Signed-off-by: Mike Galbraith <efault@gmx.de>
> Cc: stable-rt@vger.kernel.org # +v4.6-rt
> ---
>  lib/radix-tree.c |    7 +++++++
>  1 file changed, 7 insertions(+)
> 
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -303,6 +303,13 @@ radix_tree_node_alloc(struct radix_tree_
>  	if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
>  		struct radix_tree_preload *rtp;
>  
> +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_MEMCG)
> +		/*
> +		 * Arriving here from shmem_fault() and meeting a full memcg
> +		 * will send us to pagefault_out_of_memory(), and a dead box.
> +		 */
> +		gfp_mask &= ~__GFP_ACCOUNT;
> +#endif
>  		/*
>  		 * Even if the caller has preloaded, try to allocate from the
>  		 * cache first for the new node to get accounted to the memory

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
  2017-01-06  8:55   ` Michal Hocko
@ 2017-01-06 10:52     ` Mike Galbraith
  2017-01-06 12:20       ` Mike Galbraith
  0 siblings, 1 reply; 21+ messages in thread
From: Mike Galbraith @ 2017-01-06 10:52 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Sebastian Andrzej Siewior, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt

On Fri, 2017-01-06 at 09:55 +0100, Michal Hocko wrote:
> On Fri 06-01-17 09:13:23, Mike Galbraith wrote:
> > radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
> > 
> > Having no preload, which turns accounting off for non-rt kernels, trying to
> > allocate coming from shmem_fault() when memcg is full sends us scurrying off
> > to pagefault_out_of_memory(), with dramatic (usually terminal) consequences.
> > LTP's madvise06 testcase triggers this quite well, and per gitk, the below
> > was the beginning of RT memcg woes.
> > 
> > 58e698af4c63 radix-tree: account radix_tree_node to memory cgroup
> > 
> > Turn memcg accounting off for RT in the problematic path.
> 
> I am really wondering why this is RT specific and the non RT kernels
> doesn't have any problem.

For all I know, there may be a scenario for non-RT to explode, but the
madvise06 testcase that thoroughly nails RT ain't it.

	-Mike

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
  2017-01-06 10:52     ` Mike Galbraith
@ 2017-01-06 12:20       ` Mike Galbraith
  2017-01-06 12:44         ` Mike Galbraith
  0 siblings, 1 reply; 21+ messages in thread
From: Mike Galbraith @ 2017-01-06 12:20 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Sebastian Andrzej Siewior, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt

On Fri, 2017-01-06 at 11:52 +0100, Mike Galbraith wrote:
> On Fri, 2017-01-06 at 09:55 +0100, Michal Hocko wrote:
> > On Fri 06-01-17 09:13:23, Mike Galbraith wrote:
> > > radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
> > > 
> > > Having no preload, which turns accounting off for non-rt kernels, trying to
> > > allocate coming from shmem_fault() when memcg is full sends us scurrying off
> > > to pagefault_out_of_memory(), with dramatic (usually terminal) consequences.
> > > LTP's madvise06 testcase triggers this quite well, and per gitk, the below
> > > was the beginning of RT memcg woes.
> > > 
> > > 58e698af4c63 radix-tree: account radix_tree_node to memory cgroup
> > > 
> > > Turn memcg accounting off for RT in the problematic path.
> > 
> > I am really wondering why this is RT specific and the non RT kernels
> > doesn't have any problem.
> 
> For all I know, there may be a scenario for non-RT to explode, but the
> madvise06 testcase that thoroughly nails RT ain't it.

Unless you twiddle/apply the RT tree radix-tree patch.  So (as rashly
presumed), memcg woes are RT specific because RT disabled the preload
business.  madvise06 isn't as deadly to the twiddled PREEMPT kernel as
it is to PREEMPT_RT_FULL, but a very few runs attracted the oom beast.

('course there still may be a non-RT danger path lurking.. dunno)

[   81.376673] madvise06 invoked oom-killer: gfp_mask=0x0(), nodemask=0, order=0, oom_score_adj=-1000
[   81.376676] madvise06 cpuset=/ mems_allowed=0
[   81.376680] CPU: 5 PID: 4018 Comm: madvise06 Tainted: G            E   4.10.0-preempt #31
[   81.376681] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
[   81.376682] Call Trace:
[   81.376687]  ? dump_stack+0x5c/0x7e
[   81.376690]  ? dump_header+0x7f/0x241
[   81.376692]  ? __do_fault+0x1d/0x70
[   81.376693]  ? handle_mm_fault+0x3f5/0xfe0
[   81.376696]  ? oom_kill_process+0x225/0x3f0
[   81.376697]  ? oom_badness+0x70/0x180
[   81.376699]  ? out_of_memory+0x103/0x4a0
[   81.376700]  ? pagefault_out_of_memory+0x43/0x60
[   81.376703]  ? do_page_fault+0x2b/0x70
[   81.376705]  ? page_fault+0x28/0x30

From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 17 Jul 2011 21:33:18 +0200
Subject: radix-tree: Make RT aware

Disable radix_tree_preload() on -RT. This functions returns with
preemption disabled which may cause high latencies and breaks if the
user tries to grab any locks after invoking it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/radix-tree.h |   18 +++++++++++++++++-
 lib/radix-tree.c           |    5 ++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -318,9 +318,24 @@ unsigned int radix_tree_gang_lookup(stru
 unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
 			void ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items);
+#ifdef CONFIG_PREEMPT
+static inline int radix_tree_preload(gfp_t gm) { return 0; }
+static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
+static inline int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
+{
+	return 0;
+}
+
+static inline int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t gfp_mask)
+{
+	return 0;
+}
+#else
 int radix_tree_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
+int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t gfp_mask);
+#endif
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
@@ -342,10 +357,11 @@ int radix_tree_tagged(struct radix_tree_
 
 static inline void radix_tree_preload_end(void)
 {
+#ifndef CONFIG_PREEMPT
 	preempt_enable();
+#endif
 }
 
-int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
 int radix_tree_split(struct radix_tree_root *, unsigned long index,
 			unsigned new_order);
 int radix_tree_join(struct radix_tree_root *, unsigned long index,
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -318,13 +318,14 @@ radix_tree_node_alloc(struct radix_tree_
 		 * succeed in getting a node here (and never reach
 		 * kmem_cache_alloc)
 		 */
-		rtp = this_cpu_ptr(&radix_tree_preloads);
+		rtp = &get_cpu_var(radix_tree_preloads);
 		if (rtp->nr) {
 			ret = rtp->nodes;
 			rtp->nodes = ret->private_data;
 			ret->private_data = NULL;
 			rtp->nr--;
 		}
+		put_cpu_var(radix_tree_preloads);
 		/*
 		 * Update the allocation stack trace as this is more useful
 		 * for debugging.
@@ -368,6 +369,7 @@ radix_tree_node_free(struct radix_tree_n
 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 }
 
+#ifndef CONFIG_PREEMPT
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -509,6 +511,7 @@ int radix_tree_maybe_preload_order(gfp_t
 
 	return __radix_tree_preload(gfp_mask, nr_nodes);
 }
+#endif
 
 static unsigned radix_tree_load_root(struct radix_tree_root *root,
 		struct radix_tree_node **nodep, unsigned long *maxindex)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
  2017-01-06 12:20       ` Mike Galbraith
@ 2017-01-06 12:44         ` Mike Galbraith
  0 siblings, 0 replies; 21+ messages in thread
From: Mike Galbraith @ 2017-01-06 12:44 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Sebastian Andrzej Siewior, Thomas Gleixner, LKML, linux-rt-users,
	Steven Rostedt

On Fri, 2017-01-06 at 13:20 +0100, Mike Galbraith wrote:

> > > madvise06 isn't as deadly to the twiddled PREEMPT kernel as
> it is to PREEMPT_RT_FULL, but a very few runs attracted the oom beast.

The very next run paniced the box... deadly enough for gvt. work :)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* {patch-rt] cpuset: Convert callback_lock to raw_spinlock_t
  2016-12-23 16:32 [ANNOUNCE] v4.9-rt1 Sebastian Andrzej Siewior
                   ` (3 preceding siblings ...)
  2017-01-06  8:13 ` [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc() Mike Galbraith
@ 2017-01-08  8:32 ` Mike Galbraith
  2017-01-25 15:45   ` Sebastian Andrzej Siewior
  4 siblings, 1 reply; 21+ messages in thread
From: Mike Galbraith @ 2017-01-08  8:32 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior, Thomas Gleixner
  Cc: LKML, linux-rt-users, Steven Rostedt


The two commits below add up to a cpuset might_sleep() splat for RT:

8447a0fee974 cpuset: convert callback_mutex to a spinlock
344736f29b35 cpuset: simplify cpuset_node_allowed API

BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:995
in_atomic(): 0, irqs_disabled(): 1, pid: 11718, name: cset
CPU: 135 PID: 11718 Comm: cset Tainted: G            E   4.10.0-rt1-rt #4
Hardware name: Intel Corporation BRICKLAND/BRICKLAND, BIOS BRHSXSD1.86B.0056.R01.1409242327 09/24/2014
Call Trace:
 ? dump_stack+0x5c/0x81
 ? ___might_sleep+0xf4/0x170
 ? rt_spin_lock+0x1c/0x50
 ? __cpuset_node_allowed+0x66/0xc0
 ? ___slab_alloc+0x390/0x570
 ? anon_vma_fork+0x8f/0x140
 ? copy_page_range+0x6cf/0xb00
 ? anon_vma_fork+0x8f/0x140
 ? __slab_alloc.isra.74+0x5a/0x81
 ? anon_vma_fork+0x8f/0x140
 ? kmem_cache_alloc+0x1b5/0x1f0
 ? anon_vma_fork+0x8f/0x140
 ? copy_process.part.35+0x1670/0x1ee0
 ? _do_fork+0xdd/0x3f0
 ? _do_fork+0xdd/0x3f0
 ? do_syscall_64+0x61/0x170
 ? entry_SYSCALL64_slow_path+0x25/0x25

The later ensured that a NUMA box WILL take callback_lock in atomic
context by removing the allocator and reclaim path __GFP_HARDWALL
usage which prevented such contexts from taking callback_mutex.

One option would be to reinstate __GFP_HARDWALL protections for
RT, however, as the 8447a0fee974 changelog states:

The callback_mutex is only used to synchronize reads/updates of cpusets'
flags and cpu/node masks. These operations should always proceed fast so
there's no reason why we can't use a spinlock instead of the mutex.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: stable-rt@vger.kernel.org
---
 kernel/cpuset.c |   66 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
  */
 
 static DEFINE_MUTEX(cpuset_mutex);
-static DEFINE_SPINLOCK(callback_lock);
+static DEFINE_RAW_SPINLOCK(callback_lock);
 
 static struct workqueue_struct *cpuset_migrate_mm_wq;
 
@@ -907,9 +907,9 @@ static void update_cpumasks_hier(struct
 			continue;
 		rcu_read_unlock();
 
-		spin_lock_irq(&callback_lock);
+		raw_spin_lock_irq(&callback_lock);
 		cpumask_copy(cp->effective_cpus, new_cpus);
-		spin_unlock_irq(&callback_lock);
+		raw_spin_unlock_irq(&callback_lock);
 
 		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -974,9 +974,9 @@ static int update_cpumask(struct cpuset
 	if (retval < 0)
 		return retval;
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	/* use trialcs->cpus_allowed as a temp variable */
 	update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1176,9 +1176,9 @@ static void update_nodemasks_hier(struct
 			continue;
 		rcu_read_unlock();
 
-		spin_lock_irq(&callback_lock);
+		raw_spin_lock_irq(&callback_lock);
 		cp->effective_mems = *new_mems;
-		spin_unlock_irq(&callback_lock);
+		raw_spin_unlock_irq(&callback_lock);
 
 		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 			!nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1246,9 +1246,9 @@ static int update_nodemask(struct cpuset
 	if (retval < 0)
 		goto done;
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cs->mems_allowed = trialcs->mems_allowed;
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	/* use trialcs->mems_allowed as a temp variable */
 	update_nodemasks_hier(cs, &trialcs->mems_allowed);
@@ -1339,9 +1339,9 @@ static int update_flag(cpuset_flagbits_t
 	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
 			|| (is_spread_page(cs) != is_spread_page(trialcs)));
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cs->flags = trialcs->flags;
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
 		rebuild_sched_domains_locked();
@@ -1756,7 +1756,7 @@ static int cpuset_common_seq_show(struct
 	cpuset_filetype_t type = seq_cft(sf)->private;
 	int ret = 0;
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 
 	switch (type) {
 	case FILE_CPULIST:
@@ -1775,7 +1775,7 @@ static int cpuset_common_seq_show(struct
 		ret = -EINVAL;
 	}
 
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 	return ret;
 }
 
@@ -1989,12 +1989,12 @@ static int cpuset_css_online(struct cgro
 
 	cpuset_inc();
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
 		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
 		cs->effective_mems = parent->effective_mems;
 	}
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
 		goto out_unlock;
@@ -2021,12 +2021,12 @@ static int cpuset_css_online(struct cgro
 	}
 	rcu_read_unlock();
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cs->mems_allowed = parent->mems_allowed;
 	cs->effective_mems = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	return 0;
@@ -2065,7 +2065,7 @@ static void cpuset_css_free(struct cgrou
 static void cpuset_bind(struct cgroup_subsys_state *root_css)
 {
 	mutex_lock(&cpuset_mutex);
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 
 	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
 		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2076,7 +2076,7 @@ static void cpuset_bind(struct cgroup_su
 		top_cpuset.mems_allowed = top_cpuset.effective_mems;
 	}
 
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 	mutex_unlock(&cpuset_mutex);
 }
 
@@ -2177,12 +2177,12 @@ hotplug_update_tasks_legacy(struct cpuse
 {
 	bool is_empty;
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->cpus_allowed, new_cpus);
 	cpumask_copy(cs->effective_cpus, new_cpus);
 	cs->mems_allowed = *new_mems;
 	cs->effective_mems = *new_mems;
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	/*
 	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2219,10 +2219,10 @@ hotplug_update_tasks(struct cpuset *cs,
 	if (nodes_empty(*new_mems))
 		*new_mems = parent_cs(cs)->effective_mems;
 
-	spin_lock_irq(&callback_lock);
+	raw_spin_lock_irq(&callback_lock);
 	cpumask_copy(cs->effective_cpus, new_cpus);
 	cs->effective_mems = *new_mems;
-	spin_unlock_irq(&callback_lock);
+	raw_spin_unlock_irq(&callback_lock);
 
 	if (cpus_updated)
 		update_tasks_cpumask(cs);
@@ -2308,21 +2308,21 @@ static void cpuset_hotplug_workfn(struct
 
 	/* synchronize cpus_allowed to cpu_active_mask */
 	if (cpus_updated) {
-		spin_lock_irq(&callback_lock);
+		raw_spin_lock_irq(&callback_lock);
 		if (!on_dfl)
 			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
 		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
-		spin_unlock_irq(&callback_lock);
+		raw_spin_unlock_irq(&callback_lock);
 		/* we don't mess with cpumasks of tasks in top_cpuset */
 	}
 
 	/* synchronize mems_allowed to N_MEMORY */
 	if (mems_updated) {
-		spin_lock_irq(&callback_lock);
+		raw_spin_lock_irq(&callback_lock);
 		if (!on_dfl)
 			top_cpuset.mems_allowed = new_mems;
 		top_cpuset.effective_mems = new_mems;
-		spin_unlock_irq(&callback_lock);
+		raw_spin_unlock_irq(&callback_lock);
 		update_tasks_nodemask(&top_cpuset);
 	}
 
@@ -2420,11 +2420,11 @@ void cpuset_cpus_allowed(struct task_str
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&callback_lock, flags);
+	raw_spin_lock_irqsave(&callback_lock, flags);
 	rcu_read_lock();
 	guarantee_online_cpus(task_cs(tsk), pmask);
 	rcu_read_unlock();
-	spin_unlock_irqrestore(&callback_lock, flags);
+	raw_spin_unlock_irqrestore(&callback_lock, flags);
 }
 
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2472,11 +2472,11 @@ nodemask_t cpuset_mems_allowed(struct ta
 	nodemask_t mask;
 	unsigned long flags;
 
-	spin_lock_irqsave(&callback_lock, flags);
+	raw_spin_lock_irqsave(&callback_lock, flags);
 	rcu_read_lock();
 	guarantee_online_mems(task_cs(tsk), &mask);
 	rcu_read_unlock();
-	spin_unlock_irqrestore(&callback_lock, flags);
+	raw_spin_unlock_irqrestore(&callback_lock, flags);
 
 	return mask;
 }
@@ -2568,14 +2568,14 @@ bool __cpuset_node_allowed(int node, gfp
 		return true;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	spin_lock_irqsave(&callback_lock, flags);
+	raw_spin_lock_irqsave(&callback_lock, flags);
 
 	rcu_read_lock();
 	cs = nearest_hardwall_ancestor(task_cs(current));
 	allowed = node_isset(node, cs->mems_allowed);
 	rcu_read_unlock();
 
-	spin_unlock_irqrestore(&callback_lock, flags);
+	raw_spin_unlock_irqrestore(&callback_lock, flags);
 	return allowed;
 }
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch-rt] kvm: Convert pvclock_gtod_sync_lock to raw_spinlock_t
  2016-12-26  6:54 ` [patch-rt] kvm: Convert pvclock_gtod_sync_lock to raw_spinlock_t Mike Galbraith
@ 2017-01-20 16:44   ` Sebastian Andrzej Siewior
  2017-01-20 17:32     ` Mike Galbraith
  0 siblings, 1 reply; 21+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-01-20 16:44 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2016-12-26 07:54:08 [+0100], Mike Galbraith wrote:

Not so sure about it.

> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1540,7 +1540,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu
>  	kvm_vcpu_write_tsc_offset(vcpu, offset);
>  	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
>  
> -	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
> +	raw_spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
>  	if (!matched) {
>  		kvm->arch.nr_vcpus_matched_tsc = 0;
>  	} else if (!already_matched) {
> @@ -1548,7 +1548,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu
>  	}
>  
>  	kvm_track_tsc_matching(vcpu);
> -	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
> +	raw_spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
>  }
>  
>  EXPORT_SYMBOL_GPL(kvm_write_tsc);
> @@ -1715,7 +1715,7 @@ static void kvm_gen_update_masterclock(s
>  	struct kvm_vcpu *vcpu;
>  	struct kvm_arch *ka = &kvm->arch;
>  
> -	spin_lock(&ka->pvclock_gtod_sync_lock);
> +	raw_spin_lock(&ka->pvclock_gtod_sync_lock);
>  	kvm_make_mclock_inprogress_request(kvm);

kvm_make_mclock_inprogress_request() will do zalloc_cpumask_var().
off-stack zalloc is not yet working but I would like to enable it. Also
it does a SMP function call.

Couldn't we go the other way around and drop the local_irq_disable()?

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] posix_cpu_timers: Kill hotplug cpu notifier
  2016-12-26  7:00 ` [rfc patch-rt] posix_cpu_timers: Kill hotplug cpu notifier Mike Galbraith
@ 2017-01-20 16:46   ` Sebastian Andrzej Siewior
  2017-01-20 17:29     ` Mike Galbraith
  0 siblings, 1 reply; 21+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-01-20 16:46 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2016-12-26 08:00:54 [+0100], Mike Galbraith wrote:
> Shamelessly steal softirq.c thread initialization method.
What is the problem here?

> Signed-off-by: Mike Galbraith <efault@gmx.de>

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch-rt] softirq: Move ksoftirqd_running() under !CONFIG_PREEMPT_RT_FULL
  2016-12-31  8:20 ` [patch-rt] softirq: Move ksoftirqd_running() under !CONFIG_PREEMPT_RT_FULL Mike Galbraith
@ 2017-01-20 17:21   ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 21+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-01-20 17:21 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2016-12-31 09:20:10 [+0100], Mike Galbraith wrote:
> 
> Otherwise, ktimersoftd may not be awakened when it has work to do.

so the problem is that we have two softirq threads and we only check
the state of the "normal" one even if we would have also the schedule
the second one. 
The approach looks good, applied.

> [   84.087571] NOHZ: local_softirq_pending 02
> [   84.087593] NOHZ: local_softirq_pending 02
> [   84.087598] NOHZ: local_softirq_pending 02
> [   84.087904] NOHZ: local_softirq_pending 02
> [   84.088526] NOHZ: local_softirq_pending 02
> [   84.088899] NOHZ: local_softirq_pending 02
> [   84.089463] NOHZ: local_softirq_pending 02
> [  115.013470] NOHZ: local_softirq_pending 02
> [  115.013601] NOHZ: local_softirq_pending 02
> [  115.013709] NOHZ: local_softirq_pending 02
> 
> Signed-off-by: Mike Galbraith <efault@gmx.de>

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] posix_cpu_timers: Kill hotplug cpu notifier
  2017-01-20 16:46   ` Sebastian Andrzej Siewior
@ 2017-01-20 17:29     ` Mike Galbraith
  2017-01-20 17:34       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 21+ messages in thread
From: Mike Galbraith @ 2017-01-20 17:29 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On Fri, 2017-01-20 at 17:46 +0100, Sebastian Andrzej Siewior wrote:
> On 2016-12-26 08:00:54 [+0100], Mike Galbraith wrote:
> > Shamelessly steal softirq.c thread initialization method.
> What is the problem here?

There is no problem in 4.9.  I did that for upstream, thought it might 
save someone a minute or two.

	-Mike

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [patch-rt] kvm: Convert pvclock_gtod_sync_lock to raw_spinlock_t
  2017-01-20 16:44   ` Sebastian Andrzej Siewior
@ 2017-01-20 17:32     ` Mike Galbraith
  0 siblings, 0 replies; 21+ messages in thread
From: Mike Galbraith @ 2017-01-20 17:32 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On Fri, 2017-01-20 at 17:44 +0100, Sebastian Andrzej Siewior wrote:

> kvm_make_mclock_inprogress_request() will do zalloc_cpumask_var().
> off-stack zalloc is not yet working but I would like to enable it. Also
> it does a SMP function call.
> 
> Couldn't we go the other way around and drop the local_irq_disable()?

I suppose so.

	-MIke

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] posix_cpu_timers: Kill hotplug cpu notifier
  2017-01-20 17:29     ` Mike Galbraith
@ 2017-01-20 17:34       ` Sebastian Andrzej Siewior
  2017-01-20 17:56         ` Mike Galbraith
  0 siblings, 1 reply; 21+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-01-20 17:34 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2017-01-20 18:29:54 [+0100], Mike Galbraith wrote:
> On Fri, 2017-01-20 at 17:46 +0100, Sebastian Andrzej Siewior wrote:
> > On 2016-12-26 08:00:54 [+0100], Mike Galbraith wrote:
> > > Shamelessly steal softirq.c thread initialization method.
> > What is the problem here?
> 
> There is no problem in 4.9.  I did that for upstream, thought it might 
> save someone a minute or two.

I see. The subject says RT so I assumed it is somehow RT related. But as
of now we should have this hotplug notifier removed, right?

> 	-Mike

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] posix_cpu_timers: Kill hotplug cpu notifier
  2017-01-20 17:34       ` Sebastian Andrzej Siewior
@ 2017-01-20 17:56         ` Mike Galbraith
  0 siblings, 0 replies; 21+ messages in thread
From: Mike Galbraith @ 2017-01-20 17:56 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On Fri, 2017-01-20 at 18:34 +0100, Sebastian Andrzej Siewior wrote:
> On 2017-01-20 18:29:54 [+0100], Mike Galbraith wrote:
> > On Fri, 2017-01-20 at 17:46 +0100, Sebastian Andrzej Siewior wrote:
> > > On 2016-12-26 08:00:54 [+0100], Mike Galbraith wrote:
> > > > Shamelessly steal softirq.c thread initialization method.
> > > What is the problem here?
> > 
> > There is no problem in 4.9.  I did that for upstream, thought it might 
> > save someone a minute or two.
> 
> I see. The subject says RT so I assumed it is somehow RT related. But as
> of now we should have this hotplug notifier removed, right?

Yeah, RT only, it adds the thread and soon to be obsolete hotplug bits.

patches/0101-posix-timers-Thread-posix-cpu-timers-on-rt.patch

	-Mike

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
  2017-01-06  8:13 ` [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc() Mike Galbraith
  2017-01-06  8:28   ` Mike Galbraith
  2017-01-06  8:55   ` Michal Hocko
@ 2017-01-25 15:06   ` Sebastian Andrzej Siewior
  2017-01-26  3:42     ` Mike Galbraith
  2 siblings, 1 reply; 21+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-01-25 15:06 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2017-01-06 09:13:23 [+0100], Mike Galbraith wrote:
> radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
> 
> Having no preload, which turns accounting off for non-rt kernels, trying to
> allocate coming from shmem_fault() when memcg is full sends us scurrying off
> to pagefault_out_of_memory(), with dramatic (usually terminal) consequences.
> LTP's madvise06 testcase triggers this quite well, and per gitk, the below
> was the beginning of RT memcg woes.
> 
> 58e698af4c63 radix-tree: account radix_tree_node to memory cgroup
> 
> Turn memcg accounting off for RT in the problematic path.

According to the to description of radix_tree_preload() the return code
of 0 means that the following addition of a single element does not
fail. But in RT's case this requirement is not fulfilled. There is more
than just one user of that function. So instead adding an exception here
and maybe later one someplace else, what about the following patch?
That testcase you mentioned passes now:

|testcases/kernel/syscalls/madvise/madvise06
|tst_test.c:760: INFO: Timeout per run is 0h 05m 00s
|madvise06.c:65: INFO: dropping caches
|madvise06.c:139: INFO: SwapCached (before madvise): 304
|madvise06.c:153: INFO: SwapCached (after madvise): 309988
|madvise06.c:155: PASS: Regression test pass
|
|Summary:
|passed   1
|failed   0
|skipped  0
|warnings 0

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index f87f87dec84c..277295039c8f 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -289,19 +289,11 @@ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
 unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
 			void ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items);
-#ifdef CONFIG_PREEMPT_RT_FULL
-static inline int radix_tree_preload(gfp_t gm) { return 0; }
-static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
-static inline int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
-{
-	return 0;
-};
-
-#else
 int radix_tree_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload(gfp_t gfp_mask);
 int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
-#endif
+void radix_tree_preload_end(void);
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
@@ -324,11 +316,6 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
 unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
 
-static inline void radix_tree_preload_end(void)
-{
-	preempt_enable_nort();
-}
-
 /**
  * struct radix_tree_iter - radix tree iterator state
  *
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 881cc195d85f..e96c6a99f25c 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -36,7 +36,7 @@
 #include <linux/bitops.h>
 #include <linux/rcupdate.h>
 #include <linux/preempt.h>		/* in_interrupt() */
-
+#include <linux/locallock.h>
 
 /* Number of nodes in fully populated tree of given height */
 static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
@@ -68,6 +68,7 @@ struct radix_tree_preload {
 	struct radix_tree_node *nodes;
 };
 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
+static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
 
 static inline void *node_to_entry(void *ptr)
 {
@@ -290,14 +291,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 		 * succeed in getting a node here (and never reach
 		 * kmem_cache_alloc)
 		 */
-		rtp = &get_cpu_var(radix_tree_preloads);
+		rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
 		if (rtp->nr) {
 			ret = rtp->nodes;
 			rtp->nodes = ret->private_data;
 			ret->private_data = NULL;
 			rtp->nr--;
 		}
-		put_cpu_var(radix_tree_preloads);
+		put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
 		/*
 		 * Update the allocation stack trace as this is more useful
 		 * for debugging.
@@ -337,7 +338,6 @@ radix_tree_node_free(struct radix_tree_node *node)
 	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 }
 
-#ifndef CONFIG_PREEMPT_RT_FULL
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -359,14 +359,14 @@ static int __radix_tree_preload(gfp_t gfp_mask, int nr)
 	 */
 	gfp_mask &= ~__GFP_ACCOUNT;
 
-	preempt_disable();
+	local_lock(radix_tree_preloads_lock);
 	rtp = this_cpu_ptr(&radix_tree_preloads);
 	while (rtp->nr < nr) {
-		preempt_enable();
+		local_unlock(radix_tree_preloads_lock);
 		node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
 		if (node == NULL)
 			goto out;
-		preempt_disable();
+		local_lock(radix_tree_preloads_lock);
 		rtp = this_cpu_ptr(&radix_tree_preloads);
 		if (rtp->nr < nr) {
 			node->private_data = rtp->nodes;
@@ -408,7 +408,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
 	if (gfpflags_allow_blocking(gfp_mask))
 		return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
 	/* Preloading doesn't help anything with this gfp mask, skip it */
-	preempt_disable();
+	local_lock(radix_tree_preloads_lock);
 	return 0;
 }
 EXPORT_SYMBOL(radix_tree_maybe_preload);
@@ -424,7 +424,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
 
 	/* Preloading doesn't help anything with this gfp mask, skip it */
 	if (!gfpflags_allow_blocking(gfp_mask)) {
-		preempt_disable();
+		local_lock(radix_tree_preloads_lock);
 		return 0;
 	}
 
@@ -457,7 +457,11 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
 
 	return __radix_tree_preload(gfp_mask, nr_nodes);
 }
-#endif
+
+void radix_tree_preload_end(void)
+{
+	local_unlock(radix_tree_preloads_lock);
+}
 
 /*
  * The maximum index which can be stored in a radix tree

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: {patch-rt] cpuset: Convert callback_lock to raw_spinlock_t
  2017-01-08  8:32 ` {patch-rt] cpuset: Convert callback_lock to raw_spinlock_t Mike Galbraith
@ 2017-01-25 15:45   ` Sebastian Andrzej Siewior
  0 siblings, 0 replies; 21+ messages in thread
From: Sebastian Andrzej Siewior @ 2017-01-25 15:45 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On 2017-01-08 09:32:25 [+0100], Mike Galbraith wrote:
> 
> The two commits below add up to a cpuset might_sleep() splat for RT:
> Signed-off-by: Mike Galbraith <efault@gmx.de>
> Cc: stable-rt@vger.kernel.org

applied.

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc()
  2017-01-25 15:06   ` Sebastian Andrzej Siewior
@ 2017-01-26  3:42     ` Mike Galbraith
  0 siblings, 0 replies; 21+ messages in thread
From: Mike Galbraith @ 2017-01-26  3:42 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Thomas Gleixner, LKML, linux-rt-users, Steven Rostedt

On Wed, 2017-01-25 at 16:06 +0100, Sebastian Andrzej Siewior wrote:

> According to the to description of radix_tree_preload() the return code
> of 0 means that the following addition of a single element does not
> fail. But in RT's case this requirement is not fulfilled. There is more
> than just one user of that function. So instead adding an exception here
> and maybe later one someplace else, what about the following patch?
> That testcase you mentioned passes now:

Modulo missing EXPORT_SYMBOL(), yup, works fine.

> > testcases/kernel/syscalls/madvise/madvise06
> > tst_test.c:760: INFO: Timeout per run is 0h 05m 00s
> > madvise06.c:65: INFO: dropping caches
> > madvise06.c:139: INFO: SwapCached (before madvise): 304
> > madvise06.c:153: INFO: SwapCached (after madvise): 309988
> > madvise06.c:155: PASS: Regression test pass
> > 
> > Summary:
> > passed   1
> > failed   0
> > skipped  0
> > warnings 0
> 
> diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
> index f87f87dec84c..277295039c8f 100644
> --- a/include/linux/radix-tree.h
> +++ b/include/linux/radix-tree.h
> @@ -289,19 +289,11 @@ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
>  unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
>  > 	> 	> 	> void ***results, unsigned long *indices,
>  > 	> 	> 	> unsigned long first_index, unsigned int max_items);
> -#ifdef CONFIG_PREEMPT_RT_FULL
> -static inline int radix_tree_preload(gfp_t gm) { return 0; }
> -static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
> -static inline int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
> -{
> -> 	> return 0;
> -};
> -
> -#else
>  int radix_tree_preload(gfp_t gfp_mask);
>  int radix_tree_maybe_preload(gfp_t gfp_mask);
>  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
> -#endif
> +void radix_tree_preload_end(void);
> +
>  void radix_tree_init(void);
>  void *radix_tree_tag_set(struct radix_tree_root *root,
>  > 	> 	> 	> unsigned long index, unsigned int tag);
> @@ -324,11 +316,6 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
>  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
>  unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
>  
> -static inline void radix_tree_preload_end(void)
> -{
> -> 	> preempt_enable_nort();
> -}
> -
>  /**
>   * struct radix_tree_iter - radix tree iterator state
>   *
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index 881cc195d85f..e96c6a99f25c 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -36,7 +36,7 @@
>  #include 
>  #include 
>  #include > 	> 	> /* in_interrupt() */
> -
> +#include 
>  
>  /* Number of nodes in fully populated tree of given height */
>  static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
> @@ -68,6 +68,7 @@ struct radix_tree_preload {
>  > 	> struct radix_tree_node *nodes;
>  };
>  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
> +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
>  
>  static inline void *node_to_entry(void *ptr)
>  {
> @@ -290,14 +291,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
>  > 	> 	>  * succeed in getting a node here (and never reach
>  > 	> 	>  * kmem_cache_alloc)
>  > 	> 	>  */
> -> 	> 	> rtp = &get_cpu_var(radix_tree_preloads);
> +> 	> 	> rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
>  > 	> 	> if (rtp->nr) {
>  > 	> 	> 	> ret = rtp->nodes;
>  > 	> 	> 	> rtp->nodes = ret->private_data;
>  > 	> 	> 	> ret->private_data = NULL;
>  > 	> 	> 	> rtp->nr--;
>  > 	> 	> }
> -> 	> 	> put_cpu_var(radix_tree_preloads);
> +> 	> 	> put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
>  > 	> 	> /*
>  > 	> 	>  * Update the allocation stack trace as this is more useful
>  > 	> 	>  * for debugging.
> @@ -337,7 +338,6 @@ radix_tree_node_free(struct radix_tree_node *node)
>  > 	> call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
>  }
>  
> -#ifndef CONFIG_PREEMPT_RT_FULL
>  /*
>   * Load up this CPU's radix_tree_node buffer with sufficient objects to
>   * ensure that the addition of a single element in the tree cannot fail.  On
> @@ -359,14 +359,14 @@ static int __radix_tree_preload(gfp_t gfp_mask, int nr)
>  > 	>  */
>  > 	> gfp_mask &= ~__GFP_ACCOUNT;
>  
> -> 	> preempt_disable();
> +> 	> local_lock(radix_tree_preloads_lock);
>  > 	> rtp = this_cpu_ptr(&radix_tree_preloads);
>  > 	> while (rtp->nr < nr) {
> -> 	> 	> preempt_enable();
> +> 	> 	> local_unlock(radix_tree_preloads_lock);
>  > 	> 	> node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
>  > 	> 	> if (node == NULL)
>  > 	> 	> 	> goto out;
> -> 	> 	> preempt_disable();
> +> 	> 	> local_lock(radix_tree_preloads_lock);
>  > 	> 	> rtp = this_cpu_ptr(&radix_tree_preloads);
>  > 	> 	> if (rtp->nr < nr) {
>  > 	> 	> 	> node->private_data = rtp->nodes;
> @@ -408,7 +408,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
>  > 	> if (gfpflags_allow_blocking(gfp_mask))
>  > 	> 	> return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
>  > 	> /* Preloading doesn't help anything with this gfp mask, skip it */
> -> 	> preempt_disable();
> +> 	> local_lock(radix_tree_preloads_lock);
>  > 	> return 0;
>  }
>  EXPORT_SYMBOL(radix_tree_maybe_preload);
> @@ -424,7 +424,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
>  
>  > 	> /* Preloading doesn't help anything with this gfp mask, skip it */
>  > 	> if (!gfpflags_allow_blocking(gfp_mask)) {
> -> 	> 	> preempt_disable();
> +> 	> 	> local_lock(radix_tree_preloads_lock);
>  > 	> 	> return 0;
>  > 	> }
>  
> @@ -457,7 +457,11 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
>  
>  > 	> return __radix_tree_preload(gfp_mask, nr_nodes);
>  }
> -#endif
> +
> +void radix_tree_preload_end(void)
> +{
> +> 	> local_unlock(radix_tree_preloads_lock);
> +}
>  
>  /*
>   * The maximum index which can be stored in a radix tree

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2017-01-26  3:43 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-23 16:32 [ANNOUNCE] v4.9-rt1 Sebastian Andrzej Siewior
2016-12-26  6:54 ` [patch-rt] kvm: Convert pvclock_gtod_sync_lock to raw_spinlock_t Mike Galbraith
2017-01-20 16:44   ` Sebastian Andrzej Siewior
2017-01-20 17:32     ` Mike Galbraith
2016-12-26  7:00 ` [rfc patch-rt] posix_cpu_timers: Kill hotplug cpu notifier Mike Galbraith
2017-01-20 16:46   ` Sebastian Andrzej Siewior
2017-01-20 17:29     ` Mike Galbraith
2017-01-20 17:34       ` Sebastian Andrzej Siewior
2017-01-20 17:56         ` Mike Galbraith
2016-12-31  8:20 ` [patch-rt] softirq: Move ksoftirqd_running() under !CONFIG_PREEMPT_RT_FULL Mike Galbraith
2017-01-20 17:21   ` Sebastian Andrzej Siewior
2017-01-06  8:13 ` [rfc patch-rt] radix-tree: Partially disable memcg accounting in radix_tree_node_alloc() Mike Galbraith
2017-01-06  8:28   ` Mike Galbraith
2017-01-06  8:55   ` Michal Hocko
2017-01-06 10:52     ` Mike Galbraith
2017-01-06 12:20       ` Mike Galbraith
2017-01-06 12:44         ` Mike Galbraith
2017-01-25 15:06   ` Sebastian Andrzej Siewior
2017-01-26  3:42     ` Mike Galbraith
2017-01-08  8:32 ` {patch-rt] cpuset: Convert callback_lock to raw_spinlock_t Mike Galbraith
2017-01-25 15:45   ` Sebastian Andrzej Siewior

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.