linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 00/10] smp: irq_work / smp_call_function rework
@ 2020-08-18 10:51 Peter Zijlstra
  2020-08-18 10:51 ` [PATCH v2 01/10] irq_work: Cleanup Peter Zijlstra
                   ` (9 more replies)
  0 siblings, 10 replies; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz

Hi,

Here is a new version of the irq_work / smp_call_function integration / cleanup.

I'm thinking the first (5?) patches should be ready to go in, all the
dependents (i915, block) landed this merge window.

The rest attempts to 'fix' smp_call_function_single_async() while migrating a
number of performance sensitive users over to a new (fairly crap by
requirements) interface.

But by getting 'rid' of those smp_call_function_single_async() users, we can
'fix' that interface.

I'm still not entirely happy with the end result. Suggestions?




^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 01/10] irq_work: Cleanup
  2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
@ 2020-08-18 10:51 ` Peter Zijlstra
  2020-08-18 10:51 ` [PATCH v2 02/10] smp: Cleanup smp_call_function*() Peter Zijlstra
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz

Get rid of the __call_single_node union and clean up the API a little
to avoid external code relying on the structure layout as much.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/gpu/drm/i915/i915_request.c |    4 ++--
 include/linux/irq_work.h            |   33 +++++++++++++++++++++------------
 include/linux/irqflags.h            |    4 ++--
 kernel/bpf/stackmap.c               |    2 +-
 kernel/irq_work.c                   |   18 +++++++++---------
 kernel/printk/printk.c              |    6 ++----
 kernel/rcu/tree.c                   |    3 +--
 kernel/time/tick-sched.c            |    6 ++----
 kernel/trace/bpf_trace.c            |    2 +-
 9 files changed, 41 insertions(+), 37 deletions(-)

--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -196,7 +196,7 @@ static void __notify_execute_cb(struct i
 	if (llist_empty(&rq->execute_cb))
 		return;
 
-	llist_for_each_entry_safe(cb, cn, rq->execute_cb.first, work.llnode)
+	llist_for_each_entry_safe(cb, cn, rq->execute_cb.first, work.node.llist)
 		irq_work_queue(&cb->work);
 
 	/*
@@ -448,7 +448,7 @@ __await_execution(struct i915_request *r
 		i915_sw_fence_complete(cb->fence);
 		kmem_cache_free(global.slab_execute_cbs, cb);
 	} else {
-		__llist_add(&cb->work.llnode, &signal->execute_cb);
+		__llist_add(&cb->work.node.llist, &signal->execute_cb);
 	}
 	spin_unlock_irq(&signal->lock);
 
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -14,28 +14,37 @@
  */
 
 struct irq_work {
-	union {
-		struct __call_single_node node;
-		struct {
-			struct llist_node llnode;
-			atomic_t flags;
-		};
-	};
+	struct __call_single_node node;
 	void (*func)(struct irq_work *);
 };
 
+#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){	\
+	.node = { .u_flags = (_flags), },			\
+	.func = (_func),					\
+}
+
+#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0)
+#define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY)
+#define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ)
+
+#define DEFINE_IRQ_WORK(name, _f)				\
+	struct irq_work name = IRQ_WORK_INIT(_f)
+
 static inline
 void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 {
-	atomic_set(&work->flags, 0);
-	work->func = func;
+	*work = IRQ_WORK_INIT(func);
 }
 
-#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = {	\
-		.flags = ATOMIC_INIT(0),			\
-		.func  = (_f)					\
+static inline bool irq_work_is_pending(struct irq_work *work)
+{
+	return atomic_read(&work->node.a_flags) & IRQ_WORK_PENDING;
 }
 
+static inline bool irq_work_is_busy(struct irq_work *work)
+{
+	return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY;
+}
 
 bool irq_work_queue(struct irq_work *work);
 bool irq_work_queue_on(struct irq_work *work, int cpu);
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -108,12 +108,12 @@ do {						\
 
 # define lockdep_irq_work_enter(__work)					\
 	  do {								\
-		  if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
+		  if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\
 			current->irq_config = 1;			\
 	  } while (0)
 # define lockdep_irq_work_exit(__work)					\
 	  do {								\
-		  if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
+		  if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\
 			current->irq_config = 0;			\
 	  } while (0)
 
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -294,7 +294,7 @@ static void stack_map_get_build_id_offse
 	if (irqs_disabled()) {
 		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
 			work = this_cpu_ptr(&up_read_work);
-			if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
+			if (irq_work_is_busy(&work->irq_work)) {
 				/* cannot queue more up_read, fallback */
 				irq_work_busy = true;
 			}
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_wo
 {
 	int oflags;
 
-	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
+	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
 	/*
 	 * If the work is already pending, no need to raise the IPI.
 	 * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
@@ -53,12 +53,12 @@ void __weak arch_irq_work_raise(void)
 static void __irq_work_queue_local(struct irq_work *work)
 {
 	/* If the work is "lazy", handle it from next tick if any */
-	if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
-		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+	if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) {
+		if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) &&
 		    tick_nohz_tick_stopped())
 			arch_irq_work_raise();
 	} else {
-		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+		if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
 			arch_irq_work_raise();
 	}
 }
@@ -102,7 +102,7 @@ bool irq_work_queue_on(struct irq_work *
 	if (cpu != smp_processor_id()) {
 		/* Arch remote IPI send/receive backend aren't NMI safe */
 		WARN_ON_ONCE(in_nmi());
-		__smp_call_single_queue(cpu, &work->llnode);
+		__smp_call_single_queue(cpu, &work->node.llist);
 	} else {
 		__irq_work_queue_local(work);
 	}
@@ -142,7 +142,7 @@ void irq_work_single(void *arg)
 	 * to claim that work don't rely on us to handle their data
 	 * while we are in the middle of the func.
 	 */
-	flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+	flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->node.a_flags);
 
 	lockdep_irq_work_enter(work);
 	work->func(work);
@@ -152,7 +152,7 @@ void irq_work_single(void *arg)
 	 * no-one else claimed it meanwhile.
 	 */
 	flags &= ~IRQ_WORK_PENDING;
-	(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+	(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
 }
 
 static void irq_work_run_list(struct llist_head *list)
@@ -166,7 +166,7 @@ static void irq_work_run_list(struct lli
 		return;
 
 	llnode = llist_del_all(list);
-	llist_for_each_entry_safe(work, tmp, llnode, llnode)
+	llist_for_each_entry_safe(work, tmp, llnode, node.llist)
 		irq_work_single(work);
 }
 
@@ -198,7 +198,7 @@ void irq_work_sync(struct irq_work *work
 {
 	lockdep_assert_irqs_enabled();
 
-	while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
+	while (irq_work_is_busy(work))
 		cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3029,10 +3029,8 @@ static void wake_up_klogd_work_func(stru
 		wake_up_interruptible(&log_wait);
 }
 
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
-	.func = wake_up_klogd_work_func,
-	.flags = ATOMIC_INIT(IRQ_WORK_LAZY),
-};
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
+	IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
 
 void wake_up_klogd(void)
 {
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1305,8 +1305,6 @@ static int rcu_implicit_dynticks_qs(stru
 		if (IS_ENABLED(CONFIG_IRQ_WORK) &&
 		    !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
 		    (rnp->ffmask & rdp->grpmask)) {
-			init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
-			atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ);
 			rdp->rcu_iw_pending = true;
 			rdp->rcu_iw_gp_seq = rnp->gp_seq;
 			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
@@ -3906,6 +3904,7 @@ int rcutree_prepare_cpu(unsigned int cpu
 	rdp->cpu_no_qs.b.norm = true;
 	rdp->core_needs_qs = false;
 	rdp->rcu_iw_pending = false;
+	rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
 	rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
 	trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -243,10 +243,8 @@ static void nohz_full_kick_func(struct i
 	/* Empty, the tick restart happens on tick_nohz_irq_exit() */
 }
 
-static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
-	.func = nohz_full_kick_func,
-	.flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ),
-};
+static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
+	IRQ_WORK_INIT_HARD(nohz_full_kick_func);
 
 /*
  * Kick this CPU if it's full dynticks in order to force it to
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1057,7 +1057,7 @@ static int bpf_send_signal_common(u32 si
 			return -EINVAL;
 
 		work = this_cpu_ptr(&send_signal_work);
-		if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
+		if (irq_work_is_busy(&work->irq_work))
 			return -EBUSY;
 
 		/* Add the current task, which is the target of sending signal,



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 02/10] smp: Cleanup smp_call_function*()
  2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
  2020-08-18 10:51 ` [PATCH v2 01/10] irq_work: Cleanup Peter Zijlstra
@ 2020-08-18 10:51 ` Peter Zijlstra
  2020-08-18 10:51 ` [PATCH v2 03/10] irq_work: Optimize irq_work_single() Peter Zijlstra
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz

Get rid of the __call_single_node union and cleanup the API a little
to avoid external code relying on the structure layout as much.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/mips/kernel/process.c                      |    5 +--
 arch/mips/kernel/smp.c                          |   25 +++------------
 arch/s390/pci/pci_irq.c                         |    4 --
 arch/x86/kernel/cpuid.c                         |    7 +---
 arch/x86/lib/msr-smp.c                          |    7 +---
 block/blk-mq.c                                  |    4 --
 drivers/cpuidle/coupled.c                       |    3 -
 drivers/net/ethernet/cavium/liquidio/lio_core.c |    9 +----
 include/linux/smp.h                             |   16 +++++----
 kernel/debug/debug_core.c                       |    6 +--
 kernel/sched/core.c                             |   12 +------
 kernel/smp.c                                    |   40 ++++++++++++------------
 net/core/dev.c                                  |    3 -
 13 files changed, 55 insertions(+), 86 deletions(-)

--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -687,7 +687,6 @@ unsigned long arch_align_stack(unsigned
 	return sp & ALMASK;
 }
 
-static DEFINE_PER_CPU(call_single_data_t, backtrace_csd);
 static struct cpumask backtrace_csd_busy;
 
 static void handle_backtrace(void *info)
@@ -696,6 +695,9 @@ static void handle_backtrace(void *info)
 	cpumask_clear_cpu(smp_processor_id(), &backtrace_csd_busy);
 }
 
+static DEFINE_PER_CPU(call_single_data_t, backtrace_csd) =
+	CSD_INIT(handle_backtrace, NULL);
+
 static void raise_backtrace(cpumask_t *mask)
 {
 	call_single_data_t *csd;
@@ -715,7 +717,6 @@ static void raise_backtrace(cpumask_t *m
 		}
 
 		csd = &per_cpu(backtrace_csd, cpu);
-		csd->func = handle_backtrace;
 		smp_call_function_single_async(cpu, csd);
 	}
 }
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -687,36 +687,23 @@ EXPORT_SYMBOL(flush_tlb_one);
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 
-static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd);
-
-void tick_broadcast(const struct cpumask *mask)
-{
-	call_single_data_t *csd;
-	int cpu;
-
-	for_each_cpu(cpu, mask) {
-		csd = &per_cpu(tick_broadcast_csd, cpu);
-		smp_call_function_single_async(cpu, csd);
-	}
-}
-
 static void tick_broadcast_callee(void *info)
 {
 	tick_receive_broadcast();
 }
 
-static int __init tick_broadcast_init(void)
+static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd) =
+	CSD_INIT(tick_broadcast_callee, NULL);
+
+void tick_broadcast(const struct cpumask *mask)
 {
 	call_single_data_t *csd;
 	int cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_cpu(cpu, mask) {
 		csd = &per_cpu(tick_broadcast_csd, cpu);
-		csd->func = tick_broadcast_callee;
+		smp_call_function_single_async(cpu, csd);
 	}
-
-	return 0;
 }
-early_initcall(tick_broadcast_init);
 
 #endif /* CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -178,9 +178,7 @@ static void zpci_handle_fallback_irq(voi
 		if (atomic_inc_return(&cpu_data->scheduled) > 1)
 			continue;
 
-		cpu_data->csd.func = zpci_handle_remote_irq;
-		cpu_data->csd.info = &cpu_data->scheduled;
-		cpu_data->csd.flags = 0;
+		INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled);
 		smp_call_function_single_async(cpu, &cpu_data->csd);
 	}
 }
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *f
 
 	init_completion(&cmd.done);
 	for (; count; count -= 16) {
-		call_single_data_t csd = {
-			.func = cpuid_smp_cpuid,
-			.info = &cmd,
-		};
+		call_single_data_t csd;
+
+		INIT_CSD(&csd, cpuid_smp_cpuid, &cmd);
 
 		cmd.regs.eax = pos;
 		cmd.regs.ecx = pos >> 32;
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -169,12 +169,11 @@ static void __wrmsr_safe_on_cpu(void *in
 int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 {
 	struct msr_info_completion rv;
-	call_single_data_t csd = {
-		.func	= __rdmsr_safe_on_cpu,
-		.info	= &rv,
-	};
+	call_single_data_t csd;
 	int err;
 
+	INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv);
+
 	memset(&rv, 0, sizeof(rv));
 	init_completion(&rv.done);
 	rv.msr.msr_no = msr_no;
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -672,9 +672,7 @@ bool blk_mq_complete_request_remote(stru
 		return false;
 
 	if (blk_mq_complete_need_ipi(rq)) {
-		rq->csd.func = __blk_mq_complete_request_remote;
-		rq->csd.info = rq;
-		rq->csd.flags = 0;
+		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
 		smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
 	} else {
 		if (rq->q->nr_hw_queues > 1)
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -674,8 +674,7 @@ int cpuidle_coupled_register_device(stru
 	coupled->refcnt++;
 
 	csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu);
-	csd->func = cpuidle_coupled_handle_poke;
-	csd->info = (void *)(unsigned long)dev->cpu;
+	INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu);
 
 	return 0;
 }
--- a/drivers/net/ethernet/cavium/liquidio/lio_core.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c
@@ -726,13 +726,8 @@ static void liquidio_napi_drv_callback(v
 	    droq->cpu_id == this_cpu) {
 		napi_schedule_irqoff(&droq->napi);
 	} else {
-		call_single_data_t *csd = &droq->csd;
-
-		csd->func = napi_schedule_wrapper;
-		csd->info = &droq->napi;
-		csd->flags = 0;
-
-		smp_call_function_single_async(droq->cpu_id, csd);
+		INIT_CSD(&droq->csd, napi_schedule_wrapper, &droq->napi);
+		smp_call_function_single_async(droq->cpu_id, &droq->csd);
 	}
 }
 
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -21,21 +21,23 @@ typedef bool (*smp_cond_func_t)(int cpu,
  * structure shares (partial) layout with struct irq_work
  */
 struct __call_single_data {
-	union {
-		struct __call_single_node node;
-		struct {
-			struct llist_node llist;
-			unsigned int flags;
-		};
-	};
+	struct __call_single_node node;
 	smp_call_func_t func;
 	void *info;
 };
 
+#define CSD_INIT(_func, _info) \
+	(struct __call_single_data){ .func = (_func), .info = (_info), }
+
 /* Use __aligned() to avoid to use 2 cache lines for 1 csd */
 typedef struct __call_single_data call_single_data_t
 	__aligned(sizeof(struct __call_single_data));
 
+#define INIT_CSD(_csd, _func, _info)		\
+do {						\
+	*(_csd) = CSD_INIT((_func), (_info));	\
+} while (0)
+
 /*
  * Enqueue a llist_node on the call_single_queue; be very careful, read
  * flush_smp_call_function_queue() in detail.
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -225,8 +225,6 @@ int __weak kgdb_skipexception(int except
  * Default (weak) implementation for kgdb_roundup_cpus
  */
 
-static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd);
-
 void __weak kgdb_call_nmi_hook(void *ignored)
 {
 	/*
@@ -240,6 +238,9 @@ void __weak kgdb_call_nmi_hook(void *ign
 	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
 }
 
+static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) =
+	CSD_INIT(kgdb_call_nmi_hook, NULL);
+
 void __weak kgdb_roundup_cpus(void)
 {
 	call_single_data_t *csd;
@@ -266,7 +267,6 @@ void __weak kgdb_roundup_cpus(void)
 			continue;
 		kgdb_info[cpu].rounding_up = true;
 
-		csd->func = kgdb_call_nmi_hook;
 		ret = smp_call_function_single_async(cpu, csd);
 		if (ret)
 			kgdb_info[cpu].rounding_up = false;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -223,14 +223,6 @@ void update_rq_clock(struct rq *rq)
 	update_rq_clock_task(rq, delta);
 }
 
-static inline void
-rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
-{
-	csd->flags = 0;
-	csd->func = func;
-	csd->info = rq;
-}
-
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -331,7 +323,7 @@ void hrtick_start(struct rq *rq, u64 del
 static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
-	rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
+	INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
 #endif
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 	rq->hrtick_timer.function = hrtick;
@@ -6835,7 +6827,7 @@ void __init sched_init(void)
 		rq->last_blocked_load_update_tick = jiffies;
 		atomic_set(&rq->nohz_flags, 0);
 
-		rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
+		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
 #endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -24,7 +24,7 @@
 #include "smpboot.h"
 #include "sched/smp.h"
 
-#define CSD_TYPE(_csd)	((_csd)->flags & CSD_FLAG_TYPE_MASK)
+#define CSD_TYPE(_csd)	((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
 
 struct call_function_data {
 	call_single_data_t	__percpu *csd;
@@ -105,13 +105,13 @@ void __init call_function_init(void)
  */
 static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
-	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
+	smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 }
 
 static __always_inline void csd_lock(call_single_data_t *csd)
 {
 	csd_lock_wait(csd);
-	csd->flags |= CSD_FLAG_LOCK;
+	csd->node.u_flags |= CSD_FLAG_LOCK;
 
 	/*
 	 * prevent CPU from reordering the above assignment
@@ -123,12 +123,12 @@ static __always_inline void csd_lock(cal
 
 static __always_inline void csd_unlock(call_single_data_t *csd)
 {
-	WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
+	WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
 
 	/*
 	 * ensure we're all done before releasing data:
 	 */
-	smp_store_release(&csd->flags, 0);
+	smp_store_release(&csd->node.u_flags, 0);
 }
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
@@ -178,7 +178,7 @@ static int generic_exec_single(int cpu,
 		return -ENXIO;
 	}
 
-	__smp_call_single_queue(cpu, &csd->llist);
+	__smp_call_single_queue(cpu, &csd->node.llist);
 
 	return 0;
 }
@@ -231,7 +231,7 @@ static void flush_smp_call_function_queu
 		 * We don't have to use the _safe() variant here
 		 * because we are not invoking the IPI handlers yet.
 		 */
-		llist_for_each_entry(csd, entry, llist) {
+		llist_for_each_entry(csd, entry, node.llist) {
 			switch (CSD_TYPE(csd)) {
 			case CSD_TYPE_ASYNC:
 			case CSD_TYPE_SYNC:
@@ -256,22 +256,22 @@ static void flush_smp_call_function_queu
 	 * First; run all SYNC callbacks, people are waiting for us.
 	 */
 	prev = NULL;
-	llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+	llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
 		/* Do we wait until *after* callback? */
 		if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
 			smp_call_func_t func = csd->func;
 			void *info = csd->info;
 
 			if (prev) {
-				prev->next = &csd_next->llist;
+				prev->next = &csd_next->node.llist;
 			} else {
-				entry = &csd_next->llist;
+				entry = &csd_next->node.llist;
 			}
 
 			func(info);
 			csd_unlock(csd);
 		} else {
-			prev = &csd->llist;
+			prev = &csd->node.llist;
 		}
 	}
 
@@ -282,14 +282,14 @@ static void flush_smp_call_function_queu
 	 * Second; run all !SYNC callbacks.
 	 */
 	prev = NULL;
-	llist_for_each_entry_safe(csd, csd_next, entry, llist) {
+	llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
 		int type = CSD_TYPE(csd);
 
 		if (type != CSD_TYPE_TTWU) {
 			if (prev) {
-				prev->next = &csd_next->llist;
+				prev->next = &csd_next->node.llist;
 			} else {
-				entry = &csd_next->llist;
+				entry = &csd_next->node.llist;
 			}
 
 			if (type == CSD_TYPE_ASYNC) {
@@ -303,7 +303,7 @@ static void flush_smp_call_function_queu
 			}
 
 		} else {
-			prev = &csd->llist;
+			prev = &csd->node.llist;
 		}
 	}
 
@@ -339,7 +339,7 @@ int smp_call_function_single(int cpu, sm
 {
 	call_single_data_t *csd;
 	call_single_data_t csd_stack = {
-		.flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC,
+		.node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
 	};
 	int this_cpu;
 	int err;
@@ -414,12 +414,12 @@ int smp_call_function_single_async(int c
 
 	preempt_disable();
 
-	if (csd->flags & CSD_FLAG_LOCK) {
+	if (csd->node.u_flags & CSD_FLAG_LOCK) {
 		err = -EBUSY;
 		goto out;
 	}
 
-	csd->flags = CSD_FLAG_LOCK;
+	csd->node.u_flags = CSD_FLAG_LOCK;
 	smp_wmb();
 
 	err = generic_exec_single(cpu, csd);
@@ -537,10 +537,10 @@ static void smp_call_function_many_cond(
 
 		csd_lock(csd);
 		if (wait)
-			csd->flags |= CSD_TYPE_SYNC;
+			csd->node.u_flags |= CSD_TYPE_SYNC;
 		csd->func = func;
 		csd->info = info;
-		if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+		if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu)))
 			__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 	}
 
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10661,8 +10661,7 @@ static int __init net_dev_init(void)
 		INIT_LIST_HEAD(&sd->poll_list);
 		sd->output_queue_tailp = &sd->output_queue;
 #ifdef CONFIG_RPS
-		sd->csd.func = rps_trigger_softirq;
-		sd->csd.info = sd;
+		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
 		sd->cpu = i;
 #endif
 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 03/10] irq_work: Optimize irq_work_single()
  2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
  2020-08-18 10:51 ` [PATCH v2 01/10] irq_work: Cleanup Peter Zijlstra
  2020-08-18 10:51 ` [PATCH v2 02/10] smp: Cleanup smp_call_function*() Peter Zijlstra
@ 2020-08-18 10:51 ` Peter Zijlstra
  2020-08-18 10:51 ` [PATCH v2 04/10] irq_work: Unconditionally build on SMP Peter Zijlstra
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz

Trade one atomic op for a full memory barrier.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/irqflags.h |    8 ++++----
 kernel/irq_work.c        |   29 +++++++++++++++++------------
 2 files changed, 21 insertions(+), 16 deletions(-)

--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -88,14 +88,14 @@ do {						\
 		  current->irq_config = 0;			\
 	  } while (0)
 
-# define lockdep_irq_work_enter(__work)					\
+# define lockdep_irq_work_enter(_flags)					\
 	  do {								\
-		  if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\
+		  if (!((_flags) & IRQ_WORK_HARD_IRQ))			\
 			current->irq_config = 1;			\
 	  } while (0)
-# define lockdep_irq_work_exit(__work)					\
+# define lockdep_irq_work_exit(_flags)					\
 	  do {								\
-		  if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\
+		  if (!((_flags) & IRQ_WORK_HARD_IRQ))			\
 			current->irq_config = 0;			\
 	  } while (0)
 
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_wo
 	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
 	/*
 	 * If the work is already pending, no need to raise the IPI.
-	 * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
+	 * The pairing smp_mb() in irq_work_single() makes sure
 	 * everything we did before is visible.
 	 */
 	if (oflags & IRQ_WORK_PENDING)
@@ -136,22 +136,27 @@ void irq_work_single(void *arg)
 	int flags;
 
 	/*
-	 * Clear the PENDING bit, after this point the @work
-	 * can be re-used.
-	 * Make it immediately visible so that other CPUs trying
-	 * to claim that work don't rely on us to handle their data
-	 * while we are in the middle of the func.
+	 * Clear the PENDING bit, after this point the @work can be re-used.
+	 * The PENDING bit acts as a lock, and we own it, so we can clear it
+	 * without atomic ops.
 	 */
-	flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->node.a_flags);
+	flags = atomic_read(&work->node.a_flags);
+	flags &= ~IRQ_WORK_PENDING;
+	atomic_set(&work->node.a_flags, flags);
+
+	/*
+	 * See irq_work_claim().
+	 */
+	smp_mb();
 
-	lockdep_irq_work_enter(work);
+	lockdep_irq_work_enter(flags);
 	work->func(work);
-	lockdep_irq_work_exit(work);
+	lockdep_irq_work_exit(flags);
+
 	/*
-	 * Clear the BUSY bit and return to the free state if
-	 * no-one else claimed it meanwhile.
+	 * Clear the BUSY bit, if set, and return to the free state if no-one
+	 * else claimed it meanwhile.
 	 */
-	flags &= ~IRQ_WORK_PENDING;
 	(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
 }
 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 04/10] irq_work: Unconditionally build on SMP
  2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
                   ` (2 preceding siblings ...)
  2020-08-18 10:51 ` [PATCH v2 03/10] irq_work: Optimize irq_work_single() Peter Zijlstra
@ 2020-08-18 10:51 ` Peter Zijlstra
  2020-08-18 10:51 ` [PATCH v2 05/10] irq_work: Provide irq_work_queue_remote() Peter Zijlstra
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/Makefile   |    1 +
 kernel/irq_work.c |    3 +++
 2 files changed, 4 insertions(+)

--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -105,6 +105,7 @@ obj-$(CONFIG_TRACE_CLOCK) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-$(CONFIG_SMP) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_BPF) += bpf/
 obj-$(CONFIG_KCSAN) += kcsan/
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -20,6 +20,7 @@
 #include <linux/smp.h>
 #include <asm/processor.h>
 
+#ifdef CONFIG_IRQ_WORK
 
 static DEFINE_PER_CPU(struct llist_head, raised_list);
 static DEFINE_PER_CPU(struct llist_head, lazy_list);
@@ -212,3 +213,5 @@ void irq_work_sync(struct irq_work *work
 		cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
+
+#endif /* CONFIG_IRQ_WORK */



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v2 05/10] irq_work: Provide irq_work_queue_remote()
  2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
                   ` (3 preceding siblings ...)
  2020-08-18 10:51 ` [PATCH v2 04/10] irq_work: Unconditionally build on SMP Peter Zijlstra
@ 2020-08-18 10:51 ` Peter Zijlstra
  2020-08-18 10:51 ` [RFC][PATCH v2 06/10] irq_work: Provide irq_work_queue_remote_static() Peter Zijlstra
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz

While the traditional irq_work relies on the ability to self-IPI, it
makes sense to provide an unconditional irq_work_queue_remote()
interface.

This can be used to replace the plagued smp_call_function_single_async().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/irq_work.h |   17 ++++--
 kernel/irq_work.c        |  129 ++++++++++++++++++++++++++++-------------------
 kernel/rcu/tree.c        |    6 +-
 3 files changed, 95 insertions(+), 57 deletions(-)

--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -46,22 +46,29 @@ static inline bool irq_work_is_busy(stru
 	return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY;
 }
 
+#ifdef CONFIG_IRQ_WORK
+
 bool irq_work_queue(struct irq_work *work);
 bool irq_work_queue_on(struct irq_work *work, int cpu);
 
 void irq_work_tick(void);
 void irq_work_sync(struct irq_work *work);
 
-#ifdef CONFIG_IRQ_WORK
 #include <asm/irq_work.h>
 
 void irq_work_run(void);
 bool irq_work_needs_cpu(void);
-void irq_work_single(void *arg);
-#else
-static inline bool irq_work_needs_cpu(void) { return false; }
+
+#else /* !CONFIG_IRQ_WORK */
+
 static inline void irq_work_run(void) { }
-static inline void irq_work_single(void *arg) { }
+static inline bool irq_work_needs_cpu(void) { return false; }
+
+#endif /* CONFIG_IRQ_WORK */
+
+#ifdef CONFIG_SMP
+extern int irq_work_queue_remote(int cpu, struct irq_work *work);
+extern void irq_work_single(void *arg);
 #endif
 
 #endif /* _LINUX_IRQ_WORK_H */
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -20,10 +20,7 @@
 #include <linux/smp.h>
 #include <asm/processor.h>
 
-#ifdef CONFIG_IRQ_WORK
-
-static DEFINE_PER_CPU(struct llist_head, raised_list);
-static DEFINE_PER_CPU(struct llist_head, lazy_list);
+#if defined(CONFIG_IRQ_WORK) || defined(CONFIG_SMP)
 
 /*
  * Claim the entry so that no one else will poke at it.
@@ -43,6 +40,82 @@ static bool irq_work_claim(struct irq_wo
 	return true;
 }
 
+void irq_work_single(void *arg)
+{
+	struct irq_work *work = arg;
+	int flags;
+
+	/*
+	 * Clear the PENDING bit, after this point the @work can be re-used.
+	 * The PENDING bit acts as a lock, and we own it, so we can clear it
+	 * without atomic ops.
+	 */
+	flags = atomic_read(&work->node.a_flags);
+	flags &= ~IRQ_WORK_PENDING;
+	atomic_set(&work->node.a_flags, flags);
+
+	/*
+	 * See irq_work_claim().
+	 */
+	smp_mb();
+
+	lockdep_irq_work_enter(flags);
+	work->func(work);
+	lockdep_irq_work_exit(flags);
+
+	/*
+	 * Clear the BUSY bit, if set, and return to the free state if no-one
+	 * else claimed it meanwhile.
+	 */
+	(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
+}
+
+/*
+ * Synchronize against the irq_work @entry, ensures the entry is not
+ * currently in use.
+ */
+void irq_work_sync(struct irq_work *work)
+{
+	lockdep_assert_irqs_enabled();
+
+	while (irq_work_is_busy(work))
+		cpu_relax();
+}
+EXPORT_SYMBOL_GPL(irq_work_sync);
+
+#endif /* CONFIG_IRQ_WORK || CONFIG_SMP */
+
+#ifdef CONFIG_SMP
+
+static void __irq_work_queue_remote(int cpu, struct irq_work *work)
+{
+	/* Arch remote IPI send/receive backend aren't NMI safe */
+	WARN_ON_ONCE(in_nmi());
+	__smp_call_single_queue(cpu, &work->node.llist);
+}
+
+int irq_work_queue_remote(int cpu, struct irq_work *work)
+{
+	/*
+	 * Ensures preemption is disabled in the caller.
+	 */
+	WARN_ON_ONCE(cpu == smp_processor_id());
+
+	if (!irq_work_claim(work))
+		return -EBUSY;
+
+	__irq_work_queue_remote(cpu, work);
+
+	return 0;
+}
+
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_IRQ_WORK
+
+static DEFINE_PER_CPU(struct llist_head, raised_list);
+static DEFINE_PER_CPU(struct llist_head, lazy_list);
+
 void __weak arch_irq_work_raise(void)
 {
 	/*
@@ -101,9 +174,7 @@ bool irq_work_queue_on(struct irq_work *
 
 	preempt_disable();
 	if (cpu != smp_processor_id()) {
-		/* Arch remote IPI send/receive backend aren't NMI safe */
-		WARN_ON_ONCE(in_nmi());
-		__smp_call_single_queue(cpu, &work->node.llist);
+		__irq_work_queue_remote(cpu, work);
 	} else {
 		__irq_work_queue_local(work);
 	}
@@ -131,36 +202,6 @@ bool irq_work_needs_cpu(void)
 	return true;
 }
 
-void irq_work_single(void *arg)
-{
-	struct irq_work *work = arg;
-	int flags;
-
-	/*
-	 * Clear the PENDING bit, after this point the @work can be re-used.
-	 * The PENDING bit acts as a lock, and we own it, so we can clear it
-	 * without atomic ops.
-	 */
-	flags = atomic_read(&work->node.a_flags);
-	flags &= ~IRQ_WORK_PENDING;
-	atomic_set(&work->node.a_flags, flags);
-
-	/*
-	 * See irq_work_claim().
-	 */
-	smp_mb();
-
-	lockdep_irq_work_enter(flags);
-	work->func(work);
-	lockdep_irq_work_exit(flags);
-
-	/*
-	 * Clear the BUSY bit, if set, and return to the free state if no-one
-	 * else claimed it meanwhile.
-	 */
-	(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
-}
-
 static void irq_work_run_list(struct llist_head *list)
 {
 	struct irq_work *work, *tmp;
@@ -196,17 +237,5 @@ void irq_work_tick(void)
 	irq_work_run_list(this_cpu_ptr(&lazy_list));
 }
 
-/*
- * Synchronize against the irq_work @entry, ensures the entry is not
- * currently in use.
- */
-void irq_work_sync(struct irq_work *work)
-{
-	lockdep_assert_irqs_enabled();
-
-	while (irq_work_is_busy(work))
-		cpu_relax();
-}
-EXPORT_SYMBOL_GPL(irq_work_sync);
-
 #endif /* CONFIG_IRQ_WORK */
+
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1284,13 +1284,15 @@ static int rcu_implicit_dynticks_qs(stru
 			resched_cpu(rdp->cpu);
 			WRITE_ONCE(rdp->last_fqs_resched, jiffies);
 		}
-		if (IS_ENABLED(CONFIG_IRQ_WORK) &&
-		    !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
+#ifdef CONFIG_IRQ_WORK
+		// XXX should we use irq_work_queue_remote() ?
+		if (!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
 		    (rnp->ffmask & rdp->grpmask)) {
 			rdp->rcu_iw_pending = true;
 			rdp->rcu_iw_gp_seq = rnp->gp_seq;
 			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
 		}
+#endif
 	}
 
 	return 0;



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC][PATCH v2 06/10] irq_work: Provide irq_work_queue_remote_static()
  2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
                   ` (4 preceding siblings ...)
  2020-08-18 10:51 ` [PATCH v2 05/10] irq_work: Provide irq_work_queue_remote() Peter Zijlstra
@ 2020-08-18 10:51 ` Peter Zijlstra
  2020-08-18 10:51 ` [RFC][PATCH v2 07/10] sched/fair: Exclude the current CPU from find_new_ilb() Peter Zijlstra
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz

Provide the same horrible semantics provided by
smp_call_function_single_async(), doing so allows skiping a bunch of
atomic ops.

API wise this is horrible crap as it relies on external serialization.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/irq_work.h |    3 ++-
 kernel/irq_work.c        |   21 ++++++++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -19,7 +19,7 @@ struct irq_work {
 };
 
 #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){	\
-	.node = { .u_flags = (_flags), },			\
+	.node = { .u_flags = CSD_TYPE_IRQ_WORK | (_flags), },	\
 	.func = (_func),					\
 }
 
@@ -68,6 +68,7 @@ static inline bool irq_work_needs_cpu(vo
 
 #ifdef CONFIG_SMP
 extern int irq_work_queue_remote(int cpu, struct irq_work *work);
+extern int irq_work_queue_remote_static(int cpu, struct irq_work *work);
 extern void irq_work_single(void *arg);
 #endif
 
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -29,7 +29,7 @@ static bool irq_work_claim(struct irq_wo
 {
 	int oflags;
 
-	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
+	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->node.a_flags);
 	/*
 	 * If the work is already pending, no need to raise the IPI.
 	 * The pairing smp_mb() in irq_work_single() makes sure
@@ -63,6 +63,9 @@ void irq_work_single(void *arg)
 	work->func(work);
 	lockdep_irq_work_exit(flags);
 
+	if (!(flags & IRQ_WORK_BUSY))
+		return;
+
 	/*
 	 * Clear the BUSY bit, if set, and return to the free state if no-one
 	 * else claimed it meanwhile.
@@ -108,6 +111,22 @@ int irq_work_queue_remote(int cpu, struc
 
 	return 0;
 }
+
+int irq_work_queue_remote_static(int cpu, struct irq_work *work)
+{
+	/*
+	 * Ensures preemption is disabled in the caller.
+	 */
+	WARN_ON_ONCE(cpu == smp_processor_id());
+
+	if (work->node.u_flags & IRQ_WORK_PENDING)
+		return -EBUSY;
+
+	work->node.u_flags |= IRQ_WORK_PENDING;
+	__smp_call_single_queue(cpu, &work->node.llist);
+
+	return 0;
+}
 
 #endif /* CONFIG_SMP */
 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC][PATCH v2 07/10] sched/fair: Exclude the current CPU from find_new_ilb()
  2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
                   ` (5 preceding siblings ...)
  2020-08-18 10:51 ` [RFC][PATCH v2 06/10] irq_work: Provide irq_work_queue_remote_static() Peter Zijlstra
@ 2020-08-18 10:51 ` Peter Zijlstra
  2020-08-18 10:51 ` [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API Peter Zijlstra
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz

It is possible for find_new_ilb() to select the current CPU, however,
this only happens from newidle balancing, in which case need_resched()
will be true, and consequently nohz_csd_func() will not trigger the
softirq.

Exclude the current CPU from becoming an ILB target.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/fair.c |    4 ++++
 1 file changed, 4 insertions(+)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10026,6 +10026,10 @@ static inline int find_new_ilb(void)
 
 	for_each_cpu_and(ilb, nohz.idle_cpus_mask,
 			      housekeeping_cpumask(HK_FLAG_MISC)) {
+
+		if (ilb == smp_processor_id())
+			continue;
+
 		if (idle_cpu(ilb))
 			return ilb;
 	}



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
                   ` (6 preceding siblings ...)
  2020-08-18 10:51 ` [RFC][PATCH v2 07/10] sched/fair: Exclude the current CPU from find_new_ilb() Peter Zijlstra
@ 2020-08-18 10:51 ` Peter Zijlstra
  2020-08-18 16:25   ` Christoph Hellwig
  2020-08-20  6:19   ` Christoph Hellwig
  2020-08-18 10:51 ` [RFC][PATCH v2 09/10] smp: Make smp_call_function_single_async() safer Peter Zijlstra
  2020-08-18 10:51 ` [RFC][PATCH v2 10/10] irq_work: Add a few comments Peter Zijlstra
  9 siblings, 2 replies; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz

Convert the performance sensitive users of
smp_call_single_function_async() over to the new
irq_work_queue_remote_static().

The new API is marginally less crap but taking these users away allows
fixing up smp_call_single_function_async() without risk of performance
regressions.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 block/blk-mq.c            |    8 ++++----
 include/linux/blkdev.h    |    4 ++--
 include/linux/netdevice.h |    3 ++-
 kernel/sched/core.c       |   16 ++++++++--------
 kernel/sched/fair.c       |    6 +++---
 kernel/sched/sched.h      |    4 ++--
 net/core/dev.c            |   15 ++++++++++-----
 7 files changed, 31 insertions(+), 25 deletions(-)

--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -622,9 +622,9 @@ static int blk_softirq_cpu_dead(unsigned
 }
 
 
-static void __blk_mq_complete_request_remote(void *data)
+static void __blk_mq_complete_request_remote(struct irq_work *work)
 {
-	struct request *rq = data;
+	struct request *rq = container_of(work, struct request, work);
 
 	/*
 	 * For most of single queue controllers, there is only one irq vector
@@ -671,8 +671,8 @@ bool blk_mq_complete_request_remote(stru
 		return false;
 
 	if (blk_mq_complete_need_ipi(rq)) {
-		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
-		smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
+		rq->work = IRQ_WORK_INIT_HARD(__blk_mq_complete_request_remote);
+		irq_work_queue_remote_static(rq->mq_ctx->cpu, &rq->work);
 	} else {
 		if (rq->q->nr_hw_queues > 1)
 			return false;
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -19,7 +19,7 @@
 #include <linux/stringify.h>
 #include <linux/gfp.h>
 #include <linux/bsg.h>
-#include <linux/smp.h>
+#include <linux/irq_work.h>
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
@@ -234,7 +234,7 @@ struct request {
 	unsigned long deadline;
 
 	union {
-		struct __call_single_data csd;
+		struct irq_work work;
 		u64 fifo_time;
 	};
 
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
+#include <linux/irq_work.h>
 #include <asm/cache.h>
 #include <asm/byteorder.h>
 
@@ -3144,7 +3145,7 @@ struct softnet_data {
 	unsigned int		input_queue_head ____cacheline_aligned_in_smp;
 
 	/* Elements below can be accessed between CPUs for RPS/RFS */
-	call_single_data_t	csd ____cacheline_aligned_in_smp;
+	struct irq_work		work ____cacheline_aligned_in_smp;
 	struct softnet_data	*rps_ipi_next;
 	unsigned int		cpu;
 	unsigned int		input_queue_tail;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -361,9 +361,9 @@ static void __hrtick_restart(struct rq *
 /*
  * called from hardirq (IPI) context
  */
-static void __hrtick_start(void *arg)
+static void __hrtick_start(struct irq_work *work)
 {
-	struct rq *rq = arg;
+	struct rq *rq = container_of(work, struct rq, hrtick_work);
 	struct rq_flags rf;
 
 	rq_lock(rq, &rf);
@@ -394,7 +394,7 @@ void hrtick_start(struct rq *rq, u64 del
 	if (rq == this_rq())
 		__hrtick_restart(rq);
 	else
-		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
+		irq_work_queue_remote_static(cpu_of(rq), &rq->hrtick_work);
 }
 
 #else
@@ -419,7 +419,7 @@ void hrtick_start(struct rq *rq, u64 del
 static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
-	INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
+	rq->hrtick_work = IRQ_WORK_INIT_HARD(__hrtick_start);
 #endif
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 	rq->hrtick_timer.function = hrtick;
@@ -729,14 +729,14 @@ void wake_up_nohz_cpu(int cpu)
 		wake_up_idle_cpu(cpu);
 }
 
-static void nohz_csd_func(void *info)
+static void nohz_work_func(struct irq_work *work)
 {
-	struct rq *rq = info;
+	struct rq *rq = container_of(work, struct rq, nohz_work);
 	int cpu = cpu_of(rq);
 	unsigned int flags;
 
 	/*
-	 * Release the rq::nohz_csd.
+	 * Release rq::nohz_work.
 	 */
 	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
 	WARN_ON(!(flags & NOHZ_KICK_MASK));
@@ -7181,7 +7181,7 @@ void __init sched_init(void)
 		rq->last_blocked_load_update_tick = jiffies;
 		atomic_set(&rq->nohz_flags, 0);
 
-		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
+		rq->nohz_work = IRQ_WORK_INIT_HARD(nohz_work_func);
 #endif
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10058,8 +10058,8 @@ static void kick_ilb(unsigned int flags)
 		return;
 
 	/*
-	 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
-	 * the first flag owns it; cleared by nohz_csd_func().
+	 * Access to rq::nohz_work is serialized by NOHZ_KICK_MASK; he who sets
+	 * the first flag owns it; cleared by nohz_work_func().
 	 */
 	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
 	if (flags & NOHZ_KICK_MASK)
@@ -10070,7 +10070,7 @@ static void kick_ilb(unsigned int flags)
 	 * is idle. And the softirq performing nohz idle load balance
 	 * will be run before returning from the IPI.
 	 */
-	smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
+	irq_work_queue_remote_static(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_work);
 }
 
 /*
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -910,7 +910,7 @@ struct rq {
 #ifdef CONFIG_SMP
 	unsigned long		last_blocked_load_update_tick;
 	unsigned int		has_blocked_load;
-	call_single_data_t	nohz_csd;
+	struct irq_work		nohz_work;
 #endif /* CONFIG_SMP */
 	unsigned int		nohz_tick_stopped;
 	atomic_t		nohz_flags;
@@ -1021,7 +1021,7 @@ struct rq {
 
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
-	call_single_data_t	hrtick_csd;
+	struct irq_work		hrtick_work;
 #endif
 	struct hrtimer		hrtick_timer;
 #endif
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4443,9 +4443,9 @@ EXPORT_SYMBOL(rps_may_expire_flow);
 #endif /* CONFIG_RFS_ACCEL */
 
 /* Called from hardirq (IPI) context */
-static void rps_trigger_softirq(void *data)
+static void rps_trigger_softirq(struct irq_work *work)
 {
-	struct softnet_data *sd = data;
+	struct softnet_data *sd = container_of(work, struct softnet_data, work);
 
 	____napi_schedule(sd, &sd->backlog);
 	sd->received_rps++;
@@ -6182,8 +6182,13 @@ static void net_rps_send_ipi(struct soft
 	while (remsd) {
 		struct softnet_data *next = remsd->rps_ipi_next;
 
-		if (cpu_online(remsd->cpu))
-			smp_call_function_single_async(remsd->cpu, &remsd->csd);
+		if (cpu_online(remsd->cpu)) {
+			/*
+			 * XXX can there be two CPUs calling into the same remsd?
+			 * XXX serialized by NAPI_STATE_SCHED ??
+			 */
+			irq_work_queue_remote_static(remsd->cpu, &remsd->work);
+		}
 		remsd = next;
 	}
 #endif
@@ -10965,7 +10970,7 @@ static int __init net_dev_init(void)
 		INIT_LIST_HEAD(&sd->poll_list);
 		sd->output_queue_tailp = &sd->output_queue;
 #ifdef CONFIG_RPS
-		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
+		init_irq_work(&sd->work, rps_trigger_softirq);
 		sd->cpu = i;
 #endif
 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC][PATCH v2 09/10] smp: Make smp_call_function_single_async() safer
  2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
                   ` (7 preceding siblings ...)
  2020-08-18 10:51 ` [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API Peter Zijlstra
@ 2020-08-18 10:51 ` Peter Zijlstra
  2020-08-18 10:51 ` [RFC][PATCH v2 10/10] irq_work: Add a few comments Peter Zijlstra
  9 siblings, 0 replies; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz

Make smp_call_function_single_async() a safer and more convenient
interface by using an atomic op for setting CSD_FLAG_LOCK. This allows
safe concurrent use of this function as would be expected by the
-EBUSY return value.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/smp.c |   28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -297,6 +297,13 @@ static void flush_smp_call_function_queu
 				void *info = csd->info;
 
 				csd_unlock(csd);
+				/*
+				 * Ensures any LOAD of func() will be after
+				 * the above UNLOCK, this guarantees that
+				 * func() will observe any state prior
+				 * to _async() returning -EBUSY.
+				 */
+				smp_mb();
 				func(info);
 			} else if (type == CSD_TYPE_IRQ_WORK) {
 				irq_work_single(csd);
@@ -397,16 +404,18 @@ EXPORT_SYMBOL(smp_call_function_single);
  * can thus be done from contexts with disabled interrupts.
  *
  * The caller passes his own pre-allocated data structure
- * (ie: embedded in an object) and is responsible for synchronizing it
- * such that the IPIs performed on the @csd are strictly serialized.
+ * and is responsible for it's life-time, it must not be re-used
+ * until csd->node.u_flags == 0.
  *
  * If the function is called with one csd which has not yet been
  * processed by previous call to smp_call_function_single_async(), the
  * function will return immediately with -EBUSY showing that the csd
  * object is still in progress.
  *
- * NOTE: Be careful, there is unfortunately no current debugging facility to
- * validate the correctness of this serialization.
+ * When -EBUSY is returned, any invocation of csd->func() is guaranteed to see
+ * the state prior to this call.
+ *
+ * Also, consider using irq_work_queue_remote() if at all possible.
  */
 int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 {
@@ -414,13 +423,16 @@ int smp_call_function_single_async(int c
 
 	preempt_disable();
 
-	if (csd->node.u_flags & CSD_FLAG_LOCK) {
+	/*
+	 * We still need RELEASE like semantics, even when the cmpxchg() fails.
+	 * Pairs with the smp_mb() in flush_smp_call_function_queue().
+	 */
+	smp_mb__before_atomic();
+	if (cmpxchg_relaxed(&csd->node.u_flags, 0, CSD_FLAG_LOCK) != 0) {
 		err = -EBUSY;
 		goto out;
 	}
-
-	csd->node.u_flags = CSD_FLAG_LOCK;
-	smp_wmb();
+	/* ctrl-dep orders later stores */
 
 	err = generic_exec_single(cpu, csd);
 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* [RFC][PATCH v2 10/10] irq_work: Add a few comments
  2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
                   ` (8 preceding siblings ...)
  2020-08-18 10:51 ` [RFC][PATCH v2 09/10] smp: Make smp_call_function_single_async() safer Peter Zijlstra
@ 2020-08-18 10:51 ` Peter Zijlstra
  2020-08-18 15:52   ` Randy Dunlap
  9 siblings, 1 reply; 22+ messages in thread
From: Peter Zijlstra @ 2020-08-18 10:51 UTC (permalink / raw)
  To: mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot, peterz


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/irq_work.h |   80 +++++++++++++++++++++++++++++++++++++++++------
 kernel/irq_work.c        |   10 +++++
 2 files changed, 80 insertions(+), 10 deletions(-)

--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -4,15 +4,6 @@
 
 #include <linux/smp_types.h>
 
-/*
- * An entry can be in one of four states:
- *
- * free	     NULL, 0 -> {claimed}       : free to be used
- * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
- * pending   next, 3 -> {busy}          : queued, pending callback
- * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- */
-
 struct irq_work {
 	struct __call_single_node node;
 	void (*func)(struct irq_work *);
@@ -36,11 +27,19 @@ void init_irq_work(struct irq_work *work
 	*work = IRQ_WORK_INIT(func);
 }
 
+/*
+ * irq_work_is_pending(): if @work is already queued
+ */
 static inline bool irq_work_is_pending(struct irq_work *work)
 {
 	return atomic_read(&work->node.a_flags) & IRQ_WORK_PENDING;
 }
 
+/*
+ * irq_work_is_busy(): true until after func() has run
+ *
+ * Does not work with irq_work_queue_remote_static().
+ */
 static inline bool irq_work_is_busy(struct irq_work *work)
 {
 	return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY;
@@ -48,12 +47,45 @@ static inline bool irq_work_is_busy(stru
 
 #ifdef CONFIG_IRQ_WORK
 
+/*
+ * irq_work_queue(): run @work in IRQ context on this CPU
+ * @work: work to run
+ *
+ * Self-IPI, NMI-safe
+ *
+ * When the function returns false; @work is already queued and
+ * any eventual execution of it's func() is guaranteed to see
+ * any state before the failing enqueue.
+ */
 bool irq_work_queue(struct irq_work *work);
+
+/*
+ * irq_work_queue_on(): run @work in IRQ context on @cpu
+ * @work: work to run
+ * @cpu: cpu to run @work on
+ *
+ * *NOT* NMI-safe
+ *
+ * When the function returns false; @work is already queued and
+ * any eventual execution of it's func() is guaranteed to see
+ * any state before the failing enqueue.
+ */
 bool irq_work_queue_on(struct irq_work *work, int cpu);
 
-void irq_work_tick(void);
+/*
+ * irq_work_sync(): wait for completion of @work
+ * @work:
+ *
+ * Expects no concurrent irq_work_queue()
+ *
+ * Will return once @work is no longer 'busy'.
+ *
+ * Does not work with irq_work_queue_remote_static().
+ */
 void irq_work_sync(struct irq_work *work);
 
+void irq_work_tick(void);
+
 #include <asm/irq_work.h>
 
 void irq_work_run(void);
@@ -67,8 +99,36 @@ static inline bool irq_work_needs_cpu(vo
 #endif /* CONFIG_IRQ_WORK */
 
 #ifdef CONFIG_SMP
+
+/*
+ * irq_work_queue_remote(): run @work in IRQ context on @cpu
+ * @cpu:
+ * @work:
+ *
+ * like irq_work_queue_on(), except it requires @cpu != smp_processor_id() and
+ * is available for any SMP build.
+ *
+ * Return -EBUSY when already queued, 0 on success.
+ */
 extern int irq_work_queue_remote(int cpu, struct irq_work *work);
+
+/*
+ * irq_work_queue_remote_state(): like irq_work_queue_remote() except dangerous
+ * @cpu:
+ * @work:
+ *
+ * DO NOT USE, this function is horrible/dangerous.
+ *
+ * The test-and-set-PENDING is not atomic, it also doesn't set
+ * the BUSY bit and with that breaks irq_work_sync().
+ *
+ * This means that the caller needs external serialization; life-time,
+ * where relevant, also needs to be externally orchestated.
+ *
+ * There is no validation/debugging to help you if you get it wrong.
+ */
 extern int irq_work_queue_remote_static(int cpu, struct irq_work *work);
+
 extern void irq_work_single(void *arg);
 #endif
 
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -23,6 +23,15 @@
 #if defined(CONFIG_IRQ_WORK) || defined(CONFIG_SMP)
 
 /*
+ * An entry can be in one of four states:
+ *
+ * free	     NULL, 0 -> {claimed}       : free to be used
+ * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
+ * pending   next, 3 -> {busy}          : queued, pending callback
+ * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
+ */
+
+/*
  * Claim the entry so that no one else will poke at it.
  */
 static bool irq_work_claim(struct irq_work *work)
@@ -37,6 +46,7 @@ static bool irq_work_claim(struct irq_wo
 	 */
 	if (oflags & IRQ_WORK_PENDING)
 		return false;
+
 	return true;
 }
 



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 10/10] irq_work: Add a few comments
  2020-08-18 10:51 ` [RFC][PATCH v2 10/10] irq_work: Add a few comments Peter Zijlstra
@ 2020-08-18 15:52   ` Randy Dunlap
  0 siblings, 0 replies; 22+ messages in thread
From: Randy Dunlap @ 2020-08-18 15:52 UTC (permalink / raw)
  To: Peter Zijlstra, mingo, torvalds
  Cc: linux-kernel, will, paulmck, hch, axboe, chris, davem, kuba,
	fweisbec, oleg, vincent.guittot

Hi Peter,

a few typos below...

On 8/18/20 3:51 AM, Peter Zijlstra wrote:
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/irq_work.h |   80 +++++++++++++++++++++++++++++++++++++++++------
>  kernel/irq_work.c        |   10 +++++
>  2 files changed, 80 insertions(+), 10 deletions(-)
> 
> --- a/include/linux/irq_work.h
> +++ b/include/linux/irq_work.h


> @@ -48,12 +47,45 @@ static inline bool irq_work_is_busy(stru
>  
>  #ifdef CONFIG_IRQ_WORK
>  
> +/*
> + * irq_work_queue(): run @work in IRQ context on this CPU
> + * @work: work to run
> + *
> + * Self-IPI, NMI-safe
> + *
> + * When the function returns false; @work is already queued and
> + * any eventual execution of it's func() is guaranteed to see

                                its

> + * any state before the failing enqueue.
> + */
>  bool irq_work_queue(struct irq_work *work);
> +
> +/*
> + * irq_work_queue_on(): run @work in IRQ context on @cpu
> + * @work: work to run
> + * @cpu: cpu to run @work on
> + *
> + * *NOT* NMI-safe
> + *
> + * When the function returns false; @work is already queued and

                                false,

> + * any eventual execution of it's func() is guaranteed to see

                                its

> + * any state before the failing enqueue.
> + */

> @@ -67,8 +99,36 @@ static inline bool irq_work_needs_cpu(vo

> +/*
> + * irq_work_queue_remote_state(): like irq_work_queue_remote() except dangerous
> + * @cpu:
> + * @work:
> + *
> + * DO NOT USE, this function is horrible/dangerous.
> + *
> + * The test-and-set-PENDING is not atomic, it also doesn't set
> + * the BUSY bit and with that breaks irq_work_sync().
> + *
> + * This means that the caller needs external serialization; life-time,
> + * where relevant, also needs to be externally orchestated.

                                                  orchestrated.

> + *
> + * There is no validation/debugging to help you if you get it wrong.
> + */
>  extern int irq_work_queue_remote_static(int cpu, struct irq_work *work);
> +
>  extern void irq_work_single(void *arg);
>  #endif
>  


-- 
~Randy


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-18 10:51 ` [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API Peter Zijlstra
@ 2020-08-18 16:25   ` Christoph Hellwig
  2020-08-19  7:22     ` peterz
  2020-08-20  6:19   ` Christoph Hellwig
  1 sibling, 1 reply; 22+ messages in thread
From: Christoph Hellwig @ 2020-08-18 16:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, torvalds, linux-kernel, will, paulmck, hch, axboe, chris,
	davem, kuba, fweisbec, oleg, vincent.guittot

On Tue, Aug 18, 2020 at 12:51:10PM +0200, Peter Zijlstra wrote:
> Convert the performance sensitive users of
> smp_call_single_function_async() over to the new
> irq_work_queue_remote_static().
> 
> The new API is marginally less crap but taking these users away allows
> fixing up smp_call_single_function_async() without risk of performance
> regressions.

You probably want a conversion patch per subsystem so that it sticks
out.  What is so crap about this API?  How could we as subsystem
maintainers help to make it less crappy?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-18 16:25   ` Christoph Hellwig
@ 2020-08-19  7:22     ` peterz
  2020-08-19 18:50       ` Linus Torvalds
  0 siblings, 1 reply; 22+ messages in thread
From: peterz @ 2020-08-19  7:22 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: mingo, torvalds, linux-kernel, will, paulmck, axboe, chris,
	davem, kuba, fweisbec, oleg, vincent.guittot

On Tue, Aug 18, 2020 at 06:25:42PM +0200, Christoph Hellwig wrote:
> On Tue, Aug 18, 2020 at 12:51:10PM +0200, Peter Zijlstra wrote:
> > Convert the performance sensitive users of
> > smp_call_single_function_async() over to the new
> > irq_work_queue_remote_static().
> > 
> > The new API is marginally less crap but taking these users away allows
> > fixing up smp_call_single_function_async() without risk of performance
> > regressions.
> 
> You probably want a conversion patch per subsystem so that it sticks
> out.  What is so crap about this API?  How could we as subsystem
> maintainers help to make it less crappy?

The problem with both the current smp_call_function_single_async() and
the proposed irq_work_queue_remote_static() is that they require
external serialization and lifetime management.

That is, the external serialization comes from the non-atomic
test-and-set they both have. This works nicely when there is external
state that already serializes things, but totally comes apart (and
causes trivial list corruption) when you get it wrong.

The life-time thing is because you can't tell when the IPI is done.


The newly introduced irq_work_queue_remote() suffers neither of these
problems, and patch 9 fixes the first for
smp_call_function_single_async(). The whole smp_call_function*() class
suffers the second issue for .wait=0, typically they get combined with a
completion or some other state when/where it matters.

Patch 9 also shows why I introduced irq_work_queue_remote_static(), the
additional atomic op on enqueue is of course not cheap, and I can
imagine a bunch of users that don't really need it not wanting to pay
that price.


From a user pov (I'm one too), I'm not sure what we can do about this,
other than possibly accept the extra overhead :/

I do have a TODO item to see if I can come up with extra debugging
checks to catch abuse of these fragile things. One possible thing is to
have csd_unlock() also set csd->next = NULL, and have the llist_add()
users verify csd->next == NULL before doing the add.




^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-19  7:22     ` peterz
@ 2020-08-19 18:50       ` Linus Torvalds
  2020-08-19 19:41         ` peterz
  2020-08-20  6:20         ` Christoph Hellwig
  0 siblings, 2 replies; 22+ messages in thread
From: Linus Torvalds @ 2020-08-19 18:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Hellwig, Ingo Molnar, Linux Kernel Mailing List,
	Will Deacon, Paul E. McKenney, Jens Axboe, Chris Wilson,
	David Miller, Jakub Kicinski, Frédéric Weisbecker,
	Oleg Nesterov, Vincent Guittot

On Wed, Aug 19, 2020 at 12:22 AM <peterz@infradead.org> wrote:
>
> That is, the external serialization comes from the non-atomic
> test-and-set they both have. This works nicely when there is external
> state that already serializes things, but totally comes apart (and
> causes trivial list corruption) when you get it wrong.

Quite often, there just isn't any *need* for serialization, because
there is only ever one op active.

That can be either because the csd is fundamentally a single thing ("I
will transfer this object to another CPU"), or it can be because the
CSD is already per-cpu (ie smp_call_function_single).

You seem to make that common situation much worse.

Not only do you add that expensive atomic op, you add that expensive
"use irq_work queues" for something that doesn't _need_ to use them.

I have to say, I'm not a fan. What are the real advantages? Your
listed disadvantages are very very questionable.

IOW, what are the actual examples of "totally comes apart" that justifies this?

If the example is theoretical ("if you use csd's wrong") then I think
they are worthless.

             Linus

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-19 18:50       ` Linus Torvalds
@ 2020-08-19 19:41         ` peterz
  2020-08-19 22:04           ` Linus Torvalds
  2020-08-20  6:20         ` Christoph Hellwig
  1 sibling, 1 reply; 22+ messages in thread
From: peterz @ 2020-08-19 19:41 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Hellwig, Ingo Molnar, Linux Kernel Mailing List,
	Will Deacon, Paul E. McKenney, Jens Axboe, Chris Wilson,
	David Miller, Jakub Kicinski, Frédéric Weisbecker,
	Oleg Nesterov, Vincent Guittot

On Wed, Aug 19, 2020 at 11:50:55AM -0700, Linus Torvalds wrote:
> On Wed, Aug 19, 2020 at 12:22 AM <peterz@infradead.org> wrote:
> >
> > That is, the external serialization comes from the non-atomic
> > test-and-set they both have. This works nicely when there is external
> > state that already serializes things, but totally comes apart (and
> > causes trivial list corruption) when you get it wrong.
> 
> Quite often, there just isn't any *need* for serialization, because
> there is only ever one op active.
> 
> That can be either because the csd is fundamentally a single thing ("I
> will transfer this object to another CPU"), or it can be because the
> CSD is already per-cpu (ie smp_call_function_single).

Note that smp_call_function_single() doesn't use _async(), and while it
does use per-cpu CSDs, the reqular .wait=false case then potentially
waits before (instead of after) in the unlikely case the local CSD was
already taken.

This makes that we cannot use it with IRQs disabled.

> You seem to make that common situation much worse.

Probably -- like I said, I'm not really happy with this either.

> Not only do you add that expensive atomic op, you add that expensive
> "use irq_work queues" for something that doesn't _need_ to use them.

I'm not sure I get the "expensive irq_work queues" argument, I fully
agree with you that adding the atomic op is fairly crap.

The remote irq_work is exactly same llist and IPI vector as remote
smp_call_function_single(). And the new irq_work_queue_remote_static()
is exactly as cheap as smp_call_function_single_async() for not having
an atomic op, queues to the same llist, sends the same IPI, but uses the
embedded data structure instead of pass a data pointer pattern.

Because most of the async users end up having another data structure to
pass arguments around anyway.

> I have to say, I'm not a fan. What are the real advantages? Your
> listed disadvantages are very very questionable.
> 
> IOW, what are the actual examples of "totally comes apart" that justifies this?
> 
> If the example is theoretical ("if you use csd's wrong") then I think
> they are worthless.

Well, I did use the CSD's wrong. I forgot the gotcha and made a mess of
things and stuff crashed, the wreckage is in the git history :/ I fixed
it, but I got burned.

I then went and looked at a bunch of other _async users and it wasn't
immediately obvious that they were doing it right (they were).

I wanted to improve things, but I'm willing to admit to just making it
worse in these last few patches.

Anyway, the pattern I wanted is more easily expressed with the new
irq_work_queue_remote(), even though I don't need it anymore. I can
throw away all the patches after that without loosing too much sleep.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-19 19:41         ` peterz
@ 2020-08-19 22:04           ` Linus Torvalds
  2020-08-20 13:08             ` peterz
  0 siblings, 1 reply; 22+ messages in thread
From: Linus Torvalds @ 2020-08-19 22:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Christoph Hellwig, Ingo Molnar, Linux Kernel Mailing List,
	Will Deacon, Paul E. McKenney, Jens Axboe, Chris Wilson,
	David Miller, Jakub Kicinski, Frédéric Weisbecker,
	Oleg Nesterov, Vincent Guittot

On Wed, Aug 19, 2020 at 12:41 PM <peterz@infradead.org> wrote:
>
> I'm not sure I get the "expensive irq_work queues" argument, I fully
> agree with you that adding the atomic op is fairly crap.

There's an atomic op on the actual runing side too, because of the
whole IRQ_WORK_PENDING thing.

So you get that double hit.

Maybe it doesn't matter. I just remember us being very careful to
avoid any unnecessary atomics in the smp_call_function area, but
admittedly I haven't worked on that code for a few years, so ..

                Linus

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-18 10:51 ` [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API Peter Zijlstra
  2020-08-18 16:25   ` Christoph Hellwig
@ 2020-08-20  6:19   ` Christoph Hellwig
  2020-08-20 13:40     ` peterz
  1 sibling, 1 reply; 22+ messages in thread
From: Christoph Hellwig @ 2020-08-20  6:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, torvalds, linux-kernel, will, paulmck, hch, axboe, chris,
	davem, kuba, fweisbec, oleg, vincent.guittot

On Tue, Aug 18, 2020 at 12:51:10PM +0200, Peter Zijlstra wrote:
>  	if (blk_mq_complete_need_ipi(rq)) {
> -		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
> -		smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
> +		rq->work = IRQ_WORK_INIT_HARD(__blk_mq_complete_request_remote);
> +		irq_work_queue_remote_static(rq->mq_ctx->cpu, &rq->work);

So given the caller synchronization / use once semantics does it even
make sense to split the init vs call part here?  What about:

		irq_work_queue_remote_static(&rq->work, rq->mq_ctx->cpu,
					    __blk_mq_complete_request_remote);

instead?  And btw, I'm not sure what the "static" stand for.  Maybe
irq_work_queue_remote_once?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-19 18:50       ` Linus Torvalds
  2020-08-19 19:41         ` peterz
@ 2020-08-20  6:20         ` Christoph Hellwig
  1 sibling, 0 replies; 22+ messages in thread
From: Christoph Hellwig @ 2020-08-20  6:20 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Christoph Hellwig, Ingo Molnar,
	Linux Kernel Mailing List, Will Deacon, Paul E. McKenney,
	Jens Axboe, Chris Wilson, David Miller, Jakub Kicinski,
	Frédéric Weisbecker, Oleg Nesterov, Vincent Guittot

On Wed, Aug 19, 2020 at 11:50:55AM -0700, Linus Torvalds wrote:
> On Wed, Aug 19, 2020 at 12:22 AM <peterz@infradead.org> wrote:
> >
> > That is, the external serialization comes from the non-atomic
> > test-and-set they both have. This works nicely when there is external
> > state that already serializes things, but totally comes apart (and
> > causes trivial list corruption) when you get it wrong.
> 
> Quite often, there just isn't any *need* for serialization, because
> there is only ever one op active.

Yes, that's pretty much the block use case.  The request gets completed
once, and and the IPI is used to bounce it back to the issuing cpu.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-19 22:04           ` Linus Torvalds
@ 2020-08-20 13:08             ` peterz
  0 siblings, 0 replies; 22+ messages in thread
From: peterz @ 2020-08-20 13:08 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Christoph Hellwig, Ingo Molnar, Linux Kernel Mailing List,
	Will Deacon, Paul E. McKenney, Jens Axboe, Chris Wilson,
	David Miller, Jakub Kicinski, Frédéric Weisbecker,
	Oleg Nesterov, Vincent Guittot

On Wed, Aug 19, 2020 at 03:04:56PM -0700, Linus Torvalds wrote:
> On Wed, Aug 19, 2020 at 12:41 PM <peterz@infradead.org> wrote:
> >
> > I'm not sure I get the "expensive irq_work queues" argument, I fully
> > agree with you that adding the atomic op is fairly crap.
> 
> There's an atomic op on the actual runing side too, because of the
> whole IRQ_WORK_PENDING thing.
> 
> So you get that double hit.
> 
> Maybe it doesn't matter. I just remember us being very careful to
> avoid any unnecessary atomics in the smp_call_function area, but
> admittedly I haven't worked on that code for a few years, so ..

Patch #3 trades that atomic for a full barrier. Not a massive win on
x86, but still.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-20  6:19   ` Christoph Hellwig
@ 2020-08-20 13:40     ` peterz
  2020-09-09  8:03       ` Christoph Hellwig
  0 siblings, 1 reply; 22+ messages in thread
From: peterz @ 2020-08-20 13:40 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: mingo, torvalds, linux-kernel, will, paulmck, axboe, chris,
	davem, kuba, fweisbec, oleg, vincent.guittot

On Thu, Aug 20, 2020 at 08:19:27AM +0200, Christoph Hellwig wrote:
> On Tue, Aug 18, 2020 at 12:51:10PM +0200, Peter Zijlstra wrote:
> >  	if (blk_mq_complete_need_ipi(rq)) {
> > -		INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
> > -		smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
> > +		rq->work = IRQ_WORK_INIT_HARD(__blk_mq_complete_request_remote);
> > +		irq_work_queue_remote_static(rq->mq_ctx->cpu, &rq->work);
> 
> So given the caller synchronization / use once semantics does it even
> make sense to split the init vs call part here?  What about:
> 
> 		irq_work_queue_remote_static(&rq->work, rq->mq_ctx->cpu,
> 					    __blk_mq_complete_request_remote);
> 
> instead?  And btw, I'm not sure what the "static" stand for.  Maybe
> irq_work_queue_remote_once?

The 'static' came from static storage, but I agree that the naming is
pretty random/poor.

One argument against your proposal is that it makes it even harder to
add warnings that try and catch bad/broken usage.

Also, given Linus' email I now wonder if we still want the
irq_work_remote variant at all.


So the initial use-case was something like:

struct foo {
	struct irq_work work;
	...
};

void foo_func(struct irq_work *work)
{
	struct foo *f = container_of(work, struct foo, work);

	...
}

DEFINE_PER_CPU(struct foo, foo) = IRQ_WORK_INIT_HARD(foo_func);

void raise_foo(int cpu)
{
	irq_work_queue_remote(per_cpu_ptr(&foo, cpu), cpu);
}

Where you can, with IRQs disabled, call raise_foo(cpu) for a remote CPU
and have the guarantee that foo_func() will observe whatever you did
before calling raise_foo().

Specifically, I needed this for what is now __ttwu_queue_wakelist(),
which used to rely on smp_send_reschedule() but needed to be pulled out
of the regular scheduler IPI.

While sorting through the wreckage of me getting this horribly wrong, I
noticed that generic_smp_call_function_single_interrupt() was
unconditionally loading _2_ cachelines, one for the regular CSD llist
and one for the remote irq_work llist.

I then realized we could merge those two lists, and regain the original
intent of that IPI to only touch one line.

At that point I could build the above, but then I realized that since I
already had a mixed type list, I could put the ttwu entries on it as
well, which is cheaper than doing the above.


Anyway, tl;dr, what do we actually want? Do we favour the embedded
irq_work variant over smp_call_function_single_asyn() ?

There's a few subtle differences, where smp_call_function_single_async()
will directly call @func when @cpu == smp_processor_id(),
irq_work_remote will give you WARN -- since irq_work to the local CPU is
defined as a self-IPI, which isn't implemented on all architectures and
is a distinctly different behaviour.

That said, most (if not all) users seem to actually only care about
running things on another CPU, so that seems to not matter (much).




^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API
  2020-08-20 13:40     ` peterz
@ 2020-09-09  8:03       ` Christoph Hellwig
  0 siblings, 0 replies; 22+ messages in thread
From: Christoph Hellwig @ 2020-09-09  8:03 UTC (permalink / raw)
  To: peterz
  Cc: Christoph Hellwig, mingo, torvalds, linux-kernel, will, paulmck,
	axboe, chris, davem, kuba, fweisbec, oleg, vincent.guittot

On Thu, Aug 20, 2020 at 03:40:01PM +0200, peterz@infradead.org wrote:
> Anyway, tl;dr, what do we actually want? Do we favour the embedded
> irq_work variant over smp_call_function_single_asyn() ?

For blk-mq we really like an embedded structure that allows to bounce
the work to another CPU using IPIs.

> There's a few subtle differences, where smp_call_function_single_async()
> will directly call @func when @cpu == smp_processor_id(),
> irq_work_remote will give you WARN -- since irq_work to the local CPU is
> defined as a self-IPI, which isn't implemented on all architectures and
> is a distinctly different behaviour.

For the block code we handle the self case explicitly as it allows us
to optimize further and avoid an indirect call.  That being said we
don't do a preempt_disable for that any more, as completing on the
"wrong" CPU for a just migrated completion handler is at worst a
performance degration but not a functional issue.  So instead of the
warn we'd at least like an error return that allows to directly call
the completion handler.

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2020-09-09  8:05 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-18 10:51 [PATCH v2 00/10] smp: irq_work / smp_call_function rework Peter Zijlstra
2020-08-18 10:51 ` [PATCH v2 01/10] irq_work: Cleanup Peter Zijlstra
2020-08-18 10:51 ` [PATCH v2 02/10] smp: Cleanup smp_call_function*() Peter Zijlstra
2020-08-18 10:51 ` [PATCH v2 03/10] irq_work: Optimize irq_work_single() Peter Zijlstra
2020-08-18 10:51 ` [PATCH v2 04/10] irq_work: Unconditionally build on SMP Peter Zijlstra
2020-08-18 10:51 ` [PATCH v2 05/10] irq_work: Provide irq_work_queue_remote() Peter Zijlstra
2020-08-18 10:51 ` [RFC][PATCH v2 06/10] irq_work: Provide irq_work_queue_remote_static() Peter Zijlstra
2020-08-18 10:51 ` [RFC][PATCH v2 07/10] sched/fair: Exclude the current CPU from find_new_ilb() Peter Zijlstra
2020-08-18 10:51 ` [RFC][PATCH v2 08/10] smp,irq_work: Use the new irq_work API Peter Zijlstra
2020-08-18 16:25   ` Christoph Hellwig
2020-08-19  7:22     ` peterz
2020-08-19 18:50       ` Linus Torvalds
2020-08-19 19:41         ` peterz
2020-08-19 22:04           ` Linus Torvalds
2020-08-20 13:08             ` peterz
2020-08-20  6:20         ` Christoph Hellwig
2020-08-20  6:19   ` Christoph Hellwig
2020-08-20 13:40     ` peterz
2020-09-09  8:03       ` Christoph Hellwig
2020-08-18 10:51 ` [RFC][PATCH v2 09/10] smp: Make smp_call_function_single_async() safer Peter Zijlstra
2020-08-18 10:51 ` [RFC][PATCH v2 10/10] irq_work: Add a few comments Peter Zijlstra
2020-08-18 15:52   ` Randy Dunlap

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).