linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/1] kernel/smp: Split call_single_queue into 3 queues
@ 2021-01-28  6:55 Leonardo Bras
  2021-01-28 10:33 ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 4+ messages in thread
From: Leonardo Bras @ 2021-01-28  6:55 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Thomas Gleixner, Paul E. McKenney,
	Sebastian Andrzej Siewior, Wei Yongjun, Qais Yousef,
	Leonardo Bras
  Cc: linux-kernel

Currently, during flush_smp_call_function_queue():
- All items are transversed once, for inverting.
- The SYNC items are transversed twice.
- The ASYNC & IRQ_WORK items are transversed tree times.
- The TTWU items are transversed four times;.

Also, a lot of extra work is done to keep track and remove the items
already processed in each step.

By using three queues, it's possible to avoid all this work, and
all items in list are transversed only twice: once for inverting,
and once for processing..

In exchange, this requires 2 extra llist_del_all() in the beginning
of flush_smp_call_function_queue(), and some extra logic to decide
the correct queue to add the desired csd.

This is not supposed to cause any change in the order the items are
processed, but will change the order of printing (cpu offlining)
to the order the items will be proceessed.

(The above transversed count ignores the cpu-offlining case, in
which all items would be transversed again, in both cases.)

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 kernel/smp.c | 173 ++++++++++++++++++++++++++-------------------------
 1 file changed, 87 insertions(+), 86 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index 1b6070bf97bb..67fb415873f9 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -37,7 +37,13 @@ struct call_function_data {
 
 static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
+struct call_multi_queue {
+	struct llist_head sync;
+	struct llist_head async_n_irq_work;
+	struct llist_head ttwu;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_multi_queue, call_mq);
 
 static void flush_smp_call_function_queue(bool warn_cpu_offline);
 
@@ -93,8 +99,13 @@ void __init call_function_init(void)
 {
 	int i;
 
-	for_each_possible_cpu(i)
-		init_llist_head(&per_cpu(call_single_queue, i));
+	for_each_possible_cpu(i) {
+		struct call_multi_queue *cmq = &per_cpu(call_mq, i);
+
+		init_llist_head(&cmq->sync);
+		init_llist_head(&cmq->async_n_irq_work);
+		init_llist_head(&cmq->ttwu);
+	}
 
 	smpcfd_prepare_cpu(smp_processor_id());
 }
@@ -253,6 +264,31 @@ static __always_inline void csd_unlock(call_single_data_t *csd)
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
 
+static __always_inline bool smp_add_to_queue(int cpu, struct llist_node *node)
+{
+	struct llist_head *head;
+	call_single_data_t *csd = llist_entry(node, call_single_data_t, node.llist);
+	struct call_multi_queue *cmq = &per_cpu(call_mq, cpu);
+
+	switch (CSD_TYPE(csd)) {
+	case CSD_TYPE_SYNC:
+		head = &cmq->sync;
+		break;
+	case CSD_TYPE_ASYNC:
+	case CSD_TYPE_IRQ_WORK:
+		head = &cmq->async_n_irq_work;
+		break;
+	case CSD_TYPE_TTWU:
+		head = &cmq->ttwu;
+		break;
+	default:
+		pr_warn("IPI callback, unknown type %d blocked from %d\n", cpu, CSD_TYPE(csd));
+		return false;
+	}
+
+	return llist_add(node, head);
+}
+
 void __smp_call_single_queue(int cpu, struct llist_node *node)
 {
 	/*
@@ -266,7 +302,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
 	 * locking and barrier primitives. Generic code isn't really
 	 * equipped to do the right thing...
 	 */
-	if (llist_add(node, &per_cpu(call_single_queue, cpu)))
+	if (smp_add_to_queue(cpu, node))
 		send_call_function_single_ipi(cpu);
 }
 
@@ -327,124 +363,89 @@ void generic_smp_call_function_single_interrupt(void)
  * to ensure that all pending IPI callbacks are run before it goes completely
  * offline.
  *
- * Loop through the call_single_queue and run all the queued callbacks.
+ * Loop through the call_multi_queue->* and run all the queued callbacks.
  * Must be called with interrupts disabled.
  */
 static void flush_smp_call_function_queue(bool warn_cpu_offline)
 {
-	call_single_data_t *csd, *csd_next;
-	struct llist_node *entry, *prev;
-	struct llist_head *head;
+	call_single_data_t *csd;
+	struct llist_node *entry_sync, *entry_async_n_irq_work, *entry_ttwu;
+	struct call_multi_queue *cmq;
 	static bool warned;
 
 	lockdep_assert_irqs_disabled();
 
-	head = this_cpu_ptr(&call_single_queue);
-	entry = llist_del_all(head);
-	entry = llist_reverse_order(entry);
+	cmq = this_cpu_ptr(&call_mq);
+	entry_sync = llist_del_all(&cmq->sync);
+	entry_async_n_irq_work = llist_del_all(&cmq->async_n_irq_work);
+	entry_ttwu = llist_del_all(&cmq->ttwu);
+
+	entry_sync = llist_reverse_order(entry_sync);
+	entry_async_n_irq_work = llist_reverse_order(entry_async_n_irq_work);
+	entry_ttwu = llist_reverse_order(entry_ttwu);
 
 	/* There shouldn't be any pending callbacks on an offline CPU. */
-	if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
-		     !warned && !llist_empty(head))) {
+	if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && !warned &&
+		     !llist_empty(&cmq->sync) && !llist_empty(&cmq->async_n_irq_work) &&
+		     !llist_empty(&cmq->ttwu))) {
 		warned = true;
 		WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
 
-		/*
-		 * We don't have to use the _safe() variant here
-		 * because we are not invoking the IPI handlers yet.
-		 */
-		llist_for_each_entry(csd, entry, node.llist) {
-			switch (CSD_TYPE(csd)) {
-			case CSD_TYPE_ASYNC:
-			case CSD_TYPE_SYNC:
-			case CSD_TYPE_IRQ_WORK:
-				pr_warn("IPI callback %pS sent to offline CPU\n",
-					csd->func);
-				break;
-
-			case CSD_TYPE_TTWU:
-				pr_warn("IPI task-wakeup sent to offline CPU\n");
-				break;
-
-			default:
-				pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
-					CSD_TYPE(csd));
-				break;
-			}
-		}
+		llist_for_each_entry(csd, entry_sync, node.llist)
+			pr_warn("IPI callback %pS sent to offline CPU\n", csd->func);
+
+		llist_for_each_entry(csd, entry_async_n_irq_work, node.llist)
+			pr_warn("IPI callback %pS sent to offline CPU\n", csd->func);
+
+		llist_for_each_entry(csd, entry_ttwu, node.llist)
+			pr_warn("IPI task-wakeup sent to offline CPU\n");
 	}
 
 	/*
 	 * First; run all SYNC callbacks, people are waiting for us.
 	 */
-	prev = NULL;
-	llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
-		/* Do we wait until *after* callback? */
-		if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
-			smp_call_func_t func = csd->func;
-			void *info = csd->info;
-
-			if (prev) {
-				prev->next = &csd_next->node.llist;
-			} else {
-				entry = &csd_next->node.llist;
-			}
+	llist_for_each_entry(csd, entry_sync, node.llist) {
+		smp_call_func_t func = csd->func;
+		void *info = csd->info;
 
-			csd_lock_record(csd);
-			func(info);
-			csd_unlock(csd);
-			csd_lock_record(NULL);
-		} else {
-			prev = &csd->node.llist;
-		}
+		csd_lock_record(csd);
+		func(info);
+		csd_unlock(csd);
+		csd_lock_record(NULL);
 	}
 
-	if (!entry)
-		return;
-
 	/*
-	 * Second; run all !SYNC callbacks.
+	 * Second; run all ASYNC & IRQ_WORK callbacks.
 	 */
-	prev = NULL;
-	llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
-		int type = CSD_TYPE(csd);
-
-		if (type != CSD_TYPE_TTWU) {
-			if (prev) {
-				prev->next = &csd_next->node.llist;
-			} else {
-				entry = &csd_next->node.llist;
-			}
-
-			if (type == CSD_TYPE_ASYNC) {
-				smp_call_func_t func = csd->func;
-				void *info = csd->info;
-
-				csd_lock_record(csd);
-				csd_unlock(csd);
-				func(info);
-				csd_lock_record(NULL);
-			} else if (type == CSD_TYPE_IRQ_WORK) {
-				irq_work_single(csd);
-			}
+	llist_for_each_entry(csd, entry_async_n_irq_work, node.llist) {
+		smp_call_func_t func = csd->func;
+		void *info = csd->info;
 
+		if (CSD_TYPE(csd) == CSD_TYPE_ASYNC) {
+			csd_lock_record(csd);
+			csd_unlock(csd);
+			func(info);
+			csd_lock_record(NULL);
 		} else {
-			prev = &csd->node.llist;
+			/* CSD_TYPE_IRQ_WORK */
+			irq_work_single(csd);
 		}
 	}
 
 	/*
 	 * Third; only CSD_TYPE_TTWU is left, issue those.
 	 */
-	if (entry)
-		sched_ttwu_pending(entry);
+	if (entry_ttwu)
+		sched_ttwu_pending(entry_ttwu);
 }
 
 void flush_smp_call_function_from_idle(void)
 {
 	unsigned long flags;
+	struct call_multi_queue *cmq = this_cpu_ptr(&call_mq);
 
-	if (llist_empty(this_cpu_ptr(&call_single_queue)))
+	if (llist_empty(&cmq->sync) && llist_empty(&cmq->async_n_irq_work) &&
+	    llist_empty(&cmq->ttwu))
 		return;
 
 	local_irq_save(flags);
@@ -674,7 +675,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 		csd->node.src = smp_processor_id();
 		csd->node.dst = cpu;
 #endif
-		if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu)))
+		if (smp_add_to_queue(cpu, &csd->node.llist))
 			__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 	}
 
-- 
2.29.2


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/1] kernel/smp: Split call_single_queue into 3 queues
  2021-01-28  6:55 [PATCH 1/1] kernel/smp: Split call_single_queue into 3 queues Leonardo Bras
@ 2021-01-28 10:33 ` Sebastian Andrzej Siewior
  2021-02-08 17:03   ` Leonardo Bras
  0 siblings, 1 reply; 4+ messages in thread
From: Sebastian Andrzej Siewior @ 2021-01-28 10:33 UTC (permalink / raw)
  To: Leonardo Bras
  Cc: Ingo Molnar, Peter Zijlstra, Thomas Gleixner, Paul E. McKenney,
	Wei Yongjun, Qais Yousef, linux-kernel

On 2021-01-28 03:55:06 [-0300], Leonardo Bras wrote:
> Currently, during flush_smp_call_function_queue():
> - All items are transversed once, for inverting.
> - The SYNC items are transversed twice.
> - The ASYNC & IRQ_WORK items are transversed tree times.
> - The TTWU items are transversed four times;.
> 
> Also, a lot of extra work is done to keep track and remove the items
> already processed in each step.
> 
> By using three queues, it's possible to avoid all this work, and
> all items in list are transversed only twice: once for inverting,
> and once for processing..
> 
> In exchange, this requires 2 extra llist_del_all() in the beginning
> of flush_smp_call_function_queue(), and some extra logic to decide
> the correct queue to add the desired csd.
> 
> This is not supposed to cause any change in the order the items are
> processed, but will change the order of printing (cpu offlining)
> to the order the items will be proceessed.
> 
> (The above transversed count ignores the cpu-offlining case, in
> which all items would be transversed again, in both cases.)

Numbers would be good. Having three queues increases the memory foot
print from one pointer to three but we still remain in one cache line.
One difference your patch makes is this hunk:

> +	if (smp_add_to_queue(cpu, node))
>  		send_call_function_single_ipi(cpu);

Previously only the first addition resulted in sending an IPI. With this
change you could send two IPIs, one for adding to two independent queues.

A quick smoke test ended up
          <idle>-0       [005] d..h1..   146.255996: flush_smp_call_function_queue: A1 S2 I0 T0 X3

with the patch at the bottom of the mail. This shows that in my
smoke test at least, the number of items in the individual list is low.

diff --git a/kernel/smp.c b/kernel/smp.c
index 1b6070bf97bb0..3acce385b9f97 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -336,6 +336,11 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 	struct llist_node *entry, *prev;
 	struct llist_head *head;
 	static bool warned;
+	int num_async = 0;
+	int num_sync = 0;
+	int num_irqw = 0;
+	int num_twu = 0;
+	int total = 0;
 
 	lockdep_assert_irqs_disabled();
 
@@ -343,6 +348,33 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 	entry = llist_del_all(head);
 	entry = llist_reverse_order(entry);
 
+	llist_for_each_entry(csd, entry, node.llist) {
+		switch (CSD_TYPE(csd)) {
+			case CSD_TYPE_ASYNC:
+				num_async++;
+				break;
+			case CSD_TYPE_SYNC:
+				num_sync++;
+				break;
+
+			case CSD_TYPE_IRQ_WORK:
+				num_irqw++;
+				break;
+
+			case CSD_TYPE_TTWU:
+				num_twu++;
+				break;
+
+			default:
+				pr_warn("hmmmm\n");
+				break;
+		}
+	}
+	total = num_async + num_sync + num_irqw + num_twu;
+	if (total > 2)
+		trace_printk("A%d S%d I%d T%d X%d\n", num_async, num_sync, num_irqw, num_twu,
+			     total);
+
 	/* There shouldn't be any pending callbacks on an offline CPU. */
 	if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
 		     !warned && !llist_empty(head))) {

Sebastian

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/1] kernel/smp: Split call_single_queue into 3 queues
  2021-01-28 10:33 ` Sebastian Andrzej Siewior
@ 2021-02-08 17:03   ` Leonardo Bras
  2021-02-09  7:53     ` Peter Zijlstra
  0 siblings, 1 reply; 4+ messages in thread
From: Leonardo Bras @ 2021-02-08 17:03 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Ingo Molnar, Peter Zijlstra, Thomas Gleixner, Paul E. McKenney,
	Wei Yongjun, Qais Yousef, linux-kernel

Hello Sebastian, 
Thanks for the feedback!

On Thu, 2021-01-28 at 11:33 +0100, Sebastian Andrzej Siewior wrote:
> On 2021-01-28 03:55:06 [-0300], Leonardo Bras wrote:
> > Currently, during flush_smp_call_function_queue():
> > - All items are transversed once, for inverting.
> > - The SYNC items are transversed twice.
> > - The ASYNC & IRQ_WORK items are transversed tree times.
> > - The TTWU items are transversed four times;.
> > 
> > Also, a lot of extra work is done to keep track and remove the items
> > already processed in each step.
> > 
> > By using three queues, it's possible to avoid all this work, and
> > all items in list are transversed only twice: once for inverting,
> > and once for processing..
> > 
> > In exchange, this requires 2 extra llist_del_all() in the beginning
> > of flush_smp_call_function_queue(), and some extra logic to decide
> > the correct queue to add the desired csd.
> > 
> > This is not supposed to cause any change in the order the items are
> > processed, but will change the order of printing (cpu offlining)
> > to the order the items will be proceessed.
> > 
> > (The above transversed count ignores the cpu-offlining case, in
> > which all items would be transversed again, in both cases.)
> 
> Numbers would be good.
> 

Sure, I will try to get some time to compare performance.


>  Having three queues increases the memory foot
> print from one pointer to three but we still remain in one cache line.
> One difference your patch makes is this hunk:
> 
> > +	if (smp_add_to_queue(cpu, node))
> >  		send_call_function_single_ipi(cpu);
> 
> Previously only the first addition resulted in sending an IPI. With this
> change you could send two IPIs, one for adding to two independent queues.

Yes, you are correct. 
I need to change this to looking into all queues, which should just add
a few compares, given all llist_heads are in the same cacheline.

> 
> A quick smoke test ended up
>           <idle>-0       [005] d..h1..   146.255996: flush_smp_call_function_queue: A1 S2 I0 T0 X3
> 
> with the patch at the bottom of the mail. This shows that in my
> smoke test at least, the number of items in the individual list is low.

Yes, but depending on workload this list may get longer.

My patch also needs some other changes, so I will send a v2 with those
+ the proposed changes.

> Sebastian

Best regards,
Leonardo Bras


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/1] kernel/smp: Split call_single_queue into 3 queues
  2021-02-08 17:03   ` Leonardo Bras
@ 2021-02-09  7:53     ` Peter Zijlstra
  0 siblings, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2021-02-09  7:53 UTC (permalink / raw)
  To: Leonardo Bras
  Cc: Sebastian Andrzej Siewior, Ingo Molnar, Thomas Gleixner,
	Paul E. McKenney, Wei Yongjun, Qais Yousef, linux-kernel

On Mon, Feb 08, 2021 at 02:03:47PM -0300, Leonardo Bras wrote:

> > with the patch at the bottom of the mail. This shows that in my
> > smoke test at least, the number of items in the individual list is low.
> 
> Yes, but depending on workload this list may get longer.

Get a median number of entries on the list, if you can get your median
anywhere near large enough to any of this to matter I'd be very
surprised.

This needs lots of numbers..

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-02-09  7:54 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-28  6:55 [PATCH 1/1] kernel/smp: Split call_single_queue into 3 queues Leonardo Bras
2021-01-28 10:33 ` Sebastian Andrzej Siewior
2021-02-08 17:03   ` Leonardo Bras
2021-02-09  7:53     ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).