linux-rt-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 0/3] do not raise timer softirq unconditionally (spinlockless version)
@ 2019-04-15 20:12 Marcelo Tosatti
  2019-04-15 20:12 ` [patch 1/3] timers: raise timer softirq on __mod_timer/add_timer_on Marcelo Tosatti
                   ` (5 more replies)
  0 siblings, 6 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2019-04-15 20:12 UTC (permalink / raw)
  To: linux-kernel, linux-rt-users
  Cc: Thomas Gleixner, Anna-Maria Gleixner, Daniel Bristot de Oliveira,
	Luiz Capitulino, Haris Okanovic

For isolated CPUs, we'd like to skip awakening ktimersoftd
(the switch to and then back from ktimersoftd takes 10us in
virtualized environments, in addition to other OS overhead,
which exceeds telco requirements for packet forwarding for
5G) from the sched tick.

The patch "timers: do not raise softirq unconditionally" from Thomas
attempts to address that by checking, in the sched tick, whether its
necessary to raise the timer softirq. Unfortunately, it attempts to grab
the tvec base spinlock which generates the issue described in the patch
"Revert "timers: do not raise softirq unconditionally"".

tvec_base->lock protects addition of timers to the wheel versus
timer interrupt execution.

This patch does not grab the tvec base spinlock from irq context,
but rather performs a lockless access to base->pending_map.

It handles the the race between timer addition and timer interrupt
execution by unconditionally (in case of isolated CPUs) raising the
timer softirq after making sure the updated bitmap is visible
on remote CPUs.

This patchset reduces cyclictest latency from 25us to 14us
on my testbox. 




^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 1/3] timers: raise timer softirq on __mod_timer/add_timer_on
  2019-04-15 20:12 [patch 0/3] do not raise timer softirq unconditionally (spinlockless version) Marcelo Tosatti
@ 2019-04-15 20:12 ` Marcelo Tosatti
  2019-05-29 14:53   ` Anna-Maria Gleixner
  2019-04-15 20:12 ` [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version) Marcelo Tosatti
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Marcelo Tosatti @ 2019-04-15 20:12 UTC (permalink / raw)
  To: linux-kernel, linux-rt-users
  Cc: Thomas Gleixner, Anna-Maria Gleixner, Daniel Bristot de Oliveira,
	Luiz Capitulino, Haris Okanovic, Marcelo Tosatti

[-- Attachment #1: 01-modtimer-remote-softirqraise --]
[-- Type: text/plain, Size: 3231 bytes --]

For isolated CPUs, we'd like to skip awakening ktimersoftd
(the switch to and then back from ktimersoftd takes 10us in
virtualized environments, in addition to other OS overhead,
which exceeds telco requirements for packet forwarding for
5G) from the sched tick.

The patch "timers: do not raise softirq unconditionally" from Thomas
attempts to address that by checking, in the sched tick, whether its
necessary to raise the timer softirq. Unfortunately, it attempts to grab
the tvec base spinlock which generates the issue described in the patch
"Revert "timers: do not raise softirq unconditionally"".

tvec_base->lock protects addition of timers to the wheel versus
timer interrupt execution.

This patch does not grab the tvec base spinlock from irq context,
but rather performs a lockless access to base->pending_map.

It handles the the race between timer addition and timer interrupt
execution by unconditionally (in case of isolated CPUs) raising the
timer softirq after making sure the updated bitmap is visible 
on remote CPUs.

This patch reduces cyclictest latency from 25us to 14us 
on my testbox.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 kernel/time/timer.c |   38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

Index: linux-rt-devel/kernel/time/timer.c
===================================================================
--- linux-rt-devel.orig/kernel/time/timer.c	2019-04-15 13:56:06.974210992 -0300
+++ linux-rt-devel/kernel/time/timer.c	2019-04-15 14:21:02.788704354 -0300
@@ -41,6 +41,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/nohz.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
 #include <linux/swait.h>
@@ -907,6 +908,12 @@
 #endif
 }
 
+static DEFINE_PER_CPU(call_single_data_t, raise_timer_csd);
+
+static void raise_timer_softirq(void *arg)
+{
+	raise_softirq(TIMER_SOFTIRQ);
+}
 
 /*
  * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -1056,6 +1063,17 @@
 		internal_add_timer(base, timer);
 	}
 
+	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER) &&
+	    !(timer->flags & TIMER_DEFERRABLE)) {
+		call_single_data_t *c;
+
+		c = per_cpu_ptr(&raise_timer_csd, base->cpu);
+
+		/* Make sure bitmap updates are visible on remote CPUs */
+		smp_wmb();
+		smp_call_function_single_async(base->cpu, c);
+	}
+
 out_unlock:
 	raw_spin_unlock_irqrestore(&base->lock, flags);
 
@@ -1175,6 +1193,17 @@
 
 	debug_activate(timer, timer->expires);
 	internal_add_timer(base, timer);
+
+	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER) &&
+	    !(timer->flags & TIMER_DEFERRABLE)) {
+		call_single_data_t *c;
+
+		c = per_cpu_ptr(&raise_timer_csd, base->cpu);
+
+		/* Make sure bitmap updates are visible on remote CPUs */
+		smp_wmb();
+		smp_call_function_single_async(base->cpu, c);
+	}
 	raw_spin_unlock_irqrestore(&base->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1970,6 +1999,15 @@
 {
 	int cpu;
 
+	for_each_possible_cpu(cpu) {
+		call_single_data_t *c;
+
+		c = per_cpu_ptr(&raise_timer_csd, cpu);
+		c->func = raise_timer_softirq;
+		c->info = NULL;
+		c->flags = 0;
+	}
+
 	for_each_possible_cpu(cpu)
 		init_timer_cpu(cpu);
 }



^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version)
  2019-04-15 20:12 [patch 0/3] do not raise timer softirq unconditionally (spinlockless version) Marcelo Tosatti
  2019-04-15 20:12 ` [patch 1/3] timers: raise timer softirq on __mod_timer/add_timer_on Marcelo Tosatti
@ 2019-04-15 20:12 ` Marcelo Tosatti
  2019-05-29 14:53   ` Anna-Maria Gleixner
  2019-06-04  6:29   ` Peter Xu
  2019-04-15 20:12 ` [patch 3/3] timers: condense pending bitmap information Marcelo Tosatti
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2019-04-15 20:12 UTC (permalink / raw)
  To: linux-kernel, linux-rt-users
  Cc: Thomas Gleixner, Anna-Maria Gleixner, Daniel Bristot de Oliveira,
	Luiz Capitulino, Haris Okanovic, Marcelo Tosatti

[-- Attachment #1: 02-do-not-raise-softirq-unconditionally-second-attempt --]
[-- Type: text/plain, Size: 1162 bytes --]

Check base->pending_map locklessly and skip raising timer softirq 
if empty.

What allows the lockless (and potentially racy against mod_timer) 
check is that mod_timer will raise another timer softirq after
modifying base->pending_map.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 kernel/time/timer.c |   18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

Index: linux-rt-devel/kernel/time/timer.c
===================================================================
--- linux-rt-devel.orig/kernel/time/timer.c	2019-04-15 14:21:02.788704354 -0300
+++ linux-rt-devel/kernel/time/timer.c	2019-04-15 14:22:56.755047354 -0300
@@ -1776,6 +1776,24 @@
 		if (time_before(jiffies, base->clk))
 			return;
 	}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+/* On RT, irq work runs from softirq */
+	if (irq_work_needs_cpu())
+		goto raise;
+#endif
+	base = this_cpu_ptr(&timer_bases[BASE_STD]);
+	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER)) {
+		if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
+			goto raise;
+		base++;
+		if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
+			goto raise;
+
+		return;
+	}
+
+raise:
 	raise_softirq(TIMER_SOFTIRQ);
 }
 



^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 3/3] timers: condense pending bitmap information
  2019-04-15 20:12 [patch 0/3] do not raise timer softirq unconditionally (spinlockless version) Marcelo Tosatti
  2019-04-15 20:12 ` [patch 1/3] timers: raise timer softirq on __mod_timer/add_timer_on Marcelo Tosatti
  2019-04-15 20:12 ` [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version) Marcelo Tosatti
@ 2019-04-15 20:12 ` Marcelo Tosatti
  2019-04-15 20:17 ` [patch 0/3] do not raise timer softirq unconditionally (spinlockless version) Marcelo Tosatti
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2019-04-15 20:12 UTC (permalink / raw)
  To: linux-kernel, linux-rt-users
  Cc: Thomas Gleixner, Anna-Maria Gleixner, Daniel Bristot de Oliveira,
	Luiz Capitulino, Haris Okanovic, Marcelo Tosatti

[-- Attachment #1: 03-optimize-pending-map-check --]
[-- Type: text/plain, Size: 1564 bytes --]

Condense the pending bitmap information in a bool.

Improves cyclictest results by 4us.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 kernel/time/timer.c |    9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

Index: linux-rt-devel/kernel/time/timer.c
===================================================================
--- linux-rt-devel.orig/kernel/time/timer.c	2019-04-15 14:22:56.000000000 -0300
+++ linux-rt-devel/kernel/time/timer.c	2019-04-15 14:24:24.803312280 -0300
@@ -206,6 +206,7 @@
 	unsigned int		cpu;
 	bool			is_idle;
 	bool			must_forward_clk;
+	bool			pending_map_clear;
 	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
 	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
@@ -538,6 +539,7 @@
 {
 	hlist_add_head(&timer->entry, base->vectors + idx);
 	__set_bit(idx, base->pending_map);
+	base->pending_map_clear = false;
 	timer_set_idx(timer, idx);
 }
 
@@ -1741,6 +1743,9 @@
 		while (levels--)
 			expire_timers(base, heads + levels);
 	}
+	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER) &&
+	    bitmap_empty(base->pending_map, WHEEL_SIZE))
+		base->pending_map_clear = true;
 	raw_spin_unlock_irq(&base->lock);
 	wakeup_timer_waiters(base);
 }
@@ -1784,10 +1789,10 @@
 #endif
 	base = this_cpu_ptr(&timer_bases[BASE_STD]);
 	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER)) {
-		if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
+		if (!base->pending_map_clear)
 			goto raise;
 		base++;
-		if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
+		if (!base->pending_map_clear)
 			goto raise;
 
 		return;



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 0/3] do not raise timer softirq unconditionally (spinlockless version)
  2019-04-15 20:12 [patch 0/3] do not raise timer softirq unconditionally (spinlockless version) Marcelo Tosatti
                   ` (2 preceding siblings ...)
  2019-04-15 20:12 ` [patch 3/3] timers: condense pending bitmap information Marcelo Tosatti
@ 2019-04-15 20:17 ` Marcelo Tosatti
  2019-05-06  3:22 ` Marcelo Tosatti
  2019-05-29 14:52 ` Anna-Maria Gleixner
  5 siblings, 0 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2019-04-15 20:17 UTC (permalink / raw)
  To: linux-kernel, linux-rt-users
  Cc: Thomas Gleixner, Anna-Maria Gleixner, Daniel Bristot de Oliveira,
	Luiz Capitulino, Haris Okanovic

On Mon, Apr 15, 2019 at 05:12:13PM -0300, Marcelo Tosatti wrote:
> For isolated CPUs, we'd like to skip awakening ktimersoftd
> (the switch to and then back from ktimersoftd takes 10us in
> virtualized environments, in addition to other OS overhead,
> which exceeds telco requirements for packet forwarding for
> 5G) from the sched tick.
> 
> The patch "timers: do not raise softirq unconditionally" from Thomas
> attempts to address that by checking, in the sched tick, whether its
> necessary to raise the timer softirq. Unfortunately, it attempts to grab
> the tvec base spinlock which generates the issue described in the patch
> "Revert "timers: do not raise softirq unconditionally"".
> 
> tvec_base->lock protects addition of timers to the wheel versus
> timer interrupt execution.
> 
> This patch does not grab the tvec base spinlock from irq context,
> but rather performs a lockless access to base->pending_map.
> 
> It handles the the race between timer addition and timer interrupt
> execution by unconditionally (in case of isolated CPUs) raising the
> timer softirq after making sure the updated bitmap is visible
> on remote CPUs.
> 
> This patchset reduces cyclictest latency from 25us to 14us
> on my testbox. 

Patch against v5.0.5-rt3 branch of linux-rt-devel.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 0/3] do not raise timer softirq unconditionally (spinlockless version)
  2019-04-15 20:12 [patch 0/3] do not raise timer softirq unconditionally (spinlockless version) Marcelo Tosatti
                   ` (3 preceding siblings ...)
  2019-04-15 20:17 ` [patch 0/3] do not raise timer softirq unconditionally (spinlockless version) Marcelo Tosatti
@ 2019-05-06  3:22 ` Marcelo Tosatti
  2019-05-06  7:17   ` Daniel Bristot de Oliveira
  2019-05-06  9:22   ` Thomas Gleixner
  2019-05-29 14:52 ` Anna-Maria Gleixner
  5 siblings, 2 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2019-05-06  3:22 UTC (permalink / raw)
  To: linux-kernel, linux-rt-users
  Cc: Thomas Gleixner, Anna-Maria Gleixner, Daniel Bristot de Oliveira,
	Luiz Capitulino, Haris Okanovic

On Mon, Apr 15, 2019 at 05:12:13PM -0300, Marcelo Tosatti wrote:
> For isolated CPUs, we'd like to skip awakening ktimersoftd
> (the switch to and then back from ktimersoftd takes 10us in
> virtualized environments, in addition to other OS overhead,
> which exceeds telco requirements for packet forwarding for
> 5G) from the sched tick.
> 
> The patch "timers: do not raise softirq unconditionally" from Thomas
> attempts to address that by checking, in the sched tick, whether its
> necessary to raise the timer softirq. Unfortunately, it attempts to grab
> the tvec base spinlock which generates the issue described in the patch
> "Revert "timers: do not raise softirq unconditionally"".
> 
> tvec_base->lock protects addition of timers to the wheel versus
> timer interrupt execution.
> 
> This patch does not grab the tvec base spinlock from irq context,
> but rather performs a lockless access to base->pending_map.
> 
> It handles the the race between timer addition and timer interrupt
> execution by unconditionally (in case of isolated CPUs) raising the
> timer softirq after making sure the updated bitmap is visible
> on remote CPUs.
> 
> This patchset reduces cyclictest latency from 25us to 14us
> on my testbox. 
> 
> 

Ping?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 0/3] do not raise timer softirq unconditionally (spinlockless version)
  2019-05-06  3:22 ` Marcelo Tosatti
@ 2019-05-06  7:17   ` Daniel Bristot de Oliveira
  2019-05-06  9:22   ` Thomas Gleixner
  1 sibling, 0 replies; 18+ messages in thread
From: Daniel Bristot de Oliveira @ 2019-05-06  7:17 UTC (permalink / raw)
  To: Marcelo Tosatti, linux-kernel, linux-rt-users
  Cc: Thomas Gleixner, Anna-Maria Gleixner, Daniel Bristot de Oliveira,
	Luiz Capitulino, Haris Okanovic



On 5/6/19 5:22 AM, Marcelo Tosatti wrote:
> On Mon, Apr 15, 2019 at 05:12:13PM -0300, Marcelo Tosatti wrote:
>> For isolated CPUs, we'd like to skip awakening ktimersoftd
>> (the switch to and then back from ktimersoftd takes 10us in
>> virtualized environments, in addition to other OS overhead,
>> which exceeds telco requirements for packet forwarding for
>> 5G) from the sched tick.
>>
>> The patch "timers: do not raise softirq unconditionally" from Thomas
>> attempts to address that by checking, in the sched tick, whether its
>> necessary to raise the timer softirq. Unfortunately, it attempts to grab
>> the tvec base spinlock which generates the issue described in the patch
>> "Revert "timers: do not raise softirq unconditionally"".
>>
>> tvec_base->lock protects addition of timers to the wheel versus
>> timer interrupt execution.
>>
>> This patch does not grab the tvec base spinlock from irq context,
>> but rather performs a lockless access to base->pending_map.
>>
>> It handles the the race between timer addition and timer interrupt
>> execution by unconditionally (in case of isolated CPUs) raising the
>> timer softirq after making sure the updated bitmap is visible
>> on remote CPUs.
>>
>> This patchset reduces cyclictest latency from 25us to 14us
>> on my testbox. 
>>
>>
> 
> Ping?
> 

Hi Marcelo,

I've been running your patches with lockdep and other debug options and did not
find any problem of this kind. Also, I did not find any kind of timing
regressions in tests with the PREEMPT RT...

-- Daniel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 0/3] do not raise timer softirq unconditionally (spinlockless version)
  2019-05-06  3:22 ` Marcelo Tosatti
  2019-05-06  7:17   ` Daniel Bristot de Oliveira
@ 2019-05-06  9:22   ` Thomas Gleixner
  1 sibling, 0 replies; 18+ messages in thread
From: Thomas Gleixner @ 2019-05-06  9:22 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-kernel, linux-rt-users, Anna-Maria Gleixner,
	Daniel Bristot de Oliveira, Luiz Capitulino, Haris Okanovic

On Mon, 6 May 2019, Marcelo Tosatti wrote:
> On Mon, Apr 15, 2019 at 05:12:13PM -0300, Marcelo Tosatti wrote:
> > It handles the the race between timer addition and timer interrupt
> > execution by unconditionally (in case of isolated CPUs) raising the
> > timer softirq after making sure the updated bitmap is visible
> > on remote CPUs.
> > 
> > This patchset reduces cyclictest latency from 25us to 14us
> > on my testbox.
>
> Ping?

On my list ....


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 0/3] do not raise timer softirq unconditionally (spinlockless version)
  2019-04-15 20:12 [patch 0/3] do not raise timer softirq unconditionally (spinlockless version) Marcelo Tosatti
                   ` (4 preceding siblings ...)
  2019-05-06  3:22 ` Marcelo Tosatti
@ 2019-05-29 14:52 ` Anna-Maria Gleixner
  2019-05-30 19:38   ` Marcelo Tosatti
  5 siblings, 1 reply; 18+ messages in thread
From: Anna-Maria Gleixner @ 2019-05-29 14:52 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-kernel, linux-rt-users, Thomas Gleixner,
	Daniel Bristot de Oliveira, Luiz Capitulino, Haris Okanovic

Hi,

I had a look at the queue and have several questions about your
implementation. 

First of all, I had some troubles to understand your commit messages. So I
first had to read the code and then tried to understand the commit
messages. It is easier, if it works the other way round.

On Mon, 15 Apr 2019, Marcelo Tosatti wrote:

> For isolated CPUs, we'd like to skip awakening ktimersoftd
> (the switch to and then back from ktimersoftd takes 10us in
> virtualized environments, in addition to other OS overhead,
> which exceeds telco requirements for packet forwarding for
> 5G) from the sched tick.

You would like to prevent raising the timer softirq in general from the
sched tick for isolated CPUs? Or you would like to prevent raising the
timer softirq if no pending timer is available?

Nevertheless, this change is not PREEMPT_RT specific. It is a NOHZ
dependand change. So it would be nice, if the queue is against
mainline. But please correct me, if I'm wrong.

[...]

> This patchset reduces cyclictest latency from 25us to 14us
> on my testbox. 
> 

A lot of information is missing: How does your environment looks like for
this test, what is your workload,...?

Did you also run other tests?

Thanks,

	Anna-Maria

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 1/3] timers: raise timer softirq on __mod_timer/add_timer_on
  2019-04-15 20:12 ` [patch 1/3] timers: raise timer softirq on __mod_timer/add_timer_on Marcelo Tosatti
@ 2019-05-29 14:53   ` Anna-Maria Gleixner
  2019-05-30 19:23     ` Marcelo Tosatti
  0 siblings, 1 reply; 18+ messages in thread
From: Anna-Maria Gleixner @ 2019-05-29 14:53 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-kernel, linux-rt-users, Thomas Gleixner,
	Daniel Bristot de Oliveira, Luiz Capitulino, Haris Okanovic

On Mon, 15 Apr 2019, Marcelo Tosatti wrote:

[...]

> The patch "timers: do not raise softirq unconditionally" from Thomas
> attempts to address that by checking, in the sched tick, whether its
> necessary to raise the timer softirq. Unfortunately, it attempts to grab
> the tvec base spinlock which generates the issue described in the patch
> "Revert "timers: do not raise softirq unconditionally"".

Both patches are not available in the version your patch set is based
on. Better pointers would be helpful.

> tvec_base->lock protects addition of timers to the wheel versus
> timer interrupt execution.

The timer_base->lock (formally known as tvec_base->lock), synchronizes all
accesses to timer_base and not only addition of timers versus timer
interrupt execution. Deletion of timers, getting the next timer interrupt,
forwarding the base clock and migration of timers are protected as well by
timer_base->lock.

> This patch does not grab the tvec base spinlock from irq context,
> but rather performs a lockless access to base->pending_map.

I cannot see where this patch performs a lockless access to
timer_base->pending_map.

> It handles the the race between timer addition and timer interrupt
> execution by unconditionally (in case of isolated CPUs) raising the
> timer softirq after making sure the updated bitmap is visible 
> on remote CPUs.

So after modifying a timer on a non housekeeping timer base, the timer
softirq is raised - even if there is no pending timer in the next
bucket. Only with this patch, this shouldn't be a problem - but it is an
additional raise of timer softirq and an overhead when adding a timer,
because the normal timer softirq is raised from sched tick anyway.

> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
>  kernel/time/timer.c |   38 ++++++++++++++++++++++++++++++++++++++
>  1 file changed, 38 insertions(+)
> 
> Index: linux-rt-devel/kernel/time/timer.c
> ===================================================================
> --- linux-rt-devel.orig/kernel/time/timer.c	2019-04-15 13:56:06.974210992 -0300
> +++ linux-rt-devel/kernel/time/timer.c	2019-04-15 14:21:02.788704354 -0300
> @@ -1056,6 +1063,17 @@
>  		internal_add_timer(base, timer);
>  	}
>  
> +	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER) &&
> +	    !(timer->flags & TIMER_DEFERRABLE)) {
> +		call_single_data_t *c;
> +
> +		c = per_cpu_ptr(&raise_timer_csd, base->cpu);
> +
> +		/* Make sure bitmap updates are visible on remote CPUs */
> +		smp_wmb();
> +		smp_call_function_single_async(base->cpu, c);
> +	}
> +
>  out_unlock:
>  	raw_spin_unlock_irqrestore(&base->lock, flags);
>

Could you please explain me, why you decided to use the above
implementation for raising the timer softirq after modifying a timer?

Thanks,

	Anna-Maria


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version)
  2019-04-15 20:12 ` [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version) Marcelo Tosatti
@ 2019-05-29 14:53   ` Anna-Maria Gleixner
  2019-05-30 20:14     ` Marcelo Tosatti
  2019-06-04  6:29   ` Peter Xu
  1 sibling, 1 reply; 18+ messages in thread
From: Anna-Maria Gleixner @ 2019-05-29 14:53 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-kernel, linux-rt-users, Thomas Gleixner,
	Daniel Bristot de Oliveira, Luiz Capitulino, Haris Okanovic

On Mon, 15 Apr 2019, Marcelo Tosatti wrote:

> Check base->pending_map locklessly and skip raising timer softirq 
> if empty.
> 
> What allows the lockless (and potentially racy against mod_timer) 
> check is that mod_timer will raise another timer softirq after
> modifying base->pending_map.

The raise of the timer softirq after adding the timer is done
unconditionally - so there are timer softirqs raised which are not required
at all, as mentioned before.

This check is for !CONFIG_PREEMPT_RT_FULL only implemented. The commit
message totally igonres that you are implementing something
CONFIG_PREEMPT_RT_FULL dependent as well.

> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
>  kernel/time/timer.c |   18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
> 
> Index: linux-rt-devel/kernel/time/timer.c
> ===================================================================
> --- linux-rt-devel.orig/kernel/time/timer.c	2019-04-15 14:21:02.788704354 -0300
> +++ linux-rt-devel/kernel/time/timer.c	2019-04-15 14:22:56.755047354 -0300
> @@ -1776,6 +1776,24 @@
>  		if (time_before(jiffies, base->clk))
>  			return;
>  	}
> +
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +/* On RT, irq work runs from softirq */
> +	if (irq_work_needs_cpu())
> +		goto raise;

So with this patch and the change you made in the patch before, timers on
RT are expired only when there is pending irq work or after modifying a
timer on a non housekeeping cpu?

With your patches I could create the following problematic situation on RT
(if I understood everything properly): I add a timer which should expire in
50 jiffies to the wheel of a non housekeeping cpu. So it ends up 50 buckets
away form now in the first wheel. This timer is the only timer in the wheel
and the next timer softirq raise is required in 50 jiffies. After adding
the timer, the timer interrupt is raised, and no timer has to be expired,
because there is no timer pending. If there is no irq work required during
the next 51 jiffies and also no timer changed, the timer I added, will not
expire in time. The timer_base will come out of idle but will not forward
the base clk. This makes it even worse: When then adding a timer, the timer
base is forwarded - but without checking for the next pending timer, so the
first added timer will be delayed even more.

So your implementation lacks forwarding the timer_base->clk when timer_base
comes out of idle with respect to the next pending timer.


> +#endif
> +	base = this_cpu_ptr(&timer_bases[BASE_STD]);
> +	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER)) {
> +		if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
> +			goto raise;
> +		base++;
> +		if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
> +			goto raise;
> +
> +		return;
> +	}
> +
> +raise:
>  	raise_softirq(TIMER_SOFTIRQ);
>  }
>  
>

Thanks,

	Anna-Maria


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 1/3] timers: raise timer softirq on __mod_timer/add_timer_on
  2019-05-29 14:53   ` Anna-Maria Gleixner
@ 2019-05-30 19:23     ` Marcelo Tosatti
  0 siblings, 0 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2019-05-30 19:23 UTC (permalink / raw)
  To: Anna-Maria Gleixner
  Cc: linux-kernel, linux-rt-users, Thomas Gleixner,
	Daniel Bristot de Oliveira, Luiz Capitulino, Haris Okanovic

Hi Anna-Maria,

On Wed, May 29, 2019 at 04:53:05PM +0200, Anna-Maria Gleixner wrote:
> On Mon, 15 Apr 2019, Marcelo Tosatti wrote:
> 
> [...]
> 
> > The patch "timers: do not raise softirq unconditionally" from Thomas
> > attempts to address that by checking, in the sched tick, whether its
> > necessary to raise the timer softirq. 

https://lore.kernel.org/patchwork/patch/446045/

>> Unfortunately, it attempts to grab
> > the tvec base spinlock which generates the issue described in the patch
> > "Revert "timers: do not raise softirq unconditionally"".

https://lore.kernel.org/patchwork/patch/552474/

> Both patches are not available in the version your patch set is based
> on. Better pointers would be helpful.

See above.

> 
> > tvec_base->lock protects addition of timers to the wheel versus
> > timer interrupt execution.
> 
> The timer_base->lock (formally known as tvec_base->lock), synchronizes all
> accesses to timer_base and not only addition of timers versus timer
> interrupt execution. Deletion of timers, getting the next timer interrupt,
> forwarding the base clock and migration of timers are protected as well by
> timer_base->lock.

Right.

> > This patch does not grab the tvec base spinlock from irq context,
> > but rather performs a lockless access to base->pending_map.
> 
> I cannot see where this patch performs a lockless access to
> timer_base->pending_map.

[patch 2/3] timers: do not raise softirq unconditionally (spinlockless
version)

> > It handles the the race between timer addition and timer interrupt
> > execution by unconditionally (in case of isolated CPUs) raising the
> > timer softirq after making sure the updated bitmap is visible 
> > on remote CPUs.
> 
> So after modifying a timer on a non housekeeping timer base, the timer
> softirq is raised - even if there is no pending timer in the next
> bucket. Only with this patch, this shouldn't be a problem - but it is an
> additional raise of timer softirq and an overhead when adding a timer,
> because the normal timer softirq is raised from sched tick anyway.

It should be clear why this is necessary when reading

[patch 2/3] timers: do not raise softirq unconditionally (spinlockless
version)

> 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > ---
> >  kernel/time/timer.c |   38 ++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 38 insertions(+)
> > 
> > Index: linux-rt-devel/kernel/time/timer.c
> > ===================================================================
> > --- linux-rt-devel.orig/kernel/time/timer.c	2019-04-15 13:56:06.974210992 -0300
> > +++ linux-rt-devel/kernel/time/timer.c	2019-04-15 14:21:02.788704354 -0300
> > @@ -1056,6 +1063,17 @@
> >  		internal_add_timer(base, timer);
> >  	}
> >  
> > +	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER) &&
> > +	    !(timer->flags & TIMER_DEFERRABLE)) {
> > +		call_single_data_t *c;
> > +
> > +		c = per_cpu_ptr(&raise_timer_csd, base->cpu);
> > +
> > +		/* Make sure bitmap updates are visible on remote CPUs */
> > +		smp_wmb();
> > +		smp_call_function_single_async(base->cpu, c);
> > +	}
> > +
> >  out_unlock:
> >  	raw_spin_unlock_irqrestore(&base->lock, flags);
> >
> 
> Could you please explain me, why you decided to use the above
> implementation for raising the timer softirq after modifying a timer?

Because of the following race condition which is open after

"[patch 2/3] timers: do not raise softirq unconditionally (spinlockless
version)": 


CPU-0				CPU-1

				jiffies=99
runs
add_timer_on, with 
timer->expires=100
				jiffies=100
				run_softirq(), sees pending bitmap clear

add_timer_on 
returns and 
timer was not executed

P)


This race did not exist before. 

So by raising a softirq on the remote CPU 
at point P), its ensured the timer will 
be executed ASAP.



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 0/3] do not raise timer softirq unconditionally (spinlockless version)
  2019-05-29 14:52 ` Anna-Maria Gleixner
@ 2019-05-30 19:38   ` Marcelo Tosatti
  0 siblings, 0 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2019-05-30 19:38 UTC (permalink / raw)
  To: Anna-Maria Gleixner
  Cc: linux-kernel, linux-rt-users, Thomas Gleixner,
	Daniel Bristot de Oliveira, Luiz Capitulino, Haris Okanovic

On Wed, May 29, 2019 at 04:52:37PM +0200, Anna-Maria Gleixner wrote:
> Hi,
> 
> I had a look at the queue and have several questions about your
> implementation. 
> 
> First of all, I had some troubles to understand your commit messages. So I
> first had to read the code and then tried to understand the commit
> messages. It is easier, if it works the other way round.

Right. Commit message seemed descriptive to me, but i should probably
try to improve the explanation in the commit message.

> On Mon, 15 Apr 2019, Marcelo Tosatti wrote:
> 
> > For isolated CPUs, we'd like to skip awakening ktimersoftd
> > (the switch to and then back from ktimersoftd takes 10us in
> > virtualized environments, in addition to other OS overhead,
> > which exceeds telco requirements for packet forwarding for
> > 5G) from the sched tick.
> 
> You would like to prevent raising the timer softirq in general from the
> sched tick for isolated CPUs? Or you would like to prevent raising the
> timer softirq if no pending timer is available?

With DPDK and CPU isolation, there are no low resolution timers running on
the isolated CPUs. So prevent raising timer softirq if no pending timer 
is available is enough.

> Nevertheless, this change is not PREEMPT_RT specific. It is a NOHZ
> dependand change. So it would be nice, if the queue is against
> mainline. But please correct me, if I'm wrong.

Since it was based on the idea of 

https://lore.kernel.org/patchwork/patch/446045/

Which was an -RT patch, i decided to submit it against the -RT kernel.

But it can be merged through the mainline kernel queue as well, 
i believe.

> [...]
> 
> > This patchset reduces cyclictest latency from 25us to 14us
> > on my testbox. 
> > 
> 
> A lot of information is missing: How does your environment looks like for
> this test, what is your workload,...?

x86 hardware, with KVM virtualized, running -RT kernel on both host
and guest. Relevant host kernel command line:

isolcpus=1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,14
intel_pstate=disable nosoftlockup nohz=on
nohz_full=1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,14
rcu_nocbs=1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,14

Relevant guest kernel command line:
isolcpus=1 nosoftlockup nohz=on nohz_full=1
rcu_nocbs=1

And workload in question is cyclictest (the production
software is DPDK, but cyclictest is used to gather 
latency data).

> Did you also run other tests?

Yes, have a custom module stress test to attempt to hit the race.
Daniel (CC'ed) ran it under a kernel with debugging enabled, 
with no apparent problems.

So one downside of this patch is that raises the softirq 
unnecessarily sometimes. However, its impact is reduced to
isolated CPUs.






^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version)
  2019-05-29 14:53   ` Anna-Maria Gleixner
@ 2019-05-30 20:14     ` Marcelo Tosatti
  2019-05-31 11:55       ` Anna-Maria Gleixner
  0 siblings, 1 reply; 18+ messages in thread
From: Marcelo Tosatti @ 2019-05-30 20:14 UTC (permalink / raw)
  To: Anna-Maria Gleixner
  Cc: linux-kernel, linux-rt-users, Thomas Gleixner,
	Daniel Bristot de Oliveira, Luiz Capitulino, Haris Okanovic

On Wed, May 29, 2019 at 04:53:26PM +0200, Anna-Maria Gleixner wrote:
> On Mon, 15 Apr 2019, Marcelo Tosatti wrote:
> 
> > Check base->pending_map locklessly and skip raising timer softirq 
> > if empty.
> > 
> > What allows the lockless (and potentially racy against mod_timer) 
> > check is that mod_timer will raise another timer softirq after
> > modifying base->pending_map.
> 
> The raise of the timer softirq after adding the timer is done
> unconditionally - so there are timer softirqs raised which are not required
> at all, as mentioned before.

Yes. However i can't see a way to avoid that: its not possible to know
if the race described earlier happened or not. 

Do you have a suggestion on how to avoid this or a way to avoid
the IPI+raise softirq ?

> This check is for !CONFIG_PREEMPT_RT_FULL only implemented. The commit
> message totally igonres that you are implementing something
> CONFIG_PREEMPT_RT_FULL dependent as well.
> 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > ---
> >  kernel/time/timer.c |   18 ++++++++++++++++++
> >  1 file changed, 18 insertions(+)
> > 
> > Index: linux-rt-devel/kernel/time/timer.c
> > ===================================================================
> > --- linux-rt-devel.orig/kernel/time/timer.c	2019-04-15 14:21:02.788704354 -0300
> > +++ linux-rt-devel/kernel/time/timer.c	2019-04-15 14:22:56.755047354 -0300
> > @@ -1776,6 +1776,24 @@
> >  		if (time_before(jiffies, base->clk))
> >  			return;
> >  	}
> > +
> > +#ifdef CONFIG_PREEMPT_RT_FULL
> > +/* On RT, irq work runs from softirq */
> > +	if (irq_work_needs_cpu())
> > +		goto raise;
> 
> So with this patch and the change you made in the patch before, timers on
> RT are expired only when there is pending irq work or after modifying a
> timer on a non housekeeping cpu?

Well, run_timer_softirq execute only if pending_map contains a bit set.

> With your patches I could create the following problematic situation on RT
> (if I understood everything properly): I add a timer which should expire in
> 50 jiffies to the wheel of a non housekeeping cpu. So it ends up 50 buckets
> away form now in the first wheel. This timer is the only timer in the wheel
> and the next timer softirq raise is required in 50 jiffies. After adding
> the timer, the timer interrupt is raised, and no timer has to be expired,
> because there is no timer pending.

But the softirq will be raised, because pending_map will be set:

+               if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
+                       goto raise;

No?

>  If there is no irq work required during
> the next 51 jiffies and also no timer changed, the timer I added, will not
> expire in time. The timer_base will come out of idle but will not forward
> the base clk. 

> This makes it even worse: When then adding a timer, the timer
> base is forwarded - but without checking for the next pending timer, so the
> first added timer will be delayed even more.
> 
> So your implementation lacks forwarding the timer_base->clk when timer_base
> comes out of idle with respect to the next pending timer.

> > +#endif
> > +	base = this_cpu_ptr(&timer_bases[BASE_STD]);
> > +	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER)) {
> > +		if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
> > +			goto raise;
> > +		base++;
> > +		if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
> > +			goto raise;
> > +
> > +		return;
> > +	}
> > +
> > +raise:
> >  	raise_softirq(TIMER_SOFTIRQ);
> >  }
> >  
> >
> 
> Thanks,
> 
> 	Anna-Maria

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version)
  2019-05-30 20:14     ` Marcelo Tosatti
@ 2019-05-31 11:55       ` Anna-Maria Gleixner
  2019-06-11 11:45         ` Anna-Maria Gleixner
  0 siblings, 1 reply; 18+ messages in thread
From: Anna-Maria Gleixner @ 2019-05-31 11:55 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-kernel, linux-rt-users, Thomas Gleixner,
	Daniel Bristot de Oliveira, Luiz Capitulino, Haris Okanovic

On Thu, 30 May 2019, Marcelo Tosatti wrote:

> On Wed, May 29, 2019 at 04:53:26PM +0200, Anna-Maria Gleixner wrote:
> > On Mon, 15 Apr 2019, Marcelo Tosatti wrote:
> > 
> > > --- linux-rt-devel.orig/kernel/time/timer.c	2019-04-15 14:21:02.788704354 -0300
> > > +++ linux-rt-devel/kernel/time/timer.c	2019-04-15 14:22:56.755047354 -0300
> > > @@ -1776,6 +1776,24 @@
> > >  		if (time_before(jiffies, base->clk))
> > >  			return;
> > >  	}
> > > +
> > > +#ifdef CONFIG_PREEMPT_RT_FULL
> > > +/* On RT, irq work runs from softirq */
> > > +	if (irq_work_needs_cpu())
> > > +		goto raise;
> > 
> > So with this patch and the change you made in the patch before, timers on
> > RT are expired only when there is pending irq work or after modifying a
> > timer on a non housekeeping cpu?
> 
> Well, run_timer_softirq execute only if pending_map contains a bit set.
> 
> > With your patches I could create the following problematic situation on RT
> > (if I understood everything properly): I add a timer which should expire in
> > 50 jiffies to the wheel of a non housekeeping cpu. So it ends up 50 buckets
> > away form now in the first wheel. This timer is the only timer in the wheel
> > and the next timer softirq raise is required in 50 jiffies. After adding
> > the timer, the timer interrupt is raised, and no timer has to be expired,
> > because there is no timer pending.
> 
> But the softirq will be raised, because pending_map will be set:
> 
> +               if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
> +                       goto raise;
> 
> No?

I'm sorry! I read the #endif of the CONFIG_PREEMPT_RT_FULL section as an
#else... This is where my confusion comes from. I will think about the
problem and your solution a little bit more and give you feedback hopefully
on monday.

Thanks,
	Anna-Maria


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version)
  2019-04-15 20:12 ` [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version) Marcelo Tosatti
  2019-05-29 14:53   ` Anna-Maria Gleixner
@ 2019-06-04  6:29   ` Peter Xu
  2019-06-06 15:14     ` Marcelo Tosatti
  1 sibling, 1 reply; 18+ messages in thread
From: Peter Xu @ 2019-06-04  6:29 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-kernel, linux-rt-users, Thomas Gleixner,
	Anna-Maria Gleixner, Daniel Bristot de Oliveira, Luiz Capitulino,
	Haris Okanovic

On Mon, Apr 15, 2019 at 05:12:15PM -0300, Marcelo Tosatti wrote:
> Check base->pending_map locklessly and skip raising timer softirq 
> if empty.
> 
> What allows the lockless (and potentially racy against mod_timer) 
> check is that mod_timer will raise another timer softirq after
> modifying base->pending_map.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
>  kernel/time/timer.c |   18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
> 
> Index: linux-rt-devel/kernel/time/timer.c
> ===================================================================
> --- linux-rt-devel.orig/kernel/time/timer.c	2019-04-15 14:21:02.788704354 -0300
> +++ linux-rt-devel/kernel/time/timer.c	2019-04-15 14:22:56.755047354 -0300
> @@ -1776,6 +1776,24 @@
>  		if (time_before(jiffies, base->clk))
>  			return;
>  	}
> +
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +/* On RT, irq work runs from softirq */
> +	if (irq_work_needs_cpu())
> +		goto raise;
> +#endif
> +	base = this_cpu_ptr(&timer_bases[BASE_STD]);
> +	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER)) {
> +		if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
> +			goto raise;
> +		base++;

Shall we check against CONFIG_NO_HZ_COMMON?  Otherwise the base could
point to something else rather tha the deferred base (NR_BASES==1 if
without nohz-common).

I see that run_local_timers() has similar pattern, actually I'm
thinking whether we can put things like "base++" to be inside some
"if"s of CONFIG_NO_HZ_COMMON to be clear.

Thanks,

-- 
Peter Xu

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version)
  2019-06-04  6:29   ` Peter Xu
@ 2019-06-06 15:14     ` Marcelo Tosatti
  0 siblings, 0 replies; 18+ messages in thread
From: Marcelo Tosatti @ 2019-06-06 15:14 UTC (permalink / raw)
  To: Peter Xu
  Cc: linux-kernel, linux-rt-users, Thomas Gleixner,
	Anna-Maria Gleixner, Daniel Bristot de Oliveira, Luiz Capitulino,
	Haris Okanovic

On Tue, Jun 04, 2019 at 02:29:31PM +0800, Peter Xu wrote:
> On Mon, Apr 15, 2019 at 05:12:15PM -0300, Marcelo Tosatti wrote:
> > Check base->pending_map locklessly and skip raising timer softirq 
> > if empty.
> > 
> > What allows the lockless (and potentially racy against mod_timer) 
> > check is that mod_timer will raise another timer softirq after
> > modifying base->pending_map.
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > ---
> >  kernel/time/timer.c |   18 ++++++++++++++++++
> >  1 file changed, 18 insertions(+)
> > 
> > Index: linux-rt-devel/kernel/time/timer.c
> > ===================================================================
> > --- linux-rt-devel.orig/kernel/time/timer.c	2019-04-15 14:21:02.788704354 -0300
> > +++ linux-rt-devel/kernel/time/timer.c	2019-04-15 14:22:56.755047354 -0300
> > @@ -1776,6 +1776,24 @@
> >  		if (time_before(jiffies, base->clk))
> >  			return;
> >  	}
> > +
> > +#ifdef CONFIG_PREEMPT_RT_FULL
> > +/* On RT, irq work runs from softirq */
> > +	if (irq_work_needs_cpu())
> > +		goto raise;
> > +#endif
> > +	base = this_cpu_ptr(&timer_bases[BASE_STD]);
> > +	if (!housekeeping_cpu(base->cpu, HK_FLAG_TIMER)) {
> > +		if (!bitmap_empty(base->pending_map, WHEEL_SIZE))
> > +			goto raise;
> > +		base++;
> 
> Shall we check against CONFIG_NO_HZ_COMMON?  Otherwise the base could
> point to something else rather tha the deferred base (NR_BASES==1 if
> without nohz-common).
> 
> I see that run_local_timers() has similar pattern, actually I'm
> thinking whether we can put things like "base++" to be inside some
> "if"s of CONFIG_NO_HZ_COMMON to be clear.
> 
> Thanks,
> 
> -- 
> Peter Xu

Hi Peter,

Yes, that is a good point and needs fixing. 

I'll wait for Anna's comments before posting -v2.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version)
  2019-05-31 11:55       ` Anna-Maria Gleixner
@ 2019-06-11 11:45         ` Anna-Maria Gleixner
  0 siblings, 0 replies; 18+ messages in thread
From: Anna-Maria Gleixner @ 2019-06-11 11:45 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-kernel, linux-rt-users, Thomas Gleixner,
	Daniel Bristot de Oliveira, Luiz Capitulino, Haris Okanovic

On Fri, 31 May 2019, Anna-Maria Gleixner wrote:

[...]
> I will think about the problem and your solution a little bit more and
> give you feedback hopefully on monday.

I'm sorry for the delay. But now I'm able to give you a detailed feedback:

The general problem is, that your solution is customized to a single
use-case: preventing softirq raise but only if there is _no_ timer
pending. To reach this goal without using locks, overhead is added to the
formerly optimized add/mod path of a timer. With your code the timer
softirq is raised even when there is a pending timer which does not has to
be expired right now. But there have been requests in the past for this use
case already.

I discussed with Thomas several approaches during the last week how to
solve the unconditional softirq timer raise in a more general way without
loosing the fast add/mod path of a timer. The approach which seems to be
the best has a dependency on a timer code change from a push to pull model
which is still under developement (see v2 patchset:
http://lkml.kernel.org/r/20170418111102.490432548@linutronix.de). The
patchset v2 has several problems but we are working on a solution for those
problems right now. When the timer pull model is in place the approach to
solve the unconditional timer softirq raise could look like the following:

---8<---
The next_expiry value of timer_base struct is used to store the next expiry
value even if timer_base is not idle. Therefore it is udpated after adding
or modifying a timer and also at the end of timer softirq. In case timer
softirq does not has to be raised, the timer_base->clk is incremented to
prevent stale clocks. Checking whether timer softirq has to be raised
cannot be done lockless.

This code is not compile tested nor boot tested.

---
 kernel/time/timer.c |   60 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 24 deletions(-)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -552,37 +552,32 @@ static void
 static void
 trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
 {
-	if (!is_timers_nohz_active())
-		return;
-
-	/*
-	 * TODO: This wants some optimizing similar to the code below, but we
-	 * will do that when we switch from push to pull for deferrable timers.
-	 */
-	if (timer->flags & TIMER_DEFERRABLE) {
-		if (tick_nohz_full_cpu(base->cpu))
-			wake_up_nohz_cpu(base->cpu);
-		return;
+	if (is_timers_nohz_active()) {
+		/*
+		 * TODO: This wants some optimizing similar to the code
+		 * below, but we will do that when we switch from push to
+		 * pull for deferrable timers.
+		 */
+		if (timer->flags & TIMER_DEFERRABLE) {
+			if (tick_nohz_full_cpu(base->cpu))
+				wake_up_nohz_cpu(base->cpu);
+			return;
+		}
 	}
 
-	/*
-	 * We might have to IPI the remote CPU if the base is idle and the
-	 * timer is not deferrable. If the other CPU is on the way to idle
-	 * then it can't set base->is_idle as we hold the base lock:
-	 */
-	if (!base->is_idle)
-		return;
-
 	/* Check whether this is the new first expiring timer: */
 	if (time_after_eq(timer->expires, base->next_expiry))
 		return;
+	/* Update next expiry time */
+	base->next_expiry = timer->expires;
 
 	/*
-	 * Set the next expiry time and kick the CPU so it can reevaluate the
-	 * wheel:
+	 * We might have to IPI the remote CPU if the base is idle and the
+	 * timer is not deferrable. If the other CPU is on the way to idle
+	 * then it can't set base->is_idle as we hold the base lock:
 	 */
-	base->next_expiry = timer->expires;
-	wake_up_nohz_cpu(base->cpu);
+	if (is_timers_nohz_active() && base->is_idle)
+		wake_up_nohz_cpu(base->cpu);
 }
 
 static void
@@ -1684,6 +1679,7 @@ static inline void __run_timers(struct t
 		while (levels--)
 			expire_timers(base, heads + levels);
 	}
+	base->next_expiry = __next_timer_interrupt(base);
 	base->running_timer = NULL;
 	raw_spin_unlock_irq(&base->lock);
 }
@@ -1716,8 +1712,24 @@ void run_local_timers(void)
 		base++;
 		if (time_before(jiffies, base->clk))
 			return;
+		base--;
+	}
+
+	/*
+	 * check for next expiry
+	 *
+	 * deferrable base is igonred here - it is only usable when
+	 * switching from push to pull model for deferrable timers
+	 */
+	raw_spin_lock_irq(&base->lock);
+	if (base->clk == base->next_expiry) {
+		raw_spin_unlock_irq(&base->lock);
+		raise_softirq(TIMER_SOFTIRQ);
+	} else {
+		base->clk++;
+		raw_spin_unlock_irq(&base->lock);
+		return;
 	}
-	raise_softirq(TIMER_SOFTIRQ);
 }
 
 /*
---8<---


Thanks,

	Anna-Maria

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2019-06-11 11:45 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-15 20:12 [patch 0/3] do not raise timer softirq unconditionally (spinlockless version) Marcelo Tosatti
2019-04-15 20:12 ` [patch 1/3] timers: raise timer softirq on __mod_timer/add_timer_on Marcelo Tosatti
2019-05-29 14:53   ` Anna-Maria Gleixner
2019-05-30 19:23     ` Marcelo Tosatti
2019-04-15 20:12 ` [patch 2/3] timers: do not raise softirq unconditionally (spinlockless version) Marcelo Tosatti
2019-05-29 14:53   ` Anna-Maria Gleixner
2019-05-30 20:14     ` Marcelo Tosatti
2019-05-31 11:55       ` Anna-Maria Gleixner
2019-06-11 11:45         ` Anna-Maria Gleixner
2019-06-04  6:29   ` Peter Xu
2019-06-06 15:14     ` Marcelo Tosatti
2019-04-15 20:12 ` [patch 3/3] timers: condense pending bitmap information Marcelo Tosatti
2019-04-15 20:17 ` [patch 0/3] do not raise timer softirq unconditionally (spinlockless version) Marcelo Tosatti
2019-05-06  3:22 ` Marcelo Tosatti
2019-05-06  7:17   ` Daniel Bristot de Oliveira
2019-05-06  9:22   ` Thomas Gleixner
2019-05-29 14:52 ` Anna-Maria Gleixner
2019-05-30 19:38   ` Marcelo Tosatti

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).