linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 01/20] timer: Make pinned a timer property
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13  8:40 ` [patch 02/20] x86/apic/uv: Initialize timer as pinned Thomas Gleixner
                   ` (20 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: timer--Make-pinned-a-timer-property.patch --]
[-- Type: text/plain, Size: 5161 bytes --]

We want to move the timer migration from a push to a pull model. This requires
to store the pinned attribute of a timer in the timer itself. This must happen
at initialization time.

Add the helper macros for this.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h |   25 ++++++++++++++++++++++---
 kernel/time/timer.c   |   10 +++++-----
 2 files changed, 27 insertions(+), 8 deletions(-)

Index: b/include/linux/timer.h
===================================================================
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -62,7 +62,8 @@ struct timer_list {
 #define TIMER_MIGRATING		0x00080000
 #define TIMER_BASEMASK		(TIMER_CPUMASK | TIMER_MIGRATING)
 #define TIMER_DEFERRABLE	0x00100000
-#define TIMER_IRQSAFE		0x00200000
+#define TIMER_PINNED		0x00200000
+#define TIMER_IRQSAFE		0x00400000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
@@ -78,9 +79,15 @@ struct timer_list {
 #define TIMER_INITIALIZER(_function, _expires, _data)		\
 	__TIMER_INITIALIZER((_function), (_expires), (_data), 0)
 
+#define TIMER_PINNED_INITIALIZER(_function, _expires, _data)	\
+	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED)
+
 #define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data)	\
 	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE)
 
+#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data)	\
+	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED)
+
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
 	struct timer_list _name =				\
 		TIMER_INITIALIZER(_function, _expires, _data)
@@ -124,8 +131,12 @@ static inline void init_timer_on_stack_k
 
 #define init_timer(timer)						\
 	__init_timer((timer), 0)
+#define init_timer_pinned(timer)					\
+	__init_timer((timer), TIMER_PINNED)
 #define init_timer_deferrable(timer)					\
 	__init_timer((timer), TIMER_DEFERRABLE)
+#define init_timer_pinned_deferrable(timer)				\
+	__init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED)
 #define init_timer_on_stack(timer)					\
 	__init_timer_on_stack((timer), 0)
 
@@ -145,12 +156,20 @@ static inline void init_timer_on_stack_k
 
 #define setup_timer(timer, fn, data)					\
 	__setup_timer((timer), (fn), (data), 0)
+#define setup_pinned_timer(timer, fn, data)				\
+	__setup_timer((timer), (fn), (data), TIMER_PINNED)
 #define setup_deferrable_timer(timer, fn, data)				\
 	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE)
+#define setup_pinned_deferrable_timer(timer, fn, data)			\
+	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
 #define setup_timer_on_stack(timer, fn, data)				\
 	__setup_timer_on_stack((timer), (fn), (data), 0)
+#define setup_pinned_timer_on_stack(timer, fn, data)			\
+	__setup_timer_on_stack((timer), (fn), (data), TIMER_PINNED)
 #define setup_deferrable_timer_on_stack(timer, fn, data)		\
 	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE)
+#define setup_pinned_deferrable_timer_on_stack(timer, fn, data)		\
+	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
 
 /**
  * timer_pending - is a timer pending?
@@ -175,8 +194,8 @@ extern int mod_timer_pinned(struct timer
 
 extern void set_timer_slack(struct timer_list *time, int slack_hz);
 
-#define TIMER_NOT_PINNED	0
-#define TIMER_PINNED		1
+#define MOD_TIMER_NOT_PINNED	0
+#define MOD_TIMER_PINNED	1
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
Index: b/kernel/time/timer.c
===================================================================
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -782,7 +782,7 @@ static inline int
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, pinned);
+	new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED);
 
 	if (base != new_base) {
 		/*
@@ -825,7 +825,7 @@ static inline int
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -900,7 +900,7 @@ int mod_timer(struct timer_list *timer,
 	if (timer_pending(timer) && timer->expires == expires)
 		return 1;
 
-	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
 
@@ -928,7 +928,7 @@ int mod_timer_pinned(struct timer_list *
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	return __mod_timer(timer, expires, false, TIMER_PINNED);
+	return __mod_timer(timer, expires, false, MOD_TIMER_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pinned);
 
@@ -1512,7 +1512,7 @@ signed long __sched schedule_timeout(sig
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
+	__mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 00/20] timer: Refactor the timer wheel
@ 2016-06-13  8:40 Thomas Gleixner
  2016-06-13  8:40 ` [patch 01/20] timer: Make pinned a timer property Thomas Gleixner
                   ` (21 more replies)
  0 siblings, 22 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

The current timer wheel has some drawbacks:

1) Cascading

   Cascading can be an unbound operation and is completely pointless in most
   cases because the vast majority of the timer wheel timers are canceled or
   rearmed before expiration.

2) No fast lookup of the next expiring timer

   In NOHZ scenarios the first timer soft interrupt after a long NOHZ period
   must fast forward the base time to current jiffies. As we have no way to
   find the next expiring timer fast, the code loops and increments the base
   time by one and checks for expired timers in each step. I've observed loops
   lasting 1 ms!

There are some other issues caused by the above, but they are minor compare to
those.

After a thorough analysis of real world data gathered on laptops,
workstations, webservers and other machines (thanks Chris!) I came to the
conclusion that the current 'classic' timer wheel implementation can be
modified to address the above issues.

The vast majority of timer wheel timers is canceled or rearmed before
expiry. Most of them are timeouts for networking and other I/O tasks. The
nature of timeouts is to catch the exception from normal operation (TCP ack
timed out, disk does not respond, etc.). For these kind of timeouts the
accuracy is not really a concern. In case the timeout fires, performance is
down the drain already.

The few timers which actually expire can be split into two categories:

 1) Short expiry times which expect halfways accurate expiry

 2) Long term expiry times are inaccurate today already due to the batching
    which is done for NOHZ.

So for long term expiry timers we can avoid the cascading property and just
leave them in the less granular outer wheels until expiry or cancelation.

Contrary to the classic wheel the granularities of the next wheel is not the
capacity of the first wheel. The granularities of the wheels are in the
currently chosen setting 8 times the granularity of the previous wheel. So for
HZ=250 we end up with the following granularity levels:

Level Offset  Granularity            Range
  0   0          4 ms                 0 ms -        252 ms
  1  64         32 ms               256 ms -       2044 ms (256ms - ~2s)
  2 128        256 ms              2048 ms -      16380 ms (~2s - ~16s)
  3 192       2048 ms (~2s)       16384 ms -     131068 ms (~16s - ~2m)
  4 256      16384 ms (~16s)     131072 ms -    1048572 ms (~2m - ~17m)
  5 320     131072 ms (~2m)     1048576 ms -    8388604 ms (~17m - ~2h)

That's a worst case inaccuracy of 12.5% for the timers which are queued at the
beginning of a level. 

So the new wheel concept addresses the old issues:

1) Cascading is avoided (except for extreme long time timers)

2) By keeping the timers in the bucket until expiry/cancelation we can track
   the buckets which have timers enqueued in a bucket bitmap and therefor can
   lookup the next expiring timer fast and time bound.

A further benefit of the concept is, that the slack calculation which is done
on every timer start is not longer necessary because the granularity levels
provide natural batching already.

Our extensive testing with various loads did not show any performance
degradation vs. the current wheel implementation.

This series has also preparatory patches for changing the NOHZ timer handling
from the current push to a pull model. Currently we decide at timer enqueue
time on which cpu we queue the timer. This is exceptionally silly because
there is no way to predict at enqueue time which cpu will be idle when the
timer expires. Given the fact that most timers are canceled or rearmed before
expiry this is even more silly. We trade a expensive decision and cross cpu
access for a very doubtful benefit.

The solution to this is to store the migrateable timers in a seperate storage
space on the local cpu and when the cpu goes idle tell the others about its
next expiring timer and let the other cpu pull the timer in case of expiry. We
have a proof of concept implementation for this, but it's not yet ready for
posting.

Thanks,

	tglx
---
 arch/x86/kernel/apic/x2apic_uv_x.c  |    4 
 arch/x86/kernel/cpu/mcheck/mce.c    |    4 
 block/genhd.c                       |    5 
 drivers/cpufreq/powernv-cpufreq.c   |    5 
 drivers/mmc/host/jz4740_mmc.c       |    2 
 drivers/net/ethernet/tile/tilepro.c |    4 
 drivers/power/bq27xxx_battery.c     |    5 
 drivers/tty/metag_da.c              |    4 
 drivers/tty/mips_ejtag_fdc.c        |    4 
 drivers/usb/host/ohci-hcd.c         |    1 
 drivers/usb/host/xhci.c             |    2 
 include/linux/list.h                |   10 
 include/linux/timer.h               |   30 
 include/trace/events/timer.h        |   11 
 kernel/time/tick-internal.h         |    1 
 kernel/time/tick-sched.c            |   46 -
 kernel/time/timer.c                 | 1090 +++++++++++++++++++++---------------
 lib/random32.c                      |    1 
 net/ipv4/inet_connection_sock.c     |    7 
 net/ipv4/inet_timewait_sock.c       |    5 
 20 files changed, 731 insertions(+), 510 deletions(-)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 02/20] x86/apic/uv: Initialize timer as pinned
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
  2016-06-13  8:40 ` [patch 01/20] timer: Make pinned a timer property Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13  8:40 ` [patch 03/20] x86/mce: " Thomas Gleixner
                   ` (19 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: x86-apic-uv--Initialize-timer-as-pinned.patch --]
[-- Type: text/plain, Size: 1115 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/apic/x2apic_uv_x.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: b/arch/x86/kernel/apic/x2apic_uv_x.c
===================================================================
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -919,7 +919,7 @@ static void uv_heartbeat(unsigned long i
 	uv_set_scir_bits(bits);
 
 	/* enable next timer period */
-	mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+	mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
 }
 
 static void uv_heartbeat_enable(int cpu)
@@ -928,7 +928,7 @@ static void uv_heartbeat_enable(int cpu)
 		struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
 
 		uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
-		setup_timer(timer, uv_heartbeat, cpu);
+		setup_pinned_timer(timer, uv_heartbeat, cpu);
 		timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
 		add_timer_on(timer, cpu);
 		uv_cpu_scir_info(cpu)->enabled = 1;

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 03/20] x86/mce: Initialize timer as pinned
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
  2016-06-13  8:40 ` [patch 01/20] timer: Make pinned a timer property Thomas Gleixner
  2016-06-13  8:40 ` [patch 02/20] x86/apic/uv: Initialize timer as pinned Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13  8:40 ` [patch 04/20] cpufreq/powernv: " Thomas Gleixner
                   ` (18 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: x86-mce--Initialize-timer-as-pinned.patch --]
[-- Type: text/plain, Size: 966 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: b/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1309,7 +1309,7 @@ static void __restart_timer(struct timer
 
 	if (timer_pending(t)) {
 		if (time_before(when, t->expires))
-			mod_timer_pinned(t, when);
+			mod_timer(t, when);
 	} else {
 		t->expires = round_jiffies(when);
 		add_timer_on(t, smp_processor_id());
@@ -1735,7 +1735,7 @@ static void __mcheck_cpu_init_timer(void
 	struct timer_list *t = this_cpu_ptr(&mce_timer);
 	unsigned int cpu = smp_processor_id();
 
-	setup_timer(t, mce_timer_fn, cpu);
+	setup_pinned_timer(t, mce_timer_fn, cpu);
 	mce_start_timer(cpu, t);
 }
 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 05/20] driver/net/ethernet/tile: Initialize timer as pinned
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (3 preceding siblings ...)
  2016-06-13  8:40 ` [patch 04/20] cpufreq/powernv: " Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13  8:40 ` [patch 06/20] drivers/tty/metag_da: " Thomas Gleixner
                   ` (16 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: driver-net-ethernet-tile--Initialize-timer-as-pinned.patch --]
[-- Type: text/plain, Size: 940 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/net/ethernet/tile/tilepro.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/drivers/net/ethernet/tile/tilepro.c
+++ b/drivers/net/ethernet/tile/tilepro.c
@@ -588,7 +588,7 @@ static bool tile_net_lepp_free_comps(str
 static void tile_net_schedule_egress_timer(struct tile_net_cpu *info)
 {
 	if (!info->egress_timer_scheduled) {
-		mod_timer_pinned(&info->egress_timer, jiffies + 1);
+		mod_timer(&info->egress_timer, jiffies + 1);
 		info->egress_timer_scheduled = true;
 	}
 }
@@ -1004,7 +1004,7 @@ static void tile_net_register(void *dev_
 		BUG();
 
 	/* Initialize the egress timer. */
-	init_timer(&info->egress_timer);
+	init_pinned_timer(&info->egress_timer);
 	info->egress_timer.data = (long)info;
 	info->egress_timer.function = tile_net_handle_egress_timer;
 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 04/20] cpufreq/powernv: Initialize timer as pinned
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (2 preceding siblings ...)
  2016-06-13  8:40 ` [patch 03/20] x86/mce: " Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13 13:18   ` Arjan van de Ven
  2016-06-13  8:40 ` [patch 05/20] driver/net/ethernet/tile: " Thomas Gleixner
                   ` (17 subsequent siblings)
  21 siblings, 1 reply; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: cpufreq-powernv--Initialize-timer-as-pinned.patch --]
[-- Type: text/plain, Size: 1067 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/cpufreq/powernv-cpufreq.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Index: b/drivers/cpufreq/powernv-cpufreq.c
===================================================================
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -530,8 +530,7 @@ static inline void  queue_gpstate_timer(
 	else
 		timer_interval = GPSTATE_TIMER_INTERVAL;
 
-	mod_timer_pinned(&gpstates->timer, jiffies +
-			msecs_to_jiffies(timer_interval));
+	mod_timer(&gpstates->timer, jiffies + msecs_to_jiffies(timer_interval));
 }
 
 /**
@@ -699,7 +698,7 @@ static int powernv_cpufreq_cpu_init(stru
 	policy->driver_data = gpstates;
 
 	/* initialize timer */
-	init_timer_deferrable(&gpstates->timer);
+	init_timer_pinned_deferrable(&gpstates->timer);
 	gpstates->timer.data = (unsigned long)policy;
 	gpstates->timer.function = gpstate_timer_handler;
 	gpstates->timer.expires = jiffies +

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 06/20] drivers/tty/metag_da: Initialize timer as pinned
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (4 preceding siblings ...)
  2016-06-13  8:40 ` [patch 05/20] driver/net/ethernet/tile: " Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13 13:13   ` Arjan van de Ven
  2016-06-13  8:40 ` [patch 07/20] drivers/tty/mips_ejtag: " Thomas Gleixner
                   ` (15 subsequent siblings)
  21 siblings, 1 reply; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: drivers-tty-metag_da--Initialize-timer-as-pinned.patch --]
[-- Type: text/plain, Size: 816 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/tty/metag_da.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: b/drivers/tty/metag_da.c
===================================================================
--- a/drivers/tty/metag_da.c
+++ b/drivers/tty/metag_da.c
@@ -323,12 +323,12 @@ static void dashtty_timer(unsigned long
 	if (channel >= 0)
 		fetch_data(channel);
 
-	mod_timer_pinned(&poll_timer, jiffies + DA_TTY_POLL);
+	mod_pinned(&poll_timer, jiffies + DA_TTY_POLL);
 }
 
 static void add_poll_timer(struct timer_list *poll_timer)
 {
-	setup_timer(poll_timer, dashtty_timer, 0);
+	setup_pinned_timer(poll_timer, dashtty_timer, 0);
 	poll_timer->expires = jiffies + DA_TTY_POLL;
 
 	/*

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 07/20] drivers/tty/mips_ejtag: Initialize timer as pinned
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (5 preceding siblings ...)
  2016-06-13  8:40 ` [patch 06/20] drivers/tty/metag_da: " Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13  8:40 ` [patch 08/20] net/ipv4/inet: Initialize timers " Thomas Gleixner
                   ` (14 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: drivers-tty-mips_ejtag--Initialize-timer-as-pinned.patch --]
[-- Type: text/plain, Size: 1060 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/tty/mips_ejtag_fdc.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: b/drivers/tty/mips_ejtag_fdc.c
===================================================================
--- a/drivers/tty/mips_ejtag_fdc.c
+++ b/drivers/tty/mips_ejtag_fdc.c
@@ -689,7 +689,7 @@ static void mips_ejtag_fdc_tty_timer(uns
 
 	mips_ejtag_fdc_handle(priv);
 	if (!priv->removing)
-		mod_timer_pinned(&priv->poll_timer, jiffies + FDC_TTY_POLL);
+		mod_timer(&priv->poll_timer, jiffies + FDC_TTY_POLL);
 }
 
 /* TTY Port operations */
@@ -1002,7 +1002,7 @@ static int mips_ejtag_fdc_tty_probe(stru
 		raw_spin_unlock_irq(&priv->lock);
 	} else {
 		/* If we didn't get an usable IRQ, poll instead */
-		setup_timer(&priv->poll_timer, mips_ejtag_fdc_tty_timer,
+		setup_pinned_timer(&priv->poll_timer, mips_ejtag_fdc_tty_timer,
 			    (unsigned long)priv);
 		priv->poll_timer.expires = jiffies + FDC_TTY_POLL;
 		/*

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 09/20] timer: Remove mod_timer_pinned
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (7 preceding siblings ...)
  2016-06-13  8:40 ` [patch 08/20] net/ipv4/inet: Initialize timers " Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13  8:40 ` [patch 10/20] timer: Add a cascading tracepoint Thomas Gleixner
                   ` (12 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: timer--Remove-mod_timer_pinned.patch --]
[-- Type: text/plain, Size: 3706 bytes --]

We switched all users to initialize the timers as pinned and call
mod_timer(). Remove the now unused function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h |    3 ---
 kernel/time/timer.c   |   39 +++++----------------------------------
 2 files changed, 5 insertions(+), 37 deletions(-)

Index: b/include/linux/timer.h
===================================================================
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -190,12 +190,9 @@ extern void add_timer_on(struct timer_li
 extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
-extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires);
 
 extern void set_timer_slack(struct timer_list *time, int slack_hz);
 
-#define MOD_TIMER_NOT_PINNED	0
-#define MOD_TIMER_PINNED	1
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
Index: b/kernel/time/timer.c
===================================================================
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -764,8 +764,7 @@ static struct tvec_base *lock_timer_base
 }
 
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires,
-	    bool pending_only, int pinned)
+__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
@@ -782,7 +781,7 @@ static inline int
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED);
+	new_base = get_target_base(base, timer->flags & TIMER_PINNED);
 
 	if (base != new_base) {
 		/*
@@ -825,7 +824,7 @@ static inline int
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, true);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -900,39 +899,11 @@ int mod_timer(struct timer_list *timer,
 	if (timer_pending(timer) && timer->expires == expires)
 		return 1;
 
-	return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, false);
 }
 EXPORT_SYMBOL(mod_timer);
 
 /**
- * mod_timer_pinned - modify a timer's timeout
- * @timer: the timer to be modified
- * @expires: new timeout in jiffies
- *
- * mod_timer_pinned() is a way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
- * and to ensure that the timer is scheduled on the current CPU.
- *
- * Note that this does not prevent the timer from being migrated
- * when the current CPU goes offline.  If this is a problem for
- * you, use CPU-hotplug notifiers to handle it correctly, for
- * example, cancelling the timer when the corresponding CPU goes
- * offline.
- *
- * mod_timer_pinned(timer, expires) is equivalent to:
- *
- *     del_timer(timer); timer->expires = expires; add_timer(timer);
- */
-int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
-{
-	if (timer->expires == expires && timer_pending(timer))
-		return 1;
-
-	return __mod_timer(timer, expires, false, MOD_TIMER_PINNED);
-}
-EXPORT_SYMBOL(mod_timer_pinned);
-
-/**
  * add_timer - start a timer
  * @timer: the timer to be added
  *
@@ -1512,7 +1483,7 @@ signed long __sched schedule_timeout(sig
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED);
+	__mod_timer(&timer, expire, false);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 08/20] net/ipv4/inet: Initialize timers as pinned
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (6 preceding siblings ...)
  2016-06-13  8:40 ` [patch 07/20] drivers/tty/mips_ejtag: " Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13  8:40 ` [patch 09/20] timer: Remove mod_timer_pinned Thomas Gleixner
                   ` (13 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: net-ipv4-inet--Initialize-timers-as-pinned.patch --]
[-- Type: text/plain, Size: 2252 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 net/ipv4/inet_connection_sock.c |    7 ++++---
 net/ipv4/inet_timewait_sock.c   |    5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

Index: b/net/ipv4/inet_connection_sock.c
===================================================================
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -603,7 +603,7 @@ static void reqsk_timer_handler(unsigned
 		if (req->num_timeout++ == 0)
 			atomic_dec(&queue->young);
 		timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
-		mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
+		mod_timer(&req->rsk_timer, jiffies + timeo);
 		return;
 	}
 drop:
@@ -617,8 +617,9 @@ static void reqsk_queue_hash_req(struct
 	req->num_timeout = 0;
 	req->sk = NULL;
 
-	setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
-	mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
+	setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler,
+			    (unsigned long)req);
+	mod_timer(&req->rsk_timer, jiffies + timeout);
 
 	inet_ehash_insert(req_to_sk(req), NULL);
 	/* before letting lookups find us, make sure all req fields
Index: b/net/ipv4/inet_timewait_sock.c
===================================================================
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -188,7 +188,8 @@ struct inet_timewait_sock *inet_twsk_all
 		tw->tw_prot	    = sk->sk_prot_creator;
 		atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
 		twsk_net_set(tw, sock_net(sk));
-		setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw);
+		setup_pinned_timer(&tw->tw_timer, tw_timer_handler,
+				   (unsigned long)tw);
 		/*
 		 * Because we use RCU lookups, we should not set tw_refcnt
 		 * to a non null value before everything is setup for this
@@ -248,7 +249,7 @@ void __inet_twsk_schedule(struct inet_ti
 
 	tw->tw_kill = timeo <= 4*HZ;
 	if (!rearm) {
-		BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo));
+		BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
 		atomic_inc(&tw->tw_dr->tw_count);
 	} else {
 		mod_timer_pending(&tw->tw_timer, jiffies + timeo);

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 10/20] timer: Add a cascading tracepoint
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (8 preceding siblings ...)
  2016-06-13  8:40 ` [patch 09/20] timer: Remove mod_timer_pinned Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13  8:40 ` [patch 11/20] hlist: Add hlist_is_last_node() helper Thomas Gleixner
                   ` (11 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: timers--Add-a-cascading-tracepoint.patch --]
[-- Type: text/plain, Size: 1115 bytes --]

For analysis of the timer wheel its necessary to trace the cascading. Add a
tracepoint.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: rt@linutronix.de
---
 include/trace/events/timer.h |   11 +++++++++++
 kernel/time/timer.c          |    1 +
 2 files changed, 12 insertions(+)

--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -126,6 +126,17 @@ DEFINE_EVENT(timer_class, timer_cancel,
 );
 
 /**
+ * timer_cascade - called when the timer is cascaded
+ * @timer:	pointer to struct timer_list
+ */
+DEFINE_EVENT(timer_class, timer_cascade,
+
+	TP_PROTO(struct timer_list *timer),
+
+	TP_ARGS(timer)
+);
+
+/**
  * hrtimer_init - called when the hrtimer is initialized
  * @hrtimer:	pointer to struct hrtimer
  * @clockid:	the hrtimers clock
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1099,6 +1099,7 @@ static int cascade(struct tvec_base *bas
 	 * don't have to detach them individually.
 	 */
 	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
+		trace_timer_cascade(timer);
 		/* No accounting, while moving them */
 		__internal_add_timer(base, timer);
 	}

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 11/20] hlist: Add hlist_is_last_node() helper
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (9 preceding siblings ...)
  2016-06-13  8:40 ` [patch 10/20] timer: Add a cascading tracepoint Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13 10:27   ` Paolo Bonzini
  2016-06-13  8:40 ` [patch 12/20] timer: Give a few structs and members proper names Thomas Gleixner
                   ` (10 subsequent siblings)
  21 siblings, 1 reply; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: hlist--Add-hlist_is_last_node-helper.patch --]
[-- Type: text/plain, Size: 780 bytes --]

Required to figure out whether the entry is the last one in the hlist.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/list.h |   10 ++++++++++
 1 file changed, 10 insertions(+)

Index: b/include/linux/list.h
===================================================================
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -679,6 +679,16 @@ static inline bool hlist_fake(struct hli
 }
 
 /*
+ * Check whether the node is the last node of the head without
+ * accessing head.
+ */
+static inline bool hlist_is_last_node(struct hlist_node *n,
+				      struct hlist_head *h)
+{
+	return !n->next && n->pprev == &h->first;
+}
+
+/*
  * Move a list from one list head to another. Fixup the pprev
  * reference of the first entry if it exists.
  */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 12/20] timer: Give a few structs and members proper names
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (10 preceding siblings ...)
  2016-06-13  8:40 ` [patch 11/20] hlist: Add hlist_is_last_node() helper Thomas Gleixner
@ 2016-06-13  8:40 ` Thomas Gleixner
  2016-06-13  8:41 ` [patch 13/20] timer: Switch to a non cascading wheel Thomas Gleixner
                   ` (9 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:40 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: timer--Give-a-few-structs-and-members-proper-names.patch --]
[-- Type: text/plain, Size: 13082 bytes --]

Some of the names are not longer correct and others are simply too long to
type. Clean it up before we switch the wheel implementation over to the new
scheme.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c |  118 ++++++++++++++++++++++++++--------------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

Index: b/kernel/time/timer.c
===================================================================
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -77,10 +77,10 @@ struct tvec_root {
 	struct hlist_head vec[TVR_SIZE];
 };
 
-struct tvec_base {
+struct timer_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
-	unsigned long timer_jiffies;
+	unsigned long clk;
 	unsigned long next_timer;
 	unsigned long active_timers;
 	unsigned long all_timers;
@@ -95,7 +95,7 @@ struct tvec_base {
 } ____cacheline_aligned;
 
 
-static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
@@ -106,15 +106,15 @@ void timers_update_migration(bool update
 	unsigned int cpu;
 
 	/* Avoid the loop, if nothing to update */
-	if (this_cpu_read(tvec_bases.migration_enabled) == on)
+	if (this_cpu_read(timer_bases.migration_enabled) == on)
 		return;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(tvec_bases.migration_enabled, cpu) = on;
+		per_cpu(timer_bases.migration_enabled, cpu) = on;
 		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
 		if (!update_nohz)
 			continue;
-		per_cpu(tvec_bases.nohz_active, cpu) = true;
+		per_cpu(timer_bases.nohz_active, cpu) = true;
 		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
 	}
 }
@@ -134,18 +134,18 @@ int timer_migration_handler(struct ctl_t
 	return ret;
 }
 
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
+static inline struct timer_base *get_target_base(struct timer_base *base,
 						int pinned)
 {
 	if (pinned || !base->migration_enabled)
-		return this_cpu_ptr(&tvec_bases);
-	return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
+		return this_cpu_ptr(&timer_bases);
+	return per_cpu_ptr(&timer_bases, get_nohz_timer_target());
 }
 #else
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
+static inline struct timer_base *get_target_base(struct timer_base *base,
 						int pinned)
 {
-	return this_cpu_ptr(&tvec_bases);
+	return this_cpu_ptr(&timer_bases);
 }
 #endif
 
@@ -371,10 +371,10 @@ void set_timer_slack(struct timer_list *
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
 static void
-__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
-	unsigned long idx = expires - base->timer_jiffies;
+	unsigned long idx = expires - base->clk;
 	struct hlist_head *vec;
 
 	if (idx < TVR_SIZE) {
@@ -394,7 +394,7 @@ static void
 		 * Can happen if you add a timer with expires == jiffies,
 		 * or you set a timer to go off in the past
 		 */
-		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+		vec = base->tv1.vec + (base->clk & TVR_MASK);
 	} else {
 		int i;
 		/* If the timeout is larger than MAX_TVAL (on 64-bit
@@ -403,7 +403,7 @@ static void
 		 */
 		if (idx > MAX_TVAL) {
 			idx = MAX_TVAL;
-			expires = idx + base->timer_jiffies;
+			expires = idx + base->clk;
 		}
 		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv5.vec + i;
@@ -412,11 +412,11 @@ static void
 	hlist_add_head(&timer->entry, vec);
 }
 
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	/* Advance base->jiffies, if the base is empty */
 	if (!base->all_timers++)
-		base->timer_jiffies = jiffies;
+		base->clk = jiffies;
 
 	__internal_add_timer(base, timer);
 	/*
@@ -707,7 +707,7 @@ static inline void detach_timer(struct t
 }
 
 static inline void
-detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
+detach_expired_timer(struct timer_list *timer, struct timer_base *base)
 {
 	detach_timer(timer, true);
 	if (!(timer->flags & TIMER_DEFERRABLE))
@@ -715,7 +715,7 @@ detach_expired_timer(struct timer_list *
 	base->all_timers--;
 }
 
-static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
 			     bool clear_pending)
 {
 	if (!timer_pending(timer))
@@ -725,16 +725,16 @@ static int detach_if_pending(struct time
 	if (!(timer->flags & TIMER_DEFERRABLE)) {
 		base->active_timers--;
 		if (timer->expires == base->next_timer)
-			base->next_timer = base->timer_jiffies;
+			base->next_timer = base->clk;
 	}
 	/* If this was the last timer, advance base->jiffies */
 	if (!--base->all_timers)
-		base->timer_jiffies = jiffies;
+		base->clk = jiffies;
 	return 1;
 }
 
 /*
- * We are using hashed locking: holding per_cpu(tvec_bases).lock
+ * We are using hashed locking: holding per_cpu(timer_bases).lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
  *
@@ -744,16 +744,16 @@ static int detach_if_pending(struct time
  * When the timer's base is locked and removed from the list, the
  * TIMER_MIGRATING flag is set, FIXME
  */
-static struct tvec_base *lock_timer_base(struct timer_list *timer,
+static struct timer_base *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 	__acquires(timer->base->lock)
 {
 	for (;;) {
 		u32 tf = timer->flags;
-		struct tvec_base *base;
+		struct timer_base *base;
 
 		if (!(tf & TIMER_MIGRATING)) {
-			base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
+			base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK);
 			spin_lock_irqsave(&base->lock, *flags);
 			if (timer->flags == tf)
 				return base;
@@ -766,7 +766,7 @@ static struct tvec_base *lock_timer_base
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
-	struct tvec_base *base, *new_base;
+	struct timer_base *base, *new_base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -933,8 +933,8 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu);
-	struct tvec_base *base;
+	struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu);
+	struct timer_base *base;
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
@@ -975,7 +975,7 @@ EXPORT_SYMBOL_GPL(add_timer_on);
  */
 int del_timer(struct timer_list *timer)
 {
-	struct tvec_base *base;
+	struct timer_base *base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -1001,7 +1001,7 @@ EXPORT_SYMBOL(del_timer);
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-	struct tvec_base *base;
+	struct timer_base *base;
 	unsigned long flags;
 	int ret = -1;
 
@@ -1085,7 +1085,7 @@ int del_timer_sync(struct timer_list *ti
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static int cascade(struct tvec_base *base, struct tvec *tv, int index)
+static int cascade(struct timer_base *base, struct tvec *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
 	struct timer_list *timer;
@@ -1150,7 +1150,7 @@ static void call_timer_fn(struct timer_l
 	}
 }
 
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
@@ -1159,23 +1159,23 @@ static void call_timer_fn(struct timer_l
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-static inline void __run_timers(struct tvec_base *base)
+static inline void __run_timers(struct timer_base *base)
 {
 	struct timer_list *timer;
 
 	spin_lock_irq(&base->lock);
 
-	while (time_after_eq(jiffies, base->timer_jiffies)) {
+	while (time_after_eq(jiffies, base->clk)) {
 		struct hlist_head work_list;
 		struct hlist_head *head = &work_list;
 		int index;
 
 		if (!base->all_timers) {
-			base->timer_jiffies = jiffies;
+			base->clk = jiffies;
 			break;
 		}
 
-		index = base->timer_jiffies & TVR_MASK;
+		index = base->clk & TVR_MASK;
 
 		/*
 		 * Cascade timers:
@@ -1185,7 +1185,7 @@ static inline void __run_timers(struct t
 				(!cascade(base, &base->tv3, INDEX(1))) &&
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
-		++base->timer_jiffies;
+		++base->clk;
 		hlist_move_list(base->tv1.vec + index, head);
 		while (!hlist_empty(head)) {
 			void (*fn)(unsigned long);
@@ -1223,16 +1223,16 @@ static inline void __run_timers(struct t
  * is used on S/390 to stop all activity when a CPU is idle.
  * This function needs to be called with interrupts disabled.
  */
-static unsigned long __next_timer_interrupt(struct tvec_base *base)
+static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
-	unsigned long timer_jiffies = base->timer_jiffies;
-	unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
+	unsigned long clk = base->clk;
+	unsigned long expires = clk + NEXT_TIMER_MAX_DELTA;
 	int index, slot, array, found = 0;
 	struct timer_list *nte;
 	struct tvec *varray[4];
 
 	/* Look for timer events in tv1. */
-	index = slot = timer_jiffies & TVR_MASK;
+	index = slot = clk & TVR_MASK;
 	do {
 		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
 			if (nte->flags & TIMER_DEFERRABLE)
@@ -1251,8 +1251,8 @@ static unsigned long __next_timer_interr
 cascade:
 	/* Calculate the next cascade event */
 	if (index)
-		timer_jiffies += TVR_SIZE - index;
-	timer_jiffies >>= TVR_BITS;
+		clk += TVR_SIZE - index;
+	clk >>= TVR_BITS;
 
 	/* Check tv2-tv5. */
 	varray[0] = &base->tv2;
@@ -1263,7 +1263,7 @@ static unsigned long __next_timer_interr
 	for (array = 0; array < 4; array++) {
 		struct tvec *varp = varray[array];
 
-		index = slot = timer_jiffies & TVN_MASK;
+		index = slot = clk & TVN_MASK;
 		do {
 			hlist_for_each_entry(nte, varp->vec + slot, entry) {
 				if (nte->flags & TIMER_DEFERRABLE)
@@ -1287,8 +1287,8 @@ static unsigned long __next_timer_interr
 		} while (slot != index);
 
 		if (index)
-			timer_jiffies += TVN_SIZE - index;
-		timer_jiffies >>= TVN_BITS;
+			clk += TVN_SIZE - index;
+		clk >>= TVN_BITS;
 	}
 	return expires;
 }
@@ -1336,7 +1336,7 @@ static u64 cmp_next_hrtimer_event(u64 ba
  */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
 
@@ -1349,7 +1349,7 @@ u64 get_next_timer_interrupt(unsigned lo
 
 	spin_lock(&base->lock);
 	if (base->active_timers) {
-		if (time_before_eq(base->next_timer, base->timer_jiffies))
+		if (time_before_eq(base->next_timer, base->clk))
 			base->next_timer = __next_timer_interrupt(base);
 		nextevt = base->next_timer;
 		if (time_before_eq(nextevt, basej))
@@ -1388,9 +1388,9 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases);
 
-	if (time_after_eq(jiffies, base->timer_jiffies))
+	if (time_after_eq(jiffies, base->clk))
 		__run_timers(base);
 }
 
@@ -1535,7 +1535,7 @@ signed long __sched schedule_timeout_idl
 EXPORT_SYMBOL(schedule_timeout_idle);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
+static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
 {
 	struct timer_list *timer;
 	int cpu = new_base->cpu;
@@ -1551,13 +1551,13 @@ static void migrate_timer_list(struct tv
 
 static void migrate_timers(int cpu)
 {
-	struct tvec_base *old_base;
-	struct tvec_base *new_base;
+	struct timer_base *old_base;
+	struct timer_base *new_base;
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu_ptr(&tvec_bases, cpu);
-	new_base = get_cpu_ptr(&tvec_bases);
+	old_base = per_cpu_ptr(&timer_bases, cpu);
+	new_base = get_cpu_ptr(&timer_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1581,7 +1581,7 @@ static void migrate_timers(int cpu)
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
-	put_cpu_ptr(&tvec_bases);
+	put_cpu_ptr(&timer_bases);
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1609,13 +1609,13 @@ static inline void timer_register_cpu_no
 
 static void __init init_timer_cpu(int cpu)
 {
-	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
+	struct timer_base *base = per_cpu_ptr(&timer_bases, cpu);
 
 	base->cpu = cpu;
 	spin_lock_init(&base->lock);
 
-	base->timer_jiffies = jiffies;
-	base->next_timer = base->timer_jiffies;
+	base->clk = jiffies;
+	base->next_timer = base->clk;
 }
 
 static void __init init_timer_cpus(void)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (11 preceding siblings ...)
  2016-06-13  8:40 ` [patch 12/20] timer: Give a few structs and members proper names Thomas Gleixner
@ 2016-06-13  8:41 ` Thomas Gleixner
  2016-06-13 11:40   ` Peter Zijlstra
  2016-06-13 14:36   ` Richard Cochran
  2016-06-13  8:41 ` [patch 15/20] timer: Move __run_timers() function Thomas Gleixner
                   ` (8 subsequent siblings)
  21 siblings, 2 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:41 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: timer-Rework-timer-wheel.patch --]
[-- Type: text/plain, Size: 36756 bytes --]

The current timer wheel has some drawbacks:

1) Cascading

   Cascading can be an unbound operation and is completely pointless in most
   cases because the vast majority of the timer wheel timers are canceled or
   rearmed before expiration.

2) No fast lookup of the next expiring timer

   In NOHZ scenarios the first timer soft interrupt after a long NOHZ period
   must fast forward the base time to current jiffies. As we have no way to
   find the next expiring timer fast, the code loops and increments the base
   time by one and checks for expired timers in each step.

After a thorough analysis of real world data gathered on laptops,
workstations, webservers and other machines (thanks Chris!) I came to the
conclusion that the current 'classic' timer wheel implementation can be
modified to address the above issues.

The vast majority of timer wheel timers is canceled or rearmed before
expiry. Most of them are timeouts for networking and other I/O tasks. The
nature of timeouts is to catch the exception from normal operation (TCP ack
timed out, disk does not respond, etc.). For these kind of timeouts the
accuracy is not really a concern. In case the timeout fires, performance is
down the drain already.

The few timers which actually expire can be split into two categories:

 1) Short expiry times which expect halfways accurate expiry

 2) Long term expiry times are inaccurate today already due to the batching
    which is done for NOHZ.

So for long term expiry timers we can avoid the cascading property and just
leave them in the less granular outer wheels until expiry or cancelation.

Contrary to the classic wheel the granularities of the next wheel is not the
capacity of the first wheel. The granularities of the wheels are in the
currently chosen setting 8 times the granularity of the previous wheel. So for
HZ=250 we end up with the following granularity levels:

Level Offset  Granularity            Range
  0   0          4 ms                 0 ms -        252 ms
  1  64         32 ms               256 ms -       2044 ms (256ms - ~2s)
  2 128        256 ms              2048 ms -      16380 ms (~2s - ~16s)
  3 192       2048 ms (~2s)       16384 ms -     131068 ms (~16s - ~2m)
  4 256      16384 ms (~16s)     131072 ms -    1048572 ms (~2m - ~17m)
  5 320     131072 ms (~2m)     1048576 ms -    8388604 ms (~17m - ~2h)

That's a worst case inaccuracy of 12.5% for the timers which are queued at the
beginning of a level. 

So the new wheel concept addresses the old issues:

1) Cascading is avoided (except for extreme long time timers)

2) By keeping the timers in the bucket until expiry/cancelation we can track
   the buckets which have timers enqueued in a bucket bitmap and therefor can
   lookup the next expiring timer fast and time bound.

A further benefit of the concept is, that the slack calculation which is done
on every timer start is not longer necessary because the granularity levels
provide natural batching already.

Our extensive testing with various loads did not show any performance
degradation vs. the current wheel implementation.

This patch does not address the 'fast lookup' issue as we wanted to make sure
that there is no regression introduced by the wheel redesign. The
optimizations are in follow up patches.

[ Contains fixes from Anna-Maria Gleixner and Richard Cochran ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: rt@linutronix.de

---
v3: fix return value of __next_timer_interrupt()
v2: change HASH_SIZE to TOT_HASH_SIZE (as Richard mentioned)

 include/linux/timer.h |    2 
 kernel/time/timer.c   |  825 ++++++++++++++++++++++++++++----------------------
 2 files changed, 471 insertions(+), 356 deletions(-)

--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -64,6 +64,8 @@ struct timer_list {
 #define TIMER_DEFERRABLE	0x00100000
 #define TIMER_PINNED		0x00200000
 #define TIMER_IRQSAFE		0x00400000
+#define TIMER_ARRAYSHIFT	23
+#define TIMER_ARRAYMASK		0xFF800000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -59,43 +59,136 @@
 EXPORT_SYMBOL(jiffies_64);
 
 /*
- * per-CPU timer vector definitions:
+ * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * level has a different granularity.
+ *
+ * The level granularity is:		LVL_CLK_DIV ^ lvl
+ * The level clock frequency is:	HZ / (LVL_CLK_DIV ^ level)
+ *
+ * The array level of a newly armed timer depends on the relative expiry
+ * time. The farther the expiry time is away the higher the array level and
+ * therefor the granularity becomes.
+ *
+ * Contrary to the original timer wheel implementation, which aims for 'exact'
+ * expiry of the timers, this implementation mostly removes the need for
+ * recascading the timers into the lower array levels. The previous 'classic'
+ * timer wheel implementation of the kernel already violated the 'exact'
+ * expiry by adding slack to the expiry time to provide batched
+ * expiration. The granularity levels provide implicit batching.
+ *
+ * This is an optimization of the original timer wheel implementation for the
+ * majority of the timer wheel use cases: timeouts. The vast majority of
+ * timeout timers (networking, disk I/O ...) are canceled before expiry. If
+ * the timeout expires it indicates that normal operation is disturbed, so it
+ * does not matter much whether the timeout comes with a slight delay.
+ *
+ * The currently chosen array constants values are a good compromise between
+ * array size and granularity. This results in the following granularity and
+ * range levels:
+ *
+ * HZ 1000
+ * Level Offset  Granularity            Range
+ *  0   0          1 ms                 0 ms -         63 ms
+ *  1  64          8 ms                64 ms -        511 ms
+ *  2 128         64 ms               512 ms -       4095 ms (512ms - ~4s)
+ *  3 192        512 ms              4096 ms -      32767 ms (~4s - ~32s)
+ *  4 256       4096 ms (~4s)       32768 ms -     262143 ms (~32s - ~4m)
+ *  5 320      32768 ms (~32s)     262144 ms -    2097151 ms (~4m - ~34m)
+ *
+ * HZ  250
+ * Level Offset  Granularity            Range
+ *  0   0          4 ms                 0 ms -        252 ms
+ *  1  64         32 ms               256 ms -       2044 ms (256ms - ~2s)
+ *  2 128        256 ms              2048 ms -      16380 ms (~2s - ~16s)
+ *  3 192       2048 ms (~2s)       16384 ms -     131068 ms (~16s - ~2m)
+ *  4 256      16384 ms (~16s)     131072 ms -    1048572 ms (~2m - ~17m)
+ *  5 320     131072 ms (~2m)     1048576 ms -    8388604 ms (~17m - ~2h)
+ *
+ * HZ  100
+ * Level Offset  Granularity            Range
+ *  0   0         10 ms                 0 ms -        630 ms
+ *  1  64         80 ms               640 ms -       5110 ms (640ms - ~5s)
+ *  2 128        640 ms              5120 ms -      40950 ms (~5s - ~40s)
+ *  3 192       5120 ms (~5s)       40960 ms -     327670 ms (~40s - ~5m)
+ *  4 256      40960 ms (~40s)     327680 ms -    2621430 ms (~5m - ~43m)
+ *  5 320     327680 ms (~5m)     2621440 ms -   20971510 ms (~43m - ~5h)
+ */
+
+/* Size of each clock level */
+#define LVL_SHIFT	6
+#define LVL_SIZE	(1 << LVL_SHIFT)
+#define LVL_MASK	(LVL_SIZE - 1)
+
+/* Level depth */
+#define LVL_DEPTH	6
+
+/*
+ * The resulting wheel size. If NOHZ is configured we allocate two
+ * wheels so we have a separate storage for the deferrable timers.
  */
-#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
-#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
-#define TVN_SIZE (1 << TVN_BITS)
-#define TVR_SIZE (1 << TVR_BITS)
-#define TVN_MASK (TVN_SIZE - 1)
-#define TVR_MASK (TVR_SIZE - 1)
-#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
+#define WHEEL_SIZE	(LVL_SIZE * LVL_DEPTH)
 
-struct tvec {
-	struct hlist_head vec[TVN_SIZE];
-};
+/* Level offsets in the wheel */
+#define LVL0_OFFS	(0)
+#define LVL1_OFFS	(LVL_SIZE)
+#define LVL2_OFFS	(LVL1_OFFS + LVL_SIZE)
+#define LVL3_OFFS	(LVL2_OFFS + LVL_SIZE)
+#define LVL4_OFFS	(LVL3_OFFS + LVL_SIZE)
+#define LVL5_OFFS	(LVL4_OFFS + LVL_SIZE)
+
+/* Clock divisor for the next level */
+#define LVL_CLK_SHIFT	3
+#define LVL_CLK_DIV	(1 << LVL_CLK_SHIFT)
+#define LVL_CLK_MASK	(LVL_CLK_DIV - 1)
+
+/* The shift constants for selecting the bucket at the levels */
+#define LVL1_SHIFT	(1 * LVL_CLK_SHIFT)
+#define LVL2_SHIFT	(2 * LVL_CLK_SHIFT)
+#define LVL3_SHIFT	(3 * LVL_CLK_SHIFT)
+#define LVL4_SHIFT	(4 * LVL_CLK_SHIFT)
+#define LVL5_SHIFT	(5 * LVL_CLK_SHIFT)
+
+/* The granularity of each level */
+#define LVL0_GRAN	0x00000001
+#define LVL1_GRAN	(LVL0_GRAN << LVL_CLK_SHIFT)
+#define LVL2_GRAN	(LVL1_GRAN << LVL_CLK_SHIFT)
+#define LVL3_GRAN	(LVL2_GRAN << LVL_CLK_SHIFT)
+#define LVL4_GRAN	(LVL3_GRAN << LVL_CLK_SHIFT)
+#define LVL5_GRAN	(LVL4_GRAN << LVL_CLK_SHIFT)
 
-struct tvec_root {
-	struct hlist_head vec[TVR_SIZE];
-};
+/*
+ * The time start value for each level to select the bucket at enqueue
+ * time.
+ */
+#define LVL1_TSTART	(LVL_SIZE - 1)
+#define LVL2_TSTART	(LVL1_TSTART << LVL_CLK_SHIFT)
+#define LVL3_TSTART	(LVL2_TSTART << LVL_CLK_SHIFT)
+#define LVL4_TSTART	(LVL3_TSTART << LVL_CLK_SHIFT)
+#define LVL5_TSTART	(LVL4_TSTART << LVL_CLK_SHIFT)
+
+#ifdef CONFIG_NO_HZ_COMMON
+# define NR_BASES	2
+# define BASE_STD	0
+# define BASE_DEF	1
+#else
+# define NR_BASES	1
+# define BASE_STD	0
+# define BASE_DEF	0
+#endif
 
 struct timer_base {
-	spinlock_t lock;
-	struct timer_list *running_timer;
-	unsigned long clk;
-	unsigned long next_timer;
-	unsigned long active_timers;
-	unsigned long all_timers;
-	int cpu;
-	bool migration_enabled;
-	bool nohz_active;
-	struct tvec_root tv1;
-	struct tvec tv2;
-	struct tvec tv3;
-	struct tvec tv4;
-	struct tvec tv5;
+	spinlock_t		lock;
+	struct timer_list	*running_timer;
+	unsigned long		clk;
+	unsigned int		cpu;
+	bool			migration_enabled;
+	bool			nohz_active;
+	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
+	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
 
-
-static DEFINE_PER_CPU(struct timer_base, timer_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
@@ -106,15 +199,17 @@ void timers_update_migration(bool update
 	unsigned int cpu;
 
 	/* Avoid the loop, if nothing to update */
-	if (this_cpu_read(timer_bases.migration_enabled) == on)
+	if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
 		return;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(timer_bases.migration_enabled, cpu) = on;
+		per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
+		per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
 		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
 		if (!update_nohz)
 			continue;
-		per_cpu(timer_bases.nohz_active, cpu) = true;
+		per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
+		per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
 		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
 	}
 }
@@ -133,20 +228,6 @@ int timer_migration_handler(struct ctl_t
 	mutex_unlock(&mutex);
 	return ret;
 }
-
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						int pinned)
-{
-	if (pinned || !base->migration_enabled)
-		return this_cpu_ptr(&timer_bases);
-	return per_cpu_ptr(&timer_bases, get_nohz_timer_target());
-}
-#else
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						int pinned)
-{
-	return this_cpu_ptr(&timer_bases);
-}
 #endif
 
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -370,78 +451,84 @@ void set_timer_slack(struct timer_list *
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
+static inline unsigned int timer_get_idx(struct timer_list *timer)
+{
+	return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
+}
+
+static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
+{
+	timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
+			idx << TIMER_ARRAYSHIFT;
+}
+
+/*
+ * Helper function to calculate the array index for a given expiry
+ * time.
+ */
+static inline unsigned calc_index(unsigned expires, unsigned gran,
+				  unsigned sft, unsigned base)
+{
+	return base + (((expires + gran) >> sft) & LVL_MASK);
+}
+
 static void
 __internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
-	unsigned long idx = expires - base->clk;
+	unsigned long delta = expires - base->clk;
 	struct hlist_head *vec;
+	unsigned int idx;
 
-	if (idx < TVR_SIZE) {
-		int i = expires & TVR_MASK;
-		vec = base->tv1.vec + i;
-	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
-		int i = (expires >> TVR_BITS) & TVN_MASK;
-		vec = base->tv2.vec + i;
-	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
-		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
-		vec = base->tv3.vec + i;
-	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
-		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
-		vec = base->tv4.vec + i;
-	} else if ((signed long) idx < 0) {
-		/*
-		 * Can happen if you add a timer with expires == jiffies,
-		 * or you set a timer to go off in the past
-		 */
-		vec = base->tv1.vec + (base->clk & TVR_MASK);
+	if (delta < LVL1_TSTART) {
+		idx = (expires + LVL0_GRAN) & LVL_MASK;
+	} else if (delta < LVL2_TSTART) {
+		idx = calc_index(expires, LVL1_GRAN, LVL1_SHIFT, LVL1_OFFS);
+	} else if (delta < LVL3_TSTART) {
+		idx = calc_index(expires, LVL2_GRAN, LVL2_SHIFT, LVL2_OFFS);
+	} else if (delta < LVL4_TSTART) {
+		idx = calc_index(expires, LVL3_GRAN, LVL3_SHIFT, LVL3_OFFS);
+	} else if (delta < LVL5_TSTART) {
+		idx = calc_index(expires, LVL4_GRAN, LVL4_SHIFT, LVL4_OFFS);
+	} else if ((long) delta < 0) {
+		idx = base->clk & LVL_MASK;
 	} else {
-		int i;
-		/* If the timeout is larger than MAX_TVAL (on 64-bit
-		 * architectures or with CONFIG_BASE_SMALL=1) then we
-		 * use the maximum timeout.
+		/*
+		 * The long timeouts go into the last array level. They
+		 * have to be cascaded eventually, but most of them
+		 * are removed way before cascading takes place.
 		 */
-		if (idx > MAX_TVAL) {
-			idx = MAX_TVAL;
-			expires = idx + base->clk;
-		}
-		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
-		vec = base->tv5.vec + i;
+		idx = calc_index(expires, LVL5_GRAN, LVL5_SHIFT, LVL5_OFFS);
 	}
 
+	/*
+	 * Enqueue the timer into the array bucket, mark it pending in
+	 * the bitmap and store the index in the timer flags.
+	 */
+	vec = base->vectors + idx;
 	hlist_add_head(&timer->entry, vec);
+	__set_bit(idx, base->pending_map);
+	timer_set_idx(timer, idx);
 }
 
 static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
-	/* Advance base->jiffies, if the base is empty */
-	if (!base->all_timers++)
-		base->clk = jiffies;
-
 	__internal_add_timer(base, timer);
-	/*
-	 * Update base->active_timers and base->next_timer
-	 */
-	if (!(timer->flags & TIMER_DEFERRABLE)) {
-		if (!base->active_timers++ ||
-		    time_before(timer->expires, base->next_timer))
-			base->next_timer = timer->expires;
-	}
 
 	/*
 	 * Check whether the other CPU is in dynticks mode and needs
-	 * to be triggered to reevaluate the timer wheel.
-	 * We are protected against the other CPU fiddling
-	 * with the timer by holding the timer base lock. This also
-	 * makes sure that a CPU on the way to stop its tick can not
-	 * evaluate the timer wheel.
+	 * to be triggered to reevaluate the timer wheel.  We are
+	 * protected against the other CPU fiddling with the timer by
+	 * holding the timer base lock. This also makes sure that a
+	 * CPU on the way to stop its tick can not evaluate the timer
+	 * wheel.
 	 *
 	 * Spare the IPI for deferrable timers on idle targets though.
 	 * The next busy ticks will take care of it. Except full dynticks
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (base->nohz_active) {
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
 		if (!(timer->flags & TIMER_DEFERRABLE) ||
 		    tick_nohz_full_cpu(base->cpu))
 			wake_up_nohz_cpu(base->cpu);
@@ -706,54 +793,92 @@ static inline void detach_timer(struct t
 	entry->next = LIST_POISON2;
 }
 
-static inline void
-detach_expired_timer(struct timer_list *timer, struct timer_base *base)
+static inline void detach_expired_timer(struct timer_list *timer)
 {
 	detach_timer(timer, true);
-	if (!(timer->flags & TIMER_DEFERRABLE))
-		base->active_timers--;
-	base->all_timers--;
 }
 
 static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
 			     bool clear_pending)
 {
+	unsigned idx = timer_get_idx(timer);
+
 	if (!timer_pending(timer))
 		return 0;
 
+	if (hlist_is_last_node(&timer->entry, base->vectors + idx))
+		__clear_bit(idx, base->pending_map);
+
 	detach_timer(timer, clear_pending);
-	if (!(timer->flags & TIMER_DEFERRABLE)) {
-		base->active_timers--;
-		if (timer->expires == base->next_timer)
-			base->next_timer = base->clk;
-	}
-	/* If this was the last timer, advance base->jiffies */
-	if (!--base->all_timers)
-		base->clk = jiffies;
 	return 1;
 }
 
+static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
+{
+	struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+
+	/*
+	 * If the timer is deferrable and nohz is active then we need to use
+	 * the deferrable base.
+	 */
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+	    (tflags & TIMER_DEFERRABLE))
+		base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
+	return base;
+}
+
+static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
+{
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+	/*
+	 * If the timer is deferrable and nohz is active then we need to use
+	 * the deferrable base.
+	 */
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+	    (tflags & TIMER_DEFERRABLE))
+		base = this_cpu_ptr(&timer_bases[BASE_DEF]);
+	return base;
+}
+
+static inline struct timer_base *get_timer_base(u32 tflags)
+{
+	return get_timer_cpu_base(tflags, tflags & TIMER_BASEMASK);
+}
+
+static inline struct timer_base *get_target_base(struct timer_base *base,
+						 unsigned tflags)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+	if ((tflags & TIMER_PINNED) || !base->migration_enabled)
+		return get_timer_this_cpu_base(tflags);
+	return get_timer_cpu_base(tflags, get_nohz_timer_target());
+#else
+	return get_timer_this_cpu_base(tflags);
+#endif
+}
+
 /*
- * We are using hashed locking: holding per_cpu(timer_bases).lock
- * means that all timers which are tied to this base via timer->base are
- * locked, and the base itself is locked too.
+ * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
+ * that all timers which are tied to this base are locked, and the base itself
+ * is locked too.
  *
  * So __run_timers/migrate_timers can safely modify all timers which could
- * be found on ->tvX lists.
+ * be found in the base->vectors array.
  *
- * When the timer's base is locked and removed from the list, the
- * TIMER_MIGRATING flag is set, FIXME
+ * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
+ * to wait until the migration is done.
  */
 static struct timer_base *lock_timer_base(struct timer_list *timer,
-					unsigned long *flags)
+					  unsigned long *flags)
 	__acquires(timer->base->lock)
 {
 	for (;;) {
-		u32 tf = timer->flags;
 		struct timer_base *base;
+		u32 tf = timer->flags;
 
 		if (!(tf & TIMER_MIGRATING)) {
-			base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK);
+			base = get_timer_base(tf);
 			spin_lock_irqsave(&base->lock, *flags);
 			if (timer->flags == tf)
 				return base;
@@ -770,6 +895,27 @@ static inline int
 	unsigned long flags;
 	int ret = 0;
 
+	/*
+	 * TODO: Calculate the array bucket of the timer right here w/o
+	 * holding the base lock. This allows to check not only
+	 * timer->expires == expires below, but also whether the timer
+	 * ends up in the same bucket. If we really need to requeue
+	 * the timer then we check whether base->clk have
+	 * advanced between here and locking the timer base. If
+	 * jiffies advanced we have to recalc the array bucket with the
+	 * lock held.
+	 */
+
+	/*
+	 * This is a common optimization triggered by the
+	 * networking code - if the timer is re-modified
+	 * to be the same thing then just return:
+	 */
+	if (timer_pending(timer)) {
+		if (timer->expires == expires)
+			return 1;
+	}
+
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
 
@@ -781,15 +927,15 @@ static inline int
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, timer->flags & TIMER_PINNED);
+	new_base = get_target_base(base, timer->flags);
 
 	if (base != new_base) {
 		/*
-		 * We are trying to schedule the timer on the local CPU.
+		 * We are trying to schedule the timer on the new base.
 		 * However we can't change timer's base while it is running,
 		 * otherwise del_timer_sync() can't detect that the timer's
-		 * handler yet has not finished. This also guarantees that
-		 * the timer is serialized wrt itself.
+		 * handler yet has not finished. This also guarantees that the
+		 * timer is serialized wrt itself.
 		 */
 		if (likely(base->running_timer != timer)) {
 			/* See the comment in lock_timer_base() */
@@ -828,45 +974,6 @@ int mod_timer_pending(struct timer_list
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
-/*
- * Decide where to put the timer while taking the slack into account
- *
- * Algorithm:
- *   1) calculate the maximum (absolute) time
- *   2) calculate the highest bit where the expires and new max are different
- *   3) use this bit to make a mask
- *   4) use the bitmask to round down the maximum time, so that all last
- *      bits are zeros
- */
-static inline
-unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
-{
-	unsigned long expires_limit, mask;
-	int bit;
-
-	if (timer->slack >= 0) {
-		expires_limit = expires + timer->slack;
-	} else {
-		long delta = expires - jiffies;
-
-		if (delta < 256)
-			return expires;
-
-		expires_limit = expires + delta / 256;
-	}
-	mask = expires ^ expires_limit;
-	if (mask == 0)
-		return expires;
-
-	bit = __fls(mask);
-
-	mask = (1UL << bit) - 1;
-
-	expires_limit = expires_limit & ~(mask);
-
-	return expires_limit;
-}
-
 /**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
@@ -889,16 +996,6 @@ unsigned long apply_slack(struct timer_l
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	expires = apply_slack(timer, expires);
-
-	/*
-	 * This is a common optimization triggered by the
-	 * networking code - if the timer is re-modified
-	 * to be the same thing then just return:
-	 */
-	if (timer_pending(timer) && timer->expires == expires)
-		return 1;
-
 	return __mod_timer(timer, expires, false);
 }
 EXPORT_SYMBOL(mod_timer);
@@ -933,13 +1030,14 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu);
-	struct timer_base *base;
+	struct timer_base *new_base, *base;
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(timer_pending(timer) || !timer->function);
 
+	new_base = get_timer_cpu_base(timer->flags, cpu);
+
 	/*
 	 * If @timer was on a different CPU, it should be migrated with the
 	 * old base locked to prevent other operations proceeding with the
@@ -1085,28 +1183,6 @@ int del_timer_sync(struct timer_list *ti
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static int cascade(struct timer_base *base, struct tvec *tv, int index)
-{
-	/* cascade all the timers from tv up one level */
-	struct timer_list *timer;
-	struct hlist_node *tmp;
-	struct hlist_head tv_list;
-
-	hlist_move_list(tv->vec + index, &tv_list);
-
-	/*
-	 * We are removing _all_ timers from the list, so we
-	 * don't have to detach them individually.
-	 */
-	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-		trace_timer_cascade(timer);
-		/* No accounting, while moving them */
-		__internal_add_timer(base, timer);
-	}
-
-	return index;
-}
-
 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 			  unsigned long data)
 {
@@ -1150,68 +1226,97 @@ static void call_timer_fn(struct timer_l
 	}
 }
 
-#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+static void expire_timers(struct timer_base *base, struct hlist_head *head)
+{
+	while (!hlist_empty(head)) {
+		struct timer_list *timer;
+		void (*fn)(unsigned long);
+		unsigned long data;
+
+		timer = hlist_entry(head->first, struct timer_list, entry);
+		fn = timer->function;
+		data = timer->data;
+
+		timer_stats_account_timer(timer);
+
+		base->running_timer = timer;
+		detach_expired_timer(timer);
+
+		if (timer->flags & TIMER_IRQSAFE) {
+			spin_unlock(&base->lock);
+			call_timer_fn(timer, fn, data);
+			spin_lock(&base->lock);
+		} else {
+			spin_unlock_irq(&base->lock);
+			call_timer_fn(timer, fn, data);
+			spin_lock_irq(&base->lock);
+		}
+	}
+}
+
+static int collect_expired_timers(struct timer_base *base,
+				  struct hlist_head *heads)
+{
+	unsigned long clock = base->clk;
+	struct hlist_head tmp, *vec;
+	int i, levels = 0;
+	unsigned int idx;
+
+	/* Expire the regular buckets */
+	for (i = 0; i < LVL_DEPTH - 1; i++) {
+		idx = (clock & LVL_MASK) + i * LVL_SIZE;
+
+		if (__test_and_clear_bit(idx, base->pending_map)) {
+			vec = base->vectors + idx;
+			hlist_move_list(vec, heads++);
+			levels++;
+		}
+		/* Is it time to look at the next level? */
+		if (clock & LVL_CLK_MASK)
+			return levels;
+		/* Shift clock for the next level granularity */
+		clock >>= LVL_CLK_SHIFT;
+	}
+
+	/* Cascading, sigh... */
+	idx = (clock & LVL_MASK) + i * LVL_SIZE;
+
+	if (__test_and_clear_bit(idx, base->pending_map)) {
+		vec = base->vectors + idx;
+		hlist_move_list(vec, &tmp);
+		/* Make sure we queue them in the future */
+		base->clk++;
+		while (!hlist_empty(&tmp)) {
+			struct timer_list *timer;
+
+			timer = hlist_entry(tmp.first, struct timer_list, entry);
+			trace_timer_cascade(timer);
+			__hlist_del(&timer->entry);
+			__internal_add_timer(base, timer);
+		}
+		base->clk--;
+	}
+	return levels;
+}
 
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
  * @base: the timer vector to be processed.
- *
- * This function cascades all vectors and executes all expired timer
- * vectors.
  */
 static inline void __run_timers(struct timer_base *base)
 {
-	struct timer_list *timer;
+	struct hlist_head heads[LVL_DEPTH];
+	int levels;
 
 	spin_lock_irq(&base->lock);
 
 	while (time_after_eq(jiffies, base->clk)) {
-		struct hlist_head work_list;
-		struct hlist_head *head = &work_list;
-		int index;
-
-		if (!base->all_timers) {
-			base->clk = jiffies;
-			break;
-		}
 
-		index = base->clk & TVR_MASK;
+		levels = collect_expired_timers(base, heads);
+		base->clk++;
 
-		/*
-		 * Cascade timers:
-		 */
-		if (!index &&
-			(!cascade(base, &base->tv2, INDEX(0))) &&
-				(!cascade(base, &base->tv3, INDEX(1))) &&
-					!cascade(base, &base->tv4, INDEX(2)))
-			cascade(base, &base->tv5, INDEX(3));
-		++base->clk;
-		hlist_move_list(base->tv1.vec + index, head);
-		while (!hlist_empty(head)) {
-			void (*fn)(unsigned long);
-			unsigned long data;
-			bool irqsafe;
-
-			timer = hlist_entry(head->first, struct timer_list, entry);
-			fn = timer->function;
-			data = timer->data;
-			irqsafe = timer->flags & TIMER_IRQSAFE;
-
-			timer_stats_account_timer(timer);
-
-			base->running_timer = timer;
-			detach_expired_timer(timer, base);
-
-			if (irqsafe) {
-				spin_unlock(&base->lock);
-				call_timer_fn(timer, fn, data);
-				spin_lock(&base->lock);
-			} else {
-				spin_unlock_irq(&base->lock);
-				call_timer_fn(timer, fn, data);
-				spin_lock_irq(&base->lock);
-			}
-		}
+		while (levels--)
+			expire_timers(base, heads + levels);
 	}
 	base->running_timer = NULL;
 	spin_unlock_irq(&base->lock);
@@ -1219,78 +1324,93 @@ static inline void __run_timers(struct t
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * Find out when the next timer event is due to happen. This
- * is used on S/390 to stop all activity when a CPU is idle.
- * This function needs to be called with interrupts disabled.
+ * Find the next pending bucket of a level. Search from @offset + @clk upwards
+ * and if nothing there, search from start of the level (@offset) up to
+ * @offset + clk.
+ */
+static int next_pending_bucket(struct timer_base *base, unsigned offset,
+			       unsigned clk)
+{
+	unsigned pos, start = offset + clk;
+	unsigned end = offset + LVL_SIZE;
+
+	pos = find_next_bit(base->pending_map, end, start);
+	if (pos < end)
+		return pos - start;
+
+	pos = find_next_bit(base->pending_map, start, offset);
+	return pos < start ? pos + LVL_SIZE - start : -1;
+}
+
+/*
+ * Search the first expiring timer in the various clock levels.
+ *
+ * Note: This implementation might be suboptimal vs. timers enqueued in the
+ *	 cascade level because we do not look at the timers to figure out when
+ *	 they really expire. So for now, we just treat the cascading timers
+ *	 like any other timer. If each cascading bucket has a timer, we wake
+ *	 up with the granularity of the last level.
  */
 static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
-	unsigned long clk = base->clk;
-	unsigned long expires = clk + NEXT_TIMER_MAX_DELTA;
-	int index, slot, array, found = 0;
-	struct timer_list *nte;
-	struct tvec *varray[4];
-
-	/* Look for timer events in tv1. */
-	index = slot = clk & TVR_MASK;
-	do {
-		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
-			if (nte->flags & TIMER_DEFERRABLE)
-				continue;
-
-			found = 1;
-			expires = nte->expires;
-			/* Look at the cascade bucket(s)? */
-			if (!index || slot < index)
-				goto cascade;
-			return expires;
-		}
-		slot = (slot + 1) & TVR_MASK;
-	} while (slot != index);
+	unsigned long clk, next, adj;
+	unsigned lvl, offset = 0;
 
-cascade:
-	/* Calculate the next cascade event */
-	if (index)
-		clk += TVR_SIZE - index;
-	clk >>= TVR_BITS;
-
-	/* Check tv2-tv5. */
-	varray[0] = &base->tv2;
-	varray[1] = &base->tv3;
-	varray[2] = &base->tv4;
-	varray[3] = &base->tv5;
-
-	for (array = 0; array < 4; array++) {
-		struct tvec *varp = varray[array];
-
-		index = slot = clk & TVN_MASK;
-		do {
-			hlist_for_each_entry(nte, varp->vec + slot, entry) {
-				if (nte->flags & TIMER_DEFERRABLE)
-					continue;
-
-				found = 1;
-				if (time_before(nte->expires, expires))
-					expires = nte->expires;
-			}
-			/*
-			 * Do we still search for the first timer or are
-			 * we looking up the cascade buckets ?
-			 */
-			if (found) {
-				/* Look at the cascade bucket(s)? */
-				if (!index || slot < index)
-					break;
-				return expires;
-			}
-			slot = (slot + 1) & TVN_MASK;
-		} while (slot != index);
-
-		if (index)
-			clk += TVN_SIZE - index;
-		clk >>= TVN_BITS;
+	spin_lock(&base->lock);
+	clk = base->clk;
+	next = clk + NEXT_TIMER_MAX_DELTA;
+	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
+		int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+
+		if (pos >= 0) {
+			unsigned long tmp = clk + (unsigned long) pos;
+
+			tmp <<= lvl * LVL_CLK_SHIFT;
+			if (time_before(tmp, next))
+				next = tmp;
+		}
+		/*
+		 * Clock for the next level. If the current level clock lower
+		 * bits are zero, we look at the next level as is. If not we
+		 * need to advance it by one because that's going to be the
+		 * next expiring bucket in that level. base->clk is the next
+		 * expiring jiffie. So in case of:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    0    0
+		 *
+		 * we have to look at all levels @index 0. With
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    0    2
+		 *
+		 * LVL0 has the next expiring bucket @index 2. The upper
+		 * levels have the next expiring bucket @index 1.
+		 *
+		 * In case that the propagation wraps the next level the same
+		 * rules apply:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    F    2
+		 *
+		 * So after looking at LVL0 we get:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1
+		 *  0    0    0    1    0
+		 *
+		 * So no propagation from LVL1 to LVL2 because that happened
+		 * with the add already, but then we need to propagate further
+		 * from LVL2 to LVL3.
+		 *
+		 * So the simple check whether the lower bits of the current
+		 * level are 0 or not is sufficient for all cases.
+		 */
+		adj = clk & LVL_CLK_MASK ? 1 : 0;
+		clk >>= LVL_CLK_SHIFT;
+		clk += adj;
 	}
-	return expires;
+	spin_unlock(&base->lock);
+	return next;
 }
 
 /*
@@ -1336,7 +1456,7 @@ static u64 cmp_next_hrtimer_event(u64 ba
  */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct timer_base *base = this_cpu_ptr(&timer_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
 
@@ -1347,17 +1467,11 @@ u64 get_next_timer_interrupt(unsigned lo
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
-	spin_lock(&base->lock);
-	if (base->active_timers) {
-		if (time_before_eq(base->next_timer, base->clk))
-			base->next_timer = __next_timer_interrupt(base);
-		nextevt = base->next_timer;
-		if (time_before_eq(nextevt, basej))
-			expires = basem;
-		else
-			expires = basem + (nextevt - basej) * TICK_NSEC;
-	}
-	spin_unlock(&base->lock);
+	nextevt = __next_timer_interrupt(base);
+	if (time_before_eq(nextevt, basej))
+		expires = basem;
+	else
+		expires = basem + (nextevt - basej) * TICK_NSEC;
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
@@ -1388,10 +1502,14 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct timer_base *base = this_cpu_ptr(&timer_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
-	if (time_after_eq(jiffies, base->clk))
-		__run_timers(base);
+	if (!time_after_eq(jiffies, base->clk))
+		return;
+
+	__run_timers(base);
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+		__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
 }
 
 /*
@@ -1542,7 +1660,6 @@ static void migrate_timer_list(struct ti
 
 	while (!hlist_empty(head)) {
 		timer = hlist_entry(head->first, struct timer_list, entry);
-		/* We ignore the accounting on the dying cpu */
 		detach_timer(timer, false);
 		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		internal_add_timer(new_base, timer);
@@ -1553,35 +1670,29 @@ static void migrate_timers(int cpu)
 {
 	struct timer_base *old_base;
 	struct timer_base *new_base;
-	int i;
+	int b, i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu_ptr(&timer_bases, cpu);
-	new_base = get_cpu_ptr(&timer_bases);
-	/*
-	 * The caller is globally serialized and nobody else
-	 * takes two locks at once, deadlock is not possible.
-	 */
-	spin_lock_irq(&new_base->lock);
-	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
-	BUG_ON(old_base->running_timer);
+	for (b = 0; b < NR_BASES; b++) {
+		old_base = per_cpu_ptr(&timer_bases[b], cpu);
+		new_base = get_cpu_ptr(&timer_bases[b]);
+		/*
+		 * The caller is globally serialized and nobody else
+		 * takes two locks at once, deadlock is not possible.
+		 */
+		spin_lock_irq(&new_base->lock);
+		spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+		BUG_ON(old_base->running_timer);
 
-	for (i = 0; i < TVR_SIZE; i++)
-		migrate_timer_list(new_base, old_base->tv1.vec + i);
-	for (i = 0; i < TVN_SIZE; i++) {
-		migrate_timer_list(new_base, old_base->tv2.vec + i);
-		migrate_timer_list(new_base, old_base->tv3.vec + i);
-		migrate_timer_list(new_base, old_base->tv4.vec + i);
-		migrate_timer_list(new_base, old_base->tv5.vec + i);
-	}
-
-	old_base->active_timers = 0;
-	old_base->all_timers = 0;
-
-	spin_unlock(&old_base->lock);
-	spin_unlock_irq(&new_base->lock);
-	put_cpu_ptr(&timer_bases);
+		for (i = 0; i < WHEEL_SIZE; i++)
+			migrate_timer_list(new_base, old_base->vectors + i);
+
+		spin_unlock(&old_base->lock);
+		spin_unlock_irq(&new_base->lock);
+		put_cpu_ptr(&timer_bases);
+	}
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1609,13 +1720,15 @@ static inline void timer_register_cpu_no
 
 static void __init init_timer_cpu(int cpu)
 {
-	struct timer_base *base = per_cpu_ptr(&timer_bases, cpu);
-
-	base->cpu = cpu;
-	spin_lock_init(&base->lock);
+	struct timer_base *base;
+	int i;
 
-	base->clk = jiffies;
-	base->next_timer = base->clk;
+	for (i = 0; i < NR_BASES; i++) {
+		base = per_cpu_ptr(&timer_bases[i], cpu);
+		base->cpu = cpu;
+		spin_lock_init(&base->lock);
+		base->clk = jiffies;
+	}
 }
 
 static void __init init_timer_cpus(void)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 15/20] timer: Move __run_timers() function
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (12 preceding siblings ...)
  2016-06-13  8:41 ` [patch 13/20] timer: Switch to a non cascading wheel Thomas Gleixner
@ 2016-06-13  8:41 ` Thomas Gleixner
  2016-06-13  8:41 ` [patch 14/20] timer: Remove slack leftovers Thomas Gleixner
                   ` (7 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:41 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Anna-Maria Gleixner

[-- Attachment #1: timer-move-functions-around.patch --]
[-- Type: text/plain, Size: 1926 bytes --]

From: Anna-Maria Gleixner <anna-maria@linutronix.de>

Move __run_timers() below __next_timer_interrupt() and next_pending_bucket()
in preparation for __run_timers() NOHZ optimization.

No functional change.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: rt@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c |   46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1280,29 +1280,6 @@ static int collect_expired_timers(struct
 	return levels;
 }
 
-/**
- * __run_timers - run all expired timers (if any) on this CPU.
- * @base: the timer vector to be processed.
- */
-static inline void __run_timers(struct timer_base *base)
-{
-	struct hlist_head heads[LVL_DEPTH];
-	int levels;
-
-	spin_lock_irq(&base->lock);
-
-	while (time_after_eq(jiffies, base->clk)) {
-
-		levels = collect_expired_timers(base, heads);
-		base->clk++;
-
-		while (levels--)
-			expire_timers(base, heads + levels);
-	}
-	base->running_timer = NULL;
-	spin_unlock_irq(&base->lock);
-}
-
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * Find the next pending bucket of a level. Search from @offset + @clk upwards
@@ -1478,6 +1455,29 @@ void update_process_times(int user_tick)
 	run_posix_cpu_timers(p);
 }
 
+/**
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ */
+static inline void __run_timers(struct timer_base *base)
+{
+	struct hlist_head heads[LVL_DEPTH];
+	int levels;
+
+	spin_lock_irq(&base->lock);
+
+	while (time_after_eq(jiffies, base->clk)) {
+
+		levels = collect_expired_timers(base, heads);
+		base->clk++;
+
+		while (levels--)
+			expire_timers(base, heads + levels);
+	}
+	base->running_timer = NULL;
+	spin_unlock_irq(&base->lock);
+}
+
 /*
  * This function runs timers and the timer-tq in bottom half context.
  */

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 14/20] timer: Remove slack leftovers
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (13 preceding siblings ...)
  2016-06-13  8:41 ` [patch 15/20] timer: Move __run_timers() function Thomas Gleixner
@ 2016-06-13  8:41 ` Thomas Gleixner
  2016-06-13  8:41 ` [patch 16/20] timer: Optimize collect timers for NOHZ Thomas Gleixner
                   ` (6 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:41 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: timer--Remove-slack-leftovers.patch --]
[-- Type: text/plain, Size: 4979 bytes --]

We now have implicit batching in the timer wheel. The slack is not longer
used. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 block/genhd.c                   |    5 -----
 drivers/mmc/host/jz4740_mmc.c   |    2 --
 drivers/power/bq27xxx_battery.c |    5 +----
 drivers/usb/host/ohci-hcd.c     |    1 -
 drivers/usb/host/xhci.c         |    2 --
 include/linux/timer.h           |    4 ----
 kernel/time/timer.c             |   19 -------------------
 lib/random32.c                  |    1 -
 8 files changed, 1 insertion(+), 38 deletions(-)

--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1523,12 +1523,7 @@ static void __disk_unblock_events(struct
 	if (--ev->block)
 		goto out_unlock;
 
-	/*
-	 * Not exactly a latency critical operation, set poll timer
-	 * slack to 25% and kick event check.
-	 */
 	intv = disk_events_poll_jiffies(disk);
-	set_timer_slack(&ev->dwork.timer, intv / 4);
 	if (check_now)
 		queue_delayed_work(system_freezable_power_efficient_wq,
 				&ev->dwork, 0);
--- a/drivers/mmc/host/jz4740_mmc.c
+++ b/drivers/mmc/host/jz4740_mmc.c
@@ -1068,8 +1068,6 @@ static int jz4740_mmc_probe(struct platf
 	jz4740_mmc_clock_disable(host);
 	setup_timer(&host->timeout_timer, jz4740_mmc_timeout,
 			(unsigned long)host);
-	/* It is not important when it times out, it just needs to timeout. */
-	set_timer_slack(&host->timeout_timer, HZ);
 
 	host->use_dma = true;
 	if (host->use_dma && jz4740_mmc_acquire_dma_channels(host) != 0)
--- a/drivers/power/bq27xxx_battery.c
+++ b/drivers/power/bq27xxx_battery.c
@@ -735,11 +735,8 @@ static void bq27xxx_battery_poll(struct
 
 	bq27xxx_battery_update(di);
 
-	if (poll_interval > 0) {
-		/* The timer does not have to be accurate. */
-		set_timer_slack(&di->work.timer, poll_interval * HZ / 4);
+	if (poll_interval > 0)
 		schedule_delayed_work(&di->work, poll_interval * HZ);
-	}
 }
 
 /*
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -500,7 +500,6 @@ static int ohci_init (struct ohci_hcd *o
 
 	setup_timer(&ohci->io_watchdog, io_watchdog_func,
 			(unsigned long) ohci);
-	set_timer_slack(&ohci->io_watchdog, msecs_to_jiffies(20));
 
 	ohci->hcca = dma_alloc_coherent (hcd->self.controller,
 			sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL);
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -490,8 +490,6 @@ static void compliance_mode_recovery_tim
 	xhci->comp_mode_recovery_timer.expires = jiffies +
 			msecs_to_jiffies(COMP_MODE_RCVRY_MSECS);
 
-	set_timer_slack(&xhci->comp_mode_recovery_timer,
-			msecs_to_jiffies(COMP_MODE_RCVRY_MSECS));
 	add_timer(&xhci->comp_mode_recovery_timer);
 	xhci_dbg_trace(xhci, trace_xhci_dbg_quirks,
 			"Compliance mode recovery timer initialized");
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -19,7 +19,6 @@ struct timer_list {
 	void			(*function)(unsigned long);
 	unsigned long		data;
 	u32			flags;
-	int			slack;
 
 #ifdef CONFIG_TIMER_STATS
 	int			start_pid;
@@ -73,7 +72,6 @@ struct timer_list {
 		.expires = (_expires),				\
 		.data = (_data),				\
 		.flags = (_flags),				\
-		.slack = -1,					\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
 	}
@@ -193,8 +191,6 @@ extern int del_timer(struct timer_list *
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
 
-extern void set_timer_slack(struct timer_list *time, int slack_hz);
-
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -432,24 +432,6 @@ unsigned long round_jiffies_up_relative(
 }
 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 
-/**
- * set_timer_slack - set the allowed slack for a timer
- * @timer: the timer to be modified
- * @slack_hz: the amount of time (in jiffies) allowed for rounding
- *
- * Set the amount of time, in jiffies, that a certain timer has
- * in terms of slack. By setting this value, the timer subsystem
- * will schedule the actual timer somewhere between
- * the time mod_timer() asks for, and that time plus the slack.
- *
- * By setting the slack to -1, a percentage of the delay is used
- * instead.
- */
-void set_timer_slack(struct timer_list *timer, int slack_hz)
-{
-	timer->slack = slack_hz;
-}
-EXPORT_SYMBOL_GPL(set_timer_slack);
 
 static inline unsigned int timer_get_idx(struct timer_list *timer)
 {
@@ -753,7 +735,6 @@ static void do_init_timer(struct timer_l
 {
 	timer->entry.pprev = NULL;
 	timer->flags = flags | raw_smp_processor_id();
-	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
 	timer->start_site = NULL;
 	timer->start_pid = -1;
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -233,7 +233,6 @@ static void __prandom_timer(unsigned lon
 
 static void __init __prandom_start_seed_timer(void)
 {
-	set_timer_slack(&seed_timer, HZ);
 	seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC);
 	add_timer(&seed_timer);
 }

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 16/20] timer: Optimize collect timers for NOHZ
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (14 preceding siblings ...)
  2016-06-13  8:41 ` [patch 14/20] timer: Remove slack leftovers Thomas Gleixner
@ 2016-06-13  8:41 ` Thomas Gleixner
  2016-06-13  8:41 ` [patch 17/20] tick/sched: Remove pointless empty function Thomas Gleixner
                   ` (5 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:41 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Anna-Maria Gleixner

[-- Attachment #1: timer-Optimize-collect-timers.patch --]
[-- Type: text/plain, Size: 3817 bytes --]

From: Anna-Maria Gleixner <anna-maria@linutronix.de>

After a NOHZ idle sleep the wheel must be forwarded to current jiffies. There
might be expired timers so the current code loops and checks the epxired
buckets for timers. This can take quite some time for long NOHZ idle periods.

The pending bitmask in the timer base allows us to do a quick search for the
next expiring timer and therefor a fast forward of the base time which
prevents pointless long lasting loops.

For a 3 second idle sleep this reduces the catchup time from ~1ms to 5us.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: rt@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 kernel/time/timer.c |   48 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 8 deletions(-)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1235,8 +1235,8 @@ static void expire_timers(struct timer_b
 	}
 }
 
-static int collect_expired_timers(struct timer_base *base,
-				  struct hlist_head *heads)
+static int __collect_expired_timers(struct timer_base *base,
+				    struct hlist_head *heads)
 {
 	unsigned long clock = base->clk;
 	struct hlist_head tmp, *vec;
@@ -1282,9 +1282,9 @@ static int collect_expired_timers(struct
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * Find the next pending bucket of a level. Search from @offset + @clk upwards
- * and if nothing there, search from start of the level (@offset) up to
- * @offset + clk.
+ * Find the next pending bucket of a level. Search from level start (@offset)
+ * + @clk upwards and if nothing there, search from start of the level
+ * (@offset) up to @offset + clk.
  */
 static int next_pending_bucket(struct timer_base *base, unsigned offset,
 			       unsigned clk)
@@ -1301,7 +1301,8 @@ static int next_pending_bucket(struct ti
 }
 
 /*
- * Search the first expiring timer in the various clock levels.
+ * Search the first expiring timer in the various clock levels. Caller must
+ * hold base->lock.
  *
  * Note: This implementation might be suboptimal vs. timers enqueued in the
  *	 cascade level because we do not look at the timers to figure out when
@@ -1314,7 +1315,6 @@ static unsigned long __next_timer_interr
 	unsigned long clk, next, adj;
 	unsigned lvl, offset = 0;
 
-	spin_lock(&base->lock);
 	clk = base->clk;
 	next = clk + NEXT_TIMER_MAX_DELTA;
 	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
@@ -1367,7 +1367,6 @@ static unsigned long __next_timer_interr
 		clk >>= LVL_CLK_SHIFT;
 		clk += adj;
 	}
-	spin_unlock(&base->lock);
 	return next;
 }
 
@@ -1425,7 +1424,10 @@ u64 get_next_timer_interrupt(unsigned lo
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
+	spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
+	spin_unlock(&base->lock);
+
 	if (time_before_eq(nextevt, basej))
 		expires = basem;
 	else
@@ -1433,6 +1435,36 @@ u64 get_next_timer_interrupt(unsigned lo
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
+
+static int collect_expired_timers(struct timer_base *base,
+				  struct hlist_head *heads)
+{
+	/*
+	 * NOHZ optimization. After a long idle sleep we need to forward the
+	 * base to current jiffies. Avoid a loop by searching the bitfield for
+	 * the next expiring timer.
+	 */
+	if ((jiffies - base->clk) > 1) {
+		unsigned long next = __next_timer_interrupt(base);
+
+		/*
+		 * If the next timer is ahead of time forward to current
+		 * jiffies, otherwise forward to the next expiry time.
+		 */
+		if (time_after(next, jiffies)) {
+			base->clk = jiffies;
+			return 0;
+		}
+		base->clk = next;
+	}
+	return __collect_expired_timers(base, heads);
+}
+#else
+static inline int collect_expired_timers(struct timer_base *base,
+					 struct hlist_head *heads)
+{
+	return __collect_expired_timers(base, heads);
+}
 #endif
 
 /*

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 17/20] tick/sched: Remove pointless empty function
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (15 preceding siblings ...)
  2016-06-13  8:41 ` [patch 16/20] timer: Optimize collect timers for NOHZ Thomas Gleixner
@ 2016-06-13  8:41 ` Thomas Gleixner
  2016-06-13  8:41 ` [patch 18/20] timer: Forward wheel clock whenever possible Thomas Gleixner
                   ` (4 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:41 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: tick-sched--Remove-pointless-empty-function.patch --]
[-- Type: text/plain, Size: 1851 bytes --]

This was a failed attempt to optimize the timer expiry in idle, which was
disabled and never revisited. Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c |   33 +--------------------------------
 1 file changed, 1 insertion(+), 32 deletions(-)

Index: b/kernel/time/tick-sched.c
===================================================================
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1092,35 +1092,6 @@ static void tick_nohz_switch_to_nohz(voi
 	tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
 
-/*
- * When NOHZ is enabled and the tick is stopped, we need to kick the
- * tick timer from irq_enter() so that the jiffies update is kept
- * alive during long running softirqs. That's ugly as hell, but
- * correctness is key even if we need to fix the offending softirq in
- * the first place.
- *
- * Note, this is different to tick_nohz_restart. We just kick the
- * timer and do not touch the other magic bits which need to be done
- * when idle is left.
- */
-static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
-{
-#if 0
-	/* Switch back to 2.6.27 behaviour */
-	ktime_t delta;
-
-	/*
-	 * Do not touch the tick device, when the next expiry is either
-	 * already reached or less/equal than the tick period.
-	 */
-	delta =	ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
-	if (delta.tv64 <= tick_period.tv64)
-		return;
-
-	tick_nohz_restart(ts, now);
-#endif
-}
-
 static inline void tick_nohz_irq_enter(void)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -1131,10 +1102,8 @@ static inline void tick_nohz_irq_enter(v
 	now = ktime_get();
 	if (ts->idle_active)
 		tick_nohz_stop_idle(ts, now);
-	if (ts->tick_stopped) {
+	if (ts->tick_stopped)
 		tick_nohz_update_jiffies(now);
-		tick_nohz_kick_tick(ts, now);
-	}
 }
 
 #else

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 18/20] timer: Forward wheel clock whenever possible
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (16 preceding siblings ...)
  2016-06-13  8:41 ` [patch 17/20] tick/sched: Remove pointless empty function Thomas Gleixner
@ 2016-06-13  8:41 ` Thomas Gleixner
  2016-06-13 15:14   ` Richard Cochran
  2016-06-13  8:41 ` [patch 19/20] timer: Split out index calculation Thomas Gleixner
                   ` (3 subsequent siblings)
  21 siblings, 1 reply; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:41 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

[-- Attachment #1: timers--Forward-wheel-clock-on-enqueue.patch --]
[-- Type: text/plain, Size: 7159 bytes --]

The wheel clock is stale when a cpu goes into a long idle sleep. This has the
side effect, that timers which are queued end up in the outer wheel
levels. That results in coarser granularity.

To solve this, we keep track of the idle state and forward the wheel clock
whenever it's possible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-internal.h |    1 
 kernel/time/tick-sched.c    |   13 ++++
 kernel/time/timer.c         |  120 +++++++++++++++++++++++++++++++++++---------
 3 files changed, 111 insertions(+), 23 deletions(-)

--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -164,3 +164,4 @@ static inline void timers_update_migrati
 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
 
 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
+void timer_clear_idle(void);
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick
 	delta = next_tick - basemono;
 	if (delta <= (u64)TICK_NSEC) {
 		tick.tv64 = 0;
+
+		/*
+		 * Tell the timer code that the base is not idle, i.e. undo
+		 * the effect of get_next_timer_interrupt().
+		 */
+		timer_clear_idle();
 		/*
 		 * We've not stopped the tick yet, and there's a timer in the
 		 * next period, so no point in stopping it either, bail.
@@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick
 	tick_do_update_jiffies64(now);
 	cpu_load_update_nohz_stop();
 
+	/*
+	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
+	 * the clock forward checks in the enqueue path.
+	 */
+	timer_clear_idle();
+
 	calc_load_exit_idle();
 	touch_softlockup_watchdog_sched();
 	/*
@@ -1025,6 +1037,7 @@ void tick_nohz_idle_exit(void)
 		tick_nohz_stop_idle(ts, now);
 
 	if (ts->tick_stopped) {
+		timer_clear_idle();
 		tick_nohz_restart_sched_tick(ts, now);
 		tick_nohz_account_idle_ticks(ts);
 	}
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -181,9 +181,11 @@ struct timer_base {
 	spinlock_t		lock;
 	struct timer_list	*running_timer;
 	unsigned long		clk;
+	unsigned long		next_expiry;
 	unsigned int		cpu;
 	bool			migration_enabled;
 	bool			nohz_active;
+	bool			is_idle;
 	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
 	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
@@ -498,23 +500,27 @@ static void internal_add_timer(struct ti
 	__internal_add_timer(base, timer);
 
 	/*
-	 * Check whether the other CPU is in dynticks mode and needs
-	 * to be triggered to reevaluate the timer wheel.  We are
-	 * protected against the other CPU fiddling with the timer by
-	 * holding the timer base lock. This also makes sure that a
-	 * CPU on the way to stop its tick can not evaluate the timer
-	 * wheel.
-	 *
-	 * Spare the IPI for deferrable timers on idle targets though.
-	 * The next busy ticks will take care of it. Except full dynticks
-	 * require special care against races with idle_cpu(), lets deal
-	 * with that later.
-	 */
-	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
-		if (!(timer->flags & TIMER_DEFERRABLE) ||
-		    tick_nohz_full_cpu(base->cpu))
-			wake_up_nohz_cpu(base->cpu);
-	}
+	 * We might have to IPI the remote CPU if the base is idle and the
+	 * timer is not deferrable. If the other cpu is on the way to idle
+	 * then it can't set base->is_idle as we hold base lock.
+	 */
+	if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->is_idle ||
+	    (timer->flags & TIMER_DEFERRABLE))
+		return;
+
+	/* Check whether this is the new first expiring timer */
+	if (time_after_eq(timer->expires, base->next_expiry))
+		return;
+	base->next_expiry = timer->expires;
+
+	/*
+	 * Check whether the other CPU is in dynticks mode and needs to be
+	 * triggered to reevaluate the timer wheel.  We are protected against
+	 * the other CPU fiddling with the timer by holding the timer base
+	 * lock.
+	 */
+	if (tick_nohz_full_cpu(base->cpu))
+		wake_up_nohz_cpu(base->cpu);
 }
 
 #ifdef CONFIG_TIMER_STATS
@@ -827,16 +833,50 @@ static inline struct timer_base *get_tim
 	return get_timer_cpu_base(tflags, tflags & TIMER_BASEMASK);
 }
 
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						 unsigned tflags)
-{
 #ifdef CONFIG_NO_HZ_COMMON
+static inline struct timer_base *__get_target_base(struct timer_base *base,
+						   unsigned tflags)
+{
 	if ((tflags & TIMER_PINNED) || !base->migration_enabled)
 		return get_timer_this_cpu_base(tflags);
 	return get_timer_cpu_base(tflags, get_nohz_timer_target());
+}
+
+static inline void forward_timer_base(struct timer_base *base)
+{
+	/*
+	 * We only forward the base when it's idle and we have a delta between
+	 * base clock and jiffies.
+	 */
+	if (!base->is_idle || (jiffies - base->clk) < 2)
+		return;
+
+	/*
+	 * If the next expiry value is >= jiffies, then we fast forward to
+	 * jiffies otherwise we forward to the next expiry value.
+	 */
+	if (time_after_eq(base->next_expiry, jiffies))
+		base->clk = jiffies;
+	else
+		base->clk = base->next_expiry;
+}
 #else
+static inline struct timer_base *__get_target_base(struct timer_base *base,
+						   unsigned tflags)
+{
 	return get_timer_this_cpu_base(tflags);
+}
+
+static inline void forward_timer_base(struct timer_base *base) { }
 #endif
+
+static inline struct timer_base *get_target_base(struct timer_base *base,
+						 unsigned tflags)
+{
+	struct timer_base *target = __get_target_base(base, tflags);
+
+	forward_timer_base(target);
+	return target;
 }
 
 /*
@@ -1255,6 +1295,7 @@ static int __collect_expired_timers(stru
 		/* Is it time to look at the next level? */
 		if (clock & LVL_CLK_MASK)
 			return levels;
+
 		/* Shift clock for the next level granularity */
 		clock >>= LVL_CLK_SHIFT;
 	}
@@ -1426,16 +1467,49 @@ u64 get_next_timer_interrupt(unsigned lo
 
 	spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
-	spin_unlock(&base->lock);
+	base->next_expiry = nextevt;
+	/*
+	 * We have a fresh next event. Check whether we can forward the base.
+	 */
+	if (time_after_eq(nextevt, jiffies))
+		base->clk = jiffies;
+	else if (time_after(nextevt, base->clk))
+		base->clk = nextevt;
 
-	if (time_before_eq(nextevt, basej))
+	if (time_before_eq(nextevt, basej)) {
 		expires = basem;
-	else
+		base->is_idle = false;
+	} else {
 		expires = basem + (nextevt - basej) * TICK_NSEC;
+		/*
+		 * If we expect to sleep more than a tick, mark the base idle.
+		 */
+		if ((expires - basem) > TICK_NSEC)
+			base->is_idle = true;
+	}
+	spin_unlock(&base->lock);
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
 
+/**
+ * timer_clear_idle - Clear the idle state of the timer base
+ *
+ * Called with interrupts disabled
+ */
+void timer_clear_idle(void)
+{
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+	/*
+	 * We do this unlocked. The worst outcome is a remote enqueue sending
+	 * a pointless IPI, but taking the lock would just make the window for
+	 * sending the IPI a few instructions smaller for the cost of taking
+	 * the lock in the exit from idle path.
+	 */
+	base->is_idle = false;
+}
+
 static int collect_expired_timers(struct timer_base *base,
 				  struct hlist_head *heads)
 {

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 19/20] timer: Split out index calculation
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (17 preceding siblings ...)
  2016-06-13  8:41 ` [patch 18/20] timer: Forward wheel clock whenever possible Thomas Gleixner
@ 2016-06-13  8:41 ` Thomas Gleixner
  2016-06-13  8:41 ` [patch 20/20] timer: Optimization for same expiry time in mod_timer() Thomas Gleixner
                   ` (2 subsequent siblings)
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:41 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Anna-Maria Gleixner

[-- Attachment #1: timers--Split-out-index-calculation.patch --]
[-- Type: text/plain, Size: 2768 bytes --]

From: Anna-Maria Gleixner <anna-maria@linutronix.de>

For further optimizations we need to seperate index calculation and
queueing. No functional change.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c |   41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -456,12 +456,9 @@ static inline unsigned calc_index(unsign
 	return base + (((expires + gran) >> sft) & LVL_MASK);
 }
 
-static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static int calc_wheel_index(unsigned long expires, unsigned long clk)
 {
-	unsigned long expires = timer->expires;
-	unsigned long delta = expires - base->clk;
-	struct hlist_head *vec;
+	unsigned long delta = expires - clk;
 	unsigned int idx;
 
 	if (delta < LVL1_TSTART) {
@@ -475,7 +472,7 @@ static void
 	} else if (delta < LVL5_TSTART) {
 		idx = calc_index(expires, LVL4_GRAN, LVL4_SHIFT, LVL4_OFFS);
 	} else if ((long) delta < 0) {
-		idx = base->clk & LVL_MASK;
+		idx = clk & LVL_MASK;
 	} else {
 		/*
 		 * The long timeouts go into the last array level. They
@@ -485,6 +482,18 @@ static void
 		idx = calc_index(expires, LVL5_GRAN, LVL5_SHIFT, LVL5_OFFS);
 	}
 
+	return idx;
+}
+
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap and store the index in the timer flags.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+			  unsigned int idx)
+{
+	struct hlist_head *vec;
+
 	/*
 	 * Enqueue the timer into the array bucket, mark it pending in
 	 * the bitmap and store the index in the timer flags.
@@ -495,10 +504,19 @@ static void
 	timer_set_idx(timer, idx);
 }
 
-static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static void
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
-	__internal_add_timer(base, timer);
+	unsigned long expires = timer->expires;
+	unsigned int idx;
+
+	idx = calc_wheel_index(expires, base->clk);
+	enqueue_timer(base, timer, idx);
+}
 
+static void
+trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
+{
 	/*
 	 * We might have to IPI the remote CPU if the base is idle and the
 	 * timer is not deferrable. If the other cpu is on the way to idle
@@ -523,6 +541,13 @@ static void internal_add_timer(struct ti
 		wake_up_nohz_cpu(base->cpu);
 }
 
+static void
+internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+	__internal_add_timer(base, timer);
+	trigger_dyntick_cpu(base, timer);
+}
+
 #ifdef CONFIG_TIMER_STATS
 void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 {

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [patch 20/20] timer: Optimization for same expiry time in mod_timer()
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (18 preceding siblings ...)
  2016-06-13  8:41 ` [patch 19/20] timer: Split out index calculation Thomas Gleixner
@ 2016-06-13  8:41 ` Thomas Gleixner
  2016-06-13 14:10 ` [patch 00/20] timer: Refactor the timer wheel Eric Dumazet
  2016-06-13 16:15 ` Paul E. McKenney
  21 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13  8:41 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Anna-Maria Gleixner

[-- Attachment #1: timers-Optimize-same-expiry-time.patch --]
[-- Type: text/plain, Size: 4262 bytes --]

From: Anna-Maria Gleixner <anna-maria@linutronix.de>

The existing optimization for same expiry time in mod_timer() checks whether
the timer expiry time is the same as the new requested expiry time. In the old
timer wheel implementation this does not take the slack batching into account,
neither does the new implementation evaluate whether the new expiry time will
requeue the timer to the same bucket.

To optimize that, we can calculate the resulting bucket and check if the new
expiry time is different from the current expiry time. This calculation
happens outside the base lock held region. If the resulting bucket is the same
we can avoid taking the base lock and requeueing the timer.

If the timer needs to be requeued then we have to check under the base lock
whether the base time has changed between the lockless calculation and taking
the lock. If it has changed we need to recalculate under the lock.

This optimization takes effect for timers which are enqueued into the less
granular wheel levels (1 and above). With a simple test case the functionality
has been verified:

    	    Before	After
Match:	     5.5%	86.6%
Requeue:    94.5%	13.4%
Recalc:  		<0.01%

In the non optimized case the timer is requeued in 94.5% of the cases. With
the index optimization in place the requeue rate drops to 13.4%. The case
where the lockless index calculation has to be redone is less than 0.01%.

With a real world test case (networking) we observed the following changes:

    	    Before	After
Match:	    97.8%	99.7%
Requeue:     2.2%	 0.3%
Recalc:  		<0.001%

That means two percent less lock/requeue/unlock operations in one of the hot
path use cases of timers.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 kernel/time/timer.c |   51 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 16 deletions(-)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -938,28 +938,36 @@ static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct timer_base *base, *new_base;
-	unsigned long flags;
+	unsigned int idx = UINT_MAX;
+	unsigned long clk = 0, flags;
 	int ret = 0;
 
 	/*
-	 * TODO: Calculate the array bucket of the timer right here w/o
-	 * holding the base lock. This allows to check not only
-	 * timer->expires == expires below, but also whether the timer
-	 * ends up in the same bucket. If we really need to requeue
-	 * the timer then we check whether base->clk have
-	 * advanced between here and locking the timer base. If
-	 * jiffies advanced we have to recalc the array bucket with the
-	 * lock held.
-	 */
-
-	/*
-	 * This is a common optimization triggered by the
-	 * networking code - if the timer is re-modified
-	 * to be the same thing then just return:
+	 * This is a common optimization triggered by the networking code - if
+	 * the timer is re-modified to be the same thing or ends up in the
+	 * same array bucket then just return:
 	 */
 	if (timer_pending(timer)) {
 		if (timer->expires == expires)
 			return 1;
+		/*
+		 * Take the current timer_jiffies of base, but without holding
+		 * the lock!
+		 */
+		base = get_timer_base(timer->flags);
+		clk = base->clk;
+
+		idx = calc_wheel_index(expires, clk);
+
+		/*
+		 * Retrieve and compare the array index of the pending
+		 * timer. If it matches set the expiry to the new value so a
+		 * subsequent call will exit in the expires check above.
+		 */
+		if (idx == timer_get_idx(timer)) {
+			timer->expires = expires;
+			return 1;
+		}
 	}
 
 	timer_stats_timer_set_start_info(timer);
@@ -996,7 +1004,18 @@ static inline int
 	}
 
 	timer->expires = expires;
-	internal_add_timer(base, timer);
+	/*
+	 * If idx was calculated above and the base time did not advance
+	 * between calculating idx and taking the lock, only enqueue_timer()
+	 * and trigger_dyntick_cpu() is required. Otherwise we need to
+	 * (re)calculate the wheel index via internal_add_timer().
+	 */
+	if (idx != UINT_MAX && clk == base->clk) {
+		enqueue_timer(base, timer, idx);
+		trigger_dyntick_cpu(base, timer);
+	} else {
+		internal_add_timer(base, timer);
+	}
 
 out_unlock:
 	spin_unlock_irqrestore(&base->lock, flags);

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 11/20] hlist: Add hlist_is_last_node() helper
  2016-06-13  8:40 ` [patch 11/20] hlist: Add hlist_is_last_node() helper Thomas Gleixner
@ 2016-06-13 10:27   ` Paolo Bonzini
  0 siblings, 0 replies; 62+ messages in thread
From: Paolo Bonzini @ 2016-06-13 10:27 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt



On 13/06/2016 10:40, Thomas Gleixner wrote:
>  /*
> + * Check whether the node is the last node of the head without
> + * accessing head.
> + */
> +static inline bool hlist_is_last_node(struct hlist_node *n,
> +				      struct hlist_head *h)
> +{
> +	return !n->next && n->pprev == &h->first;
> +}

Perhaps hlist_is_only_node, or hlist_is_singular_node (which is ugly but
matches list_is_singular)?  last_node seems to check only for !n->next.

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-13  8:41 ` [patch 13/20] timer: Switch to a non cascading wheel Thomas Gleixner
@ 2016-06-13 11:40   ` Peter Zijlstra
  2016-06-13 12:30     ` Thomas Gleixner
  2016-06-13 14:36   ` Richard Cochran
  1 sibling, 1 reply; 62+ messages in thread
From: Peter Zijlstra @ 2016-06-13 11:40 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

On Mon, Jun 13, 2016 at 08:41:00AM -0000, Thomas Gleixner wrote:

> + * HZ 1000
> + * Level Offset  Granularity            Range
> + *  0   0          1 ms                 0 ms -         63 ms
> + *  1  64          8 ms                64 ms -        511 ms
> + *  2 128         64 ms               512 ms -       4095 ms (512ms - ~4s)
> + *  3 192        512 ms              4096 ms -      32767 ms (~4s - ~32s)
> + *  4 256       4096 ms (~4s)       32768 ms -     262143 ms (~32s - ~4m)
> + *  5 320      32768 ms (~32s)     262144 ms -    2097151 ms (~4m - ~34m)


> +static int collect_expired_timers(struct timer_base *base,
> +				  struct hlist_head *heads)
> +{
> +	unsigned long clock = base->clk;
> +	struct hlist_head tmp, *vec;
> +	int i, levels = 0;
> +	unsigned int idx;
> +
> +	/* Expire the regular buckets */
> +	for (i = 0; i < LVL_DEPTH - 1; i++) {
> +		idx = (clock & LVL_MASK) + i * LVL_SIZE;
> +
> +		if (__test_and_clear_bit(idx, base->pending_map)) {
> +			vec = base->vectors + idx;
> +			hlist_move_list(vec, heads++);
> +			levels++;
> +		}
> +		/* Is it time to look at the next level? */
> +		if (clock & LVL_CLK_MASK)
> +			return levels;
> +		/* Shift clock for the next level granularity */
> +		clock >>= LVL_CLK_SHIFT;
> +	}
> +
> +	/* Cascading, sigh... */

So given that userspace has no influence on timer period; can't we
simply fail to support timers longer than 30 minutes?

In anything really arming timers _that_ long?

> +	idx = (clock & LVL_MASK) + i * LVL_SIZE;
> +
> +	if (__test_and_clear_bit(idx, base->pending_map)) {
> +		vec = base->vectors + idx;
> +		hlist_move_list(vec, &tmp);
> +		/* Make sure we queue them in the future */
> +		base->clk++;
> +		while (!hlist_empty(&tmp)) {
> +			struct timer_list *timer;
> +
> +			timer = hlist_entry(tmp.first, struct timer_list, entry);
> +			trace_timer_cascade(timer);
> +			__hlist_del(&timer->entry);
> +			__internal_add_timer(base, timer);
> +		}
> +		base->clk--;
> +	}
> +	return levels;
> +}

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-13 11:40   ` Peter Zijlstra
@ 2016-06-13 12:30     ` Thomas Gleixner
  2016-06-13 12:46       ` Eric Dumazet
  2016-06-14 10:11       ` Ingo Molnar
  0 siblings, 2 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13 12:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Ingo Molnar, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

On Mon, 13 Jun 2016, Peter Zijlstra wrote:
> On Mon, Jun 13, 2016 at 08:41:00AM -0000, Thomas Gleixner wrote:
> > +
> > +	/* Cascading, sigh... */
> 
> So given that userspace has no influence on timer period; can't we
> simply fail to support timers longer than 30 minutes?
> 
> In anything really arming timers _that_ long?

Unfortunately yes. Networking being one of those. Real cascading happens once
in a blue moon, but it happens.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-13 12:30     ` Thomas Gleixner
@ 2016-06-13 12:46       ` Eric Dumazet
  2016-06-13 14:30         ` Thomas Gleixner
  2016-06-14 10:11       ` Ingo Molnar
  1 sibling, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2016-06-13 12:46 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, LKML, Ingo Molnar, Paul E. McKenney,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

On Mon, Jun 13, 2016 at 5:30 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Mon, 13 Jun 2016, Peter Zijlstra wrote:
>> On Mon, Jun 13, 2016 at 08:41:00AM -0000, Thomas Gleixner wrote:
>> > +
>> > +   /* Cascading, sigh... */
>>
>> So given that userspace has no influence on timer period; can't we
>> simply fail to support timers longer than 30 minutes?
>>
>> In anything really arming timers _that_ long?
>
> Unfortunately yes. Networking being one of those. Real cascading happens once
> in a blue moon, but it happens.
>

Anyway, for the more common HZ=1000 case, what would the 'limit' be ?

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 06/20] drivers/tty/metag_da: Initialize timer as pinned
  2016-06-13  8:40 ` [patch 06/20] drivers/tty/metag_da: " Thomas Gleixner
@ 2016-06-13 13:13   ` Arjan van de Ven
  0 siblings, 0 replies; 62+ messages in thread
From: Arjan van de Ven @ 2016-06-13 13:13 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt

I know it's not related to this patch, but it'd be nice to, as you're
changing the api name anyway, make a mod_pinned_relative() so that
more direct users of jiffies can go away...
or even better, mod_pinned_relative_ms() so that these drivers also do
not need to care about HZ.

On Mon, Jun 13, 2016 at 1:40 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> Pinned timers must carry that attribute in the timer itself. No functional
> change.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  drivers/tty/metag_da.c |    4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> Index: b/drivers/tty/metag_da.c
> ===================================================================
> --- a/drivers/tty/metag_da.c
> +++ b/drivers/tty/metag_da.c
> @@ -323,12 +323,12 @@ static void dashtty_timer(unsigned long
>         if (channel >= 0)
>                 fetch_data(channel);
>
> -       mod_timer_pinned(&poll_timer, jiffies + DA_TTY_POLL);
> +       mod_pinned(&poll_timer, jiffies + DA_TTY_POLL);
>  }
>
>  static void add_poll_timer(struct timer_list *poll_timer)
>  {
> -       setup_timer(poll_timer, dashtty_timer, 0);
> +       setup_pinned_timer(poll_timer, dashtty_timer, 0);
>         poll_timer->expires = jiffies + DA_TTY_POLL;
>
>         /*
>
>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 04/20] cpufreq/powernv: Initialize timer as pinned
  2016-06-13  8:40 ` [patch 04/20] cpufreq/powernv: " Thomas Gleixner
@ 2016-06-13 13:18   ` Arjan van de Ven
  0 siblings, 0 replies; 62+ messages in thread
From: Arjan van de Ven @ 2016-06-13 13:18 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt

On Mon, Jun 13, 2016 at 1:40 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> mod_timer(&gpstates->timer, jiffies + msecs_to_jiffies(timer_interval));

are you sure this is right? the others did not get replaced by mod_timer()..
(and this is more evidence that a relative API in msecs is what
drivers really want)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 00/20] timer: Refactor the timer wheel
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (19 preceding siblings ...)
  2016-06-13  8:41 ` [patch 20/20] timer: Optimization for same expiry time in mod_timer() Thomas Gleixner
@ 2016-06-13 14:10 ` Eric Dumazet
  2016-06-13 16:15 ` Paul E. McKenney
  21 siblings, 0 replies; 62+ messages in thread
From: Eric Dumazet @ 2016-06-13 14:10 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

On Mon, Jun 13, 2016 at 1:40 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> The current timer wheel has some drawbacks:

...

> This series has also preparatory patches for changing the NOHZ timer handling
> from the current push to a pull model. Currently we decide at timer enqueue
> time on which cpu we queue the timer. This is exceptionally silly because
> there is no way to predict at enqueue time which cpu will be idle when the
> timer expires. Given the fact that most timers are canceled or rearmed before
> expiry this is even more silly. We trade a expensive decision and cross cpu
> access for a very doubtful benefit.
>

Also many TCP timers are armed from softirq handler interrupting idle task,
so scheduler believes current cpu is idle .

This patch series is really nice Thomas.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-13 12:46       ` Eric Dumazet
@ 2016-06-13 14:30         ` Thomas Gleixner
  0 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13 14:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter Zijlstra, LKML, Ingo Molnar, Paul E. McKenney,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

On Mon, 13 Jun 2016, Eric Dumazet wrote:
> On Mon, Jun 13, 2016 at 5:30 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > On Mon, 13 Jun 2016, Peter Zijlstra wrote:
> >> On Mon, Jun 13, 2016 at 08:41:00AM -0000, Thomas Gleixner wrote:
> >> > +
> >> > +   /* Cascading, sigh... */
> >>
> >> So given that userspace has no influence on timer period; can't we
> >> simply fail to support timers longer than 30 minutes?
> >>
> >> In anything really arming timers _that_ long?
> >
> > Unfortunately yes. Networking being one of those. Real cascading happens once
> > in a blue moon, but it happens.
> >
> Anyway, for the more common HZ=1000 case, what would the 'limit' be ?

34 minutes with the currently chosen parameters.

 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-13  8:41 ` [patch 13/20] timer: Switch to a non cascading wheel Thomas Gleixner
  2016-06-13 11:40   ` Peter Zijlstra
@ 2016-06-13 14:36   ` Richard Cochran
  2016-06-13 14:39     ` Thomas Gleixner
  1 sibling, 1 reply; 62+ messages in thread
From: Richard Cochran @ 2016-06-13 14:36 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt

On Mon, Jun 13, 2016 at 08:41:00AM -0000, Thomas Gleixner wrote:
> +static inline struct timer_base *get_timer_base(u32 tflags)
> +{
> +	return get_timer_cpu_base(tflags, tflags & TIMER_BASEMASK);
> +}

This should rather be (tflags & TIMER_CPUMASK) to avoid using
per_cpu_ptr() with the TIMER_MIGRATING bit set in the CPU index.

The one caller in this patch is okay, since it already checks that
TIMER_MIGRATING is clear:

>  static struct timer_base *lock_timer_base(struct timer_list *timer,
> -					unsigned long *flags)
> +					  unsigned long *flags)
>  	__acquires(timer->base->lock)
>  {
>  	for (;;) {
> -		u32 tf = timer->flags;
>  		struct timer_base *base;
> +		u32 tf = timer->flags;
>  
>  		if (!(tf & TIMER_MIGRATING)) {
> -			base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK);
> +			base = get_timer_base(tf);

However, in patch #20, we'll have this in __mod_timer();

		/*
		 * Take the current timer_jiffies of base, but without holding
		 * the lock!
		 */
		base = get_timer_base(timer->flags);

Thanks,
Richard

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-13 14:36   ` Richard Cochran
@ 2016-06-13 14:39     ` Thomas Gleixner
  0 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13 14:39 UTC (permalink / raw)
  To: Richard Cochran
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt

On Mon, 13 Jun 2016, Richard Cochran wrote:
> On Mon, Jun 13, 2016 at 08:41:00AM -0000, Thomas Gleixner wrote:
> > +static inline struct timer_base *get_timer_base(u32 tflags)
> > +{
> > +	return get_timer_cpu_base(tflags, tflags & TIMER_BASEMASK);
> > +}
> 
> This should rather be (tflags & TIMER_CPUMASK) to avoid using
> per_cpu_ptr() with the TIMER_MIGRATING bit set in the CPU index.
> 
> The one caller in this patch is okay, since it already checks that
> TIMER_MIGRATING is clear:

Good catch!
 
Thanks,

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 18/20] timer: Forward wheel clock whenever possible
  2016-06-13  8:41 ` [patch 18/20] timer: Forward wheel clock whenever possible Thomas Gleixner
@ 2016-06-13 15:14   ` Richard Cochran
  2016-06-13 15:18     ` Thomas Gleixner
  0 siblings, 1 reply; 62+ messages in thread
From: Richard Cochran @ 2016-06-13 15:14 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt

Thomas,

On Mon, Jun 13, 2016 at 08:41:04AM -0000, Thomas Gleixner wrote:
> @@ -498,23 +500,27 @@ static void internal_add_timer(struct ti
>  	__internal_add_timer(base, timer);
>  
>  	/*
> -	 * Check whether the other CPU is in dynticks mode and needs
> -	 * to be triggered to reevaluate the timer wheel.  We are
> -	 * protected against the other CPU fiddling with the timer by
> -	 * holding the timer base lock. This also makes sure that a
> -	 * CPU on the way to stop its tick can not evaluate the timer
> -	 * wheel.
> -	 *
> -	 * Spare the IPI for deferrable timers on idle targets though.
> -	 * The next busy ticks will take care of it. Except full dynticks
> -	 * require special care against races with idle_cpu(), lets deal
> -	 * with that later.
> -	 */
> -	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
> -		if (!(timer->flags & TIMER_DEFERRABLE) ||
> -		    tick_nohz_full_cpu(base->cpu))
> -			wake_up_nohz_cpu(base->cpu);
> -	}
> +	 * We might have to IPI the remote CPU if the base is idle and the
> +	 * timer is not deferrable. If the other cpu is on the way to idle
> +	 * then it can't set base->is_idle as we hold base lock.
> +	 */
> +	if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->is_idle ||
> +	    (timer->flags & TIMER_DEFERRABLE))
> +		return;

The tests for is_idle and TIMER_DEFERRABLE are actually checking the
same thing.  I was looking at the usage of base.is_idle, and it is
only ever set for BASE_STD.

So the TIMER_DEFERRABLE test is redundant, but maybe this is on
purpose?  Anyhow, it did leave me scratching my head.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 18/20] timer: Forward wheel clock whenever possible
  2016-06-13 15:14   ` Richard Cochran
@ 2016-06-13 15:18     ` Thomas Gleixner
  0 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-13 15:18 UTC (permalink / raw)
  To: Richard Cochran
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt

On Mon, 13 Jun 2016, Richard Cochran wrote:
> On Mon, Jun 13, 2016 at 08:41:04AM -0000, Thomas Gleixner wrote:
> > @@ -498,23 +500,27 @@ static void internal_add_timer(struct ti
> >  	__internal_add_timer(base, timer);
> >  
> >  	/*
> > -	 * Check whether the other CPU is in dynticks mode and needs
> > -	 * to be triggered to reevaluate the timer wheel.  We are
> > -	 * protected against the other CPU fiddling with the timer by
> > -	 * holding the timer base lock. This also makes sure that a
> > -	 * CPU on the way to stop its tick can not evaluate the timer
> > -	 * wheel.
> > -	 *
> > -	 * Spare the IPI for deferrable timers on idle targets though.
> > -	 * The next busy ticks will take care of it. Except full dynticks
> > -	 * require special care against races with idle_cpu(), lets deal
> > -	 * with that later.
> > -	 */
> > -	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
> > -		if (!(timer->flags & TIMER_DEFERRABLE) ||
> > -		    tick_nohz_full_cpu(base->cpu))
> > -			wake_up_nohz_cpu(base->cpu);
> > -	}
> > +	 * We might have to IPI the remote CPU if the base is idle and the
> > +	 * timer is not deferrable. If the other cpu is on the way to idle
> > +	 * then it can't set base->is_idle as we hold base lock.
> > +	 */
> > +	if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->is_idle ||
> > +	    (timer->flags & TIMER_DEFERRABLE))
> > +		return;
> 
> The tests for is_idle and TIMER_DEFERRABLE are actually checking the
> same thing.  I was looking at the usage of base.is_idle, and it is
> only ever set for BASE_STD.
> 
> So the TIMER_DEFERRABLE test is redundant, but maybe this is on
> purpose?  Anyhow, it did leave me scratching my head.

You're right. is_idle makes the deferrable check obsolete. We can remove it,
but it needs a big fat comment ....

Thanks

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 00/20] timer: Refactor the timer wheel
  2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
                   ` (20 preceding siblings ...)
  2016-06-13 14:10 ` [patch 00/20] timer: Refactor the timer wheel Eric Dumazet
@ 2016-06-13 16:15 ` Paul E. McKenney
  2016-06-15 15:15   ` Paul E. McKenney
  21 siblings, 1 reply; 62+ messages in thread
From: Paul E. McKenney @ 2016-06-13 16:15 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

On Mon, Jun 13, 2016 at 08:40:50AM -0000, Thomas Gleixner wrote:
> The current timer wheel has some drawbacks:
> 
> 1) Cascading
> 
>    Cascading can be an unbound operation and is completely pointless in most
>    cases because the vast majority of the timer wheel timers are canceled or
>    rearmed before expiration.
> 
> 2) No fast lookup of the next expiring timer
> 
>    In NOHZ scenarios the first timer soft interrupt after a long NOHZ period
>    must fast forward the base time to current jiffies. As we have no way to
>    find the next expiring timer fast, the code loops and increments the base
>    time by one and checks for expired timers in each step. I've observed loops
>    lasting 1 ms!
> 
> There are some other issues caused by the above, but they are minor compare to
> those.

For SMP configurations, this passes light rcutorture testing.  For UP
builds, it complains about undefined symbols.  Builds succeed with
the following kneejerk patch.  Am retesting rcutorture.

							Thanx, Paul

------------------------------------------------------------------------

commit 87dbd35cf4034f2b664b6e9d60decd2bdbfc416f
Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Date:   Mon Jun 13 09:11:41 2016 -0700

    Fix undefined get_nohz_timer_target() in non-SMP builds
    
    Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada26345..322e52415a29 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -348,6 +348,10 @@ extern int get_nohz_timer_target(void);
 #else
 static inline void nohz_balance_enter_idle(int cpu) { }
 static inline void set_cpu_sd_state_idle(void) { }
+static inline int get_nohz_timer_target(void)
+{
+	return raw_smp_processor_id();
+}
 #endif
 
 /*

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-13 12:30     ` Thomas Gleixner
  2016-06-13 12:46       ` Eric Dumazet
@ 2016-06-14 10:11       ` Ingo Molnar
  2016-06-14 16:28         ` Thomas Gleixner
  1 sibling, 1 reply; 62+ messages in thread
From: Ingo Molnar @ 2016-06-14 10:11 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, LKML, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Linus Torvalds


* Thomas Gleixner <tglx@linutronix.de> wrote:

> On Mon, 13 Jun 2016, Peter Zijlstra wrote:
> > On Mon, Jun 13, 2016 at 08:41:00AM -0000, Thomas Gleixner wrote:
> > > +
> > > +	/* Cascading, sigh... */
> > 
> > So given that userspace has no influence on timer period; can't we
> > simply fail to support timers longer than 30 minutes?
> > 
> > In anything really arming timers _that_ long?
> 
> Unfortunately yes. Networking being one of those. Real cascading happens once
> in a blue moon, but it happens.

So I'd really prefer it if we added a few more levels, a hard limit and got rid of 
the cascading once and for all!

IMHO 'once in a blue moon' code is much worse than a bit more data overhead.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14 10:11       ` Ingo Molnar
@ 2016-06-14 16:28         ` Thomas Gleixner
  2016-06-14 17:14           ` Arjan van de Ven
  0 siblings, 1 reply; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-14 16:28 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, LKML, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Linus Torvalds

On Tue, 14 Jun 2016, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> > On Mon, 13 Jun 2016, Peter Zijlstra wrote:
> > > On Mon, Jun 13, 2016 at 08:41:00AM -0000, Thomas Gleixner wrote:
> > > > +
> > > > +	/* Cascading, sigh... */
> > > 
> > > So given that userspace has no influence on timer period; can't we
> > > simply fail to support timers longer than 30 minutes?
> > > 
> > > In anything really arming timers _that_ long?
> > 
> > Unfortunately yes. Networking being one of those. Real cascading happens once
> > in a blue moon, but it happens.
> 
> So I'd really prefer it if we added a few more levels, a hard limit and got rid of 
> the cascading once and for all!
> 
> IMHO 'once in a blue moon' code is much worse than a bit more data overhead.

I agree. If we add two wheel levels then we end up with:

  HZ 1000:  134217727 ms ~=  37 hours
  HZ  250:  536870908 ms ~= 149 hours
  HZ  100: 1342177270 ms ~= 372 hours 

Looking through all my data I found exactly one timeout which is insanely
large: 120 hours!

That's net/netfilter/nf_conntrack_core.c:
      setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);

Anything else is way below 37 hours.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14 16:28         ` Thomas Gleixner
@ 2016-06-14 17:14           ` Arjan van de Ven
  2016-06-14 18:05             ` Thomas Gleixner
  0 siblings, 1 reply; 62+ messages in thread
From: Arjan van de Ven @ 2016-06-14 17:14 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Ingo Molnar, Peter Zijlstra, LKML, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt, Linus Torvalds

evaluating a 120 hours timer ever 37 hours to see if it should fire...
not too horrid.

On Tue, Jun 14, 2016 at 9:28 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Tue, 14 Jun 2016, Ingo Molnar wrote:
>> * Thomas Gleixner <tglx@linutronix.de> wrote:
>> > On Mon, 13 Jun 2016, Peter Zijlstra wrote:
>> > > On Mon, Jun 13, 2016 at 08:41:00AM -0000, Thomas Gleixner wrote:
>> > > > +
>> > > > +       /* Cascading, sigh... */
>> > >
>> > > So given that userspace has no influence on timer period; can't we
>> > > simply fail to support timers longer than 30 minutes?
>> > >
>> > > In anything really arming timers _that_ long?
>> >
>> > Unfortunately yes. Networking being one of those. Real cascading happens once
>> > in a blue moon, but it happens.
>>
>> So I'd really prefer it if we added a few more levels, a hard limit and got rid of
>> the cascading once and for all!
>>
>> IMHO 'once in a blue moon' code is much worse than a bit more data overhead.
>
> I agree. If we add two wheel levels then we end up with:
>
>   HZ 1000:  134217727 ms ~=  37 hours
>   HZ  250:  536870908 ms ~= 149 hours
>   HZ  100: 1342177270 ms ~= 372 hours
>
> Looking through all my data I found exactly one timeout which is insanely
> large: 120 hours!
>
> That's net/netfilter/nf_conntrack_core.c:
>       setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
>
> Anything else is way below 37 hours.
>
> Thanks,
>
>         tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14 17:14           ` Arjan van de Ven
@ 2016-06-14 18:05             ` Thomas Gleixner
  2016-06-14 20:34               ` Peter Zijlstra
  2016-06-14 20:42               ` Peter Zijlstra
  0 siblings, 2 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-14 18:05 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, Peter Zijlstra, LKML, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt, Linus Torvalds

On Tue, 14 Jun 2016, Arjan van de Ven wrote:

> evaluating a 120 hours timer ever 37 hours to see if it should fire...
> not too horrid.

Well that thing is doing weird stuff anyway:

   swapper     0 [001] 1789995.305532: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4850639994 [timeout=108000000]
             ssh  3870 [001] 1790025.284704: timer:timer_cancel: timer=0xffff8800c8346920
             ssh  3870 [001] 1790025.284707: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4742722493 [timeout=75000]
         swapper     0 [001] 1790025.330514: timer:timer_cancel: timer=0xffff8800c8346920
         swapper     0 [001] 1790025.330515: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4850647504 [timeout=108000000]
             ssh  3870 [001] 1790055.307058: timer:timer_cancel: timer=0xffff8800c8346920
             ssh  3870 [001] 1790055.307060: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4742730003 [timeout=75000]
         swapper     0 [001] 1790055.352146: timer:timer_cancel: timer=0xffff8800c8346920

And that goes on forever. 2834 such sequences for this particular timer
instance in 4.5 hours. 90000 sequences total for all timers related to
death_by_timeout in 4.5 hours

No idea what this is doing and why the heck it nees a 120 hour timeout ....

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14 18:05             ` Thomas Gleixner
@ 2016-06-14 20:34               ` Peter Zijlstra
  2016-06-14 20:42               ` Peter Zijlstra
  1 sibling, 0 replies; 62+ messages in thread
From: Peter Zijlstra @ 2016-06-14 20:34 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Arjan van de Ven, Ingo Molnar, LKML, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt, Linus Torvalds

On Tue, Jun 14, 2016 at 08:05:49PM +0200, Thomas Gleixner wrote:
> On Tue, 14 Jun 2016, Arjan van de Ven wrote:
> 
> > evaluating a 120 hours timer ever 37 hours to see if it should fire...
> > not too horrid.
> 
> Well that thing is doing weird stuff anyway:
> 
>    swapper     0 [001] 1789995.305532: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4850639994 [timeout=108000000]

You're running a HZ=250 kernel, right?

inet/netfilter/nf_conntrack_proto_tcp.c:

#define SECS * HZ
#define MINS * 60 SECS
#define HOURS * 60 MINS
#define DAYS * 24 HOURS

static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = {
	[TCP_CONNTRACK_SYN_SENT]	= 2 MINS,
	[TCP_CONNTRACK_SYN_RECV]	= 60 SECS,
	[TCP_CONNTRACK_ESTABLISHED]	= 5 DAYS,

					  ^^^^^^ that
ends up being 108000000 for HZ == 250.


	[TCP_CONNTRACK_FIN_WAIT]	= 2 MINS,
	[TCP_CONNTRACK_CLOSE_WAIT]	= 60 SECS,
	[TCP_CONNTRACK_LAST_ACK]	= 30 SECS,
	[TCP_CONNTRACK_TIME_WAIT]	= 2 MINS,
	[TCP_CONNTRACK_CLOSE]		= 10 SECS,
	[TCP_CONNTRACK_SYN_SENT2]	= 2 MINS,
/* RFC1122 says the R2 limit should be at least 100 seconds.
   Linux uses 15 packets as limit, which corresponds
   to ~13-30min depending on RTO. */
	[TCP_CONNTRACK_RETRANS]		= 5 MINS,
	[TCP_CONNTRACK_UNACK]		= 5 MINS,
};


>              ssh  3870 [001] 1790025.284704: timer:timer_cancel: timer=0xffff8800c8346920
>              ssh  3870 [001] 1790025.284707: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4742722493 [timeout=75000]

And that one would then be one of the 5 MINS ones.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14 18:05             ` Thomas Gleixner
  2016-06-14 20:34               ` Peter Zijlstra
@ 2016-06-14 20:42               ` Peter Zijlstra
  2016-06-14 21:17                 ` Eric Dumazet
  1 sibling, 1 reply; 62+ messages in thread
From: Peter Zijlstra @ 2016-06-14 20:42 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Arjan van de Ven, Ingo Molnar, LKML, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt, Linus Torvalds

On Tue, Jun 14, 2016 at 08:05:49PM +0200, Thomas Gleixner wrote:
> On Tue, 14 Jun 2016, Arjan van de Ven wrote:
> 
> > evaluating a 120 hours timer ever 37 hours to see if it should fire...
> > not too horrid.
> 
> Well that thing is doing weird stuff anyway:
> 
>    swapper     0 [001] 1789995.305532: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4850639994 [timeout=108000000]
>              ssh  3870 [001] 1790025.284704: timer:timer_cancel: timer=0xffff8800c8346920
>              ssh  3870 [001] 1790025.284707: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4742722493 [timeout=75000]
>          swapper     0 [001] 1790025.330514: timer:timer_cancel: timer=0xffff8800c8346920
>          swapper     0 [001] 1790025.330515: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4850647504 [timeout=108000000]
>              ssh  3870 [001] 1790055.307058: timer:timer_cancel: timer=0xffff8800c8346920
>              ssh  3870 [001] 1790055.307060: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4742730003 [timeout=75000]
>          swapper     0 [001] 1790055.352146: timer:timer_cancel: timer=0xffff8800c8346920
> 
> And that goes on forever. 2834 such sequences for this particular timer
> instance in 4.5 hours. 90000 sequences total for all timers related to
> death_by_timeout in 4.5 hours
> 
> No idea what this is doing and why the heck it nees a 120 hour timeout ....

So it moves that timer on every packet for that TCP connection stream,
provided the expiration is at least 1 second behind.

If the stream hasn't had a packet in 5 days (see previous email), then
the connection state is destroyed.

Its been too long since I've read the TCP RFCs, but I can imagine
changing this will upset people.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14 20:42               ` Peter Zijlstra
@ 2016-06-14 21:17                 ` Eric Dumazet
  2016-06-15 14:53                   ` Thomas Gleixner
  0 siblings, 1 reply; 62+ messages in thread
From: Eric Dumazet @ 2016-06-14 21:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Arjan van de Ven, Ingo Molnar, LKML,
	Paul E. McKenney, Frederic Weisbecker, Chris Mason,
	Arjan van de Ven, rt, Linus Torvalds

On Tue, Jun 14, 2016 at 1:42 PM, Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Jun 14, 2016 at 08:05:49PM +0200, Thomas Gleixner wrote:
>> On Tue, 14 Jun 2016, Arjan van de Ven wrote:
>>
>> > evaluating a 120 hours timer ever 37 hours to see if it should fire...
>> > not too horrid.
>>
>> Well that thing is doing weird stuff anyway:
>>
>>    swapper     0 [001] 1789995.305532: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4850639994 [timeout=108000000]
>>              ssh  3870 [001] 1790025.284704: timer:timer_cancel: timer=0xffff8800c8346920
>>              ssh  3870 [001] 1790025.284707: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4742722493 [timeout=75000]
>>          swapper     0 [001] 1790025.330514: timer:timer_cancel: timer=0xffff8800c8346920
>>          swapper     0 [001] 1790025.330515: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4850647504 [timeout=108000000]
>>              ssh  3870 [001] 1790055.307058: timer:timer_cancel: timer=0xffff8800c8346920
>>              ssh  3870 [001] 1790055.307060: timer:timer_start: timer=0xffff8800c8346920 function=death_by_timeout expires=4742730003 [timeout=75000]
>>          swapper     0 [001] 1790055.352146: timer:timer_cancel: timer=0xffff8800c8346920
>>
>> And that goes on forever. 2834 such sequences for this particular timer
>> instance in 4.5 hours. 90000 sequences total for all timers related to
>> death_by_timeout in 4.5 hours
>>
>> No idea what this is doing and why the heck it nees a 120 hour timeout ....
>
> So it moves that timer on every packet for that TCP connection stream,
> provided the expiration is at least 1 second behind.
>
> If the stream hasn't had a packet in 5 days (see previous email), then
> the connection state is destroyed.
>
> Its been too long since I've read the TCP RFCs, but I can imagine
> changing this will upset people.
>

Original TCP RFCs tell timeout is infinite ;)

Practically, conntrack has a 5 days timeout, but I really doubt anyone
expects an idle TCP flow to stay 'alive' when nothing is sent for 5
days.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14 21:17                 ` Eric Dumazet
@ 2016-06-15 14:53                   ` Thomas Gleixner
  2016-06-15 14:55                     ` Arjan van de Ven
  2016-06-15 15:05                     ` Eric Dumazet
  0 siblings, 2 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-15 14:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter Zijlstra, Arjan van de Ven, Ingo Molnar, LKML,
	Paul E. McKenney, Frederic Weisbecker, Chris Mason,
	Arjan van de Ven, Linus Torvalds

On Tue, 14 Jun 2016, Eric Dumazet wrote:
> Original TCP RFCs tell timeout is infinite ;)
> 
> Practically, conntrack has a 5 days timeout, but I really doubt anyone
> expects an idle TCP flow to stay 'alive' when nothing is sent for 5
> days.

So would 37hrs ~= 1.5 days be a reasonable cutoff or will stuff fall apart and
people be surprised?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-15 14:53                   ` Thomas Gleixner
@ 2016-06-15 14:55                     ` Arjan van de Ven
  2016-06-15 16:43                       ` Thomas Gleixner
  2016-06-15 15:05                     ` Eric Dumazet
  1 sibling, 1 reply; 62+ messages in thread
From: Arjan van de Ven @ 2016-06-15 14:55 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Eric Dumazet, Peter Zijlstra, Ingo Molnar, LKML,
	Paul E. McKenney, Frederic Weisbecker, Chris Mason,
	Arjan van de Ven, Linus Torvalds

what would 1 more timer wheel do?

On Wed, Jun 15, 2016 at 7:53 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Tue, 14 Jun 2016, Eric Dumazet wrote:
>> Original TCP RFCs tell timeout is infinite ;)
>>
>> Practically, conntrack has a 5 days timeout, but I really doubt anyone
>> expects an idle TCP flow to stay 'alive' when nothing is sent for 5
>> days.
>
> So would 37hrs ~= 1.5 days be a reasonable cutoff or will stuff fall apart and
> people be surprised?
>
> Thanks,
>
>         tglx
>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-15 14:53                   ` Thomas Gleixner
  2016-06-15 14:55                     ` Arjan van de Ven
@ 2016-06-15 15:05                     ` Eric Dumazet
  1 sibling, 0 replies; 62+ messages in thread
From: Eric Dumazet @ 2016-06-15 15:05 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Arjan van de Ven, Ingo Molnar, LKML,
	Paul E. McKenney, Frederic Weisbecker, Chris Mason,
	Arjan van de Ven, Linus Torvalds

On Wed, Jun 15, 2016 at 7:53 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Tue, 14 Jun 2016, Eric Dumazet wrote:
>> Original TCP RFCs tell timeout is infinite ;)
>>
>> Practically, conntrack has a 5 days timeout, but I really doubt anyone
>> expects an idle TCP flow to stay 'alive' when nothing is sent for 5
>> days.
>
> So would 37hrs ~= 1.5 days be a reasonable cutoff or will stuff fall apart and
> people be surprised?
>

It seems very reasonable to me at least.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 00/20] timer: Refactor the timer wheel
  2016-06-13 16:15 ` Paul E. McKenney
@ 2016-06-15 15:15   ` Paul E. McKenney
  2016-06-15 17:02     ` Thomas Gleixner
  0 siblings, 1 reply; 62+ messages in thread
From: Paul E. McKenney @ 2016-06-15 15:15 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

On Mon, Jun 13, 2016 at 09:15:14AM -0700, Paul E. McKenney wrote:
> On Mon, Jun 13, 2016 at 08:40:50AM -0000, Thomas Gleixner wrote:
> > The current timer wheel has some drawbacks:
> > 
> > 1) Cascading
> > 
> >    Cascading can be an unbound operation and is completely pointless in most
> >    cases because the vast majority of the timer wheel timers are canceled or
> >    rearmed before expiration.
> > 
> > 2) No fast lookup of the next expiring timer
> > 
> >    In NOHZ scenarios the first timer soft interrupt after a long NOHZ period
> >    must fast forward the base time to current jiffies. As we have no way to
> >    find the next expiring timer fast, the code loops and increments the base
> >    time by one and checks for expired timers in each step. I've observed loops
> >    lasting 1 ms!
> > 
> > There are some other issues caused by the above, but they are minor compare to
> > those.
> 
> For SMP configurations, this passes light rcutorture testing.  For UP
> builds, it complains about undefined symbols.  Builds succeed with
> the following kneejerk patch.  Am retesting rcutorture.

And with the patch below, testing goes as well with your patch stack as
it does without it.  So, with that patch (or equivalent):

Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

There were some complaints about increasing the size of the tiny
configuration, FYI.

So, just out of curiosity, does anyone still run -rt on single-CPU systems?

							Thanx, Paul

> ------------------------------------------------------------------------
> 
> commit 87dbd35cf4034f2b664b6e9d60decd2bdbfc416f
> Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Date:   Mon Jun 13 09:11:41 2016 -0700
> 
>     Fix undefined get_nohz_timer_target() in non-SMP builds
>     
>     Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 6e42ada26345..322e52415a29 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -348,6 +348,10 @@ extern int get_nohz_timer_target(void);
>  #else
>  static inline void nohz_balance_enter_idle(int cpu) { }
>  static inline void set_cpu_sd_state_idle(void) { }
> +static inline int get_nohz_timer_target(void)
> +{
> +	return raw_smp_processor_id();
> +}
>  #endif
>  
>  /*

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-15 14:55                     ` Arjan van de Ven
@ 2016-06-15 16:43                       ` Thomas Gleixner
  2016-06-16 15:43                         ` Thomas Gleixner
  0 siblings, 1 reply; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-15 16:43 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Eric Dumazet, Peter Zijlstra, Ingo Molnar, LKML,
	Paul E. McKenney, Frederic Weisbecker, Chris Mason,
	Arjan van de Ven, Linus Torvalds

On Wed, 15 Jun 2016, Arjan van de Ven wrote:
> what would 1 more timer wheel do?

Waste storage space and make the collection of expired timers more expensive.

The selection of the timer wheel properties is combination of:

    1) Granularity 

    2) Storage space

    3) Number of levels to collect

#1 Influences the accuracy of the wheel levels. The worst case error is the
   ratio of the shortest timeout in a level and the granularity of that level.

   Currently I've chosen 12.5%. If we go to 25% worst case, then we increase
   the per level capacity by factor (1 << lvl).

#2 It strikes me a bit silly to waste storage space for esoteric
   outliers. According to the data I collected we have no timers at all
   between 2hrs and 5 days.

#3 The more levels we have the more steps we have when collecting expired
   timers or searching the next expiring timer. It's all bound, but it still
   adds up.

Thanks,

	tglx




   

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 00/20] timer: Refactor the timer wheel
  2016-06-15 15:15   ` Paul E. McKenney
@ 2016-06-15 17:02     ` Thomas Gleixner
  2016-06-15 20:26       ` Paul E. McKenney
  0 siblings, 1 reply; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-15 17:02 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

On Wed, 15 Jun 2016, Paul E. McKenney wrote:
> On Mon, Jun 13, 2016 at 09:15:14AM -0700, Paul E. McKenney wrote:
> > On Mon, Jun 13, 2016 at 08:40:50AM -0000, Thomas Gleixner wrote:
> > > The current timer wheel has some drawbacks:
> > > 
> > > 1) Cascading
> > > 
> > >    Cascading can be an unbound operation and is completely pointless in most
> > >    cases because the vast majority of the timer wheel timers are canceled or
> > >    rearmed before expiration.
> > > 
> > > 2) No fast lookup of the next expiring timer
> > > 
> > >    In NOHZ scenarios the first timer soft interrupt after a long NOHZ period
> > >    must fast forward the base time to current jiffies. As we have no way to
> > >    find the next expiring timer fast, the code loops and increments the base
> > >    time by one and checks for expired timers in each step. I've observed loops
> > >    lasting 1 ms!
> > > 
> > > There are some other issues caused by the above, but they are minor compare to
> > > those.
> > 
> > For SMP configurations, this passes light rcutorture testing.  For UP
> > builds, it complains about undefined symbols.  Builds succeed with
> > the following kneejerk patch.  Am retesting rcutorture.
> 
> And with the patch below, testing goes as well with your patch stack as
> it does without it.  So, with that patch (or equivalent):
> 
> Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> 
> There were some complaints about increasing the size of the tiny
> configuration, FYI.

I know. The extra storage space for the deferrable stuff makes it larger along
with the extra code for avoiding all the crap which the current wheel suffers
from :) Do the tiny people need NOHZ?

> So, just out of curiosity, does anyone still run -rt on single-CPU systems?

Of course :)

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 00/20] timer: Refactor the timer wheel
  2016-06-15 17:02     ` Thomas Gleixner
@ 2016-06-15 20:26       ` Paul E. McKenney
  0 siblings, 0 replies; 62+ messages in thread
From: Paul E. McKenney @ 2016-06-15 20:26 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt

On Wed, Jun 15, 2016 at 07:02:38PM +0200, Thomas Gleixner wrote:
> On Wed, 15 Jun 2016, Paul E. McKenney wrote:
> > On Mon, Jun 13, 2016 at 09:15:14AM -0700, Paul E. McKenney wrote:
> > > On Mon, Jun 13, 2016 at 08:40:50AM -0000, Thomas Gleixner wrote:
> > > > The current timer wheel has some drawbacks:
> > > > 
> > > > 1) Cascading
> > > > 
> > > >    Cascading can be an unbound operation and is completely pointless in most
> > > >    cases because the vast majority of the timer wheel timers are canceled or
> > > >    rearmed before expiration.
> > > > 
> > > > 2) No fast lookup of the next expiring timer
> > > > 
> > > >    In NOHZ scenarios the first timer soft interrupt after a long NOHZ period
> > > >    must fast forward the base time to current jiffies. As we have no way to
> > > >    find the next expiring timer fast, the code loops and increments the base
> > > >    time by one and checks for expired timers in each step. I've observed loops
> > > >    lasting 1 ms!
> > > > 
> > > > There are some other issues caused by the above, but they are minor compare to
> > > > those.
> > > 
> > > For SMP configurations, this passes light rcutorture testing.  For UP
> > > builds, it complains about undefined symbols.  Builds succeed with
> > > the following kneejerk patch.  Am retesting rcutorture.
> > 
> > And with the patch below, testing goes as well with your patch stack as
> > it does without it.  So, with that patch (or equivalent):
> > 
> > Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > 
> > There were some complaints about increasing the size of the tiny
> > configuration, FYI.
> 
> I know. The extra storage space for the deferrable stuff makes it larger along
> with the extra code for avoiding all the crap which the current wheel suffers
> from :) Do the tiny people need NOHZ?

If it makes the code bigger, I would hope that they don't need it.

> > So, just out of curiosity, does anyone still run -rt on single-CPU systems?
> 
> Of course :)

Hey, had to ask!  ;-)

								Thanx, Paul

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-15 16:43                       ` Thomas Gleixner
@ 2016-06-16 15:43                         ` Thomas Gleixner
  2016-06-16 16:02                           ` Paul E. McKenney
  2016-06-16 16:04                           ` Arjan van de Ven
  0 siblings, 2 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-16 15:43 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Eric Dumazet, Peter Zijlstra, Ingo Molnar, LKML,
	Paul E. McKenney, Frederic Weisbecker, Chris Mason,
	Arjan van de Ven, Linus Torvalds, George Spelvin

On Wed, 15 Jun 2016, Thomas Gleixner wrote:
> On Wed, 15 Jun 2016, Arjan van de Ven wrote:
> > what would 1 more timer wheel do?
> 
> Waste storage space and make the collection of expired timers more expensive.
> 
> The selection of the timer wheel properties is combination of:
> 
>     1) Granularity 
> 
>     2) Storage space
> 
>     3) Number of levels to collect

So I came up with a slightly different solution for this. The problem case is
HZ=1000 and again looking at the data, there is no reason why we need actual
1ms granularity for timer wheel timers. That's independent of the desired ms
based interfaces.

We can simply run the wheel internaly with 4ms base level resolution and
degrade from there. That gives us 6 days+ and a simple cutoff at the capacity
of the 7th level wheel.

 0     0        4 ms               0 ms -        255 ms		    
 1    64       32 ms             256 ms -       2047 ms (256ms - ~2s)
 2   128      256 ms            2048 ms -      16383 ms (~2s - ~16s) 
 3   192     2048 ms (~2s)     16384 ms -     131071 ms (~16s - ~2m)
 4   256    16384 ms (~16s)   131072 ms -    1048575 ms (~2m - ~17m)
 5   320   131072 ms (~2m)   1048576 ms -    8388607 ms (~17m - ~2h)
 6   384  1048576 ms (~17m)  8388608 ms -   67108863 ms (~2h - ~18h)
 7   448  8388608 ms (~2h)  67108864 ms -  536870911 ms (~18h - ~6d)

That works really nice and has the interesting side effect that we batch in
the first level wheel which helps networking. I'll repost the series with the
other review points addressed later tonight.

Btw, I also thought a bit more about the milliseconds interfaces. I think we
shouldn't invent new interfaces. The correct solution IMHO is to distangle the
scheduler tick frequency and jiffies. If we have that completely seperated
then we can do the following:

1) Force HZ=1000. That means jiffies and timer wheel units are 1ms. If the
   tick frequency is != 1000 we simply increment jiffies in the tick by the
   proper amount (4 @250 ticks/sec, 10 @100 ticks/sec).

   So all msec_to_jiffies() invocations compile out into nothing magically and
   we can remove them gradually over time.

2) When we do that right, we can make the tick frequency a command line option
   and just have a compiled in default.

Thoughts?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-16 15:43                         ` Thomas Gleixner
@ 2016-06-16 16:02                           ` Paul E. McKenney
  2016-06-16 18:14                             ` Peter Zijlstra
  2016-06-16 16:04                           ` Arjan van de Ven
  1 sibling, 1 reply; 62+ messages in thread
From: Paul E. McKenney @ 2016-06-16 16:02 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Arjan van de Ven, Eric Dumazet, Peter Zijlstra, Ingo Molnar,
	LKML, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	Linus Torvalds, George Spelvin

On Thu, Jun 16, 2016 at 05:43:36PM +0200, Thomas Gleixner wrote:
> On Wed, 15 Jun 2016, Thomas Gleixner wrote:
> > On Wed, 15 Jun 2016, Arjan van de Ven wrote:
> > > what would 1 more timer wheel do?
> > 
> > Waste storage space and make the collection of expired timers more expensive.
> > 
> > The selection of the timer wheel properties is combination of:
> > 
> >     1) Granularity 
> > 
> >     2) Storage space
> > 
> >     3) Number of levels to collect
> 
> So I came up with a slightly different solution for this. The problem case is
> HZ=1000 and again looking at the data, there is no reason why we need actual
> 1ms granularity for timer wheel timers. That's independent of the desired ms
> based interfaces.
> 
> We can simply run the wheel internaly with 4ms base level resolution and
> degrade from there. That gives us 6 days+ and a simple cutoff at the capacity
> of the 7th level wheel.
> 
>  0     0        4 ms               0 ms -        255 ms		    
>  1    64       32 ms             256 ms -       2047 ms (256ms - ~2s)
>  2   128      256 ms            2048 ms -      16383 ms (~2s - ~16s) 
>  3   192     2048 ms (~2s)     16384 ms -     131071 ms (~16s - ~2m)
>  4   256    16384 ms (~16s)   131072 ms -    1048575 ms (~2m - ~17m)
>  5   320   131072 ms (~2m)   1048576 ms -    8388607 ms (~17m - ~2h)
>  6   384  1048576 ms (~17m)  8388608 ms -   67108863 ms (~2h - ~18h)
>  7   448  8388608 ms (~2h)  67108864 ms -  536870911 ms (~18h - ~6d)
> 
> That works really nice and has the interesting side effect that we batch in
> the first level wheel which helps networking. I'll repost the series with the
> other review points addressed later tonight.
> 
> Btw, I also thought a bit more about the milliseconds interfaces. I think we
> shouldn't invent new interfaces. The correct solution IMHO is to distangle the
> scheduler tick frequency and jiffies. If we have that completely seperated
> then we can do the following:
> 
> 1) Force HZ=1000. That means jiffies and timer wheel units are 1ms. If the
>    tick frequency is != 1000 we simply increment jiffies in the tick by the
>    proper amount (4 @250 ticks/sec, 10 @100 ticks/sec).
> 
>    So all msec_to_jiffies() invocations compile out into nothing magically and
>    we can remove them gradually over time.

Some of RCU's heuristics assume that if scheduling-clock ticks happen,
they happen once per jiffy.  These would need to be adjusted, which would
not be a big deal, just a bit more use of HZ.

> 2) When we do that right, we can make the tick frequency a command line option
>    and just have a compiled in default.

As long as there is something that tells RCU what the tick frequency
actually is at runtime, this should not be a problem.  For example,
in rcu_implicit_dynticks_qs(), the following:

	rdp->rsp->jiffies_resched += 5;

Would instead need to be something like:

	rdp->rsp->jiffies_resched += 5 * jiffies_per_tick;

Changing tick frequency at runtime would be a bit more tricky, as it would
be tough to avoid some oddball false positives during the transition.
But setting it at boot time would be fine.  ;-)

							Thanx, Paul

> Thoughts?
> 
> Thanks,
> 
> 	tglx
> 

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-16 15:43                         ` Thomas Gleixner
  2016-06-16 16:02                           ` Paul E. McKenney
@ 2016-06-16 16:04                           ` Arjan van de Ven
  2016-06-16 16:09                             ` Thomas Gleixner
  1 sibling, 1 reply; 62+ messages in thread
From: Arjan van de Ven @ 2016-06-16 16:04 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Eric Dumazet, Peter Zijlstra, Ingo Molnar, LKML,
	Paul E. McKenney, Frederic Weisbecker, Chris Mason,
	Arjan van de Ven, Linus Torvalds, George Spelvin

I think there's 2 elements on the interface.

1) having a relative interface to the current time (avoid use of
absolute jiffies in drivers)

2) having wallclock units. Making HZ always be 1000 is effectively
doing that as well (1 msec after all)



On Thu, Jun 16, 2016 at 8:43 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Wed, 15 Jun 2016, Thomas Gleixner wrote:
>> On Wed, 15 Jun 2016, Arjan van de Ven wrote:
>> > what would 1 more timer wheel do?
>>
>> Waste storage space and make the collection of expired timers more expensive.
>>
>> The selection of the timer wheel properties is combination of:
>>
>>     1) Granularity
>>
>>     2) Storage space
>>
>>     3) Number of levels to collect
>
> So I came up with a slightly different solution for this. The problem case is
> HZ=1000 and again looking at the data, there is no reason why we need actual
> 1ms granularity for timer wheel timers. That's independent of the desired ms
> based interfaces.
>
> We can simply run the wheel internaly with 4ms base level resolution and
> degrade from there. That gives us 6 days+ and a simple cutoff at the capacity
> of the 7th level wheel.
>
>  0     0        4 ms               0 ms -        255 ms
>  1    64       32 ms             256 ms -       2047 ms (256ms - ~2s)
>  2   128      256 ms            2048 ms -      16383 ms (~2s - ~16s)
>  3   192     2048 ms (~2s)     16384 ms -     131071 ms (~16s - ~2m)
>  4   256    16384 ms (~16s)   131072 ms -    1048575 ms (~2m - ~17m)
>  5   320   131072 ms (~2m)   1048576 ms -    8388607 ms (~17m - ~2h)
>  6   384  1048576 ms (~17m)  8388608 ms -   67108863 ms (~2h - ~18h)
>  7   448  8388608 ms (~2h)  67108864 ms -  536870911 ms (~18h - ~6d)
>
> That works really nice and has the interesting side effect that we batch in
> the first level wheel which helps networking. I'll repost the series with the
> other review points addressed later tonight.
>
> Btw, I also thought a bit more about the milliseconds interfaces. I think we
> shouldn't invent new interfaces. The correct solution IMHO is to distangle the
> scheduler tick frequency and jiffies. If we have that completely seperated
> then we can do the following:
>
> 1) Force HZ=1000. That means jiffies and timer wheel units are 1ms. If the
>    tick frequency is != 1000 we simply increment jiffies in the tick by the
>    proper amount (4 @250 ticks/sec, 10 @100 ticks/sec).
>
>    So all msec_to_jiffies() invocations compile out into nothing magically and
>    we can remove them gradually over time.
>
> 2) When we do that right, we can make the tick frequency a command line option
>    and just have a compiled in default.
>
> Thoughts?
>
> Thanks,
>
>         tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-16 16:04                           ` Arjan van de Ven
@ 2016-06-16 16:09                             ` Thomas Gleixner
  0 siblings, 0 replies; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-16 16:09 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Eric Dumazet, Peter Zijlstra, Ingo Molnar, LKML,
	Paul E. McKenney, Frederic Weisbecker, Chris Mason,
	Arjan van de Ven, Linus Torvalds, George Spelvin

On Thu, 16 Jun 2016, Arjan van de Ven wrote:

> I think there's 2 elements on the interface.
> 
> 1) having a relative interface to the current time (avoid use of
> absolute jiffies in drivers)

That's the easy part :)
 
> 2) having wallclock units. Making HZ always be 1000 is effectively
> doing that as well (1 msec after all)

Right. And it allows us to switch over the whole code base w/o fiddling with
it in one go. Cleanups should obviously take place after that.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-16 16:02                           ` Paul E. McKenney
@ 2016-06-16 18:14                             ` Peter Zijlstra
  2016-06-17  0:40                               ` Paul E. McKenney
  0 siblings, 1 reply; 62+ messages in thread
From: Peter Zijlstra @ 2016-06-16 18:14 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Thomas Gleixner, Arjan van de Ven, Eric Dumazet, Ingo Molnar,
	LKML, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	Linus Torvalds, George Spelvin

On Thu, Jun 16, 2016 at 09:02:15AM -0700, Paul E. McKenney wrote:
> > 2) When we do that right, we can make the tick frequency a command line option
> >    and just have a compiled in default.
> 
> As long as there is something that tells RCU what the tick frequency
> actually is at runtime, this should not be a problem.  For example,
> in rcu_implicit_dynticks_qs(), the following:
> 
> 	rdp->rsp->jiffies_resched += 5;
> 
> Would instead need to be something like:
> 
> 	rdp->rsp->jiffies_resched += 5 * jiffies_per_tick;
> 
> Changing tick frequency at runtime would be a bit more tricky, as it would
> be tough to avoid some oddball false positives during the transition.

So the 'fun' part will be frequencies with non integer factors of 1000.
Like say HZ=300. For that we'll have to keep jiffies_remainder, and
add an extra jiffy every time that rolls over.

That would make your case slightly more interesting than you really
want I suspect.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-16 18:14                             ` Peter Zijlstra
@ 2016-06-17  0:40                               ` Paul E. McKenney
  2016-06-17  4:04                                 ` Paul E. McKenney
  0 siblings, 1 reply; 62+ messages in thread
From: Paul E. McKenney @ 2016-06-17  0:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Arjan van de Ven, Eric Dumazet, Ingo Molnar,
	LKML, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	Linus Torvalds, George Spelvin

On Thu, Jun 16, 2016 at 08:14:02PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 16, 2016 at 09:02:15AM -0700, Paul E. McKenney wrote:
> > > 2) When we do that right, we can make the tick frequency a command line option
> > >    and just have a compiled in default.
> > 
> > As long as there is something that tells RCU what the tick frequency
> > actually is at runtime, this should not be a problem.  For example,
> > in rcu_implicit_dynticks_qs(), the following:
> > 
> > 	rdp->rsp->jiffies_resched += 5;
> > 
> > Would instead need to be something like:
> > 
> > 	rdp->rsp->jiffies_resched += 5 * jiffies_per_tick;
> > 
> > Changing tick frequency at runtime would be a bit more tricky, as it would
> > be tough to avoid some oddball false positives during the transition.
> 
> So the 'fun' part will be frequencies with non integer factors of 1000.
> Like say HZ=300. For that we'll have to keep jiffies_remainder, and
> add an extra jiffy every time that rolls over.
> 
> That would make your case slightly more interesting than you really
> want I suspect.

My particular case is not that sensitive, so 1000/300 would be plenty
accurate.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-17  0:40                               ` Paul E. McKenney
@ 2016-06-17  4:04                                 ` Paul E. McKenney
  0 siblings, 0 replies; 62+ messages in thread
From: Paul E. McKenney @ 2016-06-17  4:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Arjan van de Ven, Eric Dumazet, Ingo Molnar,
	LKML, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	Linus Torvalds, George Spelvin

On Thu, Jun 16, 2016 at 05:40:51PM -0700, Paul E. McKenney wrote:
> On Thu, Jun 16, 2016 at 08:14:02PM +0200, Peter Zijlstra wrote:
> > On Thu, Jun 16, 2016 at 09:02:15AM -0700, Paul E. McKenney wrote:
> > > > 2) When we do that right, we can make the tick frequency a command line option
> > > >    and just have a compiled in default.
> > > 
> > > As long as there is something that tells RCU what the tick frequency
> > > actually is at runtime, this should not be a problem.  For example,
> > > in rcu_implicit_dynticks_qs(), the following:
> > > 
> > > 	rdp->rsp->jiffies_resched += 5;
> > > 
> > > Would instead need to be something like:
> > > 
> > > 	rdp->rsp->jiffies_resched += 5 * jiffies_per_tick;
> > > 
> > > Changing tick frequency at runtime would be a bit more tricky, as it would
> > > be tough to avoid some oddball false positives during the transition.
> > 
> > So the 'fun' part will be frequencies with non integer factors of 1000.
> > Like say HZ=300. For that we'll have to keep jiffies_remainder, and
> > add an extra jiffy every time that rolls over.
> > 
> > That would make your case slightly more interesting than you really
> > want I suspect.
> 
> My particular case is not that sensitive, so 1000/300 would be plenty
> accurate.

That said...  If someone were to set HZ=501, now -that- could be a
bit problematic.  :-/

							Thanx, Paul

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14 16:48     ` Thomas Gleixner
@ 2016-06-14 19:56       ` George Spelvin
  0 siblings, 0 replies; 62+ messages in thread
From: George Spelvin @ 2016-06-14 19:56 UTC (permalink / raw)
  To: linux, tglx; +Cc: edumazet, linux-kernel, peterz, richardcochran

Thomas Gleixner wrote:
> On Tue, 14 Jun 2016, George Spelvin wrote:
>> If you need to shrink TIMER_ARRAYMASK to fit another flag bit,
> 
> We can accomodate wheel with 512 buckets with the current ARRAYMASK and that
> really should be enough.

You're absolutely correct, but I was referring to the possible development
in the future of the need for another flag bit for some purpose *other*
than encoding a bucket number.  There's no need now, but if next year
someone finds and urgent need for another flag bit, there's a way
to proceed.

(Although you could just enlarge "flags"; the removal of "slack" has left
a 32-bit alignment hole.)

> The expiry code uses the pending_map already in patch 13 to avoid looking at
> the bucket if its empty.

My bad, I'm sorry!  I was quickly re-reading it and missed that.
Given the quality of the patch series, I should have expected that and
looked harder.

> Thanks,

Thank *you*.  It really is a pleasure to read.  I can't find anything but the most
insignificant issues to complain about.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14 12:58   ` George Spelvin
@ 2016-06-14 16:48     ` Thomas Gleixner
  2016-06-14 19:56       ` George Spelvin
  0 siblings, 1 reply; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-14 16:48 UTC (permalink / raw)
  To: George Spelvin; +Cc: peterz, edumazet, linux-kernel, richardcochran

On Tue, 14 Jun 2016, George Spelvin wrote:
> Not enough to be interesting, and the extra levels increase processing
> time.  If you need to shrink TIMER_ARRAYMASK to fit another flag bit,
> the easier way would be to encode only the level rather than the index,
> since you can derive the latter from level and expiry time trivially.
 
We can accomodate wheel with 512 buckets with the current ARRAYMASK and that
really should be enough.
 
> A couple of really minor tweaks that could be folded in, if Thomas feels
> like it:
> 
> * It would make sense to move all the TIMER_ARRAYSHIFT/TIMER_ARRAYMASK
>   stuff out of patch 13 and into patch 20.

The expiry code uses the pending_map already in patch 13 to avoid looking at
the bucket if its empty.

> * It would make sense to change the return type of mod_timer (& Co.)
>   detach_if_pending, and del_timer to bool.
>   ({try_to_,}del_timer_sync return 3 values.)

We can do that as a seperate patch. Makes sense. 

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14 10:20 ` Peter Zijlstra
@ 2016-06-14 12:58   ` George Spelvin
  2016-06-14 16:48     ` Thomas Gleixner
  0 siblings, 1 reply; 62+ messages in thread
From: George Spelvin @ 2016-06-14 12:58 UTC (permalink / raw)
  To: linux, peterz; +Cc: edumazet, linux-kernel, richardcochran, tglx

Peter Zijlstra wrote:
> That did occur to me as well; however I think it would be best to
> eradicate all forms of cascading entirely -- if at all possible.

Agreed.

>> to be replaced with __builtin_clz or similar:

> Problem is for the archs that don't have that, the 5 layer branch is
> trivial for all arches, while software clz/fls is far more expensive.

And there's no way to tell if an architecture has a good one, so bleah.

I was thinking about the flosting-point number representation and what
the effect of a finer table spacing would be.

The maximum error is determined by the difference between LVL_BITS (=6)
and LVL_CLK_SHIFT(=3).

If you dropped those both by 1, you'd get 2/3 as many bits of timer
in 1/2 the space, which would be a slight space saving.  The current 6
levels (64 * 6 = 384 lists) covering 6 + 3*5 = 21 bits be done 32 * 9 =
288 lists (5 + 2*8 = 21).

Not enough to be interesting, and the extra levels increase processing
time.  If you need to shrink TIMER_ARRAYMASK to fit another flag bit,
the easier way would be to encode only the level rather than the index,
since you can derive the latter from level and expiry time trivially.


A couple of really minor tweaks that could be folded in, if Thomas feels
like it:

* It would make sense to move all the TIMER_ARRAYSHIFT/TIMER_ARRAYMASK
  stuff out of patch 13 and into patch 20.
* It would make sense to change the return type of mod_timer (& Co.)
  detach_if_pending, and del_timer to bool.
  ({try_to_,}del_timer_sync return 3 values.)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14  8:16 [patch 13/20] timer: Switch to a non cascading wheel George Spelvin
  2016-06-14  8:50 ` Thomas Gleixner
@ 2016-06-14 10:20 ` Peter Zijlstra
  2016-06-14 12:58   ` George Spelvin
  1 sibling, 1 reply; 62+ messages in thread
From: Peter Zijlstra @ 2016-06-14 10:20 UTC (permalink / raw)
  To: George Spelvin; +Cc: tglx, edumazet, linux-kernel, richardcochran

On Tue, Jun 14, 2016 at 04:16:02AM -0400, George Spelvin wrote:
> While I like the cleanup of just limiting long-term resolution, if
> it turns out to be necessary, it's not too hard to add exact timers
> back in if a need is found in future.  All you need is a second
> __internal_add_timer function that rounds down rather than up, and to
> teach expire_timers() to cascade in the unlikely situation that a timer
> does have an expiry time in the future.

That did occur to me as well; however I think it would be best to
eradicate all forms of cascading entirely -- if at all possible.

If not; then I agree, that would clean things up.

> Wouldn't this all be so much simpler as
> 
> #define LVL_BITS	6	/* Renamed previous LVL_SHIFT */
> #define LVL_SIZE	(1 << LVL_BITS)
> #define LVL_MASK	(LVL_BITS - 1)
> #define LVL_OFFS(n)	((n) * LVL_SIZE)
> #define LVL_SHIFT(n)	((n) * LVL_CLK_SHIFT)
> #define LVL_GRAN(n)	(1 << LVL_SHIFT(n))
> 
> Then you could do
> +static inline unsigned calc_index(unsigned expires, unsigned level),
> +{
> +	/* Round up to next bin bin */
> +	expires = ((expires - 1) >> LVL_SHIFT(level)) + 1;
> +	return LVL_OFFS(level) + (expires & LVL_MASK);
> +}

I like.

> to be replaced with __builtin_clz or similar:

Problem is for the archs that don't have that, the 5 layer branch is
trivial for all arches, while software clz/fls is far more expensive.

> > +		timer = hlist_entry(head->first, struct timer_list, entry);
> > +		fn = timer->function;
> > +		data = timer->data;
> > +
> > +		timer_stats_account_timer(timer);
> > +
> > +		base->running_timer = timer;
> > +		detach_expired_timer(timer);
> 
> Is there some non-obvious reason that you have to fetch fn and data
> so early?  It seems like a register pressure pessimization, if the
> compiler can't figure out that timer_stats code can't change them.
> 
> The cache line containing this timer was already prefetched when you
> updated its entry.pprev as part of removing the previous entry from
> the list.
> 
> I see why you want to fetch them with the lock held in case there's some
> freaky race, but I'd do it all after detach_timer().

Good point, ideally the compiler can move those loads around inside the
lock, but its unlikely to be _that_ clever. We could indeed lower those
loads manually to just before the unlock.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14  8:50 ` Thomas Gleixner
@ 2016-06-14 10:15   ` George Spelvin
  0 siblings, 0 replies; 62+ messages in thread
From: George Spelvin @ 2016-06-14 10:15 UTC (permalink / raw)
  To: linux, tglx; +Cc: edumazet, linux-kernel, peterz, richardcochran

On Tue, 14 Jun 2016, Thomas Gleixner wrote:
> I thought about that and when looking at those long timeout thingies
> I came to the conclusion that it's simply not worth the trouble.

Okay.  A comment might be nice, just to stop someone else wasting
brain power on it.  E.g.

/*
 * If the timer happens to expire exactly now, this will cascade it to
 * vectors[0] which we just cleared and won't check again for 64 jiffies.
 * This is acceptable error on a timeout this long.
 */

>> to be replaced with __builtin_clz or similar:
> 
> Except that __fls() is noticeably slower than the if chain.

Fair enogh.  I wasn't sure about the distribution; if it's biased low,
then the if chain would win.

> That's not new code. We kept the ordering, but yes, we definitely can turn
> that around. The only restriction is that we get it before releasing the lock.

Thanks!

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
  2016-06-14  8:16 [patch 13/20] timer: Switch to a non cascading wheel George Spelvin
@ 2016-06-14  8:50 ` Thomas Gleixner
  2016-06-14 10:15   ` George Spelvin
  2016-06-14 10:20 ` Peter Zijlstra
  1 sibling, 1 reply; 62+ messages in thread
From: Thomas Gleixner @ 2016-06-14  8:50 UTC (permalink / raw)
  To: George Spelvin; +Cc: edumazet, linux-kernel, peterz, richardcochran

On Tue, 14 Jun 2016, George Spelvin wrote:
> I think I see a buglet in your level-5 cascading.
> 
> Suppose a timer is requested far in the future for a time
> that is an exact multiple of 32768 jiffies.
> 
> collect_expired_timers() scans level 5 after all the previous ones,
> and will cascade it to level 0, in a level-0 bucket which has already
> been scanned, and won't be scanned again for 64 jiffies.
> 
> I agree that 64 jiffies is well within your allowed rounding accuracy,
> and order of timer firing is not guaranteed when they're for the same
> time, but it is a bit odd when a timer fires 32 jiffies *before* another
> timer scheduled for 32 jiffies later.  That's the sort of peculiarity
> that could lead to a subtle bug.

I thought about that and when looking at those long timeout thingies I came to
the conclusion that it's simply not worth the trouble.
 
> Wouldn't this all be so much simpler as
> 
> #define LVL_BITS	6	/* Renamed previous LVL_SHIFT */
> #define LVL_SIZE	(1 << LVL_BITS)
> #define LVL_MASK	(LVL_BITS - 1)
> #define LVL_OFFS(n)	((n) * LVL_SIZE)
> #define LVL_SHIFT(n)	((n) * LVL_CLK_SHIFT)
> #define LVL_GRAN(n)	(1 << LVL_SHIFT(n))

Indeed.
 
> Ideally, you'd like all of that
> 
> +	if (delta < LVL1_TSTART) {
> +		idx = (expires + LVL0_GRAN) & LVL_MASK;
> +	} else if (delta < LVL2_TSTART) {
> +		idx = calc_index(expires, LVL1_GRAN, LVL1_SHIFT, LVL1_OFFS);
> +	} else if (delta < LVL3_TSTART) {
> +		idx = calc_index(expires, LVL2_GRAN, LVL2_SHIFT, LVL2_OFFS);
> +	} else if (delta < LVL4_TSTART) {
> +		idx = calc_index(expires, LVL3_GRAN, LVL3_SHIFT, LVL3_OFFS);
> +	} else if (delta < LVL5_TSTART) {
> +		idx = calc_index(expires, LVL4_GRAN, LVL4_SHIFT, LVL4_OFFS);
> 
> to be replaced with __builtin_clz or similar:

Except that __fls() is noticeably slower than the if chain.

> > +static inline void detach_expired_timer(struct timer_list *timer)
> >  {
> >  	detach_timer(timer, true);
> > -	if (!(timer->flags & TIMER_DEFERRABLE))
> > -		base->active_timers--;
> > -	base->all_timers--;
> >  }
> 
> Is there even a reason to have this wrapper any more?  Why not
> just replace all calls to it in the source?

That just happened to stay there for no particular reason.

> > +		timer = hlist_entry(head->first, struct timer_list, entry);
> > +		fn = timer->function;
> > +		data = timer->data;
> > +
> > +		timer_stats_account_timer(timer);
> > +
> > +		base->running_timer = timer;
> > +		detach_expired_timer(timer);
> 
> Is there some non-obvious reason that you have to fetch fn and data
> so early?  It seems like a register pressure pessimization, if the
> compiler can't figure out that timer_stats code can't change them.
> 
> The cache line containing this timer was already prefetched when you
> updated its entry.pprev as part of removing the previous entry from
> the list.
> 
> I see why you want to fetch them with the lock held in case there's some
> freaky race, but I'd do it all after detach_timer().

That's not new code. We kept the ordering, but yes, we definitely can turn
that around. The only restriction is that we get it before releasing the lock.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [patch 13/20] timer: Switch to a non cascading wheel
@ 2016-06-14  8:16 George Spelvin
  2016-06-14  8:50 ` Thomas Gleixner
  2016-06-14 10:20 ` Peter Zijlstra
  0 siblings, 2 replies; 62+ messages in thread
From: George Spelvin @ 2016-06-14  8:16 UTC (permalink / raw)
  To: tglx; +Cc: edumazet, linux, linux-kernel, peterz, richardcochran

Nice cleanup!


I think I see a buglet in your level-5 cascading.

Suppose a timer is requested far in the future for a time
that is an exact multiple of 32768 jiffies.

collect_expired_timers() scans level 5 after all the previous ones,
and will cascade it to level 0, in a level-0 bucket which has already
been scanned, and won't be scanned again for 64 jiffies.

I agree that 64 jiffies is well within your allowed rounding accuracy,
and order of timer firing is not guaranteed when they're for the same
time, but it is a bit odd when a timer fires 32 jiffies *before* another
timer scheduled for 32 jiffies later.  That's the sort of peculiarity
that could lead to a subtle bug.


While I like the cleanup of just limiting long-term resolution, if
it turns out to be necessary, it's not too hard to add exact timers
back in if a need is found in future.  All you need is a second
__internal_add_timer function that rounds down rather than up, and to
teach expire_timers() to cascade in the unlikely situation that a timer
does have an expiry time in the future.

(It also gets rid of the special case for level 5.)


Other, mostly minor, code comments:

> +/* Level offsets in the wheel */
> +#define LVL0_OFFS	(0)
> +#define LVL1_OFFS	(LVL_SIZE)
> +#define LVL2_OFFS	(LVL1_OFFS + LVL_SIZE)
> +#define LVL3_OFFS	(LVL2_OFFS + LVL_SIZE)
> +#define LVL4_OFFS	(LVL3_OFFS + LVL_SIZE)
> +#define LVL5_OFFS	(LVL4_OFFS + LVL_SIZE)
> +
> +/* Clock divisor for the next level */
> +#define LVL_CLK_SHIFT	3
> +#define LVL_CLK_DIV	(1 << LVL_CLK_SHIFT)
> +#define LVL_CLK_MASK	(LVL_CLK_DIV - 1)
> +
> +/* The shift constants for selecting the bucket at the levels */
> +#define LVL1_SHIFT	(1 * LVL_CLK_SHIFT)
> +#define LVL2_SHIFT	(2 * LVL_CLK_SHIFT)
> +#define LVL3_SHIFT	(3 * LVL_CLK_SHIFT)
> +#define LVL4_SHIFT	(4 * LVL_CLK_SHIFT)
> +#define LVL5_SHIFT	(5 * LVL_CLK_SHIFT)
> +
> +/* The granularity of each level */
> +#define LVL0_GRAN	0x00000001
> +#define LVL1_GRAN	(LVL0_GRAN << LVL_CLK_SHIFT)
> +#define LVL2_GRAN	(LVL1_GRAN << LVL_CLK_SHIFT)
> +#define LVL3_GRAN	(LVL2_GRAN << LVL_CLK_SHIFT)
> +#define LVL4_GRAN	(LVL3_GRAN << LVL_CLK_SHIFT)
> +#define LVL5_GRAN	(LVL4_GRAN << LVL_CLK_SHIFT)

Wouldn't this all be so much simpler as

#define LVL_BITS	6	/* Renamed previous LVL_SHIFT */
#define LVL_SIZE	(1 << LVL_BITS)
#define LVL_MASK	(LVL_BITS - 1)
#define LVL_OFFS(n)	((n) * LVL_SIZE)
#define LVL_SHIFT(n)	((n) * LVL_CLK_SHIFT)
#define LVL_GRAN(n)	(1 << LVL_SHIFT(n))

Then you could do
+static inline unsigned calc_index(unsigned expires, unsigned level),
+{
+	/* Round up to next bin bin */
+	expires = ((expires - 1) >> LVL_SHIFT(level)) + 1;
+	return LVL_OFFS(level) + (expires & LVL_MASK);
+}


> +#define LVL1_TSTART	(LVL_SIZE - 1)

Er... isn't that LVL_SIZE, as documented in the table above?
Then it could be
#define LVL_TSTART(n) (LVL_SIZE << LVL_SHIFT(n))

Ideally, you'd like all of that

+	if (delta < LVL1_TSTART) {
+		idx = (expires + LVL0_GRAN) & LVL_MASK;
+	} else if (delta < LVL2_TSTART) {
+		idx = calc_index(expires, LVL1_GRAN, LVL1_SHIFT, LVL1_OFFS);
+	} else if (delta < LVL3_TSTART) {
+		idx = calc_index(expires, LVL2_GRAN, LVL2_SHIFT, LVL2_OFFS);
+	} else if (delta < LVL4_TSTART) {
+		idx = calc_index(expires, LVL3_GRAN, LVL3_SHIFT, LVL3_OFFS);
+	} else if (delta < LVL5_TSTART) {
+		idx = calc_index(expires, LVL4_GRAN, LVL4_SHIFT, LVL4_OFFS);

to be replaced with __builtin_clz or similar:

	level = __fls(delta | LVL_MASK);
	if (level <  LVL_BITS + LVL_SHIFT(LVL_DEPTH-1)) {	/* or LVL_DEPTH-2, no difference */
		level = (level + LVL_CLK_SHIFT - LVL_BITS) / LVL_CLK_SHIFT;
	} else if ((long)delta < 0) {
		expires = base->clk;
		level = 0;
	} else {
		level = LVL_DEPTH - 1;
	}
	index = calc_index(expires, level);


> +static inline void detach_expired_timer(struct timer_list *timer)
>  {
>  	detach_timer(timer, true);
> -	if (!(timer->flags & TIMER_DEFERRABLE))
> -		base->active_timers--;
> -	base->all_timers--;
>  }

Is there even a reason to have this wrapper any more?  Why not
just replace all calls to it in the source?


> +		timer = hlist_entry(head->first, struct timer_list, entry);
> +		fn = timer->function;
> +		data = timer->data;
> +
> +		timer_stats_account_timer(timer);
> +
> +		base->running_timer = timer;
> +		detach_expired_timer(timer);

Is there some non-obvious reason that you have to fetch fn and data
so early?  It seems like a register pressure pessimization, if the
compiler can't figure out that timer_stats code can't change them.

The cache line containing this timer was already prefetched when you
updated its entry.pprev as part of removing the previous entry from
the list.

I see why you want to fetch them with the lock held in case there's some
freaky race, but I'd do it all after detach_timer().

^ permalink raw reply	[flat|nested] 62+ messages in thread

end of thread, other threads:[~2016-06-17  4:05 UTC | newest]

Thread overview: 62+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-06-13  8:40 [patch 00/20] timer: Refactor the timer wheel Thomas Gleixner
2016-06-13  8:40 ` [patch 01/20] timer: Make pinned a timer property Thomas Gleixner
2016-06-13  8:40 ` [patch 02/20] x86/apic/uv: Initialize timer as pinned Thomas Gleixner
2016-06-13  8:40 ` [patch 03/20] x86/mce: " Thomas Gleixner
2016-06-13  8:40 ` [patch 04/20] cpufreq/powernv: " Thomas Gleixner
2016-06-13 13:18   ` Arjan van de Ven
2016-06-13  8:40 ` [patch 05/20] driver/net/ethernet/tile: " Thomas Gleixner
2016-06-13  8:40 ` [patch 06/20] drivers/tty/metag_da: " Thomas Gleixner
2016-06-13 13:13   ` Arjan van de Ven
2016-06-13  8:40 ` [patch 07/20] drivers/tty/mips_ejtag: " Thomas Gleixner
2016-06-13  8:40 ` [patch 08/20] net/ipv4/inet: Initialize timers " Thomas Gleixner
2016-06-13  8:40 ` [patch 09/20] timer: Remove mod_timer_pinned Thomas Gleixner
2016-06-13  8:40 ` [patch 10/20] timer: Add a cascading tracepoint Thomas Gleixner
2016-06-13  8:40 ` [patch 11/20] hlist: Add hlist_is_last_node() helper Thomas Gleixner
2016-06-13 10:27   ` Paolo Bonzini
2016-06-13  8:40 ` [patch 12/20] timer: Give a few structs and members proper names Thomas Gleixner
2016-06-13  8:41 ` [patch 13/20] timer: Switch to a non cascading wheel Thomas Gleixner
2016-06-13 11:40   ` Peter Zijlstra
2016-06-13 12:30     ` Thomas Gleixner
2016-06-13 12:46       ` Eric Dumazet
2016-06-13 14:30         ` Thomas Gleixner
2016-06-14 10:11       ` Ingo Molnar
2016-06-14 16:28         ` Thomas Gleixner
2016-06-14 17:14           ` Arjan van de Ven
2016-06-14 18:05             ` Thomas Gleixner
2016-06-14 20:34               ` Peter Zijlstra
2016-06-14 20:42               ` Peter Zijlstra
2016-06-14 21:17                 ` Eric Dumazet
2016-06-15 14:53                   ` Thomas Gleixner
2016-06-15 14:55                     ` Arjan van de Ven
2016-06-15 16:43                       ` Thomas Gleixner
2016-06-16 15:43                         ` Thomas Gleixner
2016-06-16 16:02                           ` Paul E. McKenney
2016-06-16 18:14                             ` Peter Zijlstra
2016-06-17  0:40                               ` Paul E. McKenney
2016-06-17  4:04                                 ` Paul E. McKenney
2016-06-16 16:04                           ` Arjan van de Ven
2016-06-16 16:09                             ` Thomas Gleixner
2016-06-15 15:05                     ` Eric Dumazet
2016-06-13 14:36   ` Richard Cochran
2016-06-13 14:39     ` Thomas Gleixner
2016-06-13  8:41 ` [patch 15/20] timer: Move __run_timers() function Thomas Gleixner
2016-06-13  8:41 ` [patch 14/20] timer: Remove slack leftovers Thomas Gleixner
2016-06-13  8:41 ` [patch 16/20] timer: Optimize collect timers for NOHZ Thomas Gleixner
2016-06-13  8:41 ` [patch 17/20] tick/sched: Remove pointless empty function Thomas Gleixner
2016-06-13  8:41 ` [patch 18/20] timer: Forward wheel clock whenever possible Thomas Gleixner
2016-06-13 15:14   ` Richard Cochran
2016-06-13 15:18     ` Thomas Gleixner
2016-06-13  8:41 ` [patch 19/20] timer: Split out index calculation Thomas Gleixner
2016-06-13  8:41 ` [patch 20/20] timer: Optimization for same expiry time in mod_timer() Thomas Gleixner
2016-06-13 14:10 ` [patch 00/20] timer: Refactor the timer wheel Eric Dumazet
2016-06-13 16:15 ` Paul E. McKenney
2016-06-15 15:15   ` Paul E. McKenney
2016-06-15 17:02     ` Thomas Gleixner
2016-06-15 20:26       ` Paul E. McKenney
2016-06-14  8:16 [patch 13/20] timer: Switch to a non cascading wheel George Spelvin
2016-06-14  8:50 ` Thomas Gleixner
2016-06-14 10:15   ` George Spelvin
2016-06-14 10:20 ` Peter Zijlstra
2016-06-14 12:58   ` George Spelvin
2016-06-14 16:48     ` Thomas Gleixner
2016-06-14 19:56       ` George Spelvin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).