All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch V3 00/22] timer: Refactor the timer wheel
@ 2016-06-24 14:32 Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 01/22] timer: Make pinned a timer property Thomas Gleixner
                   ` (23 more replies)
  0 siblings, 24 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

This is the third version of the timer wheel rework series. The previous
versions can be found here:

V1:   http://lkml.kernel.org/r/20160613070440.950649741@linutronix.de
V2:   http://lkml.kernel.org/r/20160617121134.417319325@linutronix.de

The series is also available in git:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.timers

Changes vs. V2:

 - Removed the 1000Hz granularity reduction to 4ms. Eric explained that
   datacenter workloads require the granularity in the first level wheel.

 - Fixed the typo in tilepro. 

 - Converted sigtimedwait() to hrtimers

 - To avoid the cascading I extended the wheel by another level. This removes
   the rarely executed cascading code path, but increases the storage size for
   HZ>100 slightly. If the tiny folks care, there is an simple option to cut
   the storage size in half for the price of reduced granularity.

Thanks,

	tglx

----
 arch/x86/kernel/apic/x2apic_uv_x.c  |    4 
 arch/x86/kernel/cpu/mcheck/mce.c    |    4 
 block/genhd.c                       |    5 
 drivers/cpufreq/powernv-cpufreq.c   |    5 
 drivers/mmc/host/jz4740_mmc.c       |    2 
 drivers/net/ethernet/tile/tilepro.c |    4 
 drivers/power/bq27xxx_battery.c     |    5 
 drivers/tty/metag_da.c              |    4 
 drivers/tty/mips_ejtag_fdc.c        |    4 
 drivers/usb/host/ohci-hcd.c         |    1 
 drivers/usb/host/xhci.c             |    2 
 include/linux/list.h                |   10 
 include/linux/timer.h               |   34 -
 kernel/signal.c                     |   24 
 kernel/time/tick-internal.h         |    1 
 kernel/time/tick-sched.c            |   46 -
 kernel/time/timer.c                 | 1096 +++++++++++++++++++++---------------
 lib/random32.c                      |    1 
 net/ipv4/inet_connection_sock.c     |    7 
 net/ipv4/inet_timewait_sock.c       |    5 
 20 files changed, 734 insertions(+), 530 deletions(-)

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 01/22] timer: Make pinned a timer property
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-07-01 13:03   ` Frederic Weisbecker
  2016-06-24 14:32 ` [patch V3 02/22] x86/apic/uv: Initialize timer as pinned Thomas Gleixner
                   ` (22 subsequent siblings)
  23 siblings, 1 reply; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: timer_Make_pinned_a_timer_property.patch --]
[-- Type: text/plain, Size: 5238 bytes --]

We want to move the timer migration from a push to a pull model. This requires
to store the pinned attribute of a timer in the timer itself. This must happen
at initialization time.

Add the helper macros for this.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>


---
 include/linux/timer.h |   25 ++++++++++++++++++++++---
 kernel/time/timer.c   |   10 +++++-----
 2 files changed, 27 insertions(+), 8 deletions(-)

--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -62,7 +62,8 @@ struct timer_list {
 #define TIMER_MIGRATING		0x00080000
 #define TIMER_BASEMASK		(TIMER_CPUMASK | TIMER_MIGRATING)
 #define TIMER_DEFERRABLE	0x00100000
-#define TIMER_IRQSAFE		0x00200000
+#define TIMER_PINNED		0x00200000
+#define TIMER_IRQSAFE		0x00400000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
@@ -78,9 +79,15 @@ struct timer_list {
 #define TIMER_INITIALIZER(_function, _expires, _data)		\
 	__TIMER_INITIALIZER((_function), (_expires), (_data), 0)
 
+#define TIMER_PINNED_INITIALIZER(_function, _expires, _data)	\
+	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED)
+
 #define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data)	\
 	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE)
 
+#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data)	\
+	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED)
+
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
 	struct timer_list _name =				\
 		TIMER_INITIALIZER(_function, _expires, _data)
@@ -124,8 +131,12 @@ static inline void init_timer_on_stack_k
 
 #define init_timer(timer)						\
 	__init_timer((timer), 0)
+#define init_timer_pinned(timer)					\
+	__init_timer((timer), TIMER_PINNED)
 #define init_timer_deferrable(timer)					\
 	__init_timer((timer), TIMER_DEFERRABLE)
+#define init_timer_pinned_deferrable(timer)				\
+	__init_timer((timer), TIMER_DEFERRABLE | TIMER_PINNED)
 #define init_timer_on_stack(timer)					\
 	__init_timer_on_stack((timer), 0)
 
@@ -145,12 +156,20 @@ static inline void init_timer_on_stack_k
 
 #define setup_timer(timer, fn, data)					\
 	__setup_timer((timer), (fn), (data), 0)
+#define setup_pinned_timer(timer, fn, data)				\
+	__setup_timer((timer), (fn), (data), TIMER_PINNED)
 #define setup_deferrable_timer(timer, fn, data)				\
 	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE)
+#define setup_pinned_deferrable_timer(timer, fn, data)			\
+	__setup_timer((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
 #define setup_timer_on_stack(timer, fn, data)				\
 	__setup_timer_on_stack((timer), (fn), (data), 0)
+#define setup_pinned_timer_on_stack(timer, fn, data)			\
+	__setup_timer_on_stack((timer), (fn), (data), TIMER_PINNED)
 #define setup_deferrable_timer_on_stack(timer, fn, data)		\
 	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE)
+#define setup_pinned_deferrable_timer_on_stack(timer, fn, data)		\
+	__setup_timer_on_stack((timer), (fn), (data), TIMER_DEFERRABLE | TIMER_PINNED)
 
 /**
  * timer_pending - is a timer pending?
@@ -175,8 +194,8 @@ extern int mod_timer_pinned(struct timer
 
 extern void set_timer_slack(struct timer_list *time, int slack_hz);
 
-#define TIMER_NOT_PINNED	0
-#define TIMER_PINNED		1
+#define MOD_TIMER_NOT_PINNED	0
+#define MOD_TIMER_PINNED	1
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -782,7 +782,7 @@ static inline int
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, pinned);
+	new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED);
 
 	if (base != new_base) {
 		/*
@@ -825,7 +825,7 @@ static inline int
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -900,7 +900,7 @@ int mod_timer(struct timer_list *timer,
 	if (timer_pending(timer) && timer->expires == expires)
 		return 1;
 
-	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
 
@@ -928,7 +928,7 @@ int mod_timer_pinned(struct timer_list *
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	return __mod_timer(timer, expires, false, TIMER_PINNED);
+	return __mod_timer(timer, expires, false, MOD_TIMER_PINNED);
 }
 EXPORT_SYMBOL(mod_timer_pinned);
 
@@ -1512,7 +1512,7 @@ signed long __sched schedule_timeout(sig
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
+	__mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 02/22] x86/apic/uv: Initialize timer as pinned
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 01/22] timer: Make pinned a timer property Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 03/22] x86/mce: " Thomas Gleixner
                   ` (21 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: x86apicuv_Initialize_timer_as_pinned.patch --]
[-- Type: text/plain, Size: 1276 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>


---
 arch/x86/kernel/apic/x2apic_uv_x.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -919,7 +919,7 @@ static void uv_heartbeat(unsigned long i
 	uv_set_scir_bits(bits);
 
 	/* enable next timer period */
-	mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+	mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
 }
 
 static void uv_heartbeat_enable(int cpu)
@@ -928,7 +928,7 @@ static void uv_heartbeat_enable(int cpu)
 		struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
 
 		uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
-		setup_timer(timer, uv_heartbeat, cpu);
+		setup_pinned_timer(timer, uv_heartbeat, cpu);
 		timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
 		add_timer_on(timer, cpu);
 		uv_cpu_scir_info(cpu)->enabled = 1;

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 03/22] x86/mce: Initialize timer as pinned
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 01/22] timer: Make pinned a timer property Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 02/22] x86/apic/uv: Initialize timer as pinned Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 04/22] cpufreq/powernv: " Thomas Gleixner
                   ` (20 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: x86mce_Initialize_timer_as_pinned.patch --]
[-- Type: text/plain, Size: 1128 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 arch/x86/kernel/cpu/mcheck/mce.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1309,7 +1309,7 @@ static void __restart_timer(struct timer
 
 	if (timer_pending(t)) {
 		if (time_before(when, t->expires))
-			mod_timer_pinned(t, when);
+			mod_timer(t, when);
 	} else {
 		t->expires = round_jiffies(when);
 		add_timer_on(t, smp_processor_id());
@@ -1735,7 +1735,7 @@ static void __mcheck_cpu_init_timer(void
 	struct timer_list *t = this_cpu_ptr(&mce_timer);
 	unsigned int cpu = smp_processor_id();
 
-	setup_timer(t, mce_timer_fn, cpu);
+	setup_pinned_timer(t, mce_timer_fn, cpu);
 	mce_start_timer(cpu, t);
 }
 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 04/22] cpufreq/powernv: Initialize timer as pinned
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (2 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 03/22] x86/mce: " Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 05/22] driver/net/ethernet/tile: " Thomas Gleixner
                   ` (19 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: cpufreqpowernv_Initialize_timer_as_pinned.patch --]
[-- Type: text/plain, Size: 1228 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 drivers/cpufreq/powernv-cpufreq.c |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -530,8 +530,7 @@ static inline void  queue_gpstate_timer(
 	else
 		timer_interval = GPSTATE_TIMER_INTERVAL;
 
-	mod_timer_pinned(&gpstates->timer, jiffies +
-			msecs_to_jiffies(timer_interval));
+	mod_timer(&gpstates->timer, jiffies + msecs_to_jiffies(timer_interval));
 }
 
 /**
@@ -699,7 +698,7 @@ static int powernv_cpufreq_cpu_init(stru
 	policy->driver_data = gpstates;
 
 	/* initialize timer */
-	init_timer_deferrable(&gpstates->timer);
+	init_timer_pinned_deferrable(&gpstates->timer);
 	gpstates->timer.data = (unsigned long)policy;
 	gpstates->timer.function = gpstate_timer_handler;
 	gpstates->timer.expires = jiffies +

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 05/22] driver/net/ethernet/tile: Initialize timer as pinned
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (3 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 04/22] cpufreq/powernv: " Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 06/22] drivers/tty/metag_da: " Thomas Gleixner
                   ` (18 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: drivernetethernettile_Initialize_timer_as_pinned.patch --]
[-- Type: text/plain, Size: 1212 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 drivers/net/ethernet/tile/tilepro.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/drivers/net/ethernet/tile/tilepro.c
+++ b/drivers/net/ethernet/tile/tilepro.c
@@ -588,7 +588,7 @@ static bool tile_net_lepp_free_comps(str
 static void tile_net_schedule_egress_timer(struct tile_net_cpu *info)
 {
 	if (!info->egress_timer_scheduled) {
-		mod_timer_pinned(&info->egress_timer, jiffies + 1);
+		mod_timer(&info->egress_timer, jiffies + 1);
 		info->egress_timer_scheduled = true;
 	}
 }
@@ -1004,7 +1004,7 @@ static void tile_net_register(void *dev_
 		BUG();
 
 	/* Initialize the egress timer. */
-	init_timer(&info->egress_timer);
+	init_timer_pinned(&info->egress_timer);
 	info->egress_timer.data = (long)info;
 	info->egress_timer.function = tile_net_handle_egress_timer;
 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 06/22] drivers/tty/metag_da: Initialize timer as pinned
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (4 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 05/22] driver/net/ethernet/tile: " Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 07/22] drivers/tty/mips_ejtag: " Thomas Gleixner
                   ` (17 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: driversttymetag_da_Initialize_timer_as_pinned.patch --]
[-- Type: text/plain, Size: 987 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 drivers/tty/metag_da.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/drivers/tty/metag_da.c
+++ b/drivers/tty/metag_da.c
@@ -323,12 +323,12 @@ static void dashtty_timer(unsigned long
 	if (channel >= 0)
 		fetch_data(channel);
 
-	mod_timer_pinned(&poll_timer, jiffies + DA_TTY_POLL);
+	mod_timer(&poll_timer, jiffies + DA_TTY_POLL);
 }
 
 static void add_poll_timer(struct timer_list *poll_timer)
 {
-	setup_timer(poll_timer, dashtty_timer, 0);
+	setup_pinned_timer(poll_timer, dashtty_timer, 0);
 	poll_timer->expires = jiffies + DA_TTY_POLL;
 
 	/*

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 07/22] drivers/tty/mips_ejtag: Initialize timer as pinned
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (5 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 06/22] drivers/tty/metag_da: " Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 08/22] net/ipv4/inet: Initialize timers " Thomas Gleixner
                   ` (16 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: driversttymips_ejtag_Initialize_timer_as_pinned.patch --]
[-- Type: text/plain, Size: 1226 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 drivers/tty/mips_ejtag_fdc.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/drivers/tty/mips_ejtag_fdc.c
+++ b/drivers/tty/mips_ejtag_fdc.c
@@ -689,7 +689,7 @@ static void mips_ejtag_fdc_tty_timer(uns
 
 	mips_ejtag_fdc_handle(priv);
 	if (!priv->removing)
-		mod_timer_pinned(&priv->poll_timer, jiffies + FDC_TTY_POLL);
+		mod_timer(&priv->poll_timer, jiffies + FDC_TTY_POLL);
 }
 
 /* TTY Port operations */
@@ -1002,7 +1002,7 @@ static int mips_ejtag_fdc_tty_probe(stru
 		raw_spin_unlock_irq(&priv->lock);
 	} else {
 		/* If we didn't get an usable IRQ, poll instead */
-		setup_timer(&priv->poll_timer, mips_ejtag_fdc_tty_timer,
+		setup_pinned_timer(&priv->poll_timer, mips_ejtag_fdc_tty_timer,
 			    (unsigned long)priv);
 		priv->poll_timer.expires = jiffies + FDC_TTY_POLL;
 		/*

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 08/22] net/ipv4/inet: Initialize timers as pinned
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (6 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 07/22] drivers/tty/mips_ejtag: " Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 09/22] timer: Remove mod_timer_pinned Thomas Gleixner
                   ` (15 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: netipv4inet_Initialize_timers_as_pinned.patch --]
[-- Type: text/plain, Size: 2308 bytes --]

Pinned timers must carry that attribute in the timer itself. No functional
change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 net/ipv4/inet_connection_sock.c |    7 ++++---
 net/ipv4/inet_timewait_sock.c   |    5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -603,7 +603,7 @@ static void reqsk_timer_handler(unsigned
 		if (req->num_timeout++ == 0)
 			atomic_dec(&queue->young);
 		timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
-		mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
+		mod_timer(&req->rsk_timer, jiffies + timeo);
 		return;
 	}
 drop:
@@ -617,8 +617,9 @@ static void reqsk_queue_hash_req(struct
 	req->num_timeout = 0;
 	req->sk = NULL;
 
-	setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
-	mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
+	setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler,
+			    (unsigned long)req);
+	mod_timer(&req->rsk_timer, jiffies + timeout);
 
 	inet_ehash_insert(req_to_sk(req), NULL);
 	/* before letting lookups find us, make sure all req fields
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -188,7 +188,8 @@ struct inet_timewait_sock *inet_twsk_all
 		tw->tw_prot	    = sk->sk_prot_creator;
 		atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
 		twsk_net_set(tw, sock_net(sk));
-		setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw);
+		setup_pinned_timer(&tw->tw_timer, tw_timer_handler,
+				   (unsigned long)tw);
 		/*
 		 * Because we use RCU lookups, we should not set tw_refcnt
 		 * to a non null value before everything is setup for this
@@ -248,7 +249,7 @@ void __inet_twsk_schedule(struct inet_ti
 
 	tw->tw_kill = timeo <= 4*HZ;
 	if (!rearm) {
-		BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo));
+		BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
 		atomic_inc(&tw->tw_dr->tw_count);
 	} else {
 		mod_timer_pending(&tw->tw_timer, jiffies + timeo);

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 09/22] timer: Remove mod_timer_pinned
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (7 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 08/22] net/ipv4/inet: Initialize timers " Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 10/22] signal: Use hrtimer for sigtimedwait Thomas Gleixner
                   ` (14 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: timer_Remove_mod_timer_pinned.patch --]
[-- Type: text/plain, Size: 3781 bytes --]

We switched all users to initialize the timers as pinned and call
mod_timer(). Remove the now unused function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>
---
 include/linux/timer.h |    3 ---
 kernel/time/timer.c   |   39 +++++----------------------------------
 2 files changed, 5 insertions(+), 37 deletions(-)

--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -190,12 +190,9 @@ extern void add_timer_on(struct timer_li
 extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
-extern int mod_timer_pinned(struct timer_list *timer, unsigned long expires);
 
 extern void set_timer_slack(struct timer_list *time, int slack_hz);
 
-#define MOD_TIMER_NOT_PINNED	0
-#define MOD_TIMER_PINNED	1
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -764,8 +764,7 @@ static struct tvec_base *lock_timer_base
 }
 
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires,
-	    bool pending_only, int pinned)
+__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
@@ -782,7 +781,7 @@ static inline int
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, pinned || timer->flags & TIMER_PINNED);
+	new_base = get_target_base(base, timer->flags & TIMER_PINNED);
 
 	if (base != new_base) {
 		/*
@@ -825,7 +824,7 @@ static inline int
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true, MOD_TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, true);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -900,39 +899,11 @@ int mod_timer(struct timer_list *timer,
 	if (timer_pending(timer) && timer->expires == expires)
 		return 1;
 
-	return __mod_timer(timer, expires, false, MOD_TIMER_NOT_PINNED);
+	return __mod_timer(timer, expires, false);
 }
 EXPORT_SYMBOL(mod_timer);
 
 /**
- * mod_timer_pinned - modify a timer's timeout
- * @timer: the timer to be modified
- * @expires: new timeout in jiffies
- *
- * mod_timer_pinned() is a way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
- * and to ensure that the timer is scheduled on the current CPU.
- *
- * Note that this does not prevent the timer from being migrated
- * when the current CPU goes offline.  If this is a problem for
- * you, use CPU-hotplug notifiers to handle it correctly, for
- * example, cancelling the timer when the corresponding CPU goes
- * offline.
- *
- * mod_timer_pinned(timer, expires) is equivalent to:
- *
- *     del_timer(timer); timer->expires = expires; add_timer(timer);
- */
-int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
-{
-	if (timer->expires == expires && timer_pending(timer))
-		return 1;
-
-	return __mod_timer(timer, expires, false, MOD_TIMER_PINNED);
-}
-EXPORT_SYMBOL(mod_timer_pinned);
-
-/**
  * add_timer - start a timer
  * @timer: the timer to be added
  *
@@ -1512,7 +1483,7 @@ signed long __sched schedule_timeout(sig
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false, MOD_TIMER_NOT_PINNED);
+	__mod_timer(&timer, expire, false);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 10/22] signal: Use hrtimer for sigtimedwait
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (8 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 09/22] timer: Remove mod_timer_pinned Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 11/22] hlist: Add hlist_is_singular_node() helper Thomas Gleixner
                   ` (13 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett, Cyril Hrubis

[-- Attachment #1: signal--Use-hrtimer-for-sigtimedwait.patch --]
[-- Type: text/plain, Size: 2173 bytes --]

We've converted most timeout related syscalls to hrtimers. sigtimedwait() did
not get this treatment. Convert it so we get a reasonable accuracy and remove
the user space exposure to the timer wheel properties.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Cyril Hrubis <chrubis@suse.cz>
---
 kernel/signal.c |   24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2751,23 +2751,17 @@ int copy_siginfo_to_user(siginfo_t __use
  *  @ts: upper bound on process time suspension
  */
 int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
-			const struct timespec *ts)
+		    const struct timespec *ts)
 {
 	struct task_struct *tsk = current;
-	long timeout = MAX_SCHEDULE_TIMEOUT;
+	ktime_t timeout = { .tv64 = 0 };
 	sigset_t mask = *which;
-	int sig;
+	int sig, ret = 0;
 
 	if (ts) {
 		if (!timespec_valid(ts))
 			return -EINVAL;
-		timeout = timespec_to_jiffies(ts);
-		/*
-		 * We can be close to the next tick, add another one
-		 * to ensure we will wait at least the time asked for.
-		 */
-		if (ts->tv_sec || ts->tv_nsec)
-			timeout++;
+		timeout = timespec_to_ktime(*ts);
 	}
 
 	/*
@@ -2778,7 +2772,7 @@ int do_sigtimedwait(const sigset_t *whic
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	sig = dequeue_signal(tsk, &mask, info);
-	if (!sig && timeout) {
+	if (!sig && timeout.tv64) {
 		/*
 		 * None ready, temporarily unblock those we're interested
 		 * while we are sleeping in so that we'll be awakened when
@@ -2790,8 +2784,10 @@ int do_sigtimedwait(const sigset_t *whic
 		recalc_sigpending();
 		spin_unlock_irq(&tsk->sighand->siglock);
 
-		timeout = freezable_schedule_timeout_interruptible(timeout);
-
+		__set_current_state(TASK_INTERRUPTIBLE);
+		ret = freezable_schedule_hrtimeout_range(&timeout,
+							 tsk->timer_slack_ns,
+							 HRTIMER_MODE_REL);
 		spin_lock_irq(&tsk->sighand->siglock);
 		__set_task_blocked(tsk, &tsk->real_blocked);
 		sigemptyset(&tsk->real_blocked);
@@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *whic
 
 	if (sig)
 		return sig;
-	return timeout ? -EINTR : -EAGAIN;
+	return ret ? -EINTR : -EAGAIN;
 }
 
 /**

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 11/22] hlist: Add hlist_is_singular_node() helper
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (9 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 10/22] signal: Use hrtimer for sigtimedwait Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 12/22] timer: Give a few structs and members proper names Thomas Gleixner
                   ` (12 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: hlist_Add_hlist_is_last_node_helper.patch --]
[-- Type: text/plain, Size: 955 bytes --]

Required to figure out whether the entry is the only one in the hlist.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 include/linux/list.h |   10 ++++++++++
 1 file changed, 10 insertions(+)

--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -679,6 +679,16 @@ static inline bool hlist_fake(struct hli
 }
 
 /*
+ * Check whether the node is the only node of the head without
+ * accessing head.
+ */
+static inline bool hlist_is_singular_node(struct hlist_node *n,
+					  struct hlist_head *h)
+{
+	return !n->next && n->pprev == &h->first;
+}
+
+/*
  * Move a list from one list head to another. Fixup the pprev
  * reference of the first entry if it exists.
  */

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 12/22] timer: Give a few structs and members proper names
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (10 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 11/22] hlist: Add hlist_is_singular_node() helper Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 13/22] timer: Reduce the CPU index space to 256k Thomas Gleixner
                   ` (11 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: timer_Give_a_few_structs_and_members_proper_names.patch --]
[-- Type: text/plain, Size: 13258 bytes --]

Some of the names are not longer correct and others are simply too long to
type. Clean it up before we switch the wheel implementation over to the new
scheme.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>


---
 kernel/time/timer.c |  118 ++++++++++++++++++++++++++--------------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -77,10 +77,10 @@ struct tvec_root {
 	struct hlist_head vec[TVR_SIZE];
 };
 
-struct tvec_base {
+struct timer_base {
 	spinlock_t lock;
 	struct timer_list *running_timer;
-	unsigned long timer_jiffies;
+	unsigned long clk;
 	unsigned long next_timer;
 	unsigned long active_timers;
 	unsigned long all_timers;
@@ -95,7 +95,7 @@ struct tvec_base {
 } ____cacheline_aligned;
 
 
-static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
@@ -106,15 +106,15 @@ void timers_update_migration(bool update
 	unsigned int cpu;
 
 	/* Avoid the loop, if nothing to update */
-	if (this_cpu_read(tvec_bases.migration_enabled) == on)
+	if (this_cpu_read(timer_bases.migration_enabled) == on)
 		return;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(tvec_bases.migration_enabled, cpu) = on;
+		per_cpu(timer_bases.migration_enabled, cpu) = on;
 		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
 		if (!update_nohz)
 			continue;
-		per_cpu(tvec_bases.nohz_active, cpu) = true;
+		per_cpu(timer_bases.nohz_active, cpu) = true;
 		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
 	}
 }
@@ -134,18 +134,18 @@ int timer_migration_handler(struct ctl_t
 	return ret;
 }
 
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
+static inline struct timer_base *get_target_base(struct timer_base *base,
 						int pinned)
 {
 	if (pinned || !base->migration_enabled)
-		return this_cpu_ptr(&tvec_bases);
-	return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
+		return this_cpu_ptr(&timer_bases);
+	return per_cpu_ptr(&timer_bases, get_nohz_timer_target());
 }
 #else
-static inline struct tvec_base *get_target_base(struct tvec_base *base,
+static inline struct timer_base *get_target_base(struct timer_base *base,
 						int pinned)
 {
-	return this_cpu_ptr(&tvec_bases);
+	return this_cpu_ptr(&timer_bases);
 }
 #endif
 
@@ -371,10 +371,10 @@ void set_timer_slack(struct timer_list *
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
 static void
-__internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
-	unsigned long idx = expires - base->timer_jiffies;
+	unsigned long idx = expires - base->clk;
 	struct hlist_head *vec;
 
 	if (idx < TVR_SIZE) {
@@ -394,7 +394,7 @@ static void
 		 * Can happen if you add a timer with expires == jiffies,
 		 * or you set a timer to go off in the past
 		 */
-		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
+		vec = base->tv1.vec + (base->clk & TVR_MASK);
 	} else {
 		int i;
 		/* If the timeout is larger than MAX_TVAL (on 64-bit
@@ -403,7 +403,7 @@ static void
 		 */
 		if (idx > MAX_TVAL) {
 			idx = MAX_TVAL;
-			expires = idx + base->timer_jiffies;
+			expires = idx + base->clk;
 		}
 		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv5.vec + i;
@@ -412,11 +412,11 @@ static void
 	hlist_add_head(&timer->entry, vec);
 }
 
-static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
+static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	/* Advance base->jiffies, if the base is empty */
 	if (!base->all_timers++)
-		base->timer_jiffies = jiffies;
+		base->clk = jiffies;
 
 	__internal_add_timer(base, timer);
 	/*
@@ -707,7 +707,7 @@ static inline void detach_timer(struct t
 }
 
 static inline void
-detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
+detach_expired_timer(struct timer_list *timer, struct timer_base *base)
 {
 	detach_timer(timer, true);
 	if (!(timer->flags & TIMER_DEFERRABLE))
@@ -715,7 +715,7 @@ detach_expired_timer(struct timer_list *
 	base->all_timers--;
 }
 
-static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
+static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
 			     bool clear_pending)
 {
 	if (!timer_pending(timer))
@@ -725,16 +725,16 @@ static int detach_if_pending(struct time
 	if (!(timer->flags & TIMER_DEFERRABLE)) {
 		base->active_timers--;
 		if (timer->expires == base->next_timer)
-			base->next_timer = base->timer_jiffies;
+			base->next_timer = base->clk;
 	}
 	/* If this was the last timer, advance base->jiffies */
 	if (!--base->all_timers)
-		base->timer_jiffies = jiffies;
+		base->clk = jiffies;
 	return 1;
 }
 
 /*
- * We are using hashed locking: holding per_cpu(tvec_bases).lock
+ * We are using hashed locking: holding per_cpu(timer_bases).lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
  *
@@ -744,16 +744,16 @@ static int detach_if_pending(struct time
  * When the timer's base is locked and removed from the list, the
  * TIMER_MIGRATING flag is set, FIXME
  */
-static struct tvec_base *lock_timer_base(struct timer_list *timer,
+static struct timer_base *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 	__acquires(timer->base->lock)
 {
 	for (;;) {
 		u32 tf = timer->flags;
-		struct tvec_base *base;
+		struct timer_base *base;
 
 		if (!(tf & TIMER_MIGRATING)) {
-			base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
+			base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK);
 			spin_lock_irqsave(&base->lock, *flags);
 			if (timer->flags == tf)
 				return base;
@@ -766,7 +766,7 @@ static struct tvec_base *lock_timer_base
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
-	struct tvec_base *base, *new_base;
+	struct timer_base *base, *new_base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -933,8 +933,8 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu);
-	struct tvec_base *base;
+	struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu);
+	struct timer_base *base;
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
@@ -975,7 +975,7 @@ EXPORT_SYMBOL_GPL(add_timer_on);
  */
 int del_timer(struct timer_list *timer)
 {
-	struct tvec_base *base;
+	struct timer_base *base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -1001,7 +1001,7 @@ EXPORT_SYMBOL(del_timer);
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-	struct tvec_base *base;
+	struct timer_base *base;
 	unsigned long flags;
 	int ret = -1;
 
@@ -1085,7 +1085,7 @@ int del_timer_sync(struct timer_list *ti
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static int cascade(struct tvec_base *base, struct tvec *tv, int index)
+static int cascade(struct timer_base *base, struct tvec *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
 	struct timer_list *timer;
@@ -1149,7 +1149,7 @@ static void call_timer_fn(struct timer_l
 	}
 }
 
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
@@ -1158,23 +1158,23 @@ static void call_timer_fn(struct timer_l
  * This function cascades all vectors and executes all expired timer
  * vectors.
  */
-static inline void __run_timers(struct tvec_base *base)
+static inline void __run_timers(struct timer_base *base)
 {
 	struct timer_list *timer;
 
 	spin_lock_irq(&base->lock);
 
-	while (time_after_eq(jiffies, base->timer_jiffies)) {
+	while (time_after_eq(jiffies, base->clk)) {
 		struct hlist_head work_list;
 		struct hlist_head *head = &work_list;
 		int index;
 
 		if (!base->all_timers) {
-			base->timer_jiffies = jiffies;
+			base->clk = jiffies;
 			break;
 		}
 
-		index = base->timer_jiffies & TVR_MASK;
+		index = base->clk & TVR_MASK;
 
 		/*
 		 * Cascade timers:
@@ -1184,7 +1184,7 @@ static inline void __run_timers(struct t
 				(!cascade(base, &base->tv3, INDEX(1))) &&
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
-		++base->timer_jiffies;
+		++base->clk;
 		hlist_move_list(base->tv1.vec + index, head);
 		while (!hlist_empty(head)) {
 			void (*fn)(unsigned long);
@@ -1222,16 +1222,16 @@ static inline void __run_timers(struct t
  * is used on S/390 to stop all activity when a CPU is idle.
  * This function needs to be called with interrupts disabled.
  */
-static unsigned long __next_timer_interrupt(struct tvec_base *base)
+static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
-	unsigned long timer_jiffies = base->timer_jiffies;
-	unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
+	unsigned long clk = base->clk;
+	unsigned long expires = clk + NEXT_TIMER_MAX_DELTA;
 	int index, slot, array, found = 0;
 	struct timer_list *nte;
 	struct tvec *varray[4];
 
 	/* Look for timer events in tv1. */
-	index = slot = timer_jiffies & TVR_MASK;
+	index = slot = clk & TVR_MASK;
 	do {
 		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
 			if (nte->flags & TIMER_DEFERRABLE)
@@ -1250,8 +1250,8 @@ static unsigned long __next_timer_interr
 cascade:
 	/* Calculate the next cascade event */
 	if (index)
-		timer_jiffies += TVR_SIZE - index;
-	timer_jiffies >>= TVR_BITS;
+		clk += TVR_SIZE - index;
+	clk >>= TVR_BITS;
 
 	/* Check tv2-tv5. */
 	varray[0] = &base->tv2;
@@ -1262,7 +1262,7 @@ static unsigned long __next_timer_interr
 	for (array = 0; array < 4; array++) {
 		struct tvec *varp = varray[array];
 
-		index = slot = timer_jiffies & TVN_MASK;
+		index = slot = clk & TVN_MASK;
 		do {
 			hlist_for_each_entry(nte, varp->vec + slot, entry) {
 				if (nte->flags & TIMER_DEFERRABLE)
@@ -1286,8 +1286,8 @@ static unsigned long __next_timer_interr
 		} while (slot != index);
 
 		if (index)
-			timer_jiffies += TVN_SIZE - index;
-		timer_jiffies >>= TVN_BITS;
+			clk += TVN_SIZE - index;
+		clk >>= TVN_BITS;
 	}
 	return expires;
 }
@@ -1335,7 +1335,7 @@ static u64 cmp_next_hrtimer_event(u64 ba
  */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
 
@@ -1348,7 +1348,7 @@ u64 get_next_timer_interrupt(unsigned lo
 
 	spin_lock(&base->lock);
 	if (base->active_timers) {
-		if (time_before_eq(base->next_timer, base->timer_jiffies))
+		if (time_before_eq(base->next_timer, base->clk))
 			base->next_timer = __next_timer_interrupt(base);
 		nextevt = base->next_timer;
 		if (time_before_eq(nextevt, basej))
@@ -1387,9 +1387,9 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases);
 
-	if (time_after_eq(jiffies, base->timer_jiffies))
+	if (time_after_eq(jiffies, base->clk))
 		__run_timers(base);
 }
 
@@ -1534,7 +1534,7 @@ signed long __sched schedule_timeout_idl
 EXPORT_SYMBOL(schedule_timeout_idle);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
+static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
 {
 	struct timer_list *timer;
 	int cpu = new_base->cpu;
@@ -1550,13 +1550,13 @@ static void migrate_timer_list(struct tv
 
 static void migrate_timers(int cpu)
 {
-	struct tvec_base *old_base;
-	struct tvec_base *new_base;
+	struct timer_base *old_base;
+	struct timer_base *new_base;
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu_ptr(&tvec_bases, cpu);
-	new_base = get_cpu_ptr(&tvec_bases);
+	old_base = per_cpu_ptr(&timer_bases, cpu);
+	new_base = get_cpu_ptr(&timer_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1580,7 +1580,7 @@ static void migrate_timers(int cpu)
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
-	put_cpu_ptr(&tvec_bases);
+	put_cpu_ptr(&timer_bases);
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1608,13 +1608,13 @@ static inline void timer_register_cpu_no
 
 static void __init init_timer_cpu(int cpu)
 {
-	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
+	struct timer_base *base = per_cpu_ptr(&timer_bases, cpu);
 
 	base->cpu = cpu;
 	spin_lock_init(&base->lock);
 
-	base->timer_jiffies = jiffies;
-	base->next_timer = base->timer_jiffies;
+	base->clk = jiffies;
+	base->next_timer = base->clk;
 }
 
 static void __init init_timer_cpus(void)

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 13/22] timer: Reduce the CPU index space to 256k
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (11 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 12/22] timer: Give a few structs and members proper names Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-07-01 14:41   ` Frederic Weisbecker
  2016-06-24 14:32 ` [patch V3 14/22] timer: Switch to a non cascading wheel Thomas Gleixner
                   ` (10 subsequent siblings)
  23 siblings, 1 reply; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: timer--Reduce-the-CPU-index-space-to-256k.patch --]
[-- Type: text/plain, Size: 995 bytes --]

We want to store the array index in the flags space. 256k CPUs should be
enough for a while.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h |   10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -58,12 +58,12 @@ struct timer_list {
  * workqueue locking issues. It's not meant for executing random crap
  * with interrupts disabled. Abuse is monitored!
  */
-#define TIMER_CPUMASK		0x0007FFFF
-#define TIMER_MIGRATING		0x00080000
+#define TIMER_CPUMASK		0x0003FFFF
+#define TIMER_MIGRATING		0x00040000
 #define TIMER_BASEMASK		(TIMER_CPUMASK | TIMER_MIGRATING)
-#define TIMER_DEFERRABLE	0x00100000
-#define TIMER_PINNED		0x00200000
-#define TIMER_IRQSAFE		0x00400000
+#define TIMER_DEFERRABLE	0x00080000
+#define TIMER_PINNED		0x00100000
+#define TIMER_IRQSAFE		0x00200000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 14/22] timer: Switch to a non cascading wheel
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (12 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 13/22] timer: Reduce the CPU index space to 256k Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 15/22] timer: Remove slack leftovers Thomas Gleixner
                   ` (9 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: timer_Switch_to_a_non_cascading_wheel.patch --]
[-- Type: text/plain, Size: 37246 bytes --]

The current timer wheel has some drawbacks:

1) Cascading

   Cascading can be an unbound operation and is completely pointless in most
   cases because the vast majority of the timer wheel timers are canceled or
   rearmed before expiration.

2) No fast lookup of the next expiring timer

   In NOHZ scenarios the first timer soft interrupt after a long NOHZ period
   must fast forward the base time to current jiffies. As we have no way to
   find the next expiring timer fast, the code loops and increments the base
   time by one and checks for expired timers in each step.

After a thorough analysis of real world data gathered on laptops,
workstations, webservers and other machines (thanks Chris!) I came to the
conclusion that the current 'classic' timer wheel implementation can be
modified to address the above issues.

The vast majority of timer wheel timers is canceled or rearmed before
expiry. Most of them are timeouts for networking and other I/O tasks. The
nature of timeouts is to catch the exception from normal operation (TCP ack
timed out, disk does not respond, etc.). For these kind of timeouts the
accuracy is not really a concern. In case the timeout fires, performance is
down the drain already.

The few timers which actually expire can be split into two categories:

 1) Short expiry times which expect halfways accurate expiry

 2) Long term expiry times are inaccurate today already due to the batching
    which is done for NOHZ.

So for long term expiry timers we can avoid the cascading property and
just leave them in the less granular outer wheels until expiry or
cancelation. Timers which are armed with a timeout larger than the
wheel capacity are not longer cascaded. We expire them with the
longest possible timeout. We have not observed such timeouts in our
data collection, but at least we handle them with the least surprising
effect.

Contrary to the classic wheel the granularity of the next wheel is not
the capacity of the first wheel. The granularities of the wheels are
in the currently chosen setting 8 times the granularity of the
previous wheel.  That's a worst case inaccuracy of 12.5% for the
timers which are queued at the beginning of a level. 

So the new wheel concept addresses the old issues:

1) Cascading is avoided (except for extreme long time timers)

2) By keeping the timers in the bucket until expiry/cancelation we can track
   the buckets which have timers enqueued in a bucket bitmap and therefor can
   lookup the next expiring timer fast and time bound.

A further benefit of the concept is, that the slack calculation which is done
on every timer start is not longer necessary because the granularity levels
provide natural batching already.

Our extensive testing with various loads did not show any performance
degradation vs. the current wheel implementation.

This patch does not address the 'fast lookup' issue as we wanted to make sure
that there is no regression introduced by the wheel redesign. The
optimizations are in follow up patches.

[ Contains fixes from Anna-Maria Gleixner and Richard Cochran ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
v4: Simplify wheel constants handling as pointed out by George Spelvin. Switch
    to a non cascading wheel and let HZ=1000 have reduced granularity to
    accomodate the 5days timeouts of the networking code.

v3: fix return value of __next_timer_interrupt()
v2: change HASH_SIZE to TOT_HASH_SIZE (as Richard mentioned)

 include/linux/timer.h |    2 
 kernel/time/timer.c   |  825 ++++++++++++++++++++++++++++----------------------
 2 files changed, 467 insertions(+), 360 deletions(-)

--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -64,6 +64,8 @@ struct timer_list {
 #define TIMER_DEFERRABLE	0x00080000
 #define TIMER_PINNED		0x00100000
 #define TIMER_IRQSAFE		0x00200000
+#define TIMER_ARRAYSHIFT	22
+#define TIMER_ARRAYMASK		0xFFC00000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -59,43 +59,151 @@
 EXPORT_SYMBOL(jiffies_64);
 
 /*
- * per-CPU timer vector definitions:
+ * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
+ * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
+ * level has a different granularity.
+ *
+ * The level granularity is:		LVL_CLK_DIV ^ lvl
+ * The level clock frequency is:	HZ / (LVL_CLK_DIV ^ level)
+ *
+ * The array level of a newly armed timer depends on the relative expiry
+ * time. The farther the expiry time is away the higher the array level and
+ * therefor the granularity becomes.
+ *
+ * Contrary to the original timer wheel implementation, which aims for 'exact'
+ * expiry of the timers, this implementation removes the need for recascading
+ * the timers into the lower array levels. The previous 'classic' timer wheel
+ * implementation of the kernel already violated the 'exact' expiry by adding
+ * slack to the expiry time to provide batched expiration. The granularity
+ * levels provide implicit batching.
+ *
+ * This is an optimization of the original timer wheel implementation for the
+ * majority of the timer wheel use cases: timeouts. The vast majority of
+ * timeout timers (networking, disk I/O ...) are canceled before expiry. If
+ * the timeout expires it indicates that normal operation is disturbed, so it
+ * does not matter much whether the timeout comes with a slight delay.
+ *
+ * The only exception to this are networking timers with a small expiry
+ * time. They rely on the granularity. Those fit into the first wheel level,
+ * which has HZ granularity.
+ *
+ * We don't have cascading anymore. timers with a expiry time above the
+ * capacity of the last wheel level are force expired at the maximum timeout
+ * value of the last wheel level. From data sampling we know that the maximum
+ * value observed is 5 days (network connection tracking), so this should not
+ * be an issue.
+ *
+ * The currently chosen array constants values are a good compromise between
+ * array size and granularity.
+ *
+ * This results in the following granularity and range levels:
+ *
+ * HZ 1000 steps
+ * Level Offset  Granularity            Range
+ *  0      0         1 ms                0 ms -         63 ms
+ *  1     64         8 ms               64 ms -        511 ms
+ *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
+ *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
+ *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
+ *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
+ *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
+ *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
+ *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
+ *
+ * HZ  300
+ * Level Offset  Granularity            Range
+ *  0	   0         3 ms                0 ms -        210 ms
+ *  1	  64        26 ms              213 ms -       1703 ms (213ms - ~1s)
+ *  2	 128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
+ *  3	 192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
+ *  4	 256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
+ *  5	 320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
+ *  6	 384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
+ *  7	 448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
+ *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
+ *
+ * HZ  250
+ * Level Offset  Granularity            Range
+ *  0	   0         4 ms                0 ms -        255 ms
+ *  1	  64        32 ms              256 ms -       2047 ms (256ms - ~2s)
+ *  2	 128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
+ *  3	 192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
+ *  4	 256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
+ *  5	 320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
+ *  6	 384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
+ *  7	 448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
+ *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
+ *
+ * HZ  100
+ * Level Offset  Granularity            Range
+ *  0	   0         10 ms               0 ms -        630 ms
+ *  1	  64         80 ms             640 ms -       5110 ms (640ms - ~5s)
+ *  2	 128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
+ *  3	 192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
+ *  4	 256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
+ *  5	 320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
+ *  6	 384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
+ *  7	 448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
+ */
+
+/* Clock divisor for the next level */
+#define LVL_CLK_SHIFT	3
+#define LVL_CLK_DIV	(1UL << LVL_CLK_SHIFT)
+#define LVL_CLK_MASK	(LVL_CLK_DIV - 1)
+#define LVL_SHIFT(n)	((n) * LVL_CLK_SHIFT)
+#define LVL_GRAN(n)	(1UL << LVL_SHIFT(n))
+
+/*
+ * The time start value for each level to select the bucket at enqueue
+ * time.
  */
-#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
-#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
-#define TVN_SIZE (1 << TVN_BITS)
-#define TVR_SIZE (1 << TVR_BITS)
-#define TVN_MASK (TVN_SIZE - 1)
-#define TVR_MASK (TVR_SIZE - 1)
-#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
+#define LVL_START(n)	((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
 
-struct tvec {
-	struct hlist_head vec[TVN_SIZE];
-};
+/* Size of each clock level */
+#define LVL_BITS	6
+#define LVL_SIZE	(1UL << LVL_BITS)
+#define LVL_MASK	(LVL_SIZE - 1)
+#define LVL_OFFS(n)	((n) * LVL_SIZE)
+
+/* Level depth */
+#if HZ > 100
+# define LVL_DEPTH	9
+# else
+# define LVL_DEPTH	8
+#endif
 
-struct tvec_root {
-	struct hlist_head vec[TVR_SIZE];
-};
+/* The cutoff (max. capacity of the wheel) */
+#define WHEEL_TIMEOUT_CUTOFF	(LVL_START(LVL_DEPTH))
+#define WHEEL_TIMEOUT_MAX	(WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
+
+/*
+ * The resulting wheel size. If NOHZ is configured we allocate two
+ * wheels so we have a separate storage for the deferrable timers.
+ */
+#define WHEEL_SIZE	(LVL_SIZE * LVL_DEPTH)
+
+#ifdef CONFIG_NO_HZ_COMMON
+# define NR_BASES	2
+# define BASE_STD	0
+# define BASE_DEF	1
+#else
+# define NR_BASES	1
+# define BASE_STD	0
+# define BASE_DEF	0
+#endif
 
 struct timer_base {
-	spinlock_t lock;
-	struct timer_list *running_timer;
-	unsigned long clk;
-	unsigned long next_timer;
-	unsigned long active_timers;
-	unsigned long all_timers;
-	int cpu;
-	bool migration_enabled;
-	bool nohz_active;
-	struct tvec_root tv1;
-	struct tvec tv2;
-	struct tvec tv3;
-	struct tvec tv4;
-	struct tvec tv5;
+	spinlock_t		lock;
+	struct timer_list	*running_timer;
+	unsigned long		clk;
+	unsigned int		cpu;
+	bool			migration_enabled;
+	bool			nohz_active;
+	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
+	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
 
-
-static DEFINE_PER_CPU(struct timer_base, timer_bases);
+static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
@@ -106,15 +214,17 @@ void timers_update_migration(bool update
 	unsigned int cpu;
 
 	/* Avoid the loop, if nothing to update */
-	if (this_cpu_read(timer_bases.migration_enabled) == on)
+	if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
 		return;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(timer_bases.migration_enabled, cpu) = on;
+		per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
+		per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
 		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
 		if (!update_nohz)
 			continue;
-		per_cpu(timer_bases.nohz_active, cpu) = true;
+		per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
+		per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
 		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
 	}
 }
@@ -133,20 +243,6 @@ int timer_migration_handler(struct ctl_t
 	mutex_unlock(&mutex);
 	return ret;
 }
-
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						int pinned)
-{
-	if (pinned || !base->migration_enabled)
-		return this_cpu_ptr(&timer_bases);
-	return per_cpu_ptr(&timer_bases, get_nohz_timer_target());
-}
-#else
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						int pinned)
-{
-	return this_cpu_ptr(&timer_bases);
-}
 #endif
 
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -370,78 +466,91 @@ void set_timer_slack(struct timer_list *
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
+static inline unsigned int timer_get_idx(struct timer_list *timer)
+{
+	return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
+}
+
+static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
+{
+	timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
+			idx << TIMER_ARRAYSHIFT;
+}
+
+/*
+ * Helper function to calculate the array index for a given expiry
+ * time.
+ */
+static inline unsigned calc_index(unsigned expires, unsigned lvl)
+{
+	expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl);
+	return LVL_OFFS(lvl) + (expires & LVL_MASK);
+}
+
 static void
 __internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
-	unsigned long idx = expires - base->clk;
+	unsigned long delta = expires - base->clk;
 	struct hlist_head *vec;
+	unsigned int idx;
 
-	if (idx < TVR_SIZE) {
-		int i = expires & TVR_MASK;
-		vec = base->tv1.vec + i;
-	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
-		int i = (expires >> TVR_BITS) & TVN_MASK;
-		vec = base->tv2.vec + i;
-	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
-		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
-		vec = base->tv3.vec + i;
-	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
-		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
-		vec = base->tv4.vec + i;
-	} else if ((signed long) idx < 0) {
-		/*
-		 * Can happen if you add a timer with expires == jiffies,
-		 * or you set a timer to go off in the past
-		 */
-		vec = base->tv1.vec + (base->clk & TVR_MASK);
+	if (delta < LVL_START(1)) {
+		idx = calc_index(expires, 0);
+	} else if (delta < LVL_START(2)) {
+		idx = calc_index(expires, 1);
+	} else if (delta < LVL_START(3)) {
+		idx = calc_index(expires, 2);
+	} else if (delta < LVL_START(4)) {
+		idx = calc_index(expires, 3);
+	} else if (delta < LVL_START(5)) {
+		idx = calc_index(expires, 4);
+	} else if (delta < LVL_START(6)) {
+		idx = calc_index(expires, 5);
+	} else if (delta < LVL_START(7)) {
+		idx = calc_index(expires, 6);
+	} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
+		idx = calc_index(expires, 7);
+	} else if ((long) delta < 0) {
+		idx = base->clk & LVL_MASK;
 	} else {
-		int i;
-		/* If the timeout is larger than MAX_TVAL (on 64-bit
-		 * architectures or with CONFIG_BASE_SMALL=1) then we
-		 * use the maximum timeout.
+		/*
+		 * Force expire obscene large timeouts to expire at the
+		 * capacity limit of the wheel.
 		 */
-		if (idx > MAX_TVAL) {
-			idx = MAX_TVAL;
-			expires = idx + base->clk;
-		}
-		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
-		vec = base->tv5.vec + i;
-	}
+		if (expires >= WHEEL_TIMEOUT_CUTOFF)
+			expires = WHEEL_TIMEOUT_MAX;
 
+		idx = calc_index(expires, LVL_DEPTH - 1);
+	}
+	/*
+	 * Enqueue the timer into the array bucket, mark it pending in
+	 * the bitmap and store the index in the timer flags.
+	 */
+	vec = base->vectors + idx;
 	hlist_add_head(&timer->entry, vec);
+	__set_bit(idx, base->pending_map);
+	timer_set_idx(timer, idx);
 }
 
 static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
-	/* Advance base->jiffies, if the base is empty */
-	if (!base->all_timers++)
-		base->clk = jiffies;
-
 	__internal_add_timer(base, timer);
-	/*
-	 * Update base->active_timers and base->next_timer
-	 */
-	if (!(timer->flags & TIMER_DEFERRABLE)) {
-		if (!base->active_timers++ ||
-		    time_before(timer->expires, base->next_timer))
-			base->next_timer = timer->expires;
-	}
 
 	/*
 	 * Check whether the other CPU is in dynticks mode and needs
-	 * to be triggered to reevaluate the timer wheel.
-	 * We are protected against the other CPU fiddling
-	 * with the timer by holding the timer base lock. This also
-	 * makes sure that a CPU on the way to stop its tick can not
-	 * evaluate the timer wheel.
+	 * to be triggered to reevaluate the timer wheel.  We are
+	 * protected against the other CPU fiddling with the timer by
+	 * holding the timer base lock. This also makes sure that a
+	 * CPU on the way to stop its tick can not evaluate the timer
+	 * wheel.
 	 *
 	 * Spare the IPI for deferrable timers on idle targets though.
 	 * The next busy ticks will take care of it. Except full dynticks
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (base->nohz_active) {
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
 		if (!(timer->flags & TIMER_DEFERRABLE) ||
 		    tick_nohz_full_cpu(base->cpu))
 			wake_up_nohz_cpu(base->cpu);
@@ -706,54 +815,87 @@ static inline void detach_timer(struct t
 	entry->next = LIST_POISON2;
 }
 
-static inline void
-detach_expired_timer(struct timer_list *timer, struct timer_base *base)
-{
-	detach_timer(timer, true);
-	if (!(timer->flags & TIMER_DEFERRABLE))
-		base->active_timers--;
-	base->all_timers--;
-}
-
 static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
 			     bool clear_pending)
 {
+	unsigned idx = timer_get_idx(timer);
+
 	if (!timer_pending(timer))
 		return 0;
 
+	if (hlist_is_singular_node(&timer->entry, base->vectors + idx))
+		__clear_bit(idx, base->pending_map);
+
 	detach_timer(timer, clear_pending);
-	if (!(timer->flags & TIMER_DEFERRABLE)) {
-		base->active_timers--;
-		if (timer->expires == base->next_timer)
-			base->next_timer = base->clk;
-	}
-	/* If this was the last timer, advance base->jiffies */
-	if (!--base->all_timers)
-		base->clk = jiffies;
 	return 1;
 }
 
+static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
+{
+	struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
+
+	/*
+	 * If the timer is deferrable and nohz is active then we need to use
+	 * the deferrable base.
+	 */
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+	    (tflags & TIMER_DEFERRABLE))
+		base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
+	return base;
+}
+
+static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
+{
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+	/*
+	 * If the timer is deferrable and nohz is active then we need to use
+	 * the deferrable base.
+	 */
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+	    (tflags & TIMER_DEFERRABLE))
+		base = this_cpu_ptr(&timer_bases[BASE_DEF]);
+	return base;
+}
+
+static inline struct timer_base *get_timer_base(u32 tflags)
+{
+	return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
+}
+
+static inline struct timer_base *get_target_base(struct timer_base *base,
+						 unsigned tflags)
+{
+#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
+	if ((tflags & TIMER_PINNED) || !base->migration_enabled)
+		return get_timer_this_cpu_base(tflags);
+	return get_timer_cpu_base(tflags, get_nohz_timer_target());
+#else
+	return get_timer_this_cpu_base(tflags);
+#endif
+}
+
 /*
- * We are using hashed locking: holding per_cpu(timer_bases).lock
- * means that all timers which are tied to this base via timer->base are
- * locked, and the base itself is locked too.
+ * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
+ * that all timers which are tied to this base are locked, and the base itself
+ * is locked too.
  *
  * So __run_timers/migrate_timers can safely modify all timers which could
- * be found on ->tvX lists.
+ * be found in the base->vectors array.
  *
- * When the timer's base is locked and removed from the list, the
- * TIMER_MIGRATING flag is set, FIXME
+ * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
+ * to wait until the migration is done.
  */
 static struct timer_base *lock_timer_base(struct timer_list *timer,
-					unsigned long *flags)
+					  unsigned long *flags)
 	__acquires(timer->base->lock)
 {
 	for (;;) {
-		u32 tf = timer->flags;
 		struct timer_base *base;
+		u32 tf = timer->flags;
 
 		if (!(tf & TIMER_MIGRATING)) {
-			base = per_cpu_ptr(&timer_bases, tf & TIMER_CPUMASK);
+			base = get_timer_base(tf);
 			spin_lock_irqsave(&base->lock, *flags);
 			if (timer->flags == tf)
 				return base;
@@ -770,6 +912,27 @@ static inline int
 	unsigned long flags;
 	int ret = 0;
 
+	/*
+	 * TODO: Calculate the array bucket of the timer right here w/o
+	 * holding the base lock. This allows to check not only
+	 * timer->expires == expires below, but also whether the timer
+	 * ends up in the same bucket. If we really need to requeue
+	 * the timer then we check whether base->clk have
+	 * advanced between here and locking the timer base. If
+	 * jiffies advanced we have to recalc the array bucket with the
+	 * lock held.
+	 */
+
+	/*
+	 * This is a common optimization triggered by the
+	 * networking code - if the timer is re-modified
+	 * to be the same thing then just return:
+	 */
+	if (timer_pending(timer)) {
+		if (timer->expires == expires)
+			return 1;
+	}
+
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
 
@@ -781,15 +944,15 @@ static inline int
 
 	debug_activate(timer, expires);
 
-	new_base = get_target_base(base, timer->flags & TIMER_PINNED);
+	new_base = get_target_base(base, timer->flags);
 
 	if (base != new_base) {
 		/*
-		 * We are trying to schedule the timer on the local CPU.
+		 * We are trying to schedule the timer on the new base.
 		 * However we can't change timer's base while it is running,
 		 * otherwise del_timer_sync() can't detect that the timer's
-		 * handler yet has not finished. This also guarantees that
-		 * the timer is serialized wrt itself.
+		 * handler yet has not finished. This also guarantees that the
+		 * timer is serialized wrt itself.
 		 */
 		if (likely(base->running_timer != timer)) {
 			/* See the comment in lock_timer_base() */
@@ -828,45 +991,6 @@ int mod_timer_pending(struct timer_list
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
-/*
- * Decide where to put the timer while taking the slack into account
- *
- * Algorithm:
- *   1) calculate the maximum (absolute) time
- *   2) calculate the highest bit where the expires and new max are different
- *   3) use this bit to make a mask
- *   4) use the bitmask to round down the maximum time, so that all last
- *      bits are zeros
- */
-static inline
-unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
-{
-	unsigned long expires_limit, mask;
-	int bit;
-
-	if (timer->slack >= 0) {
-		expires_limit = expires + timer->slack;
-	} else {
-		long delta = expires - jiffies;
-
-		if (delta < 256)
-			return expires;
-
-		expires_limit = expires + delta / 256;
-	}
-	mask = expires ^ expires_limit;
-	if (mask == 0)
-		return expires;
-
-	bit = __fls(mask);
-
-	mask = (1UL << bit) - 1;
-
-	expires_limit = expires_limit & ~(mask);
-
-	return expires_limit;
-}
-
 /**
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
@@ -889,16 +1013,6 @@ unsigned long apply_slack(struct timer_l
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	expires = apply_slack(timer, expires);
-
-	/*
-	 * This is a common optimization triggered by the
-	 * networking code - if the timer is re-modified
-	 * to be the same thing then just return:
-	 */
-	if (timer_pending(timer) && timer->expires == expires)
-		return 1;
-
 	return __mod_timer(timer, expires, false);
 }
 EXPORT_SYMBOL(mod_timer);
@@ -933,13 +1047,14 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct timer_base *new_base = per_cpu_ptr(&timer_bases, cpu);
-	struct timer_base *base;
+	struct timer_base *new_base, *base;
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(timer_pending(timer) || !timer->function);
 
+	new_base = get_timer_cpu_base(timer->flags, cpu);
+
 	/*
 	 * If @timer was on a different CPU, it should be migrated with the
 	 * old base locked to prevent other operations proceeding with the
@@ -1085,27 +1200,6 @@ int del_timer_sync(struct timer_list *ti
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static int cascade(struct timer_base *base, struct tvec *tv, int index)
-{
-	/* cascade all the timers from tv up one level */
-	struct timer_list *timer;
-	struct hlist_node *tmp;
-	struct hlist_head tv_list;
-
-	hlist_move_list(tv->vec + index, &tv_list);
-
-	/*
-	 * We are removing _all_ timers from the list, so we
-	 * don't have to detach them individually.
-	 */
-	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-		/* No accounting, while moving them */
-		__internal_add_timer(base, timer);
-	}
-
-	return index;
-}
-
 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 			  unsigned long data)
 {
@@ -1149,68 +1243,80 @@ static void call_timer_fn(struct timer_l
 	}
 }
 
-#define INDEX(N) ((base->clk >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+static void expire_timers(struct timer_base *base, struct hlist_head *head)
+{
+	while (!hlist_empty(head)) {
+		struct timer_list *timer;
+		void (*fn)(unsigned long);
+		unsigned long data;
+
+		timer = hlist_entry(head->first, struct timer_list, entry);
+		timer_stats_account_timer(timer);
+
+		base->running_timer = timer;
+		detach_timer(timer, true);
+
+		fn = timer->function;
+		data = timer->data;
+
+		if (timer->flags & TIMER_IRQSAFE) {
+			spin_unlock(&base->lock);
+			call_timer_fn(timer, fn, data);
+			spin_lock(&base->lock);
+		} else {
+			spin_unlock_irq(&base->lock);
+			call_timer_fn(timer, fn, data);
+			spin_lock_irq(&base->lock);
+		}
+	}
+}
+
+static int collect_expired_timers(struct timer_base *base,
+				  struct hlist_head *heads)
+{
+	unsigned long clk = base->clk;
+	struct hlist_head *vec;
+	int i, levels = 0;
+	unsigned int idx;
+
+	for (i = 0; i < LVL_DEPTH; i++) {
+		idx = (clk & LVL_MASK) + i * LVL_SIZE;
+
+		if (__test_and_clear_bit(idx, base->pending_map)) {
+			vec = base->vectors + idx;
+			hlist_move_list(vec, heads++);
+			levels++;
+		}
+		/* Is it time to look at the next level? */
+		if (clk & LVL_CLK_MASK)
+			break;
+		/* Shift clock for the next level granularity */
+		clk >>= LVL_CLK_SHIFT;
+	}
+	return levels;
+}
 
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
  * @base: the timer vector to be processed.
- *
- * This function cascades all vectors and executes all expired timer
- * vectors.
  */
 static inline void __run_timers(struct timer_base *base)
 {
-	struct timer_list *timer;
+	struct hlist_head heads[LVL_DEPTH];
+	int levels;
+
+	if (!time_after_eq(jiffies, base->clk))
+		return;
 
 	spin_lock_irq(&base->lock);
 
 	while (time_after_eq(jiffies, base->clk)) {
-		struct hlist_head work_list;
-		struct hlist_head *head = &work_list;
-		int index;
 
-		if (!base->all_timers) {
-			base->clk = jiffies;
-			break;
-		}
-
-		index = base->clk & TVR_MASK;
+		levels = collect_expired_timers(base, heads);
+		base->clk++;
 
-		/*
-		 * Cascade timers:
-		 */
-		if (!index &&
-			(!cascade(base, &base->tv2, INDEX(0))) &&
-				(!cascade(base, &base->tv3, INDEX(1))) &&
-					!cascade(base, &base->tv4, INDEX(2)))
-			cascade(base, &base->tv5, INDEX(3));
-		++base->clk;
-		hlist_move_list(base->tv1.vec + index, head);
-		while (!hlist_empty(head)) {
-			void (*fn)(unsigned long);
-			unsigned long data;
-			bool irqsafe;
-
-			timer = hlist_entry(head->first, struct timer_list, entry);
-			fn = timer->function;
-			data = timer->data;
-			irqsafe = timer->flags & TIMER_IRQSAFE;
-
-			timer_stats_account_timer(timer);
-
-			base->running_timer = timer;
-			detach_expired_timer(timer, base);
-
-			if (irqsafe) {
-				spin_unlock(&base->lock);
-				call_timer_fn(timer, fn, data);
-				spin_lock(&base->lock);
-			} else {
-				spin_unlock_irq(&base->lock);
-				call_timer_fn(timer, fn, data);
-				spin_lock_irq(&base->lock);
-			}
-		}
+		while (levels--)
+			expire_timers(base, heads + levels);
 	}
 	base->running_timer = NULL;
 	spin_unlock_irq(&base->lock);
@@ -1218,78 +1324,87 @@ static inline void __run_timers(struct t
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * Find out when the next timer event is due to happen. This
- * is used on S/390 to stop all activity when a CPU is idle.
- * This function needs to be called with interrupts disabled.
+ * Find the next pending bucket of a level. Search from @offset + @clk upwards
+ * and if nothing there, search from start of the level (@offset) up to
+ * @offset + clk.
+ */
+static int next_pending_bucket(struct timer_base *base, unsigned offset,
+			       unsigned clk)
+{
+	unsigned pos, start = offset + clk;
+	unsigned end = offset + LVL_SIZE;
+
+	pos = find_next_bit(base->pending_map, end, start);
+	if (pos < end)
+		return pos - start;
+
+	pos = find_next_bit(base->pending_map, start, offset);
+	return pos < start ? pos + LVL_SIZE - start : -1;
+}
+
+/*
+ * Search the first expiring timer in the various clock levels.
  */
 static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
-	unsigned long clk = base->clk;
-	unsigned long expires = clk + NEXT_TIMER_MAX_DELTA;
-	int index, slot, array, found = 0;
-	struct timer_list *nte;
-	struct tvec *varray[4];
-
-	/* Look for timer events in tv1. */
-	index = slot = clk & TVR_MASK;
-	do {
-		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
-			if (nte->flags & TIMER_DEFERRABLE)
-				continue;
-
-			found = 1;
-			expires = nte->expires;
-			/* Look at the cascade bucket(s)? */
-			if (!index || slot < index)
-				goto cascade;
-			return expires;
-		}
-		slot = (slot + 1) & TVR_MASK;
-	} while (slot != index);
+	unsigned long clk, next, adj;
+	unsigned lvl, offset = 0;
 
-cascade:
-	/* Calculate the next cascade event */
-	if (index)
-		clk += TVR_SIZE - index;
-	clk >>= TVR_BITS;
-
-	/* Check tv2-tv5. */
-	varray[0] = &base->tv2;
-	varray[1] = &base->tv3;
-	varray[2] = &base->tv4;
-	varray[3] = &base->tv5;
-
-	for (array = 0; array < 4; array++) {
-		struct tvec *varp = varray[array];
-
-		index = slot = clk & TVN_MASK;
-		do {
-			hlist_for_each_entry(nte, varp->vec + slot, entry) {
-				if (nte->flags & TIMER_DEFERRABLE)
-					continue;
-
-				found = 1;
-				if (time_before(nte->expires, expires))
-					expires = nte->expires;
-			}
-			/*
-			 * Do we still search for the first timer or are
-			 * we looking up the cascade buckets ?
-			 */
-			if (found) {
-				/* Look at the cascade bucket(s)? */
-				if (!index || slot < index)
-					break;
-				return expires;
-			}
-			slot = (slot + 1) & TVN_MASK;
-		} while (slot != index);
-
-		if (index)
-			clk += TVN_SIZE - index;
-		clk >>= TVN_BITS;
+	spin_lock(&base->lock);
+	next = base->clk + NEXT_TIMER_MAX_DELTA;
+	clk = base->clk;
+	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
+		int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
+
+		if (pos >= 0) {
+			unsigned long tmp = clk + (unsigned long) pos;
+
+			tmp <<= LVL_SHIFT(lvl);
+			if (time_before(tmp, next))
+				next = tmp;
+		}
+		/*
+		 * Clock for the next level. If the current level clock lower
+		 * bits are zero, we look at the next level as is. If not we
+		 * need to advance it by one because that's going to be the
+		 * next expiring bucket in that level. base->clk is the next
+		 * expiring jiffie. So in case of:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    0    0
+		 *
+		 * we have to look at all levels @index 0. With
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    0    2
+		 *
+		 * LVL0 has the next expiring bucket @index 2. The upper
+		 * levels have the next expiring bucket @index 1.
+		 *
+		 * In case that the propagation wraps the next level the same
+		 * rules apply:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
+		 *  0    0    0    0    F    2
+		 *
+		 * So after looking at LVL0 we get:
+		 *
+		 * LVL5 LVL4 LVL3 LVL2 LVL1
+		 *  0    0    0    1    0
+		 *
+		 * So no propagation from LVL1 to LVL2 because that happened
+		 * with the add already, but then we need to propagate further
+		 * from LVL2 to LVL3.
+		 *
+		 * So the simple check whether the lower bits of the current
+		 * level are 0 or not is sufficient for all cases.
+		 */
+		adj = clk & LVL_CLK_MASK ? 1 : 0;
+		clk >>= LVL_CLK_SHIFT;
+		clk += adj;
 	}
-	return expires;
+	spin_unlock(&base->lock);
+	return next;
 }
 
 /*
@@ -1335,7 +1450,7 @@ static u64 cmp_next_hrtimer_event(u64 ba
  */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct timer_base *base = this_cpu_ptr(&timer_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
 
@@ -1346,17 +1461,11 @@ u64 get_next_timer_interrupt(unsigned lo
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
-	spin_lock(&base->lock);
-	if (base->active_timers) {
-		if (time_before_eq(base->next_timer, base->clk))
-			base->next_timer = __next_timer_interrupt(base);
-		nextevt = base->next_timer;
-		if (time_before_eq(nextevt, basej))
-			expires = basem;
-		else
-			expires = basem + (nextevt - basej) * TICK_NSEC;
-	}
-	spin_unlock(&base->lock);
+	nextevt = __next_timer_interrupt(base);
+	if (time_before_eq(nextevt, basej))
+		expires = basem;
+	else
+		expires = basem + (nextevt - basej) * TICK_NSEC;
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
@@ -1387,10 +1496,11 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct timer_base *base = this_cpu_ptr(&timer_bases);
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
-	if (time_after_eq(jiffies, base->clk))
-		__run_timers(base);
+	__run_timers(base);
+	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+		__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
 }
 
 /*
@@ -1541,7 +1651,6 @@ static void migrate_timer_list(struct ti
 
 	while (!hlist_empty(head)) {
 		timer = hlist_entry(head->first, struct timer_list, entry);
-		/* We ignore the accounting on the dying cpu */
 		detach_timer(timer, false);
 		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		internal_add_timer(new_base, timer);
@@ -1552,35 +1661,29 @@ static void migrate_timers(int cpu)
 {
 	struct timer_base *old_base;
 	struct timer_base *new_base;
-	int i;
+	int b, i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu_ptr(&timer_bases, cpu);
-	new_base = get_cpu_ptr(&timer_bases);
-	/*
-	 * The caller is globally serialized and nobody else
-	 * takes two locks at once, deadlock is not possible.
-	 */
-	spin_lock_irq(&new_base->lock);
-	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
-	BUG_ON(old_base->running_timer);
+	for (b = 0; b < NR_BASES; b++) {
+		old_base = per_cpu_ptr(&timer_bases[b], cpu);
+		new_base = get_cpu_ptr(&timer_bases[b]);
+		/*
+		 * The caller is globally serialized and nobody else
+		 * takes two locks at once, deadlock is not possible.
+		 */
+		spin_lock_irq(&new_base->lock);
+		spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+		BUG_ON(old_base->running_timer);
+
+		for (i = 0; i < WHEEL_SIZE; i++)
+			migrate_timer_list(new_base, old_base->vectors + i);
 
-	for (i = 0; i < TVR_SIZE; i++)
-		migrate_timer_list(new_base, old_base->tv1.vec + i);
-	for (i = 0; i < TVN_SIZE; i++) {
-		migrate_timer_list(new_base, old_base->tv2.vec + i);
-		migrate_timer_list(new_base, old_base->tv3.vec + i);
-		migrate_timer_list(new_base, old_base->tv4.vec + i);
-		migrate_timer_list(new_base, old_base->tv5.vec + i);
-	}
-
-	old_base->active_timers = 0;
-	old_base->all_timers = 0;
-
-	spin_unlock(&old_base->lock);
-	spin_unlock_irq(&new_base->lock);
-	put_cpu_ptr(&timer_bases);
+		spin_unlock(&old_base->lock);
+		spin_unlock_irq(&new_base->lock);
+		put_cpu_ptr(&timer_bases);
+	}
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1608,13 +1711,15 @@ static inline void timer_register_cpu_no
 
 static void __init init_timer_cpu(int cpu)
 {
-	struct timer_base *base = per_cpu_ptr(&timer_bases, cpu);
-
-	base->cpu = cpu;
-	spin_lock_init(&base->lock);
+	struct timer_base *base;
+	int i;
 
-	base->clk = jiffies;
-	base->next_timer = base->clk;
+	for (i = 0; i < NR_BASES; i++) {
+		base = per_cpu_ptr(&timer_bases[i], cpu);
+		base->cpu = cpu;
+		spin_lock_init(&base->lock);
+		base->clk = jiffies;
+	}
 }
 
 static void __init init_timer_cpus(void)

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 15/22] timer: Remove slack leftovers
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (13 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 14/22] timer: Switch to a non cascading wheel Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 16/22] timer: Move __run_timers() function Thomas Gleixner
                   ` (8 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: timer_Remove_slack_leftovers.patch --]
[-- Type: text/plain, Size: 5252 bytes --]

We now have implicit batching in the timer wheel. The slack is not longer
used. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>


---
 block/genhd.c                   |    5 -----
 drivers/mmc/host/jz4740_mmc.c   |    2 --
 drivers/power/bq27xxx_battery.c |    5 +----
 drivers/usb/host/ohci-hcd.c     |    1 -
 drivers/usb/host/xhci.c         |    2 --
 include/linux/timer.h           |    4 ----
 kernel/time/timer.c             |   19 -------------------
 lib/random32.c                  |    1 -
 8 files changed, 1 insertion(+), 38 deletions(-)

--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1523,12 +1523,7 @@ static void __disk_unblock_events(struct
 	if (--ev->block)
 		goto out_unlock;
 
-	/*
-	 * Not exactly a latency critical operation, set poll timer
-	 * slack to 25% and kick event check.
-	 */
 	intv = disk_events_poll_jiffies(disk);
-	set_timer_slack(&ev->dwork.timer, intv / 4);
 	if (check_now)
 		queue_delayed_work(system_freezable_power_efficient_wq,
 				&ev->dwork, 0);
--- a/drivers/mmc/host/jz4740_mmc.c
+++ b/drivers/mmc/host/jz4740_mmc.c
@@ -1068,8 +1068,6 @@ static int jz4740_mmc_probe(struct platf
 	jz4740_mmc_clock_disable(host);
 	setup_timer(&host->timeout_timer, jz4740_mmc_timeout,
 			(unsigned long)host);
-	/* It is not important when it times out, it just needs to timeout. */
-	set_timer_slack(&host->timeout_timer, HZ);
 
 	host->use_dma = true;
 	if (host->use_dma && jz4740_mmc_acquire_dma_channels(host) != 0)
--- a/drivers/power/bq27xxx_battery.c
+++ b/drivers/power/bq27xxx_battery.c
@@ -735,11 +735,8 @@ static void bq27xxx_battery_poll(struct
 
 	bq27xxx_battery_update(di);
 
-	if (poll_interval > 0) {
-		/* The timer does not have to be accurate. */
-		set_timer_slack(&di->work.timer, poll_interval * HZ / 4);
+	if (poll_interval > 0)
 		schedule_delayed_work(&di->work, poll_interval * HZ);
-	}
 }
 
 /*
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -500,7 +500,6 @@ static int ohci_init (struct ohci_hcd *o
 
 	setup_timer(&ohci->io_watchdog, io_watchdog_func,
 			(unsigned long) ohci);
-	set_timer_slack(&ohci->io_watchdog, msecs_to_jiffies(20));
 
 	ohci->hcca = dma_alloc_coherent (hcd->self.controller,
 			sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL);
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -490,8 +490,6 @@ static void compliance_mode_recovery_tim
 	xhci->comp_mode_recovery_timer.expires = jiffies +
 			msecs_to_jiffies(COMP_MODE_RCVRY_MSECS);
 
-	set_timer_slack(&xhci->comp_mode_recovery_timer,
-			msecs_to_jiffies(COMP_MODE_RCVRY_MSECS));
 	add_timer(&xhci->comp_mode_recovery_timer);
 	xhci_dbg_trace(xhci, trace_xhci_dbg_quirks,
 			"Compliance mode recovery timer initialized");
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -19,7 +19,6 @@ struct timer_list {
 	void			(*function)(unsigned long);
 	unsigned long		data;
 	u32			flags;
-	int			slack;
 
 #ifdef CONFIG_TIMER_STATS
 	int			start_pid;
@@ -73,7 +72,6 @@ struct timer_list {
 		.expires = (_expires),				\
 		.data = (_data),				\
 		.flags = (_flags),				\
-		.slack = -1,					\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
 	}
@@ -193,8 +191,6 @@ extern int del_timer(struct timer_list *
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
 
-extern void set_timer_slack(struct timer_list *time, int slack_hz);
-
 /*
  * The jiffies value which is added to now, when there is no timer
  * in the timer wheel:
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -447,24 +447,6 @@ unsigned long round_jiffies_up_relative(
 }
 EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
 
-/**
- * set_timer_slack - set the allowed slack for a timer
- * @timer: the timer to be modified
- * @slack_hz: the amount of time (in jiffies) allowed for rounding
- *
- * Set the amount of time, in jiffies, that a certain timer has
- * in terms of slack. By setting this value, the timer subsystem
- * will schedule the actual timer somewhere between
- * the time mod_timer() asks for, and that time plus the slack.
- *
- * By setting the slack to -1, a percentage of the delay is used
- * instead.
- */
-void set_timer_slack(struct timer_list *timer, int slack_hz)
-{
-	timer->slack = slack_hz;
-}
-EXPORT_SYMBOL_GPL(set_timer_slack);
 
 static inline unsigned int timer_get_idx(struct timer_list *timer)
 {
@@ -775,7 +757,6 @@ static void do_init_timer(struct timer_l
 {
 	timer->entry.pprev = NULL;
 	timer->flags = flags | raw_smp_processor_id();
-	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
 	timer->start_site = NULL;
 	timer->start_pid = -1;
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -233,7 +233,6 @@ static void __prandom_timer(unsigned lon
 
 static void __init __prandom_start_seed_timer(void)
 {
-	set_timer_slack(&seed_timer, HZ);
 	seed_timer.expires = jiffies + msecs_to_jiffies(40 * MSEC_PER_SEC);
 	add_timer(&seed_timer);
 }

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 16/22] timer: Move __run_timers() function
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (14 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 15/22] timer: Remove slack leftovers Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 17/22] timer: Optimize collect timers for NOHZ Thomas Gleixner
                   ` (7 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett, Anna-Maria Gleixner

[-- Attachment #1: timer_Move___run_timers_function.patch --]
[-- Type: text/plain, Size: 2293 bytes --]

From: Anna-Maria Gleixner <anna-maria@linutronix.de>

Move __run_timers() below __next_timer_interrupt() and next_pending_bucket()
in preparation for __run_timers() NOHZ optimization.

No functional change.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 kernel/time/timer.c |   52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1277,32 +1277,6 @@ static int collect_expired_timers(struct
 	return levels;
 }
 
-/**
- * __run_timers - run all expired timers (if any) on this CPU.
- * @base: the timer vector to be processed.
- */
-static inline void __run_timers(struct timer_base *base)
-{
-	struct hlist_head heads[LVL_DEPTH];
-	int levels;
-
-	if (!time_after_eq(jiffies, base->clk))
-		return;
-
-	spin_lock_irq(&base->lock);
-
-	while (time_after_eq(jiffies, base->clk)) {
-
-		levels = collect_expired_timers(base, heads);
-		base->clk++;
-
-		while (levels--)
-			expire_timers(base, heads + levels);
-	}
-	base->running_timer = NULL;
-	spin_unlock_irq(&base->lock);
-}
-
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * Find the next pending bucket of a level. Search from @offset + @clk upwards
@@ -1472,6 +1446,32 @@ void update_process_times(int user_tick)
 	run_posix_cpu_timers(p);
 }
 
+/**
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ */
+static inline void __run_timers(struct timer_base *base)
+{
+	struct hlist_head heads[LVL_DEPTH];
+	int levels;
+
+	if (!time_after_eq(jiffies, base->clk))
+		return;
+
+	spin_lock_irq(&base->lock);
+
+	while (time_after_eq(jiffies, base->clk)) {
+
+		levels = collect_expired_timers(base, heads);
+		base->clk++;
+
+		while (levels--)
+			expire_timers(base, heads + levels);
+	}
+	base->running_timer = NULL;
+	spin_unlock_irq(&base->lock);
+}
+
 /*
  * This function runs timers and the timer-tq in bottom half context.
  */

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 17/22] timer: Optimize collect timers for NOHZ
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (15 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 16/22] timer: Move __run_timers() function Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 18/22] tick/sched: Remove pointless empty function Thomas Gleixner
                   ` (6 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett, Anna-Maria Gleixner

[-- Attachment #1: timer_Optimize_collect_timers_for_NOHZ.patch --]
[-- Type: text/plain, Size: 3984 bytes --]

From: Anna-Maria Gleixner <anna-maria@linutronix.de>

After a NOHZ idle sleep the wheel must be forwarded to current jiffies. There
might be expired timers so the current code loops and checks the epxired
buckets for timers. This can take quite some time for long NOHZ idle periods.

The pending bitmask in the timer base allows us to do a quick search for the
next expiring timer and therefor a fast forward of the base time which
prevents pointless long lasting loops.

For a 3 second idle sleep this reduces the catchup time from ~1ms to 5us.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 kernel/time/timer.c |   49 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 8 deletions(-)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1252,8 +1252,8 @@ static void expire_timers(struct timer_b
 	}
 }
 
-static int collect_expired_timers(struct timer_base *base,
-				  struct hlist_head *heads)
+static int __collect_expired_timers(struct timer_base *base,
+				    struct hlist_head *heads)
 {
 	unsigned long clk = base->clk;
 	struct hlist_head *vec;
@@ -1279,9 +1279,9 @@ static int collect_expired_timers(struct
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * Find the next pending bucket of a level. Search from @offset + @clk upwards
- * and if nothing there, search from start of the level (@offset) up to
- * @offset + clk.
+ * Find the next pending bucket of a level. Search from level start (@offset)
+ * + @clk upwards and if nothing there, search from start of the level
+ * (@offset) up to @offset + clk.
  */
 static int next_pending_bucket(struct timer_base *base, unsigned offset,
 			       unsigned clk)
@@ -1298,14 +1298,14 @@ static int next_pending_bucket(struct ti
 }
 
 /*
- * Search the first expiring timer in the various clock levels.
+ * Search the first expiring timer in the various clock levels. Caller must
+ * hold base->lock.
  */
 static unsigned long __next_timer_interrupt(struct timer_base *base)
 {
 	unsigned long clk, next, adj;
 	unsigned lvl, offset = 0;
 
-	spin_lock(&base->lock);
 	next = base->clk + NEXT_TIMER_MAX_DELTA;
 	clk = base->clk;
 	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
@@ -1358,7 +1358,6 @@ static unsigned long __next_timer_interr
 		clk >>= LVL_CLK_SHIFT;
 		clk += adj;
 	}
-	spin_unlock(&base->lock);
 	return next;
 }
 
@@ -1416,7 +1415,10 @@ u64 get_next_timer_interrupt(unsigned lo
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
+	spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
+	spin_unlock(&base->lock);
+
 	if (time_before_eq(nextevt, basej))
 		expires = basem;
 	else
@@ -1424,6 +1426,37 @@ u64 get_next_timer_interrupt(unsigned lo
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
+
+static int collect_expired_timers(struct timer_base *base,
+				  struct hlist_head *heads)
+{
+	/*
+	 * NOHZ optimization. After a long idle sleep we need to forward the
+	 * base to current jiffies. Avoid a loop by searching the bitfield for
+	 * the next expiring timer.
+	 */
+	if ((long)(jiffies - base->clk) > 2) {
+		unsigned long next = __next_timer_interrupt(base);
+
+		/*
+		 * If the next timer is ahead of time forward to current
+		 * jiffies, otherwise forward to the next expiry time.
+		 */
+		if (time_after(next, jiffies)) {
+			/* The call site will increment clock! */
+			base->clk = jiffies - 1;
+			return 0;
+		}
+		base->clk = next;
+	}
+	return __collect_expired_timers(base, heads);
+}
+#else
+static inline int collect_expired_timers(struct timer_base *base,
+					 struct hlist_head *heads)
+{
+	return __collect_expired_timers(base, heads);
+}
 #endif
 
 /*

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 18/22] tick/sched: Remove pointless empty function
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (16 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 17/22] timer: Optimize collect timers for NOHZ Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 19/22] timer: Forward wheel clock whenever possible Thomas Gleixner
                   ` (5 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: ticksched_Remove_pointless_empty_function.patch --]
[-- Type: text/plain, Size: 2021 bytes --]

This was a failed attempt to optimize the timer expiry in idle, which was
disabled and never revisited. Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 kernel/time/tick-sched.c |   33 +--------------------------------
 1 file changed, 1 insertion(+), 32 deletions(-)

--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1092,35 +1092,6 @@ static void tick_nohz_switch_to_nohz(voi
 	tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
 
-/*
- * When NOHZ is enabled and the tick is stopped, we need to kick the
- * tick timer from irq_enter() so that the jiffies update is kept
- * alive during long running softirqs. That's ugly as hell, but
- * correctness is key even if we need to fix the offending softirq in
- * the first place.
- *
- * Note, this is different to tick_nohz_restart. We just kick the
- * timer and do not touch the other magic bits which need to be done
- * when idle is left.
- */
-static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
-{
-#if 0
-	/* Switch back to 2.6.27 behaviour */
-	ktime_t delta;
-
-	/*
-	 * Do not touch the tick device, when the next expiry is either
-	 * already reached or less/equal than the tick period.
-	 */
-	delta =	ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
-	if (delta.tv64 <= tick_period.tv64)
-		return;
-
-	tick_nohz_restart(ts, now);
-#endif
-}
-
 static inline void tick_nohz_irq_enter(void)
 {
 	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -1131,10 +1102,8 @@ static inline void tick_nohz_irq_enter(v
 	now = ktime_get();
 	if (ts->idle_active)
 		tick_nohz_stop_idle(ts, now);
-	if (ts->tick_stopped) {
+	if (ts->tick_stopped)
 		tick_nohz_update_jiffies(now);
-		tick_nohz_kick_tick(ts, now);
-	}
 }
 
 #else

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 19/22] timer: Forward wheel clock whenever possible
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (17 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 18/22] tick/sched: Remove pointless empty function Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 20:45   ` Richard Cochran
  2016-06-24 14:32 ` [patch V3 20/22] timer: Only wake softirq if necessary Thomas Gleixner
                   ` (4 subsequent siblings)
  23 siblings, 1 reply; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: timer_Forward_wheel_clock_whenever_possible.patch --]
[-- Type: text/plain, Size: 7485 bytes --]

The wheel clock is stale when a cpu goes into a long idle sleep. This has the
side effect, that timers which are queued end up in the outer wheel
levels. That results in coarser granularity.

To solve this, we keep track of the idle state and forward the wheel clock
whenever it's possible.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 kernel/time/tick-internal.h |    1 
 kernel/time/tick-sched.c    |   13 ++++
 kernel/time/timer.c         |  123 +++++++++++++++++++++++++++++++++++---------
 3 files changed, 114 insertions(+), 23 deletions(-)

--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -164,3 +164,4 @@ static inline void timers_update_migrati
 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
 
 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
+void timer_clear_idle(void);
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick
 	delta = next_tick - basemono;
 	if (delta <= (u64)TICK_NSEC) {
 		tick.tv64 = 0;
+
+		/*
+		 * Tell the timer code that the base is not idle, i.e. undo
+		 * the effect of get_next_timer_interrupt().
+		 */
+		timer_clear_idle();
 		/*
 		 * We've not stopped the tick yet, and there's a timer in the
 		 * next period, so no point in stopping it either, bail.
@@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick
 	tick_do_update_jiffies64(now);
 	cpu_load_update_nohz_stop();
 
+	/*
+	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
+	 * the clock forward checks in the enqueue path.
+	 */
+	timer_clear_idle();
+
 	calc_load_exit_idle();
 	touch_softlockup_watchdog_sched();
 	/*
@@ -1025,6 +1037,7 @@ void tick_nohz_idle_exit(void)
 		tick_nohz_stop_idle(ts, now);
 
 	if (ts->tick_stopped) {
+		timer_clear_idle();
 		tick_nohz_restart_sched_tick(ts, now);
 		tick_nohz_account_idle_ticks(ts);
 	}
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -196,9 +196,11 @@ struct timer_base {
 	spinlock_t		lock;
 	struct timer_list	*running_timer;
 	unsigned long		clk;
+	unsigned long		next_expiry;
 	unsigned int		cpu;
 	bool			migration_enabled;
 	bool			nohz_active;
+	bool			is_idle;
 	DECLARE_BITMAP(pending_map, WHEEL_SIZE);
 	struct hlist_head	vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
@@ -520,23 +522,27 @@ static void internal_add_timer(struct ti
 	__internal_add_timer(base, timer);
 
 	/*
-	 * Check whether the other CPU is in dynticks mode and needs
-	 * to be triggered to reevaluate the timer wheel.  We are
-	 * protected against the other CPU fiddling with the timer by
-	 * holding the timer base lock. This also makes sure that a
-	 * CPU on the way to stop its tick can not evaluate the timer
-	 * wheel.
-	 *
-	 * Spare the IPI for deferrable timers on idle targets though.
-	 * The next busy ticks will take care of it. Except full dynticks
-	 * require special care against races with idle_cpu(), lets deal
-	 * with that later.
-	 */
-	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) {
-		if (!(timer->flags & TIMER_DEFERRABLE) ||
-		    tick_nohz_full_cpu(base->cpu))
-			wake_up_nohz_cpu(base->cpu);
-	}
+	 * We might have to IPI the remote CPU if the base is idle and the
+	 * timer is not deferrable. If the other cpu is on the way to idle
+	 * then it can't set base->is_idle as we hold base lock.
+	 */
+	if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->is_idle ||
+	    (timer->flags & TIMER_DEFERRABLE))
+		return;
+
+	/* Check whether this is the new first expiring timer */
+	if (time_after_eq(timer->expires, base->next_expiry))
+		return;
+	base->next_expiry = timer->expires;
+
+	/*
+	 * Check whether the other CPU is in dynticks mode and needs to be
+	 * triggered to reevaluate the timer wheel.  We are protected against
+	 * the other CPU fiddling with the timer by holding the timer base
+	 * lock.
+	 */
+	if (tick_nohz_full_cpu(base->cpu))
+		wake_up_nohz_cpu(base->cpu);
 }
 
 #ifdef CONFIG_TIMER_STATS
@@ -844,10 +850,11 @@ static inline struct timer_base *get_tim
 	return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
 }
 
-static inline struct timer_base *get_target_base(struct timer_base *base,
-						 unsigned tflags)
+#ifdef CONFIG_NO_HZ_COMMON
+static inline struct timer_base *__get_target_base(struct timer_base *base,
+						   unsigned tflags)
 {
-#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 	if ((tflags & TIMER_PINNED) || !base->migration_enabled)
 		return get_timer_this_cpu_base(tflags);
 	return get_timer_cpu_base(tflags, get_nohz_timer_target());
@@ -856,6 +863,43 @@ static inline struct timer_base *get_tar
 #endif
 }
 
+static inline void forward_timer_base(struct timer_base *base)
+{
+	/*
+	 * We only forward the base when it's idle and we have a delta between
+	 * base clock and jiffies.
+	 */
+	if (!base->is_idle || (long) (jiffies - base->clk) < 2)
+		return;
+
+	/*
+	 * If the next expiry value is > jiffies, then we fast forward to
+	 * jiffies otherwise we forward to the next expiry value.
+	 */
+	if (time_after(base->next_expiry, jiffies))
+		base->clk = jiffies;
+	else
+		base->clk = base->next_expiry;
+}
+#else
+static inline struct timer_base *__get_target_base(struct timer_base *base,
+						   unsigned tflags)
+{
+	return get_timer_this_cpu_base(tflags);
+}
+
+static inline void forward_timer_base(struct timer_base *base) { }
+#endif
+
+static inline struct timer_base *get_target_base(struct timer_base *base,
+						 unsigned tflags)
+{
+	struct timer_base *target = __get_target_base(base, tflags);
+
+	forward_timer_base(target);
+	return target;
+}
+
 /*
  * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
  * that all timers which are tied to this base are locked, and the base itself
@@ -1417,16 +1461,49 @@ u64 get_next_timer_interrupt(unsigned lo
 
 	spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
-	spin_unlock(&base->lock);
+	base->next_expiry = nextevt;
+	/*
+	 * We have a fresh next event. Check whether we can forward the base.
+	 */
+	if (time_after(nextevt, jiffies))
+		base->clk = jiffies;
+	else if (time_after(nextevt, base->clk))
+		base->clk = nextevt;
 
-	if (time_before_eq(nextevt, basej))
+	if (time_before_eq(nextevt, basej)) {
 		expires = basem;
-	else
+		base->is_idle = false;
+	} else {
 		expires = basem + (nextevt - basej) * TICK_NSEC;
+		/*
+		 * If we expect to sleep more than a tick, mark the base idle.
+		 */
+		if ((expires - basem) > TICK_NSEC)
+			base->is_idle = true;
+	}
+	spin_unlock(&base->lock);
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
 
+/**
+ * timer_clear_idle - Clear the idle state of the timer base
+ *
+ * Called with interrupts disabled
+ */
+void timer_clear_idle(void)
+{
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
+	/*
+	 * We do this unlocked. The worst outcome is a remote enqueue sending
+	 * a pointless IPI, but taking the lock would just make the window for
+	 * sending the IPI a few instructions smaller for the cost of taking
+	 * the lock in the exit from idle path.
+	 */
+	base->is_idle = false;
+}
+
 static int collect_expired_timers(struct timer_base *base,
 				  struct hlist_head *heads)
 {

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 20/22] timer: Only wake softirq if necessary
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (18 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 19/22] timer: Forward wheel clock whenever possible Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 21/22] timer: Split out index calculation Thomas Gleixner
                   ` (3 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

[-- Attachment #1: timer--Only-wake-softirq-if-necessary.patch --]
[-- Type: text/plain, Size: 802 bytes --]

With the wheel forwading in place and with the HZ=1000 4ms folding we can
avoid running the softirq at all.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c |   11 +++++++++++
 1 file changed, 11 insertions(+)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1599,7 +1599,18 @@ static void run_timer_softirq(struct sof
  */
 void run_local_timers(void)
 {
+	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+
 	hrtimer_run_queues();
+	/* Raise the softirq only if required. */
+	if (time_before(jiffies, base->clk)) {
+		if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+			return;
+		/* CPU is awake, so check the deferrable base. */
+		base++;
+		if (time_before(jiffies, base->clk))
+			return;
+	}
 	raise_softirq(TIMER_SOFTIRQ);
 }
 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 21/22] timer: Split out index calculation
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (19 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 20/22] timer: Only wake softirq if necessary Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 14:32 ` [patch V3 22/22] timer: Optimization for same expiry time in mod_timer() Thomas Gleixner
                   ` (2 subsequent siblings)
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett, Anna-Maria Gleixner

[-- Attachment #1: timer_Split_out_index_calculation.patch --]
[-- Type: text/plain, Size: 3073 bytes --]

From: Anna-Maria Gleixner <anna-maria@linutronix.de>

For further optimizations we need to seperate index calculation and
queueing. No functional change.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 kernel/time/timer.c |   45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -471,12 +471,9 @@ static inline unsigned calc_index(unsign
 	return LVL_OFFS(lvl) + (expires & LVL_MASK);
 }
 
-static void
-__internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static int calc_wheel_index(unsigned long expires, unsigned long clk)
 {
-	unsigned long expires = timer->expires;
-	unsigned long delta = expires - base->clk;
-	struct hlist_head *vec;
+	unsigned long delta = expires - clk;
 	unsigned int idx;
 
 	if (delta < LVL_START(1)) {
@@ -496,7 +493,7 @@ static void
 	} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
 		idx = calc_index(expires, 7);
 	} else if ((long) delta < 0) {
-		idx = base->clk & LVL_MASK;
+		idx = clk & LVL_MASK;
 	} else {
 		/*
 		 * Force expire obscene large timeouts to expire at the
@@ -507,20 +504,33 @@ static void
 
 		idx = calc_index(expires, LVL_DEPTH - 1);
 	}
-	/*
-	 * Enqueue the timer into the array bucket, mark it pending in
-	 * the bitmap and store the index in the timer flags.
-	 */
-	vec = base->vectors + idx;
-	hlist_add_head(&timer->entry, vec);
+	return idx;
+}
+
+/*
+ * Enqueue the timer into the hash bucket, mark it pending in
+ * the bitmap and store the index in the timer flags.
+ */
+static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
+			  unsigned int idx)
+{
+	hlist_add_head(&timer->entry, base->vectors + idx);
 	__set_bit(idx, base->pending_map);
 	timer_set_idx(timer, idx);
 }
 
-static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
+static void
+__internal_add_timer(struct timer_base *base, struct timer_list *timer)
 {
-	__internal_add_timer(base, timer);
+	unsigned int idx;
+
+	idx = calc_wheel_index(timer->expires, base->clk);
+	enqueue_timer(base, timer, idx);
+}
 
+static void
+trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
+{
 	/*
 	 * We might have to IPI the remote CPU if the base is idle and the
 	 * timer is not deferrable. If the other cpu is on the way to idle
@@ -545,6 +555,13 @@ static void internal_add_timer(struct ti
 		wake_up_nohz_cpu(base->cpu);
 }
 
+static void
+internal_add_timer(struct timer_base *base, struct timer_list *timer)
+{
+	__internal_add_timer(base, timer);
+	trigger_dyntick_cpu(base, timer);
+}
+
 #ifdef CONFIG_TIMER_STATS
 void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 {

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [patch V3 22/22] timer: Optimization for same expiry time in mod_timer()
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (20 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 21/22] timer: Split out index calculation Thomas Gleixner
@ 2016-06-24 14:32 ` Thomas Gleixner
  2016-06-24 17:04 ` [patch V3 00/22] timer: Refactor the timer wheel Josh Triplett
  2016-06-27  4:22 ` Paul E. McKenney
  23 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-24 14:32 UTC (permalink / raw)
  To: LKML
  Cc: Ingo Molnar, Peter Zijlstra, Paul E. McKenney, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett, Anna-Maria Gleixner

[-- Attachment #1: timer_Optimization_for_same_expiry_time_in_mod_timer.patch --]
[-- Type: text/plain, Size: 4535 bytes --]

From: Anna-Maria Gleixner <anna-maria@linutronix.de>

The existing optimization for same expiry time in mod_timer() checks whether
the timer expiry time is the same as the new requested expiry time. In the old
timer wheel implementation this does not take the slack batching into account,
neither does the new implementation evaluate whether the new expiry time will
requeue the timer to the same bucket.

To optimize that, we can calculate the resulting bucket and check if the new
expiry time is different from the current expiry time. This calculation
happens outside the base lock held region. If the resulting bucket is the same
we can avoid taking the base lock and requeueing the timer.

If the timer needs to be requeued then we have to check under the base lock
whether the base time has changed between the lockless calculation and taking
the lock. If it has changed we need to recalculate under the lock.

This optimization takes effect for timers which are enqueued into the less
granular wheel levels (1 and above). With a simple test case the functionality
has been verified:

    	    Before	After
Match:	     5.5%	86.6%
Requeue:    94.5%	13.4%
Recalc:  		<0.01%

In the non optimized case the timer is requeued in 94.5% of the cases. With
the index optimization in place the requeue rate drops to 13.4%. The case
where the lockless index calculation has to be redone is less than 0.01%.

With a real world test case (networking) we observed the following changes:

    	    Before	After
Match:	    97.8%	99.7%
Requeue:     2.2%	 0.3%
Recalc:  		<0.001%

That means two percent less lock/requeue/unlock operations in one of the hot
path use cases of timers.


Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: rt@linutronix.de
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@infradead.org>

---
 kernel/time/timer.c |   51 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 16 deletions(-)

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -951,28 +951,36 @@ static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct timer_base *base, *new_base;
-	unsigned long flags;
+	unsigned int idx = UINT_MAX;
+	unsigned long clk = 0, flags;
 	int ret = 0;
 
 	/*
-	 * TODO: Calculate the array bucket of the timer right here w/o
-	 * holding the base lock. This allows to check not only
-	 * timer->expires == expires below, but also whether the timer
-	 * ends up in the same bucket. If we really need to requeue
-	 * the timer then we check whether base->clk have
-	 * advanced between here and locking the timer base. If
-	 * jiffies advanced we have to recalc the array bucket with the
-	 * lock held.
-	 */
-
-	/*
-	 * This is a common optimization triggered by the
-	 * networking code - if the timer is re-modified
-	 * to be the same thing then just return:
+	 * This is a common optimization triggered by the networking code - if
+	 * the timer is re-modified to be the same thing or ends up in the
+	 * same array bucket then just return:
 	 */
 	if (timer_pending(timer)) {
 		if (timer->expires == expires)
 			return 1;
+		/*
+		 * Take the current timer_jiffies of base, but without holding
+		 * the lock!
+		 */
+		base = get_timer_base(timer->flags);
+		clk = base->clk;
+
+		idx = calc_wheel_index(expires, clk);
+
+		/*
+		 * Retrieve and compare the array index of the pending
+		 * timer. If it matches set the expiry to the new value so a
+		 * subsequent call will exit in the expires check above.
+		 */
+		if (idx == timer_get_idx(timer)) {
+			timer->expires = expires;
+			return 1;
+		}
 	}
 
 	timer_stats_timer_set_start_info(timer);
@@ -1009,7 +1017,18 @@ static inline int
 	}
 
 	timer->expires = expires;
-	internal_add_timer(base, timer);
+	/*
+	 * If idx was calculated above and the base time did not advance
+	 * between calculating idx and taking the lock, only enqueue_timer()
+	 * and trigger_dyntick_cpu() is required. Otherwise we need to
+	 * (re)calculate the wheel index via internal_add_timer().
+	 */
+	if (idx != UINT_MAX && clk == base->clk) {
+		enqueue_timer(base, timer, idx);
+		trigger_dyntick_cpu(base, timer);
+	} else {
+		internal_add_timer(base, timer);
+	}
 
 out_unlock:
 	spin_unlock_irqrestore(&base->lock, flags);

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch V3 00/22] timer: Refactor the timer wheel
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (21 preceding siblings ...)
  2016-06-24 14:32 ` [patch V3 22/22] timer: Optimization for same expiry time in mod_timer() Thomas Gleixner
@ 2016-06-24 17:04 ` Josh Triplett
  2016-06-27  4:22 ` Paul E. McKenney
  23 siblings, 0 replies; 30+ messages in thread
From: Josh Triplett @ 2016-06-24 17:04 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt, Rik van Riel, Linus Torvalds, George Spelvin, Len Brown

On Fri, Jun 24, 2016 at 02:32:00PM -0000, Thomas Gleixner wrote:
>  - Removed the 1000Hz granularity reduction to 4ms. Eric explained that
>    datacenter workloads require the granularity in the first level wheel.
> 
>  - Fixed the typo in tilepro. 
> 
>  - Converted sigtimedwait() to hrtimers
> 
>  - To avoid the cascading I extended the wheel by another level. This removes
>    the rarely executed cascading code path, but increases the storage size for
>    HZ>100 slightly. If the tiny folks care, there is an simple option to cut
>    the storage size in half for the price of reduced granularity.

No objection here.  This series is a massive improvement for low-power
systems, and the tiniest kernels will probably need to poke at this even
more aggressively anyway.  (For instance, compiling out connection
tracking and its incredibly long timer, and then removing the bucket it
needs.)

- Josh Triplett

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch V3 19/22] timer: Forward wheel clock whenever possible
  2016-06-24 14:32 ` [patch V3 19/22] timer: Forward wheel clock whenever possible Thomas Gleixner
@ 2016-06-24 20:45   ` Richard Cochran
  2016-06-25  8:55     ` Thomas Gleixner
  0 siblings, 1 reply; 30+ messages in thread
From: Richard Cochran @ 2016-06-24 20:45 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt, Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

On Fri, Jun 24, 2016 at 02:32:19PM -0000, Thomas Gleixner wrote:
> @@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick
>  	tick_do_update_jiffies64(now);
>  	cpu_load_update_nohz_stop();
>  
> +	/*
> +	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
> +	 * the clock forward checks in the enqueue path.
> +	 */
> +	timer_clear_idle();

Since function calls timer_clear_idle() already ...

>  	calc_load_exit_idle();
>  	touch_softlockup_watchdog_sched();
>  	/*
> @@ -1025,6 +1037,7 @@ void tick_nohz_idle_exit(void)
>  		tick_nohz_stop_idle(ts, now);
>  
>  	if (ts->tick_stopped) {
> +		timer_clear_idle();
>  		tick_nohz_restart_sched_tick(ts, now);
>  		tick_nohz_account_idle_ticks(ts);
>  	}

... there is no need to call timer_clear_idle() here.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch V3 19/22] timer: Forward wheel clock whenever possible
  2016-06-24 20:45   ` Richard Cochran
@ 2016-06-25  8:55     ` Thomas Gleixner
  0 siblings, 0 replies; 30+ messages in thread
From: Thomas Gleixner @ 2016-06-25  8:55 UTC (permalink / raw)
  To: Richard Cochran
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Frederic Weisbecker, Chris Mason, Arjan van de Ven,
	rt, Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

On Fri, 24 Jun 2016, Richard Cochran wrote:

> On Fri, Jun 24, 2016 at 02:32:19PM -0000, Thomas Gleixner wrote:
> > @@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick
> >  	tick_do_update_jiffies64(now);
> >  	cpu_load_update_nohz_stop();
> >  
> > +	/*
> > +	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
> > +	 * the clock forward checks in the enqueue path.
> > +	 */
> > +	timer_clear_idle();
> 
> Since function calls timer_clear_idle() already ...
>
> >  	calc_load_exit_idle();
> >  	touch_softlockup_watchdog_sched();
> >  	/*
> > @@ -1025,6 +1037,7 @@ void tick_nohz_idle_exit(void)
> >  		tick_nohz_stop_idle(ts, now);
> >  
> >  	if (ts->tick_stopped) {
> > +		timer_clear_idle();
> >  		tick_nohz_restart_sched_tick(ts, now);
> >  		tick_nohz_account_idle_ticks(ts);
> >  	}
> 
> ... there is no need to call timer_clear_idle() here.

Indeed.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch V3 00/22] timer: Refactor the timer wheel
  2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
                   ` (22 preceding siblings ...)
  2016-06-24 17:04 ` [patch V3 00/22] timer: Refactor the timer wheel Josh Triplett
@ 2016-06-27  4:22 ` Paul E. McKenney
  2016-06-27 23:56   ` Paul E. McKenney
  23 siblings, 1 reply; 30+ messages in thread
From: Paul E. McKenney @ 2016-06-27  4:22 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

On Fri, Jun 24, 2016 at 02:32:00PM -0000, Thomas Gleixner wrote:
> This is the third version of the timer wheel rework series. The previous
> versions can be found here:
> 
> V1:   http://lkml.kernel.org/r/20160613070440.950649741@linutronix.de
> V2:   http://lkml.kernel.org/r/20160617121134.417319325@linutronix.de
> 
> The series is also available in git:
> 
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.timers
> 
> Changes vs. V2:
> 
>  - Removed the 1000Hz granularity reduction to 4ms. Eric explained that
>    datacenter workloads require the granularity in the first level wheel.
> 
>  - Fixed the typo in tilepro. 
> 
>  - Converted sigtimedwait() to hrtimers
> 
>  - To avoid the cascading I extended the wheel by another level. This removes
>    the rarely executed cascading code path, but increases the storage size for
>    HZ>100 slightly. If the tiny folks care, there is an simple option to cut
>    the storage size in half for the price of reduced granularity.
> 
> Thanks,

And this series avoids the strange SRCU behavior that I saw with the
last version.  So looks to be getting there!

I will be torturing it more vigorously, but looking good so far.

							Thanx, Paul

> 	tglx
> 
> ----
>  arch/x86/kernel/apic/x2apic_uv_x.c  |    4 
>  arch/x86/kernel/cpu/mcheck/mce.c    |    4 
>  block/genhd.c                       |    5 
>  drivers/cpufreq/powernv-cpufreq.c   |    5 
>  drivers/mmc/host/jz4740_mmc.c       |    2 
>  drivers/net/ethernet/tile/tilepro.c |    4 
>  drivers/power/bq27xxx_battery.c     |    5 
>  drivers/tty/metag_da.c              |    4 
>  drivers/tty/mips_ejtag_fdc.c        |    4 
>  drivers/usb/host/ohci-hcd.c         |    1 
>  drivers/usb/host/xhci.c             |    2 
>  include/linux/list.h                |   10 
>  include/linux/timer.h               |   34 -
>  kernel/signal.c                     |   24 
>  kernel/time/tick-internal.h         |    1 
>  kernel/time/tick-sched.c            |   46 -
>  kernel/time/timer.c                 | 1096 +++++++++++++++++++++---------------
>  lib/random32.c                      |    1 
>  net/ipv4/inet_connection_sock.c     |    7 
>  net/ipv4/inet_timewait_sock.c       |    5 
>  20 files changed, 734 insertions(+), 530 deletions(-)
> 
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch V3 00/22] timer: Refactor the timer wheel
  2016-06-27  4:22 ` Paul E. McKenney
@ 2016-06-27 23:56   ` Paul E. McKenney
  0 siblings, 0 replies; 30+ messages in thread
From: Paul E. McKenney @ 2016-06-27 23:56 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Eric Dumazet,
	Frederic Weisbecker, Chris Mason, Arjan van de Ven, rt,
	Rik van Riel, Linus Torvalds, George Spelvin, Len Brown,
	Josh Triplett

On Sun, Jun 26, 2016 at 09:22:36PM -0700, Paul E. McKenney wrote:
> On Fri, Jun 24, 2016 at 02:32:00PM -0000, Thomas Gleixner wrote:
> > This is the third version of the timer wheel rework series. The previous
> > versions can be found here:
> > 
> > V1:   http://lkml.kernel.org/r/20160613070440.950649741@linutronix.de
> > V2:   http://lkml.kernel.org/r/20160617121134.417319325@linutronix.de
> > 
> > The series is also available in git:
> > 
> >    git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.timers
> > 
> > Changes vs. V2:
> > 
> >  - Removed the 1000Hz granularity reduction to 4ms. Eric explained that
> >    datacenter workloads require the granularity in the first level wheel.
> > 
> >  - Fixed the typo in tilepro. 
> > 
> >  - Converted sigtimedwait() to hrtimers
> > 
> >  - To avoid the cascading I extended the wheel by another level. This removes
> >    the rarely executed cascading code path, but increases the storage size for
> >    HZ>100 slightly. If the tiny folks care, there is an simple option to cut
> >    the storage size in half for the price of reduced granularity.
> > 
> > Thanks,
> 
> And this series avoids the strange SRCU behavior that I saw with the
> last version.  So looks to be getting there!
> 
> I will be torturing it more vigorously, but looking good so far.

And from RCU and rcutorture perspectives:

Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

							Thanx, Paul

> > 	tglx
> > 
> > ----
> >  arch/x86/kernel/apic/x2apic_uv_x.c  |    4 
> >  arch/x86/kernel/cpu/mcheck/mce.c    |    4 
> >  block/genhd.c                       |    5 
> >  drivers/cpufreq/powernv-cpufreq.c   |    5 
> >  drivers/mmc/host/jz4740_mmc.c       |    2 
> >  drivers/net/ethernet/tile/tilepro.c |    4 
> >  drivers/power/bq27xxx_battery.c     |    5 
> >  drivers/tty/metag_da.c              |    4 
> >  drivers/tty/mips_ejtag_fdc.c        |    4 
> >  drivers/usb/host/ohci-hcd.c         |    1 
> >  drivers/usb/host/xhci.c             |    2 
> >  include/linux/list.h                |   10 
> >  include/linux/timer.h               |   34 -
> >  kernel/signal.c                     |   24 
> >  kernel/time/tick-internal.h         |    1 
> >  kernel/time/tick-sched.c            |   46 -
> >  kernel/time/timer.c                 | 1096 +++++++++++++++++++++---------------
> >  lib/random32.c                      |    1 
> >  net/ipv4/inet_connection_sock.c     |    7 
> >  net/ipv4/inet_timewait_sock.c       |    5 
> >  20 files changed, 734 insertions(+), 530 deletions(-)
> > 
> > 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch V3 01/22] timer: Make pinned a timer property
  2016-06-24 14:32 ` [patch V3 01/22] timer: Make pinned a timer property Thomas Gleixner
@ 2016-07-01 13:03   ` Frederic Weisbecker
  0 siblings, 0 replies; 30+ messages in thread
From: Frederic Weisbecker @ 2016-07-01 13:03 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Chris Mason, Arjan van de Ven, rt, Rik van Riel,
	Linus Torvalds, George Spelvin, Len Brown, Josh Triplett

On Fri, Jun 24, 2016 at 02:32:01PM -0000, Thomas Gleixner wrote:
> We want to move the timer migration from a push to a pull model. This requires
> to store the pinned attribute of a timer in the timer itself. This must happen
> at initialization time.
> 
> Add the helper macros for this.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Frederic Weisbecker <fweisbec@gmail.com>
> Cc: Chris Mason <clm@fb.com>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: rt@linutronix.de
> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> Cc: Arjan van de Ven <arjan@infradead.org>
> 
> 
> ---
>  include/linux/timer.h |   25 ++++++++++++++++++++++---
>  kernel/time/timer.c   |   10 +++++-----
>  2 files changed, 27 insertions(+), 8 deletions(-)
> 
> --- a/include/linux/timer.h
> +++ b/include/linux/timer.h
> @@ -62,7 +62,8 @@ struct timer_list {
>  #define TIMER_MIGRATING		0x00080000
>  #define TIMER_BASEMASK		(TIMER_CPUMASK | TIMER_MIGRATING)
>  #define TIMER_DEFERRABLE	0x00100000
> -#define TIMER_IRQSAFE		0x00200000
> +#define TIMER_PINNED		0x00200000
> +#define TIMER_IRQSAFE		0x00400000
>  
>  #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
>  		.entry = { .next = TIMER_ENTRY_STATIC },	\
> @@ -78,9 +79,15 @@ struct timer_list {
>  #define TIMER_INITIALIZER(_function, _expires, _data)		\
>  	__TIMER_INITIALIZER((_function), (_expires), (_data), 0)
>  
> +#define TIMER_PINNED_INITIALIZER(_function, _expires, _data)	\
> +	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_PINNED)
> +
>  #define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data)	\
>  	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE)
>  
> +#define TIMER_PINNED_DEFERRED_INITIALIZER(_function, _expires, _data)	\
> +	__TIMER_INITIALIZER((_function), (_expires), (_data), TIMER_DEFERRABLE | TIMER_PINNED)
> +

I guess it means that no timer took the freedom to be sometimes enqueued pinned and
sometimes unpinned. Which is a good news :-)

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [patch V3 13/22] timer: Reduce the CPU index space to 256k
  2016-06-24 14:32 ` [patch V3 13/22] timer: Reduce the CPU index space to 256k Thomas Gleixner
@ 2016-07-01 14:41   ` Frederic Weisbecker
  0 siblings, 0 replies; 30+ messages in thread
From: Frederic Weisbecker @ 2016-07-01 14:41 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ingo Molnar, Peter Zijlstra, Paul E. McKenney,
	Eric Dumazet, Chris Mason, Arjan van de Ven, rt, Rik van Riel,
	Linus Torvalds, George Spelvin, Len Brown, Josh Triplett

On Fri, Jun 24, 2016 at 02:32:13PM -0000, Thomas Gleixner wrote:
> We want to store the array index in the flags space. 256k CPUs should be
> enough for a while.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

For the easy part (Patches 1 to 13): Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>

Now I'm going to dive in the hard part :-)

Thanks.

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2016-07-01 14:42 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-06-24 14:32 [patch V3 00/22] timer: Refactor the timer wheel Thomas Gleixner
2016-06-24 14:32 ` [patch V3 01/22] timer: Make pinned a timer property Thomas Gleixner
2016-07-01 13:03   ` Frederic Weisbecker
2016-06-24 14:32 ` [patch V3 02/22] x86/apic/uv: Initialize timer as pinned Thomas Gleixner
2016-06-24 14:32 ` [patch V3 03/22] x86/mce: " Thomas Gleixner
2016-06-24 14:32 ` [patch V3 04/22] cpufreq/powernv: " Thomas Gleixner
2016-06-24 14:32 ` [patch V3 05/22] driver/net/ethernet/tile: " Thomas Gleixner
2016-06-24 14:32 ` [patch V3 06/22] drivers/tty/metag_da: " Thomas Gleixner
2016-06-24 14:32 ` [patch V3 07/22] drivers/tty/mips_ejtag: " Thomas Gleixner
2016-06-24 14:32 ` [patch V3 08/22] net/ipv4/inet: Initialize timers " Thomas Gleixner
2016-06-24 14:32 ` [patch V3 09/22] timer: Remove mod_timer_pinned Thomas Gleixner
2016-06-24 14:32 ` [patch V3 10/22] signal: Use hrtimer for sigtimedwait Thomas Gleixner
2016-06-24 14:32 ` [patch V3 11/22] hlist: Add hlist_is_singular_node() helper Thomas Gleixner
2016-06-24 14:32 ` [patch V3 12/22] timer: Give a few structs and members proper names Thomas Gleixner
2016-06-24 14:32 ` [patch V3 13/22] timer: Reduce the CPU index space to 256k Thomas Gleixner
2016-07-01 14:41   ` Frederic Weisbecker
2016-06-24 14:32 ` [patch V3 14/22] timer: Switch to a non cascading wheel Thomas Gleixner
2016-06-24 14:32 ` [patch V3 15/22] timer: Remove slack leftovers Thomas Gleixner
2016-06-24 14:32 ` [patch V3 16/22] timer: Move __run_timers() function Thomas Gleixner
2016-06-24 14:32 ` [patch V3 17/22] timer: Optimize collect timers for NOHZ Thomas Gleixner
2016-06-24 14:32 ` [patch V3 18/22] tick/sched: Remove pointless empty function Thomas Gleixner
2016-06-24 14:32 ` [patch V3 19/22] timer: Forward wheel clock whenever possible Thomas Gleixner
2016-06-24 20:45   ` Richard Cochran
2016-06-25  8:55     ` Thomas Gleixner
2016-06-24 14:32 ` [patch V3 20/22] timer: Only wake softirq if necessary Thomas Gleixner
2016-06-24 14:32 ` [patch V3 21/22] timer: Split out index calculation Thomas Gleixner
2016-06-24 14:32 ` [patch V3 22/22] timer: Optimization for same expiry time in mod_timer() Thomas Gleixner
2016-06-24 17:04 ` [patch V3 00/22] timer: Refactor the timer wheel Josh Triplett
2016-06-27  4:22 ` Paul E. McKenney
2016-06-27 23:56   ` Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.