linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: umgwanakikbuti@gmail.com, mingo@elte.hu
Cc: ktkhai@parallels.com, rostedt@goodmis.org, tglx@linutronix.de,
	juri.lelli@gmail.com, pang.xunlei@linaro.org, oleg@redhat.com,
	wanpeng.li@linux.intel.com, linux-kernel@vger.kernel.org,
	peterz@infradead.org, Al Viro <viro@ZenIV.linux.org.uk>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Paul McKenney <paulmck@linux.vnet.ibm.com>
Subject: [PATCH 12/18] hrtimer: Allow hrtimer::function() to free the timer
Date: Thu, 11 Jun 2015 14:46:48 +0200	[thread overview]
Message-ID: <20150611124743.471563047@infradead.org> (raw)
In-Reply-To: 20150611124636.448700267@infradead.org

[-- Attachment #1: peterz-hrtimer-base-running.patch --]
[-- Type: text/plain, Size: 10456 bytes --]

Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.

Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/hrtimer.h |   41 ++++++-----------
 kernel/time/hrtimer.c   |  114 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 107 insertions(+), 48 deletions(-)

--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,30 +53,25 @@ enum hrtimer_restart {
  *
  * 0x00		inactive
  * 0x01		enqueued into rbtree
- * 0x02		callback function running
- * 0x04		timer is migrated to another cpu
  *
- * Special cases:
- * 0x03		callback function running and enqueued
- *		(was requeued on another CPU)
- * 0x05		timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
  *
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in:
+ *
+ *	timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
  * queued a signal. Between dropping the lock which protects the posix timer
  * and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
+ * signal and rearm the timer.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
-#define HRTIMER_STATE_CALLBACK	0x02
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -163,6 +158,8 @@ enum  hrtimer_base_type {
  * struct hrtimer_cpu_base - the per cpu clock bases
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
+ * @seq:		seqcount around __run_hrtimer
+ * @running:		pointer to the currently running hrtimer
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
@@ -184,6 +181,8 @@ enum  hrtimer_base_type {
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
+	seqcount_t			seq;
+	struct hrtimer			*running;
 	unsigned int			cpu;
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
@@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(con
 
 extern u64 hrtimer_get_next_event(void);
 
-/*
- * A timer is active, when it is enqueued into the rbtree or the
- * callback function is running or it's in the state of being migrated
- * to another cpu.
- */
-static inline int hrtimer_active(const struct hrtimer *timer)
-{
-	return timer->state != HRTIMER_STATE_INACTIVE;
-}
+extern bool hrtimer_active(const struct hrtimer *timer);
 
 /*
  * Helper function to check, whether the timer is on one of the queues
@@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(stru
  */
 static inline int hrtimer_callback_running(struct hrtimer *timer)
 {
-	return timer->state & HRTIMER_STATE_CALLBACK;
+	return timer->base->cpu_base->running == timer;
 }
 
 /* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+	.seq = SEQCNT_ZERO(hrtimer_bases.seq),
 	.clock_base =
 	{
 		{
@@ -111,6 +112,18 @@ static inline int hrtimer_clockid_to_bas
 #ifdef CONFIG_SMP
 
 /*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+	.seq = SEQCNT_ZERO(migration_cpu_base),
+	.clock_base = { { .cpu_base = &migration_cpu_base, }, },
+};
+
+#define migration_base	migration_cpu_base.clock_base[0]
+
+/*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
@@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_bas
  * be found on the lists/queues.
  *
  * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
  */
 static
 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_
 
 	for (;;) {
 		base = timer->base;
-		if (likely(base != NULL)) {
+		if (likely(base != &migration_base)) {
 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
@@ -194,8 +207,8 @@ switch_hrtimer_base(struct hrtimer *time
 		if (unlikely(hrtimer_callback_running(timer)))
 			return base;
 
-		/* See the comment in lock_timer_base() */
-		timer->base = NULL;
+		/* See the comment in lock_hrtimer_base() */
+		timer->base = &migration_base;
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
@@ -840,11 +853,7 @@ static int enqueue_hrtimer(struct hrtime
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	/*
-	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-	 * state of a possibly running callback.
-	 */
-	timer->state |= HRTIMER_STATE_ENQUEUED;
+	timer->state = HRTIMER_STATE_ENQUEUED;
 
 	return timerqueue_add(&base->active, &timer->node);
 }
@@ -909,14 +918,9 @@ remove_hrtimer(struct hrtimer *timer, st
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
 
-		if (!restart) {
-			/*
-			 * We must preserve the CALLBACK state flag here,
-			 * otherwise we could move the timer base in
-			 * switch_hrtimer_base.
-			 */
-			state &= HRTIMER_STATE_CALLBACK;
-		}
+		if (!restart)
+			state = HRTIMER_STATE_INACTIVE;
+
 		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
@@ -1117,6 +1121,51 @@ void hrtimer_init(struct hrtimer *timer,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
+ *
+ * It is important for this function to not return a false negative.
+ */
+bool hrtimer_active(const struct hrtimer *timer)
+{
+	struct hrtimer_cpu_base *cpu_base;
+	unsigned int seq;
+
+	do {
+		cpu_base = READ_ONCE(timer->base->cpu_base);
+		seq = raw_read_seqcount_begin(&cpu_base->seq);
+
+		if (timer->state != HRTIMER_STATE_INACTIVE ||
+		    cpu_base->running == timer)
+			return true;
+
+	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
+		 cpu_base != READ_ONCE(timer->base->cpu_base));
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(hrtimer_active);
+
+/*
+ * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+ * distinct sections:
+ *
+ *  - queued:	the timer is queued
+ *  - callback:	the timer is being ran
+ *  - post:	the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * The sequence numbers are required because otherwise we could still observe
+ * a false negative if the read side got smeared over multiple consequtive
+ * __run_hrtimer() invocations.
+ */
+
 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 			  struct hrtimer_clock_base *base,
 			  struct hrtimer *timer, ktime_t *now)
@@ -1124,10 +1173,21 @@ static void __run_hrtimer(struct hrtimer
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_held(&cpu_base->lock);
 
 	debug_deactivate(timer);
-	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	cpu_base->running = timer;
+
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
@@ -1143,7 +1203,7 @@ static void __run_hrtimer(struct hrtimer
 	raw_spin_lock(&cpu_base->lock);
 
 	/*
-	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+	 * Note: We clear the running state after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
 	 *
@@ -1155,9 +1215,17 @@ static void __run_hrtimer(struct hrtimer
 	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
 
-	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
 
-	timer->state &= ~HRTIMER_STATE_CALLBACK;
+	WARN_ON_ONCE(cpu_base->running != timer);
+	cpu_base->running = NULL;
 }
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)



  parent reply	other threads:[~2015-06-11 12:55 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
2015-06-11 12:46 ` [PATCH 01/18] sched: Replace post_schedule with a balance callback list Peter Zijlstra
2015-06-11 15:32   ` Kirill Tkhai
2015-06-18 23:00   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 02/18] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
2015-06-18 23:00   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 03/18] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
2015-06-18 23:01   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 04/18] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
2015-06-18 23:01   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 05/18] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
2015-06-18 23:01   ` [tip:sched/hrtimers] sched, rt: Convert switched_{from, to}_rt() " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 06/18] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
2015-06-18 23:02   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 07/18] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
2015-06-18 23:02   ` [tip:sched/hrtimers] sched, dl: Convert switched_{from, to}_dl() " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 08/18] hrtimer: Remove HRTIMER_STATE_MIGRATE Peter Zijlstra
2015-06-18 22:18   ` [tip:timers/core] " tip-bot for Oleg Nesterov
2015-06-11 12:46 ` [PATCH 09/18] hrtimer: Fix hrtimer_is_queued() hole Peter Zijlstra
2015-06-18 22:18   ` [tip:timers/core] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 10/18] seqcount: Rename write_seqcount_barrier() Peter Zijlstra
2015-06-18 22:19   ` [tip:timers/core] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier() Peter Zijlstra
2015-06-11 15:33   ` Paul E. McKenney
2015-06-11 21:45     ` Paul E. McKenney
2015-06-12  7:08       ` Peter Zijlstra
2015-06-12 18:59       ` Oleg Nesterov
2015-06-17 12:29       ` Peter Zijlstra
2015-06-17 14:57         ` Paul E. McKenney
2015-06-17 15:11           ` Peter Zijlstra
2015-06-17 15:42             ` Paul E. McKenney
2015-06-17 16:58               ` Peter Zijlstra
2015-06-17 15:49           ` Peter Zijlstra
2015-06-17 16:37             ` Paul E. McKenney
2015-06-17 17:11               ` Peter Zijlstra
2015-06-17 18:02                 ` Paul E. McKenney
2015-06-18  9:15                   ` Peter Zijlstra
2015-06-18  9:40                     ` Ingo Molnar
2015-06-18 10:40                       ` Peter Zijlstra
2015-06-18 16:54                         ` Paul E. McKenney
2015-06-18 17:10                           ` Steven Rostedt
2015-06-18 17:51                             ` Paul E. McKenney
2015-06-18 22:19         ` [tip:timers/core] seqcount: Introduce raw_write_seqcount_barrier( ) tip-bot for Peter Zijlstra
2015-06-11 12:46 ` Peter Zijlstra [this message]
2015-06-18 22:19   ` [tip:timers/core] hrtimer: Allow hrtimer::function() to free the timer tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 13/18] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra
2015-06-18 23:02   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 14/18] sched: Move code around Peter Zijlstra
2015-06-18 23:02   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 15/18] sched: Streamline the task migration locking a little Peter Zijlstra
2015-06-18 23:03   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 16/18] lockdep: Simplify lock_release() Peter Zijlstra
2015-06-18 23:03   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 17/18] lockdep: Implement lock pinning Peter Zijlstra
2015-06-18 23:03   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 18/18] sched,lockdep: Employ " Peter Zijlstra
2015-06-18 23:04   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-12-29  5:41 ` [PATCH 00/18] sched: balance callbacks v4 Byungchul Park

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150611124743.471563047@infradead.org \
    --to=peterz@infradead.org \
    --cc=juri.lelli@gmail.com \
    --cc=ktkhai@parallels.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=oleg@redhat.com \
    --cc=pang.xunlei@linaro.org \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=umgwanakikbuti@gmail.com \
    --cc=viro@ZenIV.linux.org.uk \
    --cc=wanpeng.li@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).