All of lore.kernel.org
 help / color / mirror / Atom feed
From: Daniel Thompson <daniel.thompson@linaro.org>
To: John Stultz <john.stultz@linaro.org>
Cc: Daniel Thompson <daniel.thompson@linaro.org>,
	linux-kernel@vger.kernel.org, patches@linaro.org,
	linaro-kernel@lists.linaro.org,
	Sumit Semwal <sumit.semwal@linaro.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Stephen Boyd <sboyd@codeaurora.org>,
	Steven Rostedt <rostedt@goodmis.org>
Subject: [PATCH v2] sched_clock: Avoid deadlock during read from NMI
Date: Thu, 22 Jan 2015 13:06:37 +0000	[thread overview]
Message-ID: <1421931997-13609-1-git-send-email-daniel.thompson@linaro.org> (raw)
In-Reply-To: <1421859236-19782-1-git-send-email-daniel.thompson@linaro.org>

Currently it is possible for an NMI (or FIQ on ARM) to come in and
read sched_clock() whilst update_sched_clock() has locked the seqcount
for writing. This results in the NMI handler locking up when it calls
raw_read_seqcount_begin().

This patch fixes that problem by providing banked clock data in a
similar manner to Thomas Gleixner's 4396e058c52e("timekeeping: Provide
fast and NMI safe access to CLOCK_MONOTONIC").

Changing the mode of operation of the seqcount away from the traditional
LSB-means-interrupted-write to a banked approach also revealed a good deal
of "fake" write locking within sched_clock_register(). This is likely
to be a latent issue because sched_clock_register() is typically called
before we enable interrupts. Nevertheless the issue has been eliminated
by increasing the scope of the read locking performed by sched_clock().

Suggested-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---

Notes:
    Tested on i.MX6 with perf drowning the system in FIQs and using the perf
    handler to check that sched_clock() returns monotonic values. At the
    same time I forcefully reduced kt_wrap so that update_sched_clock()
    is being called at >1000Hz.
    
    Without the patch the above system is grossly unstable, surviving
    [9K,115K,25K] perf event cycles during three separate runs. With the
    patch I ran for over 11M perf event cycles before getting bored.
    
    v2:
    
    * Extended the scope of the read lock in sched_clock() so we can bank
      all data consumed there (John Stultz)
    

 kernel/time/sched_clock.c | 157 +++++++++++++++++++++++++++++-----------------
 1 file changed, 100 insertions(+), 57 deletions(-)

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a2ea66944bc1 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -18,28 +18,28 @@
 #include <linux/seqlock.h>
 #include <linux/bitops.h>

-struct clock_data {
-	ktime_t wrap_kt;
+struct clock_data_banked {
 	u64 epoch_ns;
 	u64 epoch_cyc;
-	seqcount_t seq;
-	unsigned long rate;
+	u64 (*read_sched_clock)(void);
+	u64 sched_clock_mask;
 	u32 mult;
 	u32 shift;
 	bool suspended;
 };

+struct clock_data {
+	ktime_t wrap_kt;
+	seqcount_t seq;
+	unsigned long rate;
+	struct clock_data_banked bank[2];
+};
+
 static struct hrtimer sched_clock_timer;
 static int irqtime = -1;

 core_param(irqtime, irqtime, int, 0400);

-static struct clock_data cd = {
-	.mult	= NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
 static u64 notrace jiffy_sched_clock_read(void)
 {
 	/*
@@ -49,7 +49,14 @@ static u64 notrace jiffy_sched_clock_read(void)
 	return (u64)(jiffies - INITIAL_JIFFIES);
 }

-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd = {
+	.bank = {
+		[0] = {
+			.mult	= NSEC_PER_SEC / HZ,
+			.read_sched_clock = jiffy_sched_clock_read,
+		},
+	},
+};

 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
@@ -58,50 +65,82 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)

 unsigned long long notrace sched_clock(void)
 {
-	u64 epoch_ns;
-	u64 epoch_cyc;
 	u64 cyc;
 	unsigned long seq;
-
-	if (cd.suspended)
-		return cd.epoch_ns;
+	struct clock_data_banked *b;
+	u64 res;

 	do {
-		seq = raw_read_seqcount_begin(&cd.seq);
-		epoch_cyc = cd.epoch_cyc;
-		epoch_ns = cd.epoch_ns;
+		seq = raw_read_seqcount(&cd.seq);
+		b = cd.bank + (seq & 1);
+		if (b->suspended) {
+			res = b->epoch_ns;
+		} else {
+			cyc = b->read_sched_clock();
+			cyc = (cyc - b->epoch_cyc) & b->sched_clock_mask;
+			res = b->epoch_ns + cyc_to_ns(cyc, b->mult, b->shift);
+		}
 	} while (read_seqcount_retry(&cd.seq, seq));

-	cyc = read_sched_clock();
-	cyc = (cyc - epoch_cyc) & sched_clock_mask;
-	return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+	return res;
+}
+
+/*
+ * Start updating the banked clock data.
+ *
+ * sched_clock will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ *
+ * The caller is responsible for avoiding simultaneous updates.
+ */
+static struct clock_data_banked *update_bank_begin(void)
+{
+	/* update the backup (odd) bank and steer readers towards it */
+	memcpy(cd.bank + 1, cd.bank, sizeof(struct clock_data_banked));
+	raw_write_seqcount_latch(&cd.seq);
+
+	return cd.bank;
+}
+
+/*
+ * Finalize update of banked clock data.
+ *
+ * This is just a trivial switch back to the primary (even) copy.
+ */
+static void update_bank_end(void)
+{
+	raw_write_seqcount_latch(&cd.seq);
 }

 /*
  * Atomically update the sched_clock epoch.
  */
-static void notrace update_sched_clock(void)
+static void notrace update_sched_clock(bool suspended)
 {
-	unsigned long flags;
+	struct clock_data_banked *b;
 	u64 cyc;
 	u64 ns;

-	cyc = read_sched_clock();
-	ns = cd.epoch_ns +
-		cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
-
-	raw_local_irq_save(flags);
-	raw_write_seqcount_begin(&cd.seq);
-	cd.epoch_ns = ns;
-	cd.epoch_cyc = cyc;
-	raw_write_seqcount_end(&cd.seq);
-	raw_local_irq_restore(flags);
+	b = update_bank_begin();
+
+	cyc = b->read_sched_clock();
+	ns = b->epoch_ns + cyc_to_ns((cyc - b->epoch_cyc) & b->sched_clock_mask,
+				     b->mult, b->shift);
+
+	b->epoch_ns = ns;
+	b->epoch_cyc = cyc;
+	b->suspended = suspended;
+
+	update_bank_end();
 }

 static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
-	update_sched_clock();
+	update_sched_clock(false);
 	hrtimer_forward_now(hrt, cd.wrap_kt);
 	return HRTIMER_RESTART;
 }
@@ -111,9 +150,9 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 {
 	u64 res, wrap, new_mask, new_epoch, cyc, ns;
 	u32 new_mult, new_shift;
-	ktime_t new_wrap_kt;
 	unsigned long r;
 	char r_unit;
+	struct clock_data_banked *b;

 	if (cd.rate > rate)
 		return;
@@ -122,29 +161,30 @@ void __init sched_clock_register(u64 (*read)(void), int bits,

 	/* calculate the mult/shift to convert counter ticks to ns. */
 	clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
+	cd.rate = rate;

 	new_mask = CLOCKSOURCE_MASK(bits);

 	/* calculate how many ns until we wrap */
 	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
-	new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+	cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+
+	b = update_bank_begin();

 	/* update epoch for new counter and update epoch_ns from old counter*/
 	new_epoch = read();
-	cyc = read_sched_clock();
-	ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
+	cyc = b->read_sched_clock();
+	ns = b->epoch_ns + cyc_to_ns((cyc - b->epoch_cyc) & b->sched_clock_mask,
+				     b->mult, b->shift);

-	raw_write_seqcount_begin(&cd.seq);
-	read_sched_clock = read;
-	sched_clock_mask = new_mask;
-	cd.rate = rate;
-	cd.wrap_kt = new_wrap_kt;
-	cd.mult = new_mult;
-	cd.shift = new_shift;
-	cd.epoch_cyc = new_epoch;
-	cd.epoch_ns = ns;
-	raw_write_seqcount_end(&cd.seq);
+	b->read_sched_clock = read;
+	b->sched_clock_mask = new_mask;
+	b->mult = new_mult;
+	b->shift = new_shift;
+	b->epoch_cyc = new_epoch;
+	b->epoch_ns = ns;
+
+	update_bank_end();

 	r = rate;
 	if (r >= 4000000) {
@@ -175,10 +215,10 @@ void __init sched_clock_postinit(void)
 	 * If no sched_clock function has been provided at that point,
 	 * make it the final one one.
 	 */
-	if (read_sched_clock == jiffy_sched_clock_read)
+	if (cd.bank[0].read_sched_clock == jiffy_sched_clock_read)
 		sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);

-	update_sched_clock();
+	update_sched_clock(false);

 	/*
 	 * Start the timer to keep sched_clock() properly updated and
@@ -191,17 +231,20 @@ void __init sched_clock_postinit(void)

 static int sched_clock_suspend(void)
 {
-	update_sched_clock();
+	update_sched_clock(true);
 	hrtimer_cancel(&sched_clock_timer);
-	cd.suspended = true;
 	return 0;
 }

 static void sched_clock_resume(void)
 {
-	cd.epoch_cyc = read_sched_clock();
+	struct clock_data_banked *b;
+
+	b = update_bank_begin();
+	b->epoch_cyc = b->read_sched_clock();
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
-	cd.suspended = false;
+	b->suspended = false;
+	update_bank_end();
 }

 static struct syscore_ops sched_clock_ops = {
--
1.9.3


  parent reply	other threads:[~2015-01-22 13:06 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-01-21 16:53 [RFC PATCH] sched_clock: Avoid tearing during read from NMI Daniel Thompson
2015-01-21 17:29 ` John Stultz
2015-01-21 20:20   ` Daniel Thompson
2015-01-21 20:58   ` Stephen Boyd
2015-01-22 13:06 ` Daniel Thompson [this message]
2015-01-30 19:03 ` [PATCH v3 0/4] sched_clock: Optimize and avoid deadlock " Daniel Thompson
2015-01-30 19:03   ` [PATCH v3 1/4] sched_clock: Match scope of read and write seqcounts Daniel Thompson
2015-01-30 19:03   ` [PATCH v3 2/4] sched_clock: Optimize cache line usage Daniel Thompson
2015-02-05  1:14     ` Stephen Boyd
2015-02-05 10:21       ` Daniel Thompson
2015-01-30 19:03   ` [PATCH v3 3/4] sched_clock: Remove suspend from clock_read_data Daniel Thompson
2015-01-30 19:03   ` [PATCH v3 4/4] sched_clock: Avoid deadlock during read from NMI Daniel Thompson
2015-02-05  1:23     ` Stephen Boyd
2015-02-05  1:48       ` Steven Rostedt
2015-02-05  6:23         ` Stephen Boyd
2015-02-05  0:50   ` [PATCH v3 0/4] sched_clock: Optimize and avoid " Stephen Boyd
2015-02-05  9:05     ` Daniel Thompson
2015-02-08 12:09       ` Daniel Thompson
2015-02-09 22:08         ` Stephen Boyd
2015-02-08 12:02 ` [PATCH v4 0/5] " Daniel Thompson
2015-02-08 12:02   ` [PATCH v4 1/5] sched_clock: Match scope of read and write seqcounts Daniel Thompson
2015-02-08 12:02   ` [PATCH v4 2/5] sched_clock: Optimize cache line usage Daniel Thompson
2015-02-09  1:28     ` Will Deacon
2015-02-09  9:47       ` Daniel Thompson
2015-02-10  2:37         ` Stephen Boyd
2015-02-08 12:02   ` [PATCH v4 3/5] sched_clock: Remove suspend from clock_read_data Daniel Thompson
2015-02-08 12:02   ` [PATCH v4 4/5] sched_clock: Remove redundant notrace from update function Daniel Thompson
2015-02-08 12:02   ` [PATCH v4 5/5] sched_clock: Avoid deadlock during read from NMI Daniel Thompson
2015-02-13  3:49   ` [PATCH v4 0/5] sched_clock: Optimize and avoid " Stephen Boyd
2015-03-02 15:56 ` [PATCH v5 " Daniel Thompson
2015-03-02 15:56   ` [PATCH v5 1/5] sched_clock: Match scope of read and write seqcounts Daniel Thompson
2015-03-02 15:56   ` [PATCH v5 2/5] sched_clock: Optimize cache line usage Daniel Thompson
2015-03-02 15:56   ` [PATCH v5 3/5] sched_clock: Remove suspend from clock_read_data Daniel Thompson
2015-03-02 15:56   ` [PATCH v5 4/5] sched_clock: Remove redundant notrace from update function Daniel Thompson
2015-03-02 15:56   ` [PATCH v5 5/5] sched_clock: Avoid deadlock during read from NMI Daniel Thompson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1421931997-13609-1-git-send-email-daniel.thompson@linaro.org \
    --to=daniel.thompson@linaro.org \
    --cc=john.stultz@linaro.org \
    --cc=linaro-kernel@lists.linaro.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=patches@linaro.org \
    --cc=rostedt@goodmis.org \
    --cc=sboyd@codeaurora.org \
    --cc=sumit.semwal@linaro.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.