[PATCH v1] watchdog: Use a reference cycle counter to avoid scaling issues

* [PATCH v1] watchdog: Use a reference cycle counter to avoid scaling issues
@ 2015-04-23 15:03 Alexander Shishkin
  2015-04-23 20:01 ` Thomas Gleixner
  0 siblings, 1 reply; 12+ messages in thread
From: Alexander Shishkin @ 2015-04-23 15:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: H. Peter Anvin, x86, Don Zickus, Frederic Weisbecker, Andi Kleen,
	Adrian Hunter, Anton Blanchard, Michael Ellerman, linux-kernel,
	Alexander Shishkin, stable

The problem with using cycle counter for NMI watchdog is that its
frequency changes with the corresponding core's frequency. This means
that, in particular, if the core frequency scales up, watchdog NMI will
arrive more frequently than what user requested through watchdog_thresh
and also increasing the probability of setting off the hardlockup detector,
because the corresponding hrtimer will keep firing at the same intervals
regardless of the core frequency. And, if the core can turbo to up to 2.5x
its base frequency (and therefore TSC) [1], we'll have the hrtimer and NMI
counter firing at the same frequency; anything that disables interrupts at
an unfortunate moment can set off the hardlockup detector then.

The proposed solution is to use reference cycle counter instead, which is
guaranteed to run at the same frequency regardless of the cpu speed. This
will also ensure that the watchdog_thresh timeout value is honored.

One issue with the reference counter is that its frequency is not the same
as that of tsc on older cpu models and therefore it needs calibration.

This patch adds code to calibrate ref-cycles counter using an hrtimer and
a perf counter, which runs in parallel with the rest of kernel init
sequence.

This patch leaves intact the hw_nmi_get_sample_period() arch call which is
still used in case of a fallback to the cycle counter.

[1] http://ark.intel.com/products/83610/Intel-Core-M-5Y10-Processor-4M-Cache-up-to-2_00-GHz

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: stable@vger.kernel.org
---
Changes since the initial version:

 * kept hw_nmi_* intact to avoid build-breaking powerpc; on powerpc, the
   calibration should fail because of the missing REF_CYCLES counter and
   therefore fall back to the old way of doing things;
 * rebased on top of Linus' master.

 kernel/watchdog.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 131 insertions(+), 3 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 2316f50b07..7e8e031544 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -259,15 +259,129 @@ static int is_softlockup(unsigned long touch_ts)
 }
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
-	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.config		= PERF_COUNT_HW_REF_CPU_CYCLES,
 	.size		= sizeof(struct perf_event_attr),
 	.pinned		= 1,
 	.disabled	= 1,
 };
 
+/*
+ * Reference cycle counter calibration
+ *
+ * In order for nmi watchdog to be bound to the flow of time instead
+ * of cycles, which changes with cpu frequency scaling, we need to use
+ * PERF_COUNT_HW_REF_CPU_CYCLES for it. And for this we need to know its
+ * resolution since it's not necessarily the same as that of TSC.
+ *
+ * Thus, reference clock counter is calibrated using a perf counter and
+ * an hrtimer taking samples of the former's overflow counter. When the
+ * hrtimer callback has enough samples, it kicks off a work, which does
+ * the "math" and kickstarts the NMI watchdog if needed.
+ *
+ * This method of calibration does not give stellar precision, but should
+ * be enough for a watchdog timer. And it runs in parallel to the rest of
+ * the bootup sequence. The main factor contributing to the error in this
+ * calibration method is the nmi handling path leading up to the overflow
+ * handler, that is a greater REF_CAL_SAMPLE_CYCLES value gives better
+ * precision.
+ */
+
+/* PERF_COUNT_HW_REF_CPU_CYCLES timer resolution */
+static unsigned long	wd_hw_ref_khz;
+static local_t		wd_hw_ref_cycles;
+static struct hrtimer	wd_hw_cal_timer;
+static struct perf_event *wd_hw_cal_event;
+
+#define REF_CAL_POINTS 9
+static unsigned long ref_cal_point[REF_CAL_POINTS];
+static unsigned int ref_cal_points;
+
+#define REF_CAL_SAMPLE_CYCLES	1000000
+#define REF_CAL_MS		100
+#define REF_CAL_PERIOD		(REF_CAL_MS * 1000000)
+
+static void wd_hw_cal_overflow(struct perf_event *event,
+			       struct perf_sample_data *data,
+			       struct pt_regs *regs)
+{
+	event->hw.interrupts = 0;
+	local_inc(&wd_hw_ref_cycles);
+}
+
+static void watchdog_calibration_failed(void)
+{
+	wd_hw_attr.config = PERF_COUNT_HW_CPU_CYCLES;
+	pr_warn("reference counter calibration failed, falling back to cycle counter for NMI watchdog\n");
+}
+
+static int watchdog_enable_all_cpus(void);
+
+static void watchdog_finish_ref_calibration(struct work_struct *work)
+{
+	unsigned int point;
+	unsigned long diff;
+
+	hrtimer_cancel(&wd_hw_cal_timer);
+	perf_event_release_kernel(wd_hw_cal_event);
+
+	for (diff = 0, point = 0; point < REF_CAL_POINTS - 1; point++)
+		diff += ref_cal_point[point + 1] - ref_cal_point[point];
+
+	diff /= REF_CAL_POINTS - 1;
+
+	wd_hw_ref_khz = diff * REF_CAL_SAMPLE_CYCLES / REF_CAL_MS;
+	if (!wd_hw_ref_khz)
+		watchdog_calibration_failed();
+	else
+		pr_info("reference clock frequency: %ldkHz\n", wd_hw_ref_khz);
+
+	if (watchdog_enabled)
+		watchdog_enable_all_cpus();
+}
+
+static DECLARE_WORK(wd_hw_ref_work, watchdog_finish_ref_calibration);
+
+static enum hrtimer_restart wd_hw_cal_timer_fn(struct hrtimer *hrtimer)
+{
+	hrtimer_forward_now(hrtimer, ns_to_ktime(REF_CAL_PERIOD));
+	ref_cal_point[ref_cal_points] =	local_read(&wd_hw_ref_cycles);
+
+	if (++ref_cal_points < REF_CAL_POINTS)
+		return HRTIMER_RESTART;
+
+	schedule_work(&wd_hw_ref_work);
+
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * Calibrate HW_REF_CYCLE counter for NMI watchdog
+ */
+static int watchdog_calibrate(void)
+{
+	hrtimer_init(&wd_hw_cal_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	wd_hw_cal_timer.function = wd_hw_cal_timer_fn;
+
+	wd_hw_attr.sample_period = REF_CAL_SAMPLE_CYCLES;
+	wd_hw_cal_event = perf_event_create_kernel_counter(&wd_hw_attr,
+							   smp_processor_id(),
+							   NULL,
+							   wd_hw_cal_overflow,
+							   NULL);
+	if (IS_ERR(wd_hw_cal_event)) {
+		watchdog_calibration_failed();
+		return PTR_ERR(wd_hw_cal_event);
+	}
+
+	perf_event_enable(wd_hw_cal_event);
+	hrtimer_start(&wd_hw_cal_timer, ns_to_ktime(REF_CAL_PERIOD),
+		      HRTIMER_MODE_REL_PINNED);
+
+	return 0;
+}
+
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
 		 struct perf_sample_data *data,
@@ -308,6 +422,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	__this_cpu_write(hard_watchdog_warn, false);
 	return;
 }
+#else
+static int watchdog_calibrate(void)
+{
+       return -EINVAL;
+}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 static void watchdog_interrupt_count(void)
@@ -532,7 +651,9 @@ static int watchdog_nmi_enable(unsigned int cpu)
 		goto out_enable;
 
 	wd_attr = &wd_hw_attr;
-	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+	wd_attr->sample_period = wd_hw_ref_khz ?
+		wd_hw_ref_khz * 1000 * watchdog_thresh :
+		hw_nmi_get_sample_period(watchdog_thresh);
 
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
@@ -875,6 +996,13 @@ void __init lockup_detector_init(void)
 {
 	set_sample_period();
 
+	/*
+	 * watchdog calibration work will take care of the rest when
+	 * the calibration is done
+	 */
+	if (!watchdog_calibrate())
+		return;
+
 	if (watchdog_enabled)
 		watchdog_enable_all_cpus();
 }
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 12+ messages in thread