linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH V3 1/2] irq: Track the interrupt timings
@ 2016-02-16 15:43 Daniel Lezcano
  2016-02-16 15:43 ` [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period Daniel Lezcano
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Daniel Lezcano @ 2016-02-16 15:43 UTC (permalink / raw)
  To: tglx
  Cc: peterz, rafael, linux-pm, linux-kernel, nicolas.pitre, vincent.guittot

The interrupt framework gives a lot of information about each interrupt.
It does not keep track of when those interrupts occur though.

This patch provides a mean to record the elapsed time between successive
interrupt occurrences in a per-IRQ per-CPU circular buffer to help with
the prediction of the next occurrence using a statistical model.

A new function is added to browse the different interrupts and retrieve the
timing information stored in it.

A static key will be introduced when the irq prediction is switched on at
runtime in order to reduce an overhead near to zero when the kernel is not
using it.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 include/linux/interrupt.h |  17 +++++++
 include/linux/irqdesc.h   |   4 ++
 kernel/irq/Kconfig        |   3 ++
 kernel/irq/Makefile       |   1 +
 kernel/irq/handle.c       |   2 +
 kernel/irq/internals.h    |  42 ++++++++++++++++++
 kernel/irq/irqdesc.c      |  10 +++++
 kernel/irq/manage.c       |   3 ++
 kernel/irq/timings.c      | 110 ++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 192 insertions(+)
 create mode 100644 kernel/irq/timings.c

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0e95fcc..f053596 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -665,6 +665,23 @@ static inline void init_irq_proc(void)
 }
 #endif
 
+#ifdef CONFIG_IRQ_TIMINGS
+
+#define IRQ_TIMINGS_SHIFT	2
+#define IRQ_TIMINGS_SIZE	(1 << IRQ_TIMINGS_SHIFT)
+#define IRQ_TIMINGS_MASK	(IRQ_TIMINGS_SIZE - 1)
+
+struct irq_timings {
+	u32 values[IRQ_TIMINGS_SIZE];	/* our circular buffer */
+	u64 sum;			/* sum of values */
+	u64 timestamp;			/* latest timestamp */
+	unsigned int w_index;		/* current buffer index */
+};
+
+struct irq_timings *irqtiming_get_next(int *irq);
+
+#endif
+
 struct seq_file;
 int show_interrupts(struct seq_file *p, void *v);
 int arch_show_interrupts(struct seq_file *p, int prec);
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index dcca77c..f4e29b2 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -12,6 +12,7 @@ struct proc_dir_entry;
 struct module;
 struct irq_desc;
 struct irq_domain;
+struct irq_timings;
 struct pt_regs;
 
 /**
@@ -51,6 +52,9 @@ struct irq_desc {
 	struct irq_data		irq_data;
 	unsigned int __percpu	*kstat_irqs;
 	irq_flow_handler_t	handle_irq;
+#ifdef CONFIG_IRQ_TIMINGS
+	struct irq_timings __percpu *timings;
+#endif
 #ifdef CONFIG_IRQ_PREFLOW_FASTEOI
 	irq_preflow_handler_t	preflow_handler;
 #endif
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 3b48dab..392c9f5 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -77,6 +77,9 @@ config GENERIC_MSI_IRQ_DOMAIN
 config HANDLE_DOMAIN_IRQ
 	bool
 
+config IRQ_TIMINGS
+	bool
+
 config IRQ_DOMAIN_DEBUG
 	bool "Expose hardware/virtual IRQ mapping via debugfs"
 	depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2fc9cbd..9c6d3e8 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
+obj-$(CONFIG_IRQ_TIMINGS) += timings.o
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a15b548..cd37536 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -138,6 +138,8 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
 	unsigned int flags = 0, irq = desc->irq_data.irq;
 	struct irqaction *action;
 
+	handle_timings(desc);
+
 	for_each_action_of_desc(desc, action) {
 		irqreturn_t res;
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index eab521fc..3d100af 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -56,6 +56,7 @@ enum {
 	IRQS_WAITING		= 0x00000080,
 	IRQS_PENDING		= 0x00000200,
 	IRQS_SUSPENDED		= 0x00000800,
+	IRQS_TIMINGS		= 0x00001000,
 };
 
 #include "debug.h"
@@ -218,3 +219,44 @@ irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
 static inline void
 irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
 #endif
+
+#ifdef CONFIG_IRQ_TIMINGS
+static inline int alloc_timings(struct irq_desc *desc)
+{
+	desc->timings = alloc_percpu(struct irq_timings);
+	if (!desc->timings)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline void free_timings(struct irq_desc *desc)
+{
+	free_percpu(desc->timings);
+}
+
+static inline void remove_timings(struct irq_desc *desc)
+{
+	desc->istate &= ~IRQS_TIMINGS;
+}
+
+static inline void setup_timings(struct irq_desc *desc, struct irqaction *act)
+{
+	/*
+	 * Timers are deterministic, so no need to do any measurement
+	 * on them.
+	 */
+	if (act->flags & __IRQF_TIMER)
+		return;
+
+	desc->istate |= IRQS_TIMINGS;
+}
+extern void handle_timings(struct irq_desc *desc);
+#else
+static inline int alloc_timings(struct irq_desc *desc) { return 0; }
+static inline void free_timings(struct irq_desc *desc) {}
+static inline void handle_timings(struct irq_desc *desc) {}
+static inline void remove_timings(struct irq_desc *desc) {}
+static inline void setup_timings(struct irq_desc *desc,
+				 struct irqaction *act) {};
+#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0ccd028..577686b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -174,6 +174,9 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
 	if (alloc_masks(desc, gfp, node))
 		goto err_kstat;
 
+	if (alloc_timings(desc))
+		goto err_mask;
+
 	raw_spin_lock_init(&desc->lock);
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	init_rcu_head(&desc->rcu);
@@ -182,6 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
 
 	return desc;
 
+err_mask:
+	free_masks(desc);
 err_kstat:
 	free_percpu(desc->kstat_irqs);
 err_desc:
@@ -220,6 +225,11 @@ static void free_desc(unsigned int irq)
 	 * the child interrupts.
 	 */
 	call_rcu(&desc->rcu, delayed_free_desc);
+
+	free_timings(desc);
+	free_masks(desc);
+	free_percpu(desc->kstat_irqs);
+	kfree(desc);
 }
 
 static int alloc_descs(unsigned int start, unsigned int cnt, int node,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3ddd229..132c2d7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1343,6 +1343,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		__enable_irq(desc);
 	}
 
+	setup_timings(desc, new);
+
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 	/*
@@ -1465,6 +1467,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		irq_settings_clr_disable_unlazy(desc);
 		irq_shutdown(desc);
 		irq_release_resources(desc);
+		remove_timings(desc);
 	}
 
 #ifdef CONFIG_SMP
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
new file mode 100644
index 0000000..95976fa0
--- /dev/null
+++ b/kernel/irq/timings.c
@@ -0,0 +1,110 @@
+/*
+ * linux/kernel/irq/timings.c
+ *
+ * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/percpu.h>
+
+#include "internals.h"
+
+/**
+ * handle_timings - stores an irq timing  when an interrupt occurs
+ *
+ * @desc: the irq descriptor
+ *
+ * For all interruptions with their IRQS_TIMINGS flag set, the function
+ * computes the time interval between two interrupt events and store it
+ * in a circular buffer.
+ */
+void handle_timings(struct irq_desc *desc)
+{
+	struct irq_timings *timings;
+	u64 prev, now, diff;
+
+	if (!(desc->istate & IRQS_TIMINGS))
+		return;
+
+	timings = this_cpu_ptr(desc->timings);
+	now = local_clock();
+	prev = timings->timestamp;
+	timings->timestamp = now;
+
+	/*
+	 * If it is the first interrupt of the series, we can't
+	 * compute an interval, just store the timestamp and exit.
+	 */
+	if (unlikely(!prev))
+		return;
+
+	diff = now - prev;
+
+	/*
+	 * microsec (actually 1024th of a milisec) precision is good
+	 * enough for our purpose.
+	 */
+	diff >>= 10;
+
+	/*
+	 * There is no point to store intervals from interrupts more
+	 * than ~1 second apart. Furthermore that increases the risk
+	 * of overflowing our variance computation. Reset all values
+	 * in that case. Otherwise we know the magnitude of diff is
+	 * well within 32 bits.
+	 */
+	if (unlikely(diff > USEC_PER_SEC)) {
+		memset(timings, 0, sizeof(*timings));
+		timings->timestamp = now;
+		return;
+	}
+
+	/* The oldest value corresponds to the next index. */
+	timings->w_index = (timings->w_index + 1) & IRQ_TIMINGS_MASK;
+
+	/*
+	 * Remove the oldest value from the summing. If this is the
+	 * first time we go through this array slot, the previous
+	 * value will be zero and we won't substract anything from the
+	 * current sum. Hence this code relies on a zero-ed structure.
+	 */
+	timings->sum -= timings->values[timings->w_index];
+	timings->values[timings->w_index] = diff;
+	timings->sum += diff;
+}
+
+/**
+ * irqtiming_get_next - return the next irq timing
+ *
+ * @int: an integer representing the interrupt number
+ *
+ * Returns a struct irq_timings, NULL if we reach the end of the
+ * interrupts list.
+ */
+struct irq_timings *irqtiming_get_next(int *irq)
+{
+	struct irq_desc *desc;
+	int next;
+
+again:
+	/* Do a racy lookup of the next allocated irq */
+	next = irq_get_next_irq(*irq);
+	if (next >= nr_irqs)
+		return NULL;
+
+	*irq = next + 1;
+
+	/*
+	 * Now lookup the descriptor. It's RCU protected. This
+	 * descriptor might belong to an uninteresting interrupt or
+	 * one that is not measured. Look for the next interrupt in
+	 * that case.
+	 */
+	desc = irq_to_desc(next);
+	if (!desc || !(desc->istate & IRQS_TIMINGS))
+		goto again;
+
+	return this_cpu_ptr(desc->timings);
+}
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-16 15:43 [PATCH V3 1/2] irq: Track the interrupt timings Daniel Lezcano
@ 2016-02-16 15:43 ` Daniel Lezcano
  2016-02-16 16:44   ` Nicolas Pitre
                     ` (2 more replies)
  2016-02-16 16:45 ` [PATCH V3 1/2] irq: Track the interrupt timings Nicolas Pitre
  2016-02-22 14:48 ` Shreyas B Prabhu
  2 siblings, 3 replies; 18+ messages in thread
From: Daniel Lezcano @ 2016-02-16 15:43 UTC (permalink / raw)
  To: tglx
  Cc: peterz, rafael, linux-pm, linux-kernel, nicolas.pitre, vincent.guittot

Many IRQs are quiet most of the time, or they tend to come in bursts of
fairly equal time intervals within each burst. It is therefore possible
to detect those IRQs with stable intervals and guestimate when the next
IRQ event is most likely to happen.

Examples of such IRQs may include audio related IRQs where the FIFO size
and/or DMA descriptor size with the sample rate create stable intervals,
block devices during large data transfers, etc.  Even network streaming
of multimedia content creates patterns of periodic network interface IRQs
in some cases.

This patch adds code to compute the mean interval and variance for each IRQ
over a window of time intervals between IRQ events. Those statistics can
be used to assist cpuidle in selecting the most appropriate sleep state
by predicting the most likely time for the next interrupt.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/cpuidle/Kconfig   |   9 ++
 kernel/sched/Makefile     |   1 +
 kernel/sched/idle-sched.c | 280 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 290 insertions(+)
 create mode 100644 kernel/sched/idle-sched.c

diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 7e48eb5..d64445b 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -23,6 +23,15 @@ config CPU_IDLE_GOV_LADDER
 config CPU_IDLE_GOV_MENU
 	bool "Menu governor (for tickless system)"
 
+config CPU_IDLE_GOV_SCHED
+	bool "Sched idle governor"
+	select IRQ_TIMINGS
+	help
+	  Enables an irq timings tracking mechanism to track the wakeup sources
+	  of the platform.
+
+	  If you are unsure, it is safe to say N.
+
 config DT_IDLE_STATES
 	bool
 
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 6768797..f7d5a35 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_IDLE_GOV_SCHED) += idle-sched.o
diff --git a/kernel/sched/idle-sched.c b/kernel/sched/idle-sched.c
new file mode 100644
index 0000000..4dc03da
--- /dev/null
+++ b/kernel/sched/idle-sched.c
@@ -0,0 +1,280 @@
+/*
+ *  Copyright (C) 2016 Linaro Ltd, Daniel Lezcano <daniel.lezcano@linaro.org>
+ *                                 Nicolas Pitre <nicolas.pitre@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/ktime.h>
+#include <linux/tick.h>
+
+/**
+ * irqt_mean - compute the average
+ *
+ * @irqt: the irq timings structure
+ *
+ * Returns an u32 corresponding to the mean value, or zero if there is
+ * no data
+ */
+static inline u32 irqt_mean(struct irq_timings *irqt)
+{
+	return irqt->sum >> IRQ_TIMINGS_SHIFT;
+}
+
+/**
+ * irqt_variance - compute the variance
+ *
+ * @irqt: the irq timings structure
+ *
+ * Returns an u64 corresponding to the variance, or zero if there is
+ * no data
+ */
+static u64 irqt_variance(struct irq_timings *irqt, u32 mean)
+{
+	int i;
+	u64 variance = 0;
+
+	/*
+	 * The variance is the sum of the squared difference to the
+	 * average divided by the number of elements.
+	 */
+	for (i = 0; i < IRQ_TIMINGS_SIZE; i++) {
+		s32 diff = irqt->values[i] - mean;
+		variance += (s64)diff * diff;
+	}
+
+	return variance >> IRQ_TIMINGS_SHIFT;
+}
+
+/**
+ * next_irq_event - returns the next irq event
+ *
+ * Returns a guess estimate remaining time before an interrupt
+ * occurs. The type s64 corresponding in a duration in microsecond.
+ *
+ * The value is always greater or equal to zero. In case, there is no
+ * possible prediction, S64_MAX is returned.
+ */
+static s64 next_irq_event(void)
+{
+	struct irq_timings *irqt;
+	unsigned int irq = 0;
+	u64 variance, now;
+	s64 diff, next = 0, min = S64_MAX;
+	u32 interval, mean;
+	s32 deviation;
+
+	now = local_clock();
+
+	/*
+	 * Search for the earliest expected interruption.
+	 */
+	rcu_read_lock();
+	while ((irqt = irqtiming_get_next(&irq))) {
+
+		/*
+		 * No values available.
+		 */
+		if (!irqt->timestamp)
+			continue;
+
+		/*
+		 * This interrupt last triggered more than a second ago.
+		 * It is definitely not predictable for our purpose anymore.
+		 */
+		if ((now - irqt->timestamp) > NSEC_PER_SEC)
+			continue;
+
+		/*
+		 * If the mean value is null, just ignore this wakeup
+		 * source.
+		 */
+		mean = irqt_mean(irqt);
+		if (!mean)
+			continue;
+
+		variance = irqt_variance(irqt, mean);
+		/*
+		 * We want to check the last interval is:
+		 *
+		 *  mean - stddev < interval < mean + stddev
+		 *
+		 * That simplifies to:
+		 *
+		 * -stddev < interval - mean < stddev
+		 *
+		 * abs(interval - mean) < stddev
+		 *
+		 * The standard deviation is the sqrt of the variance:
+		 *
+		 * abs(interval - mean) < sqrt(variance)
+		 *
+		 * and we want to avoid the sqrt, so we square the
+		 * equation:
+		 *
+		 * (interval - mean)^2 < variance
+		 *
+		 * So if the latest value of the stats complies with
+		 * this condition, then the wakeup source is
+		 * considered predictable and can be used to predict
+		 * the next event.
+		 */
+		interval = irqt->values[irqt->w_index];
+		deviation = interval - mean;
+		if ((s64)deviation * deviation > variance)
+			continue;
+
+		/*
+		 * Let's compute the next event: the wakeup source is
+		 * considered predictable, we add the average interval
+		 * time added to the latest interruption event
+		 * time. Note the timestamp is stored in nsec while
+		 * the mean time is microsec.
+		 */
+		next = irqt->timestamp + (mean << 10);
+
+		/*
+		 * If the interrupt is supposed to happen before the
+		 * minimum time, then it becomes the minimum.
+		 */
+		if (next < min)
+			min = next;
+	}
+	rcu_read_unlock();
+
+	/*
+	 * There is no prediction for any interrupt, so we return
+	 * S64_MAX.
+	 */
+	if (!next)
+		return S64_MAX;
+
+	/*
+	 * At this point, we have our prediction but the caller is
+	 * expecting the remaining time before the next event, so
+	 * compute the expected sleep length.
+	 */
+	diff = min - now;
+
+	/*
+	 * min and now are nanosecond units, let's convert to
+	 * microsec the result of the diff.
+	 */
+	diff >>= 10;
+
+	/*
+	 * The result could be negative for different reasons:
+	 * - the prediction is incorrect but we can't do anything here
+	 *   except assuming the interrupt occured effectively.
+	 *    
+	 * - the prediction was too near now and expired while we were
+	 *   in this function.
+	 *
+	 * In those cases, we return 0. Otherwise we return the expected
+	 * duration before an interrupt occurs.
+	 */
+	return diff > 0 ? diff : 0;
+}
+
+/**
+ * sched_idle_next_wakeup - Predict the next wakeup on the current cpu
+ *
+ * The next event on the cpu is based on a statistic approach of the
+ * interrupt events and the timer deterministic value. From the timer
+ * or the irqs, we return the one expected to occur first.
+ *
+ * Returns the expected remaining idle time before being woken up by
+ * an interruption.
+ */
+s64 sched_idle_next_wakeup(void)
+{
+	s64 next_timer = ktime_to_us(tick_nohz_get_sleep_length());
+	s64 next_irq = next_irq_event();
+
+	return min(next_irq, next_timer);
+}
+
+/**
+ * sched_idle - go to idle for a specified amount of time
+ *
+ * @duration: the idle duration time
+ * @latency: the latency constraint
+ *
+ * Returns 0 on success, < 0 otherwise.
+ */
+int sched_idle(s64 duration, unsigned int latency)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	struct cpuidle_state_usage *su;
+	struct cpuidle_state *s;
+	int i, ret = 0, index = -1;
+
+	rcu_idle_enter();
+
+	/*
+	 * No cpuidle driver is available, let's use the default arch
+	 * idle function.
+	 */
+	if (cpuidle_not_available(drv, dev))
+		goto default_idle;
+
+	/*
+	 * Find the idle state with the lowest power while satisfying
+	 * our constraints. We will save energy if the duration of the
+	 * idle time is bigger than the target residency which is the
+	 * break even point. The choice will be modulated by the
+	 * latency.
+	 */
+	for (i = 0; i < drv->state_count; i++) {
+
+		s = &drv->states[i];
+
+		su = &dev->states_usage[i];
+
+		if (s->disabled || su->disable)
+			continue;
+		if (s->target_residency > duration)
+			continue;
+		if (s->exit_latency > latency)
+			continue;
+
+		index = i;
+	}
+
+	/*
+	 * The idle task must be scheduled, it is pointless to go to
+	 * idle, just re-enable the interrupt and return.
+	 */
+	if (current_clr_polling_and_test()) {
+		local_irq_enable();
+		goto out;
+	}
+
+	if (index < 0) {
+		/*
+		 * No idle callbacks fulfilled the constraints, jump
+		 * to the default function like there wasn't any
+		 * cpuidle driver.
+		 */
+		goto default_idle;
+	} else {
+		/*
+		 * Enter the idle state previously returned by the
+		 * governor decision.  This function will block until
+		 * an interrupt occurs and will take care of
+		 * re-enabling the local interrupts
+		 */
+		return cpuidle_enter(drv, dev, index);
+	}
+
+default_idle:
+	default_idle_call();
+out:
+	rcu_idle_exit();
+	return ret;
+}
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-16 15:43 ` [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period Daniel Lezcano
@ 2016-02-16 16:44   ` Nicolas Pitre
  2016-02-17 23:09     ` Rafael J. Wysocki
  2016-02-22 15:02   ` Shreyas B Prabhu
  2016-02-23 10:06   ` Shreyas B Prabhu
  2 siblings, 1 reply; 18+ messages in thread
From: Nicolas Pitre @ 2016-02-16 16:44 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: tglx, peterz, rafael, linux-pm, linux-kernel, vincent.guittot

On Tue, 16 Feb 2016, Daniel Lezcano wrote:

> Many IRQs are quiet most of the time, or they tend to come in bursts of
> fairly equal time intervals within each burst. It is therefore possible
> to detect those IRQs with stable intervals and guestimate when the next
> IRQ event is most likely to happen.
> 
> Examples of such IRQs may include audio related IRQs where the FIFO size
> and/or DMA descriptor size with the sample rate create stable intervals,
> block devices during large data transfers, etc.  Even network streaming
> of multimedia content creates patterns of periodic network interface IRQs
> in some cases.
> 
> This patch adds code to compute the mean interval and variance for each IRQ
> over a window of time intervals between IRQ events. Those statistics can
> be used to assist cpuidle in selecting the most appropriate sleep state
> by predicting the most likely time for the next interrupt.
> 
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

The math in next_irq_event() is correct even though I think it could be 
done more simply.  But that can be optimized at a later time.

Reviewed-by: Nicolas Pitre <nico@linaro.org>

> ---
>  drivers/cpuidle/Kconfig   |   9 ++
>  kernel/sched/Makefile     |   1 +
>  kernel/sched/idle-sched.c | 280 ++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 290 insertions(+)
>  create mode 100644 kernel/sched/idle-sched.c
> 
> diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
> index 7e48eb5..d64445b 100644
> --- a/drivers/cpuidle/Kconfig
> +++ b/drivers/cpuidle/Kconfig
> @@ -23,6 +23,15 @@ config CPU_IDLE_GOV_LADDER
>  config CPU_IDLE_GOV_MENU
>  	bool "Menu governor (for tickless system)"
>  
> +config CPU_IDLE_GOV_SCHED
> +	bool "Sched idle governor"
> +	select IRQ_TIMINGS
> +	help
> +	  Enables an irq timings tracking mechanism to track the wakeup sources
> +	  of the platform.
> +
> +	  If you are unsure, it is safe to say N.
> +
>  config DT_IDLE_STATES
>  	bool
>  
> diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
> index 6768797..f7d5a35 100644
> --- a/kernel/sched/Makefile
> +++ b/kernel/sched/Makefile
> @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
>  obj-$(CONFIG_SCHEDSTATS) += stats.o
>  obj-$(CONFIG_SCHED_DEBUG) += debug.o
>  obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
> +obj-$(CONFIG_CPU_IDLE_GOV_SCHED) += idle-sched.o
> diff --git a/kernel/sched/idle-sched.c b/kernel/sched/idle-sched.c
> new file mode 100644
> index 0000000..4dc03da
> --- /dev/null
> +++ b/kernel/sched/idle-sched.c
> @@ -0,0 +1,280 @@
> +/*
> + *  Copyright (C) 2016 Linaro Ltd, Daniel Lezcano <daniel.lezcano@linaro.org>
> + *                                 Nicolas Pitre <nicolas.pitre@linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +#include <linux/cpuidle.h>
> +#include <linux/interrupt.h>
> +#include <linux/ktime.h>
> +#include <linux/tick.h>
> +
> +/**
> + * irqt_mean - compute the average
> + *
> + * @irqt: the irq timings structure
> + *
> + * Returns an u32 corresponding to the mean value, or zero if there is
> + * no data
> + */
> +static inline u32 irqt_mean(struct irq_timings *irqt)
> +{
> +	return irqt->sum >> IRQ_TIMINGS_SHIFT;
> +}
> +
> +/**
> + * irqt_variance - compute the variance
> + *
> + * @irqt: the irq timings structure
> + *
> + * Returns an u64 corresponding to the variance, or zero if there is
> + * no data
> + */
> +static u64 irqt_variance(struct irq_timings *irqt, u32 mean)
> +{
> +	int i;
> +	u64 variance = 0;
> +
> +	/*
> +	 * The variance is the sum of the squared difference to the
> +	 * average divided by the number of elements.
> +	 */
> +	for (i = 0; i < IRQ_TIMINGS_SIZE; i++) {
> +		s32 diff = irqt->values[i] - mean;
> +		variance += (s64)diff * diff;
> +	}
> +
> +	return variance >> IRQ_TIMINGS_SHIFT;
> +}
> +
> +/**
> + * next_irq_event - returns the next irq event
> + *
> + * Returns a guess estimate remaining time before an interrupt
> + * occurs. The type s64 corresponding in a duration in microsecond.
> + *
> + * The value is always greater or equal to zero. In case, there is no
> + * possible prediction, S64_MAX is returned.
> + */
> +static s64 next_irq_event(void)
> +{
> +	struct irq_timings *irqt;
> +	unsigned int irq = 0;
> +	u64 variance, now;
> +	s64 diff, next = 0, min = S64_MAX;
> +	u32 interval, mean;
> +	s32 deviation;
> +
> +	now = local_clock();
> +
> +	/*
> +	 * Search for the earliest expected interruption.
> +	 */
> +	rcu_read_lock();
> +	while ((irqt = irqtiming_get_next(&irq))) {
> +
> +		/*
> +		 * No values available.
> +		 */
> +		if (!irqt->timestamp)
> +			continue;
> +
> +		/*
> +		 * This interrupt last triggered more than a second ago.
> +		 * It is definitely not predictable for our purpose anymore.
> +		 */
> +		if ((now - irqt->timestamp) > NSEC_PER_SEC)
> +			continue;
> +
> +		/*
> +		 * If the mean value is null, just ignore this wakeup
> +		 * source.
> +		 */
> +		mean = irqt_mean(irqt);
> +		if (!mean)
> +			continue;
> +
> +		variance = irqt_variance(irqt, mean);
> +		/*
> +		 * We want to check the last interval is:
> +		 *
> +		 *  mean - stddev < interval < mean + stddev
> +		 *
> +		 * That simplifies to:
> +		 *
> +		 * -stddev < interval - mean < stddev
> +		 *
> +		 * abs(interval - mean) < stddev
> +		 *
> +		 * The standard deviation is the sqrt of the variance:
> +		 *
> +		 * abs(interval - mean) < sqrt(variance)
> +		 *
> +		 * and we want to avoid the sqrt, so we square the
> +		 * equation:
> +		 *
> +		 * (interval - mean)^2 < variance
> +		 *
> +		 * So if the latest value of the stats complies with
> +		 * this condition, then the wakeup source is
> +		 * considered predictable and can be used to predict
> +		 * the next event.
> +		 */
> +		interval = irqt->values[irqt->w_index];
> +		deviation = interval - mean;
> +		if ((s64)deviation * deviation > variance)
> +			continue;
> +
> +		/*
> +		 * Let's compute the next event: the wakeup source is
> +		 * considered predictable, we add the average interval
> +		 * time added to the latest interruption event
> +		 * time. Note the timestamp is stored in nsec while
> +		 * the mean time is microsec.
> +		 */
> +		next = irqt->timestamp + (mean << 10);
> +
> +		/*
> +		 * If the interrupt is supposed to happen before the
> +		 * minimum time, then it becomes the minimum.
> +		 */
> +		if (next < min)
> +			min = next;
> +	}
> +	rcu_read_unlock();
> +
> +	/*
> +	 * There is no prediction for any interrupt, so we return
> +	 * S64_MAX.
> +	 */
> +	if (!next)
> +		return S64_MAX;
> +
> +	/*
> +	 * At this point, we have our prediction but the caller is
> +	 * expecting the remaining time before the next event, so
> +	 * compute the expected sleep length.
> +	 */
> +	diff = min - now;
> +
> +	/*
> +	 * min and now are nanosecond units, let's convert to
> +	 * microsec the result of the diff.
> +	 */
> +	diff >>= 10;
> +
> +	/*
> +	 * The result could be negative for different reasons:
> +	 * - the prediction is incorrect but we can't do anything here
> +	 *   except assuming the interrupt occured effectively.
> +	 *    
> +	 * - the prediction was too near now and expired while we were
> +	 *   in this function.
> +	 *
> +	 * In those cases, we return 0. Otherwise we return the expected
> +	 * duration before an interrupt occurs.
> +	 */
> +	return diff > 0 ? diff : 0;
> +}
> +
> +/**
> + * sched_idle_next_wakeup - Predict the next wakeup on the current cpu
> + *
> + * The next event on the cpu is based on a statistic approach of the
> + * interrupt events and the timer deterministic value. From the timer
> + * or the irqs, we return the one expected to occur first.
> + *
> + * Returns the expected remaining idle time before being woken up by
> + * an interruption.
> + */
> +s64 sched_idle_next_wakeup(void)
> +{
> +	s64 next_timer = ktime_to_us(tick_nohz_get_sleep_length());
> +	s64 next_irq = next_irq_event();
> +
> +	return min(next_irq, next_timer);
> +}
> +
> +/**
> + * sched_idle - go to idle for a specified amount of time
> + *
> + * @duration: the idle duration time
> + * @latency: the latency constraint
> + *
> + * Returns 0 on success, < 0 otherwise.
> + */
> +int sched_idle(s64 duration, unsigned int latency)
> +{
> +	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
> +	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> +	struct cpuidle_state_usage *su;
> +	struct cpuidle_state *s;
> +	int i, ret = 0, index = -1;
> +
> +	rcu_idle_enter();
> +
> +	/*
> +	 * No cpuidle driver is available, let's use the default arch
> +	 * idle function.
> +	 */
> +	if (cpuidle_not_available(drv, dev))
> +		goto default_idle;
> +
> +	/*
> +	 * Find the idle state with the lowest power while satisfying
> +	 * our constraints. We will save energy if the duration of the
> +	 * idle time is bigger than the target residency which is the
> +	 * break even point. The choice will be modulated by the
> +	 * latency.
> +	 */
> +	for (i = 0; i < drv->state_count; i++) {
> +
> +		s = &drv->states[i];
> +
> +		su = &dev->states_usage[i];
> +
> +		if (s->disabled || su->disable)
> +			continue;
> +		if (s->target_residency > duration)
> +			continue;
> +		if (s->exit_latency > latency)
> +			continue;
> +
> +		index = i;
> +	}
> +
> +	/*
> +	 * The idle task must be scheduled, it is pointless to go to
> +	 * idle, just re-enable the interrupt and return.
> +	 */
> +	if (current_clr_polling_and_test()) {
> +		local_irq_enable();
> +		goto out;
> +	}
> +
> +	if (index < 0) {
> +		/*
> +		 * No idle callbacks fulfilled the constraints, jump
> +		 * to the default function like there wasn't any
> +		 * cpuidle driver.
> +		 */
> +		goto default_idle;
> +	} else {
> +		/*
> +		 * Enter the idle state previously returned by the
> +		 * governor decision.  This function will block until
> +		 * an interrupt occurs and will take care of
> +		 * re-enabling the local interrupts
> +		 */
> +		return cpuidle_enter(drv, dev, index);
> +	}
> +
> +default_idle:
> +	default_idle_call();
> +out:
> +	rcu_idle_exit();
> +	return ret;
> +}
> -- 
> 1.9.1
> 
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 1/2] irq: Track the interrupt timings
  2016-02-16 15:43 [PATCH V3 1/2] irq: Track the interrupt timings Daniel Lezcano
  2016-02-16 15:43 ` [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period Daniel Lezcano
@ 2016-02-16 16:45 ` Nicolas Pitre
  2016-02-22 14:48 ` Shreyas B Prabhu
  2 siblings, 0 replies; 18+ messages in thread
From: Nicolas Pitre @ 2016-02-16 16:45 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: tglx, peterz, rafael, linux-pm, linux-kernel, vincent.guittot

On Tue, 16 Feb 2016, Daniel Lezcano wrote:

> The interrupt framework gives a lot of information about each interrupt.
> It does not keep track of when those interrupts occur though.
> 
> This patch provides a mean to record the elapsed time between successive
> interrupt occurrences in a per-IRQ per-CPU circular buffer to help with
> the prediction of the next occurrence using a statistical model.
> 
> A new function is added to browse the different interrupts and retrieve the
> timing information stored in it.
> 
> A static key will be introduced when the irq prediction is switched on at
> runtime in order to reduce an overhead near to zero when the kernel is not
> using it.
> 
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

Acked-by: Nicolas Pitre <nico@linaro.org>



> ---
>  include/linux/interrupt.h |  17 +++++++
>  include/linux/irqdesc.h   |   4 ++
>  kernel/irq/Kconfig        |   3 ++
>  kernel/irq/Makefile       |   1 +
>  kernel/irq/handle.c       |   2 +
>  kernel/irq/internals.h    |  42 ++++++++++++++++++
>  kernel/irq/irqdesc.c      |  10 +++++
>  kernel/irq/manage.c       |   3 ++
>  kernel/irq/timings.c      | 110 ++++++++++++++++++++++++++++++++++++++++++++++
>  9 files changed, 192 insertions(+)
>  create mode 100644 kernel/irq/timings.c
> 
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 0e95fcc..f053596 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -665,6 +665,23 @@ static inline void init_irq_proc(void)
>  }
>  #endif
>  
> +#ifdef CONFIG_IRQ_TIMINGS
> +
> +#define IRQ_TIMINGS_SHIFT	2
> +#define IRQ_TIMINGS_SIZE	(1 << IRQ_TIMINGS_SHIFT)
> +#define IRQ_TIMINGS_MASK	(IRQ_TIMINGS_SIZE - 1)
> +
> +struct irq_timings {
> +	u32 values[IRQ_TIMINGS_SIZE];	/* our circular buffer */
> +	u64 sum;			/* sum of values */
> +	u64 timestamp;			/* latest timestamp */
> +	unsigned int w_index;		/* current buffer index */
> +};
> +
> +struct irq_timings *irqtiming_get_next(int *irq);
> +
> +#endif
> +
>  struct seq_file;
>  int show_interrupts(struct seq_file *p, void *v);
>  int arch_show_interrupts(struct seq_file *p, int prec);
> diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
> index dcca77c..f4e29b2 100644
> --- a/include/linux/irqdesc.h
> +++ b/include/linux/irqdesc.h
> @@ -12,6 +12,7 @@ struct proc_dir_entry;
>  struct module;
>  struct irq_desc;
>  struct irq_domain;
> +struct irq_timings;
>  struct pt_regs;
>  
>  /**
> @@ -51,6 +52,9 @@ struct irq_desc {
>  	struct irq_data		irq_data;
>  	unsigned int __percpu	*kstat_irqs;
>  	irq_flow_handler_t	handle_irq;
> +#ifdef CONFIG_IRQ_TIMINGS
> +	struct irq_timings __percpu *timings;
> +#endif
>  #ifdef CONFIG_IRQ_PREFLOW_FASTEOI
>  	irq_preflow_handler_t	preflow_handler;
>  #endif
> diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
> index 3b48dab..392c9f5 100644
> --- a/kernel/irq/Kconfig
> +++ b/kernel/irq/Kconfig
> @@ -77,6 +77,9 @@ config GENERIC_MSI_IRQ_DOMAIN
>  config HANDLE_DOMAIN_IRQ
>  	bool
>  
> +config IRQ_TIMINGS
> +	bool
> +
>  config IRQ_DOMAIN_DEBUG
>  	bool "Expose hardware/virtual IRQ mapping via debugfs"
>  	depends on IRQ_DOMAIN && DEBUG_FS
> diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
> index 2fc9cbd..9c6d3e8 100644
> --- a/kernel/irq/Makefile
> +++ b/kernel/irq/Makefile
> @@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
>  obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
>  obj-$(CONFIG_PM_SLEEP) += pm.o
>  obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
> +obj-$(CONFIG_IRQ_TIMINGS) += timings.o
> diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
> index a15b548..cd37536 100644
> --- a/kernel/irq/handle.c
> +++ b/kernel/irq/handle.c
> @@ -138,6 +138,8 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
>  	unsigned int flags = 0, irq = desc->irq_data.irq;
>  	struct irqaction *action;
>  
> +	handle_timings(desc);
> +
>  	for_each_action_of_desc(desc, action) {
>  		irqreturn_t res;
>  
> diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
> index eab521fc..3d100af 100644
> --- a/kernel/irq/internals.h
> +++ b/kernel/irq/internals.h
> @@ -56,6 +56,7 @@ enum {
>  	IRQS_WAITING		= 0x00000080,
>  	IRQS_PENDING		= 0x00000200,
>  	IRQS_SUSPENDED		= 0x00000800,
> +	IRQS_TIMINGS		= 0x00001000,
>  };
>  
>  #include "debug.h"
> @@ -218,3 +219,44 @@ irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
>  static inline void
>  irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
>  #endif
> +
> +#ifdef CONFIG_IRQ_TIMINGS
> +static inline int alloc_timings(struct irq_desc *desc)
> +{
> +	desc->timings = alloc_percpu(struct irq_timings);
> +	if (!desc->timings)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static inline void free_timings(struct irq_desc *desc)
> +{
> +	free_percpu(desc->timings);
> +}
> +
> +static inline void remove_timings(struct irq_desc *desc)
> +{
> +	desc->istate &= ~IRQS_TIMINGS;
> +}
> +
> +static inline void setup_timings(struct irq_desc *desc, struct irqaction *act)
> +{
> +	/*
> +	 * Timers are deterministic, so no need to do any measurement
> +	 * on them.
> +	 */
> +	if (act->flags & __IRQF_TIMER)
> +		return;
> +
> +	desc->istate |= IRQS_TIMINGS;
> +}
> +extern void handle_timings(struct irq_desc *desc);
> +#else
> +static inline int alloc_timings(struct irq_desc *desc) { return 0; }
> +static inline void free_timings(struct irq_desc *desc) {}
> +static inline void handle_timings(struct irq_desc *desc) {}
> +static inline void remove_timings(struct irq_desc *desc) {}
> +static inline void setup_timings(struct irq_desc *desc,
> +				 struct irqaction *act) {};
> +#endif
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index 0ccd028..577686b 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -174,6 +174,9 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
>  	if (alloc_masks(desc, gfp, node))
>  		goto err_kstat;
>  
> +	if (alloc_timings(desc))
> +		goto err_mask;
> +
>  	raw_spin_lock_init(&desc->lock);
>  	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
>  	init_rcu_head(&desc->rcu);
> @@ -182,6 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
>  
>  	return desc;
>  
> +err_mask:
> +	free_masks(desc);
>  err_kstat:
>  	free_percpu(desc->kstat_irqs);
>  err_desc:
> @@ -220,6 +225,11 @@ static void free_desc(unsigned int irq)
>  	 * the child interrupts.
>  	 */
>  	call_rcu(&desc->rcu, delayed_free_desc);
> +
> +	free_timings(desc);
> +	free_masks(desc);
> +	free_percpu(desc->kstat_irqs);
> +	kfree(desc);
>  }
>  
>  static int alloc_descs(unsigned int start, unsigned int cnt, int node,
> diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
> index 3ddd229..132c2d7 100644
> --- a/kernel/irq/manage.c
> +++ b/kernel/irq/manage.c
> @@ -1343,6 +1343,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
>  		__enable_irq(desc);
>  	}
>  
> +	setup_timings(desc, new);
> +
>  	raw_spin_unlock_irqrestore(&desc->lock, flags);
>  
>  	/*
> @@ -1465,6 +1467,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
>  		irq_settings_clr_disable_unlazy(desc);
>  		irq_shutdown(desc);
>  		irq_release_resources(desc);
> +		remove_timings(desc);
>  	}
>  
>  #ifdef CONFIG_SMP
> diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
> new file mode 100644
> index 0000000..95976fa0
> --- /dev/null
> +++ b/kernel/irq/timings.c
> @@ -0,0 +1,110 @@
> +/*
> + * linux/kernel/irq/timings.c
> + *
> + * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
> + *
> + */
> +#include <linux/interrupt.h>
> +#include <linux/irq.h>
> +#include <linux/irqdesc.h>
> +#include <linux/percpu.h>
> +
> +#include "internals.h"
> +
> +/**
> + * handle_timings - stores an irq timing  when an interrupt occurs
> + *
> + * @desc: the irq descriptor
> + *
> + * For all interruptions with their IRQS_TIMINGS flag set, the function
> + * computes the time interval between two interrupt events and store it
> + * in a circular buffer.
> + */
> +void handle_timings(struct irq_desc *desc)
> +{
> +	struct irq_timings *timings;
> +	u64 prev, now, diff;
> +
> +	if (!(desc->istate & IRQS_TIMINGS))
> +		return;
> +
> +	timings = this_cpu_ptr(desc->timings);
> +	now = local_clock();
> +	prev = timings->timestamp;
> +	timings->timestamp = now;
> +
> +	/*
> +	 * If it is the first interrupt of the series, we can't
> +	 * compute an interval, just store the timestamp and exit.
> +	 */
> +	if (unlikely(!prev))
> +		return;
> +
> +	diff = now - prev;
> +
> +	/*
> +	 * microsec (actually 1024th of a milisec) precision is good
> +	 * enough for our purpose.
> +	 */
> +	diff >>= 10;
> +
> +	/*
> +	 * There is no point to store intervals from interrupts more
> +	 * than ~1 second apart. Furthermore that increases the risk
> +	 * of overflowing our variance computation. Reset all values
> +	 * in that case. Otherwise we know the magnitude of diff is
> +	 * well within 32 bits.
> +	 */
> +	if (unlikely(diff > USEC_PER_SEC)) {
> +		memset(timings, 0, sizeof(*timings));
> +		timings->timestamp = now;
> +		return;
> +	}
> +
> +	/* The oldest value corresponds to the next index. */
> +	timings->w_index = (timings->w_index + 1) & IRQ_TIMINGS_MASK;
> +
> +	/*
> +	 * Remove the oldest value from the summing. If this is the
> +	 * first time we go through this array slot, the previous
> +	 * value will be zero and we won't substract anything from the
> +	 * current sum. Hence this code relies on a zero-ed structure.
> +	 */
> +	timings->sum -= timings->values[timings->w_index];
> +	timings->values[timings->w_index] = diff;
> +	timings->sum += diff;
> +}
> +
> +/**
> + * irqtiming_get_next - return the next irq timing
> + *
> + * @int: an integer representing the interrupt number
> + *
> + * Returns a struct irq_timings, NULL if we reach the end of the
> + * interrupts list.
> + */
> +struct irq_timings *irqtiming_get_next(int *irq)
> +{
> +	struct irq_desc *desc;
> +	int next;
> +
> +again:
> +	/* Do a racy lookup of the next allocated irq */
> +	next = irq_get_next_irq(*irq);
> +	if (next >= nr_irqs)
> +		return NULL;
> +
> +	*irq = next + 1;
> +
> +	/*
> +	 * Now lookup the descriptor. It's RCU protected. This
> +	 * descriptor might belong to an uninteresting interrupt or
> +	 * one that is not measured. Look for the next interrupt in
> +	 * that case.
> +	 */
> +	desc = irq_to_desc(next);
> +	if (!desc || !(desc->istate & IRQS_TIMINGS))
> +		goto again;
> +
> +	return this_cpu_ptr(desc->timings);
> +}
> -- 
> 1.9.1
> 
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-16 16:44   ` Nicolas Pitre
@ 2016-02-17 23:09     ` Rafael J. Wysocki
  2016-02-17 23:21       ` Rafael J. Wysocki
  0 siblings, 1 reply; 18+ messages in thread
From: Rafael J. Wysocki @ 2016-02-17 23:09 UTC (permalink / raw)
  To: Nicolas Pitre, Daniel Lezcano
  Cc: Thomas Gleixner, Peter Zijlstra, Rafael J. Wysocki, linux-pm,
	Linux Kernel Mailing List, Vincent Guittot

On Tue, Feb 16, 2016 at 5:44 PM, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
> On Tue, 16 Feb 2016, Daniel Lezcano wrote:
>
>> Many IRQs are quiet most of the time, or they tend to come in bursts of
>> fairly equal time intervals within each burst. It is therefore possible
>> to detect those IRQs with stable intervals and guestimate when the next
>> IRQ event is most likely to happen.
>>
>> Examples of such IRQs may include audio related IRQs where the FIFO size
>> and/or DMA descriptor size with the sample rate create stable intervals,
>> block devices during large data transfers, etc.  Even network streaming
>> of multimedia content creates patterns of periodic network interface IRQs
>> in some cases.
>>
>> This patch adds code to compute the mean interval and variance for each IRQ
>> over a window of time intervals between IRQ events. Those statistics can
>> be used to assist cpuidle in selecting the most appropriate sleep state
>> by predicting the most likely time for the next interrupt.
>>
>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
>
> The math in next_irq_event() is correct even though I think it could be
> done more simply.  But that can be optimized at a later time.
>
> Reviewed-by: Nicolas Pitre <nico@linaro.org>

Well, I'm likely overlooking something, but how is this going to be
hooked up to the code in idle.c?

Thanks,
Rafael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-17 23:09     ` Rafael J. Wysocki
@ 2016-02-17 23:21       ` Rafael J. Wysocki
  2016-02-18 10:25         ` Daniel Lezcano
  0 siblings, 1 reply; 18+ messages in thread
From: Rafael J. Wysocki @ 2016-02-17 23:21 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Nicolas Pitre, Thomas Gleixner, Peter Zijlstra, linux-pm,
	Linux Kernel Mailing List, Vincent Guittot

On Thu, Feb 18, 2016 at 12:09 AM, Rafael J. Wysocki <rafael@kernel.org> wrote:
> On Tue, Feb 16, 2016 at 5:44 PM, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
>> On Tue, 16 Feb 2016, Daniel Lezcano wrote:
>>
>>> Many IRQs are quiet most of the time, or they tend to come in bursts of
>>> fairly equal time intervals within each burst. It is therefore possible
>>> to detect those IRQs with stable intervals and guestimate when the next
>>> IRQ event is most likely to happen.
>>>
>>> Examples of such IRQs may include audio related IRQs where the FIFO size
>>> and/or DMA descriptor size with the sample rate create stable intervals,
>>> block devices during large data transfers, etc.  Even network streaming
>>> of multimedia content creates patterns of periodic network interface IRQs
>>> in some cases.
>>>
>>> This patch adds code to compute the mean interval and variance for each IRQ
>>> over a window of time intervals between IRQ events. Those statistics can
>>> be used to assist cpuidle in selecting the most appropriate sleep state
>>> by predicting the most likely time for the next interrupt.
>>>
>>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
>>
>> The math in next_irq_event() is correct even though I think it could be
>> done more simply.  But that can be optimized at a later time.
>>
>> Reviewed-by: Nicolas Pitre <nico@linaro.org>
>
> Well, I'm likely overlooking something, but how is this going to be
> hooked up to the code in idle.c?

My somewhat educated guess is that sched_idle() in your patch is
intended to replace cpuidle_idle_call(), right?

If so, why do you want to replace it?

And assuming that you have a good enough reason to do that, you need
to ensure that suspend-to-idle will work anyway.

Thanks,
Rafael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-17 23:21       ` Rafael J. Wysocki
@ 2016-02-18 10:25         ` Daniel Lezcano
  2016-02-18 19:57           ` Rafael J. Wysocki
  0 siblings, 1 reply; 18+ messages in thread
From: Daniel Lezcano @ 2016-02-18 10:25 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Nicolas Pitre, Thomas Gleixner, Peter Zijlstra, linux-pm,
	Linux Kernel Mailing List, Vincent Guittot

On 02/17/2016 11:21 PM, Rafael J. Wysocki wrote:

[ ... ]

>>> Reviewed-by: Nicolas Pitre <nico@linaro.org>
>>
>> Well, I'm likely overlooking something, but how is this going to be
>> hooked up to the code in idle.c?
>
> My somewhat educated guess is that sched_idle() in your patch is
> intended to replace cpuidle_idle_call(), right?

Well, no. I was planning to first have it to use a different code path 
as experimental code in order to focus improving the accuracy of the 
prediction and then merge or replace cpuidle_idle_call() with sched_idle().

> If so, why do you want to replace it?
>
> And assuming that you have a good enough reason to do that, you need
> to ensure that suspend-to-idle will work anyway.

Yes, sure.


-- 
  <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-18 10:25         ` Daniel Lezcano
@ 2016-02-18 19:57           ` Rafael J. Wysocki
  2016-02-19 15:01             ` Daniel Lezcano
  0 siblings, 1 reply; 18+ messages in thread
From: Rafael J. Wysocki @ 2016-02-18 19:57 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Rafael J. Wysocki, Nicolas Pitre, Thomas Gleixner,
	Peter Zijlstra, linux-pm, Linux Kernel Mailing List,
	Vincent Guittot

On Thu, Feb 18, 2016 at 11:25 AM, Daniel Lezcano
<daniel.lezcano@linaro.org> wrote:
> On 02/17/2016 11:21 PM, Rafael J. Wysocki wrote:
>
> [ ... ]
>
>>>> Reviewed-by: Nicolas Pitre <nico@linaro.org>
>>>
>>>
>>> Well, I'm likely overlooking something, but how is this going to be
>>> hooked up to the code in idle.c?
>>
>>
>> My somewhat educated guess is that sched_idle() in your patch is
>> intended to replace cpuidle_idle_call(), right?
>
>
> Well, no. I was planning to first have it to use a different code path as
> experimental code in order to focus improving the accuracy of the prediction
> and then merge or replace cpuidle_idle_call() with sched_idle().

In that case, what about making it a proper cpuidle governor that
people can test and play with in a usual way?  Then it may potentially
benefit everybody and not just your experimental setup and you may get
coverage on systems you have no access to normally.

There is some boilerplate code to add for this purpose, but that's not
that bad IMO.

>
>> If so, why do you want to replace it?
>>

So I'm still unsure why you want to replace cpuidle_idle_call() with
sched_idle().  Is there anything wrong with it that it needs to be
replaced?

Thanks,
Rafael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-18 19:57           ` Rafael J. Wysocki
@ 2016-02-19 15:01             ` Daniel Lezcano
  2016-02-19 23:43               ` Rafael J. Wysocki
  0 siblings, 1 reply; 18+ messages in thread
From: Daniel Lezcano @ 2016-02-19 15:01 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Nicolas Pitre, Thomas Gleixner, Peter Zijlstra, linux-pm,
	Linux Kernel Mailing List, Vincent Guittot

On 02/18/2016 07:57 PM, Rafael J. Wysocki wrote:
> On Thu, Feb 18, 2016 at 11:25 AM, Daniel Lezcano
> <daniel.lezcano@linaro.org> wrote:
>> On 02/17/2016 11:21 PM, Rafael J. Wysocki wrote:
>>
>> [ ... ]
>>
>>>>> Reviewed-by: Nicolas Pitre <nico@linaro.org>
>>>>
>>>>
>>>> Well, I'm likely overlooking something, but how is this going to be
>>>> hooked up to the code in idle.c?
>>>
>>>
>>> My somewhat educated guess is that sched_idle() in your patch is
>>> intended to replace cpuidle_idle_call(), right?
>>
>>
>> Well, no. I was planning to first have it to use a different code path as
>> experimental code in order to focus improving the accuracy of the prediction
>> and then merge or replace cpuidle_idle_call() with sched_idle().
>
> In that case, what about making it a proper cpuidle governor that
> people can test and play with in a usual way?  Then it may potentially
> benefit everybody and not just your experimental setup and you may get
> coverage on systems you have no access to normally.
>
> There is some boilerplate code to add for this purpose, but that's not
> that bad IMO.

Hi Rafael,

sorry for the delay in the responses.

Actually, adding a new governor is precisely what I would like to avoid 
because the objective is the scheduler acts as the governor. Here, it is 
the 'predictor' and the API to enter an idle state conforming the idle 
duration and the latency constraint.

Concerning the testing, it is quite easy to switch from idle_sched to 
'menu' via on sched_debug or whatever option we want to add.

>
> So I'm still unsure why you want to replace cpuidle_idle_call() with
> sched_idle().  Is there anything wrong with it that it needs to be
> replaced?

I don't want to replace cpuidle_idle_call() with sched_idle(). How we 
integrate the API is something I would like to discuss with another 
patchset focused in this integration only.

For reference: https://lkml.org/lkml/2016/1/12/530

-- Daniel





-- 
  <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-19 15:01             ` Daniel Lezcano
@ 2016-02-19 23:43               ` Rafael J. Wysocki
  2016-02-23  9:49                 ` Daniel Lezcano
  0 siblings, 1 reply; 18+ messages in thread
From: Rafael J. Wysocki @ 2016-02-19 23:43 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Rafael J. Wysocki, Nicolas Pitre, Thomas Gleixner,
	Peter Zijlstra, linux-pm, Linux Kernel Mailing List,
	Vincent Guittot

On Fri, Feb 19, 2016 at 4:01 PM, Daniel Lezcano
<daniel.lezcano@linaro.org> wrote:
> On 02/18/2016 07:57 PM, Rafael J. Wysocki wrote:
>>
>> On Thu, Feb 18, 2016 at 11:25 AM, Daniel Lezcano
>> <daniel.lezcano@linaro.org> wrote:
>>>
>>> On 02/17/2016 11:21 PM, Rafael J. Wysocki wrote:
>>>
>>> [ ... ]
>>>
>>>>>> Reviewed-by: Nicolas Pitre <nico@linaro.org>
>>>>>
>>>>>
>>>>>
>>>>> Well, I'm likely overlooking something, but how is this going to be
>>>>> hooked up to the code in idle.c?
>>>>
>>>>
>>>>
>>>> My somewhat educated guess is that sched_idle() in your patch is
>>>> intended to replace cpuidle_idle_call(), right?
>>>
>>>
>>>
>>> Well, no. I was planning to first have it to use a different code path as
>>> experimental code in order to focus improving the accuracy of the
>>> prediction
>>> and then merge or replace cpuidle_idle_call() with sched_idle().
>>
>>
>> In that case, what about making it a proper cpuidle governor that
>> people can test and play with in a usual way?  Then it may potentially
>> benefit everybody and not just your experimental setup and you may get
>> coverage on systems you have no access to normally.
>>
>> There is some boilerplate code to add for this purpose, but that's not
>> that bad IMO.
>
>
> Hi Rafael,
>
> sorry for the delay in the responses.
>
> Actually, adding a new governor is precisely what I would like to avoid
> because the objective is the scheduler acts as the governor.

But why, really?

Well, first of all I'm not sure what "the scheduler acts as the
governor" means.  For the lack of a better explanation I'll refer to
the message at https://lkml.org/lkml/2016/1/12/530 that you pointed me
at.

Your code in there does something like:

if (sched_idle_enabled()) {
        int latency = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
        s64 duration = sched_idle_next_wakeup();
        sched_idle(duration, latency);
} else {
        cpuidle_idle_call();
}

which is quite questionable to be honest as it adds an extra branch to
the idle loop for no real benefit.

Now, what really is the difference between "governor" and "predictor"?
 I don't quite see it except that the former is expected to provide a
specific interface.

The way the idle loop works now (and I'm not sure if you can really
change it) is that when you get into it, you're idle no matter what
and you simply need to choose an idle state for the CPU to go into.
Some code needs to select that state, regardless of what name you want
to give to that code.

In the current setup, which I really don't think is unreasonable, this
is done by cpuidle_select() that simply invokes the governor's
->select() callback and that's it.  That callback may very well be
part of the scheduler and registered from there if you want that, but
why do you want to change the whole mechanism?  What's wrong with it
now?

Further, if you look at your sched_idle(), it looks almost like
cpuidle_idle_call() with a few really minor differences (apart from
the fact that it doesn't cover suspend-to-idle which it will have to
do eventually) that really look arbitrary and the "selection" if () in
it simply plays the role of the invocation of ->select().  So how is
it different really?

> Here, it is the 'predictor' and the API to enter an idle state conforming the idle duration
> and the latency constraint.

Isn't that just a simple rearrangement of the code?  The latency still
comes from PM QoS and the duration is computed by your new code
instead of that being done by ->select() itself, but why actually
->select() cannot call sched_idle_next_wakeup() to get the duration
value it needs?  Why do those values need to be passed to a
cpuidle_idle_call() replacement as arguments?  Is there any particular
technical reason for doing that?

And why that name, sched_idle_next_wakeup()?  Does that function
really have anything to do with the scheduler now?

> Concerning the testing, it is quite easy to switch from idle_sched to 'menu'
> via on sched_debug or whatever option we want to add.
>
>>
>> So I'm still unsure why you want to replace cpuidle_idle_call() with
>> sched_idle().  Is there anything wrong with it that it needs to be
>> replaced?
>
>
> I don't want to replace cpuidle_idle_call() with sched_idle(). How we
> integrate the API is something I would like to discuss with another patchset
> focused in this integration only.
>
> For reference: https://lkml.org/lkml/2016/1/12/530

Please answer my questions above.  If you need to post a patchset for
this purpose, please do that.

I have to say that I was looking forward to the IRQ timings based
duration prediction, but the way you want to use it now is seriously
disappointing.

Thanks,
Rafael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 1/2] irq: Track the interrupt timings
  2016-02-16 15:43 [PATCH V3 1/2] irq: Track the interrupt timings Daniel Lezcano
  2016-02-16 15:43 ` [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period Daniel Lezcano
  2016-02-16 16:45 ` [PATCH V3 1/2] irq: Track the interrupt timings Nicolas Pitre
@ 2016-02-22 14:48 ` Shreyas B Prabhu
  2016-02-22 17:24   ` Daniel Lezcano
  2 siblings, 1 reply; 18+ messages in thread
From: Shreyas B Prabhu @ 2016-02-22 14:48 UTC (permalink / raw)
  To: Daniel Lezcano, tglx
  Cc: peterz, rafael, linux-pm, linux-kernel, nicolas.pitre, vincent.guittot

Hi Daniel,

On 02/16/2016 09:13 PM, Daniel Lezcano wrote:
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index 0ccd028..577686b 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -174,6 +174,9 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
>  	if (alloc_masks(desc, gfp, node))
>  		goto err_kstat;
> 
> +	if (alloc_timings(desc))
> +		goto err_mask;
> +
>  	raw_spin_lock_init(&desc->lock);
>  	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
>  	init_rcu_head(&desc->rcu);
> @@ -182,6 +185,8 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
> 
>  	return desc;
> 
> +err_mask:
> +	free_masks(desc);
>  err_kstat:
>  	free_percpu(desc->kstat_irqs);
>  err_desc:
> @@ -220,6 +225,11 @@ static void free_desc(unsigned int irq)
>  	 * the child interrupts.
>  	 */
>  	call_rcu(&desc->rcu, delayed_free_desc);
> +
> +	free_timings(desc);
> +	free_masks(desc);
> +	free_percpu(desc->kstat_irqs);
> +	kfree(desc);
>  }

I think this is a rebase error. If I am not wrong, instead of this you
need to add free_timings(desc) to delayed_free_desc.

Thanks,
Shreyas

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-16 15:43 ` [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period Daniel Lezcano
  2016-02-16 16:44   ` Nicolas Pitre
@ 2016-02-22 15:02   ` Shreyas B Prabhu
  2016-02-22 17:29     ` Daniel Lezcano
  2016-02-23 10:06   ` Shreyas B Prabhu
  2 siblings, 1 reply; 18+ messages in thread
From: Shreyas B Prabhu @ 2016-02-22 15:02 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: tglx, peterz, rafael, linux-pm, linux-kernel, nicolas.pitre,
	vincent.guittot

Hi Daniel,

On 02/16/2016 09:13 PM, Daniel Lezcano wrote:
> +
> +/**
> + * sched_idle_next_wakeup - Predict the next wakeup on the current cpu
> + *
> + * The next event on the cpu is based on a statistic approach of the
> + * interrupt events and the timer deterministic value. From the timer
> + * or the irqs, we return the one expected to occur first.
> + *
> + * Returns the expected remaining idle time before being woken up by
> + * an interruption.
> + */
> +s64 sched_idle_next_wakeup(void)
> +{
> +	s64 next_timer = ktime_to_us(tick_nohz_get_sleep_length());
> +	s64 next_irq = next_irq_event();

Since next_irq_event() uses RCU and we are in idle this should probably
be wrapped in RCU_NONIDLE().

> +
> +	return min(next_irq, next_timer);
> +}
> +

Thanks,
Shreyas

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 1/2] irq: Track the interrupt timings
  2016-02-22 14:48 ` Shreyas B Prabhu
@ 2016-02-22 17:24   ` Daniel Lezcano
  0 siblings, 0 replies; 18+ messages in thread
From: Daniel Lezcano @ 2016-02-22 17:24 UTC (permalink / raw)
  To: Shreyas B Prabhu, tglx
  Cc: peterz, rafael, linux-pm, linux-kernel, nicolas.pitre, vincent.guittot

On 02/22/2016 03:48 PM, Shreyas B Prabhu wrote:

[ ... ]

>> +
>> +	free_timings(desc);
>> +	free_masks(desc);
>> +	free_percpu(desc->kstat_irqs);
>> +	kfree(desc);
>>   }
>
> I think this is a rebase error. If I am not wrong, instead of this you
> need to add free_timings(desc) to delayed_free_desc.

Ah, yes. Good catch.

Thanks for spotting it.

   -- Daniel

-- 
  <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-22 15:02   ` Shreyas B Prabhu
@ 2016-02-22 17:29     ` Daniel Lezcano
  2016-02-23 10:08       ` Shreyas B Prabhu
  0 siblings, 1 reply; 18+ messages in thread
From: Daniel Lezcano @ 2016-02-22 17:29 UTC (permalink / raw)
  To: Shreyas B Prabhu
  Cc: tglx, peterz, rafael, linux-pm, linux-kernel, nicolas.pitre,
	vincent.guittot

On 02/22/2016 04:02 PM, Shreyas B Prabhu wrote:
> Hi Daniel,
>
> On 02/16/2016 09:13 PM, Daniel Lezcano wrote:
>> +
>> +/**
>> + * sched_idle_next_wakeup - Predict the next wakeup on the current cpu
>> + *
>> + * The next event on the cpu is based on a statistic approach of the
>> + * interrupt events and the timer deterministic value. From the timer
>> + * or the irqs, we return the one expected to occur first.
>> + *
>> + * Returns the expected remaining idle time before being woken up by
>> + * an interruption.
>> + */
>> +s64 sched_idle_next_wakeup(void)
>> +{
>> +	s64 next_timer = ktime_to_us(tick_nohz_get_sleep_length());
>> +	s64 next_irq = next_irq_event();
>
> Since next_irq_event() uses RCU and we are in idle this should probably
> be wrapped in RCU_NONIDLE().

That is a good point but the function is not supposed to be called in 
the rcu_idle_enter/rcu_idle_exit section which is inside 
cpuidle_idle_call().


-- 
  <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-19 23:43               ` Rafael J. Wysocki
@ 2016-02-23  9:49                 ` Daniel Lezcano
  0 siblings, 0 replies; 18+ messages in thread
From: Daniel Lezcano @ 2016-02-23  9:49 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Nicolas Pitre, Thomas Gleixner, Peter Zijlstra, linux-pm,
	Linux Kernel Mailing List, Vincent Guittot, Ingo Molnar


Hi Rafael,

Added Ingo.

On 02/20/2016 12:43 AM, Rafael J. Wysocki wrote:
> On Fri, Feb 19, 2016 at 4:01 PM, Daniel Lezcano
> <daniel.lezcano@linaro.org> wrote:
>> On 02/18/2016 07:57 PM, Rafael J. Wysocki wrote:
>>>
>>> On Thu, Feb 18, 2016 at 11:25 AM, Daniel Lezcano
>>> <daniel.lezcano@linaro.org> wrote:
>>>>
>>>> On 02/17/2016 11:21 PM, Rafael J. Wysocki wrote:
>>>>
>>>> [ ... ]
>>>>
>>>>>>> Reviewed-by: Nicolas Pitre <nico@linaro.org>
>>>>>>
>>>>>>
>>>>>>
>>>>>> Well, I'm likely overlooking something, but how is this going to be
>>>>>> hooked up to the code in idle.c?
>>>>>
>>>>>
>>>>>
>>>>> My somewhat educated guess is that sched_idle() in your patch is
>>>>> intended to replace cpuidle_idle_call(), right?
>>>>
>>>>
>>>>
>>>> Well, no. I was planning to first have it to use a different code path as
>>>> experimental code in order to focus improving the accuracy of the
>>>> prediction
>>>> and then merge or replace cpuidle_idle_call() with sched_idle().
>>>
>>>
>>> In that case, what about making it a proper cpuidle governor that
>>> people can test and play with in a usual way?  Then it may potentially
>>> benefit everybody and not just your experimental setup and you may get
>>> coverage on systems you have no access to normally.
>>>
>>> There is some boilerplate code to add for this purpose, but that's not
>>> that bad IMO.
>>
>>
>> Hi Rafael,
>>
>> sorry for the delay in the responses.
>>
>> Actually, adding a new governor is precisely what I would like to avoid
>> because the objective is the scheduler acts as the governor.
>
> But why, really?
>
> Well, first of all I'm not sure what "the scheduler acts as the
> governor" means.  For the lack of a better explanation I'll refer to
> the message at https://lkml.org/lkml/2016/1/12/530 that you pointed me
> at.

For better explanation let's refer to Ingo's email:

http://lwn.net/Articles/552889/

Ingo said:

"Note that I still disagree with the whole design notion of having an "idle
back-end" (and a 'cpufreq back end') separate from scheduler power saving
policy, and none of the patch-sets offered so far solve this fundamental
design problem. [...]"

"[...] so the scheduler is in an _ideal_ position to do a judgement call 
about the near future and estimate how deep an idle state a CPU core 
should enter into and what frequency it should run at."

The patches I sent are supposed to provide the base brick to solve 'this 
fundamental design problem' by giving a couple of functions:

  - when is the next event ?
  - sleep X us with Y latency.

The component responsible of choosing X and Y will be the scheduler 
based on its knowledge of 'the near future' coming from 'when is the 
next event ?'

Moving the code I submitted into a governor has little sense with the 
goal described above because the sched_next_event will be called from 
the scheduler code directly. Today the integration is not obvious.

There is a branch in the cpuidle loop below but this is just for testing 
purpose at the moment and not part as a submitted patch.


> Your code in there does something like:
>
> if (sched_idle_enabled()) {
>          int latency = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
>          s64 duration = sched_idle_next_wakeup();
>          sched_idle(duration, latency);
> } else {
>          cpuidle_idle_call();
> }
>
> which is quite questionable to be honest as it adds an extra branch to
> the idle loop for no real benefit.

That is not the part of the submitted patchset. It is an example based 
on an embryonic code to show how to use the API.

How those API are integrated with the scheduler will be submitted in a 
separate patchset.

> Now, what really is the difference between "governor" and "predictor"?
>   I don't quite see it except that the former is expected to provide a
> specific interface.

The predictor is a standalone component giving the answer to the 
question 'when is the next event ?'.

The governor uses the next event information from the prediction to take 
a decision (governor == scheduler here).

If the code I am proposing is buried in the cpuidle framework inside a 
governor I don't see how the scheduler can become the ideal place to do 
a decision.

> The way the idle loop works now (and I'm not sure if you can really
> change it) is that when you get into it, you're idle no matter what
> and you simply need to choose an idle state for the CPU to go into.
> Some code needs to select that state, regardless of what name you want
> to give to that code.
>
> In the current setup, which I really don't think is unreasonable, this
> is done by cpuidle_select() that simply invokes the governor's
> ->select() callback and that's it.  That callback may very well be
> part of the scheduler and registered from there if you want that, but
> why do you want to change the whole mechanism?  What's wrong with it
> now?

There is nothing wrong. We lived with it for years and that satisfied 
everyone.

My point is I am sticking to how Ingo and Peter envision the integration 
work. It is clearly stated in Ingo's mail, he disagree on having the 
notion of 'idle backend'.

The couple of patches here are to be used by the scheduler directly.

> Further, if you look at your sched_idle(), it looks almost like
> cpuidle_idle_call() with a few really minor differences (apart from
> the fact that it doesn't cover suspend-to-idle which it will have to
> do eventually) that really look arbitrary and the "selection" if () in
> it simply plays the role of the invocation of ->select().  So how is
> it different really?

The question is not if there is duplicate code or not. Factoring code is 
easy but factoring code with a moving target is much more difficult.

The moving target is the cpuidle code itself. I have been consolidating 
the code several times and the effort got ruined several times by the 
changes.

It is much more easier to introduce the experimental code make it evolve 
wisely and then consolidate with the existing code.

And it does even more sense to branch it if the feature is by design the 
exact opposite of the current design (idle based statistics vs irq 
events) even if there is common code.

>> Here, it is the 'predictor' and the API to enter an idle state conforming the idle duration
>> and the latency constraint.
>
> Isn't that just a simple rearrangement of the code?  The latency still
> comes from PM QoS and the duration is computed by your new code
> instead of that being done by ->select() itself, but why actually
> ->select() cannot call sched_idle_next_wakeup() to get the duration
> value it needs?  Why do those values need to be passed to a
> cpuidle_idle_call() replacement as arguments?  Is there any particular
> technical reason for doing that?

Yes. The select returns an idle state index. Even if, at some points, we 
may need to know what is the idle state fitting a set of timing 
constraints, the scheduler is interested in how long the cpu will be 
idle in order to take a decision.

One example when everything is integrated:

The topology gives the information CPU2 and CPU3 are cluster1.

1. CPU3 enters idle, the next wakeup event is stored for this cpu
2. CPU2 enters idle and looks when it should wake up
3. CPU2 looks when CPU3 wakes up
4. CPU2 takes the time intersection between CPU2/CPU3 in order to have 
the time the cluster is idle
5. CPU2 takes the decision if cluster power down is possible

With the current design it is not possible to do that.

> And why that name, sched_idle_next_wakeup()?  Does that function
> really have anything to do with the scheduler now?
>
>> Concerning the testing, it is quite easy to switch from idle_sched to 'menu'
>> via on sched_debug or whatever option we want to add.
>>
>>>
>>> So I'm still unsure why you want to replace cpuidle_idle_call() with
>>> sched_idle().  Is there anything wrong with it that it needs to be
>>> replaced?
>>
>>
>> I don't want to replace cpuidle_idle_call() with sched_idle(). How we
>> integrate the API is something I would like to discuss with another patchset
>> focused in this integration only.
>>
>> For reference: https://lkml.org/lkml/2016/1/12/530
>
> Please answer my questions above.  If you need to post a patchset for
> this purpose, please do that.

...

> I have to say that I was looking forward to the IRQ timings based
> duration prediction, but the way you want to use it now is seriously
> disappointing.

This comment is a bit patronizing but I can understand you were 
expecting more.

I will let Peter and Ingo give their opinions and I will follow the 
consensus.

Thanks for the review.

   -- Daniel

-- 
  <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-16 15:43 ` [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period Daniel Lezcano
  2016-02-16 16:44   ` Nicolas Pitre
  2016-02-22 15:02   ` Shreyas B Prabhu
@ 2016-02-23 10:06   ` Shreyas B Prabhu
  2016-02-23 10:13     ` Daniel Lezcano
  2 siblings, 1 reply; 18+ messages in thread
From: Shreyas B Prabhu @ 2016-02-23 10:06 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: tglx, peterz, rafael, linux-pm, linux-kernel, nicolas.pitre,
	vincent.guittot



On 02/16/2016 09:13 PM, Daniel Lezcano wrote:
[...]
> +	if (index < 0) {
> +		/*
> +		 * No idle callbacks fulfilled the constraints, jump
> +		 * to the default function like there wasn't any
> +		 * cpuidle driver.
> +		 */
> +		goto default_idle;
> +	} else {
> +		/*
> +		 * Enter the idle state previously returned by the
> +		 * governor decision.  This function will block until
> +		 * an interrupt occurs and will take care of
> +		 * re-enabling the local interrupts
> +		 */
> +		return cpuidle_enter(drv, dev, index);

Minor point. You are not calling rcu_idle_exit() in else block. This
should probably be
		ret = cpuidle_enter(drv, dev, index);
                goto out;

> +	}
> +
> +default_idle:
> +	default_idle_call();
> +out:
> +	rcu_idle_exit();
> +	return ret;
> +}
> 

Thanks,
Shreyas

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-22 17:29     ` Daniel Lezcano
@ 2016-02-23 10:08       ` Shreyas B Prabhu
  0 siblings, 0 replies; 18+ messages in thread
From: Shreyas B Prabhu @ 2016-02-23 10:08 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: tglx, peterz, rafael, linux-pm, linux-kernel, nicolas.pitre,
	vincent.guittot



On 02/22/2016 10:59 PM, Daniel Lezcano wrote:
> On 02/22/2016 04:02 PM, Shreyas B Prabhu wrote:
>> Hi Daniel,
>>
>> On 02/16/2016 09:13 PM, Daniel Lezcano wrote:
>>> +
>>> +/**
>>> + * sched_idle_next_wakeup - Predict the next wakeup on the current cpu
>>> + *
>>> + * The next event on the cpu is based on a statistic approach of the
>>> + * interrupt events and the timer deterministic value. From the timer
>>> + * or the irqs, we return the one expected to occur first.
>>> + *
>>> + * Returns the expected remaining idle time before being woken up by
>>> + * an interruption.
>>> + */
>>> +s64 sched_idle_next_wakeup(void)
>>> +{
>>> +    s64 next_timer = ktime_to_us(tick_nohz_get_sleep_length());
>>> +    s64 next_irq = next_irq_event();
>>
>> Since next_irq_event() uses RCU and we are in idle this should probably
>> be wrapped in RCU_NONIDLE().
> 
> That is a good point but the function is not supposed to be called in
> the rcu_idle_enter/rcu_idle_exit section which is inside
> cpuidle_idle_call().
> 
> 
Right. My bad.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period
  2016-02-23 10:06   ` Shreyas B Prabhu
@ 2016-02-23 10:13     ` Daniel Lezcano
  0 siblings, 0 replies; 18+ messages in thread
From: Daniel Lezcano @ 2016-02-23 10:13 UTC (permalink / raw)
  To: Shreyas B Prabhu
  Cc: tglx, peterz, rafael, linux-pm, linux-kernel, nicolas.pitre,
	vincent.guittot

On 02/23/2016 11:06 AM, Shreyas B Prabhu wrote:
>
>
> On 02/16/2016 09:13 PM, Daniel Lezcano wrote:
> [...]
>> +	if (index < 0) {
>> +		/*
>> +		 * No idle callbacks fulfilled the constraints, jump
>> +		 * to the default function like there wasn't any
>> +		 * cpuidle driver.
>> +		 */
>> +		goto default_idle;
>> +	} else {
>> +		/*
>> +		 * Enter the idle state previously returned by the
>> +		 * governor decision.  This function will block until
>> +		 * an interrupt occurs and will take care of
>> +		 * re-enabling the local interrupts
>> +		 */
>> +		return cpuidle_enter(drv, dev, index);
>
> Minor point. You are not calling rcu_idle_exit() in else block. This
> should probably be
> 		ret = cpuidle_enter(drv, dev, index);
>                  goto out;

Yes. Right.

Thanks for the review.

   -- Daniel


-- 
  <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2016-02-23 10:13 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-16 15:43 [PATCH V3 1/2] irq: Track the interrupt timings Daniel Lezcano
2016-02-16 15:43 ` [PATCH V3 2/2] sched: idle: IRQ based next prediction for idle period Daniel Lezcano
2016-02-16 16:44   ` Nicolas Pitre
2016-02-17 23:09     ` Rafael J. Wysocki
2016-02-17 23:21       ` Rafael J. Wysocki
2016-02-18 10:25         ` Daniel Lezcano
2016-02-18 19:57           ` Rafael J. Wysocki
2016-02-19 15:01             ` Daniel Lezcano
2016-02-19 23:43               ` Rafael J. Wysocki
2016-02-23  9:49                 ` Daniel Lezcano
2016-02-22 15:02   ` Shreyas B Prabhu
2016-02-22 17:29     ` Daniel Lezcano
2016-02-23 10:08       ` Shreyas B Prabhu
2016-02-23 10:06   ` Shreyas B Prabhu
2016-02-23 10:13     ` Daniel Lezcano
2016-02-16 16:45 ` [PATCH V3 1/2] irq: Track the interrupt timings Nicolas Pitre
2016-02-22 14:48 ` Shreyas B Prabhu
2016-02-22 17:24   ` Daniel Lezcano

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).