linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] cpuidle : Add debugfs support for cpuidle core
@ 2019-12-17 14:38 Abhishek Goel
  2019-12-17 16:51 ` Rafael J. Wysocki
  0 siblings, 1 reply; 5+ messages in thread
From: Abhishek Goel @ 2019-12-17 14:38 UTC (permalink / raw)
  To: linux-pm, linux-kernel
  Cc: rjw, daniel.lezcano, ego, oohall, mpe, svaidy, Abhishek Goel

Up until now, we did not have a way to tune cpuidle attribute like
residency in kernel. This patch adds support for debugfs in cpuidle core.
Thereby providing support for tuning cpuidle attributes like residency in
kernel at runtime.
For example: Tuning residency at runtime can be used to quantify governors
decision making as governor uses residency as one of the parameter to
take decision about the state that needs to be entered while idling.

Currently, Only residency have been added in debugfs framework. We can
later on add attributes like latency too.

Signed-off-by: Abhishek Goel <huntbag@linux.vnet.ibm.com>
---
 drivers/cpuidle/Kconfig           |   8 ++
 drivers/cpuidle/Makefile          |   1 +
 drivers/cpuidle/cpuidle-debugfs.c | 188 ++++++++++++++++++++++++++++++
 drivers/cpuidle/cpuidle.c         |  19 ++-
 drivers/cpuidle/cpuidle.h         |  24 ++++
 include/linux/cpuidle.h           |   3 +
 6 files changed, 241 insertions(+), 2 deletions(-)
 create mode 100644 drivers/cpuidle/cpuidle-debugfs.c

diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index 88727b7c0d59..630a5a90744b 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -71,6 +71,14 @@ config HALTPOLL_CPUIDLE
          before halting in the guest (more efficient than polling in the
          host via halt_poll_ns for some scenarios).
 
+config CPUIDLE_DEBUG_FS
+	bool "cpuidle state debugging information in debugfs"
+	default y
+	depends on DEBUG_FS
+	help
+	Include cpuidle state debugging information in debugfs. This is mostly
+	useful for kernel developers, but it does not incur any cost at runtime.
+
 endif
 
 config ARCH_NEEDS_CPU_IDLE_COUPLED
diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index ee70d5cc5b99..8b5a3f4fe7ec 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
 obj-$(CONFIG_DT_IDLE_STATES)		  += dt_idle_states.o
 obj-$(CONFIG_ARCH_HAS_CPU_RELAX)	  += poll_state.o
 obj-$(CONFIG_HALTPOLL_CPUIDLE)		  += cpuidle-haltpoll.o
+obj-$(CONFIG_CPUIDLE_DEBUG_FS)		  += cpuidle-debugfs.o
 
 ##################################################################################
 # ARM SoC drivers
diff --git a/drivers/cpuidle/cpuidle-debugfs.c b/drivers/cpuidle/cpuidle-debugfs.c
new file mode 100644
index 000000000000..4ebb9d6debae
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-debugfs.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * cpuidle-debugfs : Debugfs for cpuidle driver
+ */
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+
+#include "cpuidle.h"
+
+#define BUFF_SIZE 12
+
+static int residency_show(void *data, struct seq_file *m)
+{
+	struct cpuidle_state *state = data;
+	char op[BUFF_SIZE];
+
+	snprintf(op, BUFF_SIZE, "%llu\n",
+			ktime_to_us(state->target_residency_ns));
+	seq_puts(m, op);
+	return 0;
+}
+
+static ssize_t residency_write(void *data, const char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct cpuidle_driver *drv = cpuidle_get_driver();
+	struct cpuidle_state *state = data;
+	char opbuf[BUFF_SIZE] = { }, *op;
+	unsigned long long res;
+	int err, i, curr_index = INT_MAX;
+
+	if (count >= BUFF_SIZE) {
+		pr_err("%s: operation too long", __func__);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(opbuf, buf, count))
+		return -EFAULT;
+
+	op = strstrip(opbuf);
+	err = kstrtoull(op, 0, &res);
+	if (err)
+		return err;
+
+	for (i = 0; i < drv->state_count; i++) {
+		if (&drv->states[i] == state) {
+			curr_index = i;
+			break;
+		}
+	}
+	if (curr_index == INT_MAX) {
+		pr_err("state not found\n");
+		return -EFAULT;
+	}
+
+	/* Maintain linearity of residency values */
+	mutex_lock(&cpuidle_lock);
+	if (curr_index && curr_index < drv->state_count - 1) {
+		if (drv->states[curr_index - 1].target_residency < res &&
+		    drv->states[curr_index + 1].target_residency > res) {
+			/* Update both the residency and residency_ns */
+			state->target_residency = (unsigned int)res;
+			state->target_residency_ns =
+				state->target_residency * NSEC_PER_USEC;
+		} else {
+			goto inval;
+		}
+	} else if (!curr_index) {
+		if (drv->states[curr_index + 1].target_residency > res) {
+			state->target_residency = (unsigned int)res;
+			state->target_residency_ns =
+				state->target_residency * NSEC_PER_USEC;
+		} else {
+			goto inval;
+		}
+	} else {
+		if (drv->states[curr_index - 1].target_residency < res) {
+			state->target_residency = (unsigned int)res;
+			state->target_residency_ns =
+				state->target_residency * NSEC_PER_USEC;
+		} else {
+			goto inval;
+		}
+	}
+
+	mutex_unlock(&cpuidle_lock);
+	return count;
+
+inval:
+	pr_err("Input value out of bound\n");
+	mutex_unlock(&cpuidle_lock);
+	return -EINVAL;
+}
+
+static int cpuidle_debugfs_show(struct seq_file *m, void *v)
+{
+	const struct cpuidle_debugfs_attr *attr = m->private;
+	void *data = d_inode(m->file->f_path.dentry->d_parent)->i_private;
+
+	return attr->show(data, m);
+}
+
+static ssize_t cpuidle_debugfs_write(struct file *file, const char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	const struct cpuidle_debugfs_attr *attr = m->private;
+	void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
+
+	if (!attr->write)
+		return -EPERM;
+
+	return attr->write(data, buf, count, ppos);
+}
+
+static int cpuidle_debugfs_open(struct inode *inode, struct file *file)
+{
+	const struct cpuidle_debugfs_attr *attr = inode->i_private;
+
+	if (WARN_ON_ONCE(!attr->show))
+		return -EPERM;
+
+	return single_open(file, cpuidle_debugfs_show, inode->i_private);
+}
+
+
+static const struct file_operations cpuidle_debugfs_fops = {
+	.open		= cpuidle_debugfs_open,
+	.read		= seq_read,
+	.write		= cpuidle_debugfs_write,
+};
+
+static const struct cpuidle_debugfs_attr cpuidle_debugfs_attrs[] = {
+	{ "target_residency", 0600, residency_show, residency_write},
+	/* We can add other attributes like latency in future */
+	{ },
+};
+
+static void debugfs_create_files(struct dentry *parent, void *data,
+				 const struct cpuidle_debugfs_attr *attr)
+{
+	if (IS_ERR_OR_NULL(parent))
+		return;
+
+	d_inode(parent)->i_private = data;
+
+	for (; attr->name; attr++)
+		debugfs_create_file(attr->name, attr->mode, parent,
+				   (void *)attr, &cpuidle_debugfs_fops);
+}
+
+/*
+ * cpuidle_state_debugfs_unregister : creates a debugfs instance for each
+ *				      cpuidle state for cpuidle driver
+ * @index : index of cpuidle state
+ */
+void cpuidle_state_debugfs_unregister(int index)
+{
+	struct cpuidle_driver *drv;
+
+	drv = cpuidle_get_driver();
+	debugfs_remove_recursive(drv->states[index].debugfs_dir);
+}
+
+/*
+ * cpuidle_state_debugfs_register : creates a debugfs instance for each cpuidle
+ *				    state for cpuidle driver
+ * @index : index of cpuidle state
+ */
+void cpuidle_state_debugfs_register(int index)
+{
+	struct cpuidle_driver *drv;
+	struct cpuidle_state *state;
+	char state_name[BUFF_SIZE];
+
+	drv = cpuidle_get_driver();
+	state = &drv->states[index];
+
+	snprintf(state_name, BUFF_SIZE, "state%d", index);
+	state->debugfs_dir = debugfs_create_dir(state_name,
+						cpuidle_debugfs_root);
+
+	debugfs_create_files(state->debugfs_dir, state, cpuidle_debugfs_attrs);
+}
+
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 569dbac443bd..33d7fb25d879 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -23,6 +23,7 @@
 #include <linux/suspend.h>
 #include <linux/tick.h>
 #include <trace/events/power.h>
+#include <linux/debugfs.h>
 
 #include "cpuidle.h"
 
@@ -32,6 +33,10 @@ DEFINE_PER_CPU(struct cpuidle_device, cpuidle_dev);
 DEFINE_MUTEX(cpuidle_lock);
 LIST_HEAD(cpuidle_detected_devices);
 
+#ifdef CONFIG_DEBUG_FS
+struct dentry *cpuidle_debugfs_root;
+#endif
+
 static int enabled_devices;
 static int off __read_mostly;
 static int initialized __read_mostly;
@@ -666,9 +671,12 @@ EXPORT_SYMBOL_GPL(cpuidle_unregister_device);
  */
 void cpuidle_unregister(struct cpuidle_driver *drv)
 {
-	int cpu;
+	int cpu, i;
 	struct cpuidle_device *device;
 
+	for (i = 0; i < drv->state_count; i++)
+		cpuidle_state_debugfs_unregister(i);
+
 	for_each_cpu(cpu, drv->cpumask) {
 		device = &per_cpu(cpuidle_dev, cpu);
 		cpuidle_unregister_device(device);
@@ -692,7 +700,7 @@ EXPORT_SYMBOL_GPL(cpuidle_unregister);
 int cpuidle_register(struct cpuidle_driver *drv,
 		     const struct cpumask *const coupled_cpus)
 {
-	int ret, cpu;
+	int ret, cpu, i;
 	struct cpuidle_device *device;
 
 	ret = cpuidle_register_driver(drv);
@@ -724,6 +732,9 @@ int cpuidle_register(struct cpuidle_driver *drv,
 		break;
 	}
 
+	for (i = 0; i < drv->state_count; i++)
+		cpuidle_state_debugfs_register(i);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(cpuidle_register);
@@ -774,6 +785,10 @@ static int __init cpuidle_init(void)
 
 	latency_notifier_init(&cpuidle_latency_notifier);
 
+#ifdef CONFIG_DEBUG_FS
+	cpuidle_debugfs_root = debugfs_create_dir("cpuidle", NULL);
+#endif
+
 	return 0;
 }
 
diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
index 9f336af17fa6..781a9d0f4b43 100644
--- a/drivers/cpuidle/cpuidle.h
+++ b/drivers/cpuidle/cpuidle.h
@@ -37,6 +37,30 @@ extern void cpuidle_remove_device_sysfs(struct cpuidle_device *device);
 extern int cpuidle_add_sysfs(struct cpuidle_device *dev);
 extern void cpuidle_remove_sysfs(struct cpuidle_device *dev);
 
+#ifdef CONFIG_CPUIDLE_DEBUG_FS
+
+#include <linux/seq_file.h>
+
+extern struct dentry *cpuidle_debugfs_root;
+
+struct cpuidle_debugfs_attr {
+	const char *name;
+	umode_t mode;
+	int (*show)(void *, struct seq_file *);
+	ssize_t (*write)(void *, const char __user *, size_t, loff_t *);
+};
+
+extern void cpuidle_state_debugfs_register(int index);
+extern void cpuidle_state_debugfs_unregister(int index);
+#else
+void cpuidle_state_debugfs_register(int index)
+{
+}
+void cpuidle_state_debugfs_unregister(int index)
+{
+}
+#endif
+
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
 bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state);
 int cpuidle_coupled_state_verify(struct cpuidle_driver *drv);
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 2dbe46b7c213..fc4f2d7d7419 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -70,6 +70,9 @@ struct cpuidle_state {
 	void (*enter_s2idle) (struct cpuidle_device *dev,
 			      struct cpuidle_driver *drv,
 			      int index);
+#ifdef CONFIG_CPUIDLE_DEBUG_FS
+	struct dentry	*debugfs_dir;
+#endif
 };
 
 /* Idle State Flags */
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [RFC] cpuidle : Add debugfs support for cpuidle core
  2019-12-17 14:38 [RFC] cpuidle : Add debugfs support for cpuidle core Abhishek Goel
@ 2019-12-17 16:51 ` Rafael J. Wysocki
  2019-12-18 14:26   ` Oliver O'Halloran
  0 siblings, 1 reply; 5+ messages in thread
From: Rafael J. Wysocki @ 2019-12-17 16:51 UTC (permalink / raw)
  To: Abhishek Goel
  Cc: Linux PM, Linux Kernel Mailing List, Rafael J. Wysocki,
	Daniel Lezcano, Gautham R. Shenoy, Oliver O'Halloran,
	Michael Ellerman, svaidy

On Tue, Dec 17, 2019 at 3:42 PM Abhishek Goel
<huntbag@linux.vnet.ibm.com> wrote:
>
> Up until now, we did not have a way to tune cpuidle attribute like
> residency in kernel. This patch adds support for debugfs in cpuidle core.
> Thereby providing support for tuning cpuidle attributes like residency in
> kernel at runtime.

This is not a good idea in my view, for a couple of reasons.

First off, if the target residency of an idle state is changed, it
effectively becomes a different one and all of the statistics
regarding it become outdated at that point.  Synchronizing that would
be a pain.

Next, governors may get confused if idle state parameters are changed
on the fly.  In particular, the statistics collected by the teo
governor depend on the target residencies of idle states, so if one of
them changes, the governor needs to be reloaded.

Next, idle states are expected to be ordered by the target residency
(and by the exit latency), so their parameters cannot be allowed to
change freely anyway.

Finally, the idle state parameters are expected to reflect the
properties of the hardware, which wouldn't hold any more if they were
allowed to change at any time.

> For example: Tuning residency at runtime can be used to quantify governors
> decision making as governor uses residency as one of the parameter to
> take decision about the state that needs to be entered while idling.

IMO it would be better to introduce a testing cpuidle driver with an
artificial set of idle states (or even such that the set of idle
states to be used by it can be defined by the user e.g. via module
parameters) for this purpose.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] cpuidle : Add debugfs support for cpuidle core
  2019-12-17 16:51 ` Rafael J. Wysocki
@ 2019-12-18 14:26   ` Oliver O'Halloran
  2019-12-19  9:04     ` Rafael J. Wysocki
  0 siblings, 1 reply; 5+ messages in thread
From: Oliver O'Halloran @ 2019-12-18 14:26 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Abhishek Goel, Linux PM, Linux Kernel Mailing List,
	Rafael J. Wysocki, Daniel Lezcano, Gautham R. Shenoy,
	Michael Ellerman, Vaidyanathan Srinivasan

On Wed, Dec 18, 2019 at 3:51 AM Rafael J. Wysocki <rafael@kernel.org> wrote:
>
> On Tue, Dec 17, 2019 at 3:42 PM Abhishek Goel
> <huntbag@linux.vnet.ibm.com> wrote:
> >
> > Up until now, we did not have a way to tune cpuidle attribute like
> > residency in kernel. This patch adds support for debugfs in cpuidle core.
> > Thereby providing support for tuning cpuidle attributes like residency in
> > kernel at runtime.
>
> This is not a good idea in my view, for a couple of reasons.
>
> First off, if the target residency of an idle state is changed, it
> effectively becomes a different one and all of the statistics
> regarding it become outdated at that point.  Synchronizing that would
> be a pain.
>
> Next, governors may get confused if idle state parameters are changed
> on the fly.  In particular, the statistics collected by the teo
> governor depend on the target residencies of idle states, so if one of
> them changes, the governor needs to be reloaded.
>
> Next, idle states are expected to be ordered by the target residency
> (and by the exit latency), so their parameters cannot be allowed to
> change freely anyway.
>
> Finally, the idle state parameters are expected to reflect the
> properties of the hardware, which wouldn't hold any more if they were
> allowed to change at any time.

Certainly does sound like a headache.

> > For example: Tuning residency at runtime can be used to quantify governors
> > decision making as governor uses residency as one of the parameter to
> > take decision about the state that needs to be entered while idling.
>
> IMO it would be better to introduce a testing cpuidle driver with an
> artificial set of idle states (or even such that the set of idle
> states to be used by it can be defined by the user e.g. via module
> parameters) for this purpose.

The motivation for this patch isn't really a desire to test / tune the
governor. It's intended to allow working around a performance problem
caused by using high-latency idle states on some interrupt heavy GPU
workload. The interrupts occur around ~30ms apart which is long enough
for the governor to put the CPU into the deeper states and over the
course of long job the additional wakeup latency adds up. The initial
fix someone came up with was cooking the residency values so the
high-latency states had a residency of +50ms to prevent the govenor
from using them. However, that fix is supposed to go into a bit of
firmware I maintain and I'm not terribly happy with the idea. I'm
fairly sure that ~30ms value is workload dependent and personally I
don't think firmware should be making up numbers to trick specific
kernel versions into doing specific things.

My impression is the right solution is to have the GPU driver set a PM
QoS constraint on the CPUs receiving interrupts while a job is
on-going. However, interrupt latency sensitivity isn't something
that's unique to GPUs so I'm wondering it it makes sense to have the
governor factor in interrupt traffic when deciding what state to use.
Is that something that's been tried before?

Oliver

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] cpuidle : Add debugfs support for cpuidle core
  2019-12-18 14:26   ` Oliver O'Halloran
@ 2019-12-19  9:04     ` Rafael J. Wysocki
  2019-12-20 12:50       ` Abhishek
  0 siblings, 1 reply; 5+ messages in thread
From: Rafael J. Wysocki @ 2019-12-19  9:04 UTC (permalink / raw)
  To: Oliver O'Halloran
  Cc: Rafael J. Wysocki, Abhishek Goel, Linux PM,
	Linux Kernel Mailing List, Rafael J. Wysocki, Daniel Lezcano,
	Gautham R. Shenoy, Michael Ellerman, Vaidyanathan Srinivasan

On Wed, Dec 18, 2019 at 3:26 PM Oliver O'Halloran <oohall@gmail.com> wrote:
>
> On Wed, Dec 18, 2019 at 3:51 AM Rafael J. Wysocki <rafael@kernel.org> wrote:
> >
> > On Tue, Dec 17, 2019 at 3:42 PM Abhishek Goel
> > <huntbag@linux.vnet.ibm.com> wrote:
> > >
> > > Up until now, we did not have a way to tune cpuidle attribute like
> > > residency in kernel. This patch adds support for debugfs in cpuidle core.
> > > Thereby providing support for tuning cpuidle attributes like residency in
> > > kernel at runtime.
> >
> > This is not a good idea in my view, for a couple of reasons.
> >
> > First off, if the target residency of an idle state is changed, it
> > effectively becomes a different one and all of the statistics
> > regarding it become outdated at that point.  Synchronizing that would
> > be a pain.
> >
> > Next, governors may get confused if idle state parameters are changed
> > on the fly.  In particular, the statistics collected by the teo
> > governor depend on the target residencies of idle states, so if one of
> > them changes, the governor needs to be reloaded.
> >
> > Next, idle states are expected to be ordered by the target residency
> > (and by the exit latency), so their parameters cannot be allowed to
> > change freely anyway.
> >
> > Finally, the idle state parameters are expected to reflect the
> > properties of the hardware, which wouldn't hold any more if they were
> > allowed to change at any time.
>
> Certainly does sound like a headache.
>
> > > For example: Tuning residency at runtime can be used to quantify governors
> > > decision making as governor uses residency as one of the parameter to
> > > take decision about the state that needs to be entered while idling.
> >
> > IMO it would be better to introduce a testing cpuidle driver with an
> > artificial set of idle states (or even such that the set of idle
> > states to be used by it can be defined by the user e.g. via module
> > parameters) for this purpose.
>
> The motivation for this patch isn't really a desire to test / tune the
> governor. It's intended to allow working around a performance problem
> caused by using high-latency idle states on some interrupt heavy GPU
> workload. The interrupts occur around ~30ms apart which is long enough
> for the governor to put the CPU into the deeper states and over the
> course of long job the additional wakeup latency adds up. The initial
> fix someone came up with was cooking the residency values so the
> high-latency states had a residency of +50ms to prevent the govenor
> from using them. However, that fix is supposed to go into a bit of
> firmware I maintain and I'm not terribly happy with the idea. I'm
> fairly sure that ~30ms value is workload dependent and personally I
> don't think firmware should be making up numbers to trick specific
> kernel versions into doing specific things.
>
> My impression is the right solution is to have the GPU driver set a PM
> QoS constraint on the CPUs receiving interrupts while a job is
> on-going.

Yes, that would address the GPU problem.

> However, interrupt latency sensitivity isn't something
> that's unique to GPUs so I'm wondering it it makes sense to have the
> governor factor in interrupt traffic when deciding what state to use.
> Is that something that's been tried before?

Yes, that is in the works.

The existing governors should take interrupts into account too in the
form of the expected idle duration corrections, but that may not be
particularly precise.  If the governor currently in use (I guess menu)
doesn't to that, you may try an alternative one (e.g. teo).

That said, work is in progress on taking the actual interrupt
frequency into account in idle duration prediction.

Thanks!

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] cpuidle : Add debugfs support for cpuidle core
  2019-12-19  9:04     ` Rafael J. Wysocki
@ 2019-12-20 12:50       ` Abhishek
  0 siblings, 0 replies; 5+ messages in thread
From: Abhishek @ 2019-12-20 12:50 UTC (permalink / raw)
  To: Rafael J. Wysocki, Oliver O'Halloran
  Cc: Linux PM, Linux Kernel Mailing List, Rafael J. Wysocki,
	Daniel Lezcano, Gautham R. Shenoy, Michael Ellerman,
	Vaidyanathan Srinivasan

Hi Rafael,


On 12/19/2019 02:34 PM, Rafael J. Wysocki wrote:
> On Wed, Dec 18, 2019 at 3:26 PM Oliver O'Halloran <oohall@gmail.com> wrote:
>> On Wed, Dec 18, 2019 at 3:51 AM Rafael J. Wysocki <rafael@kernel.org> wrote:
>>> On Tue, Dec 17, 2019 at 3:42 PM Abhishek Goel
>>> <huntbag@linux.vnet.ibm.com> wrote:
>>>> Up until now, we did not have a way to tune cpuidle attribute like
>>>> residency in kernel. This patch adds support for debugfs in cpuidle core.
>>>> Thereby providing support for tuning cpuidle attributes like residency in
>>>> kernel at runtime.
>>> This is not a good idea in my view, for a couple of reasons.
>>>
>>> First off, if the target residency of an idle state is changed, it
>>> effectively becomes a different one and all of the statistics
>>> regarding it become outdated at that point.  Synchronizing that would
>>> be a pain.
>>>
>>> Next, governors may get confused if idle state parameters are changed
>>> on the fly.  In particular, the statistics collected by the teo
>>> governor depend on the target residencies of idle states, so if one of
>>> them changes, the governor needs to be reloaded.
>>>
>>> Next, idle states are expected to be ordered by the target residency
>>> (and by the exit latency), so their parameters cannot be allowed to
>>> change freely anyway.
>>>
>>> Finally, the idle state parameters are expected to reflect the
>>> properties of the hardware, which wouldn't hold any more if they were
>>> allowed to change at any time.
>> Certainly does sound like a headache.
>>
>>>> For example: Tuning residency at runtime can be used to quantify governors
>>>> decision making as governor uses residency as one of the parameter to
>>>> take decision about the state that needs to be entered while idling.
>>> IMO it would be better to introduce a testing cpuidle driver with an
>>> artificial set of idle states (or even such that the set of idle
>>> states to be used by it can be defined by the user e.g. via module
>>> parameters) for this purpose.
>> The motivation for this patch isn't really a desire to test / tune the
>> governor. It's intended to allow working around a performance problem
>> caused by using high-latency idle states on some interrupt heavy GPU
>> workload. The interrupts occur around ~30ms apart which is long enough
>> for the governor to put the CPU into the deeper states and over the
>> course of long job the additional wakeup latency adds up. The initial
>> fix someone came up with was cooking the residency values so the
>> high-latency states had a residency of +50ms to prevent the govenor
>> from using them. However, that fix is supposed to go into a bit of
>> firmware I maintain and I'm not terribly happy with the idea. I'm
>> fairly sure that ~30ms value is workload dependent and personally I
>> don't think firmware should be making up numbers to trick specific
>> kernel versions into doing specific things.
>>
>> My impression is the right solution is to have the GPU driver set a PM
>> QoS constraint on the CPUs receiving interrupts while a job is
>> on-going.
> Yes, that would address the GPU problem.
>
>> However, interrupt latency sensitivity isn't something
>> that's unique to GPUs so I'm wondering it it makes sense to have the
>> governor factor in interrupt traffic when deciding what state to use.
>> Is that something that's been tried before?
> Yes, that is in the works.
>
> The existing governors should take interrupts into account too in the
> form of the expected idle duration corrections, but that may not be
> particularly precise.  If the governor currently in use (I guess menu)
> doesn't to that, you may try an alternative one (e.g. teo).

For this particular case, I tried TEO but it did not solve the issue.
> That said, work is in progress on taking the actual interrupt
> frequency into account in idle duration prediction.
>
> Thanks!

Could you please point out to the patch looking into device interrupt
frequency for cpuidle.

Thanks.


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-12-20 12:51 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-17 14:38 [RFC] cpuidle : Add debugfs support for cpuidle core Abhishek Goel
2019-12-17 16:51 ` Rafael J. Wysocki
2019-12-18 14:26   ` Oliver O'Halloran
2019-12-19  9:04     ` Rafael J. Wysocki
2019-12-20 12:50       ` Abhishek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).