All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/2] Adds cpu power accounting per-pid basis.
@ 2015-05-15  0:12 Ruchi Kandoi
  2015-05-15  0:12 ` [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state Ruchi Kandoi
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Ruchi Kandoi @ 2015-05-15  0:12 UTC (permalink / raw)
  To: kandoiruchi, Rafael J. Wysocki, Viresh Kumar, Ingo Molnar,
	Peter Zijlstra, Andrew Morton, Oleg Nesterov, Kirill A. Shutemov,
	Vladimir Davydov, Heinrich Schuchardt, Thomas Gleixner,
	Kees Cook, Konstantin Khlebnikov, Davidlohr Bueso, linux-pm,
	linux-kernel

These patches add a mechanism which will accurately caculate the CPU power
used by all the processes in the system. In order to account for the power
used by all the processes a data field "cpu_power" has been added in the
task_struct. This field adds power for both the system as well as user
time. cpu_power contains the total amount of charge(in uAmsec units) used
by the process. This model takes into account the frequency at which the
process was running(i.e higher power for processes running at higher
frequencies). It requires the cpufreq_stats module to be initialized with
the current numbers for each of the CPU core at each frequency. This will
be initialized during init time.

Ruchi Kandoi (2):
  cpufreq_stats: Adds sysfs file    
    /sys/devices/system/cpu/cpufreq/current_in_state
  sched: cpufreq: Adds a field cpu_power in the task_struct

 drivers/cpufreq/cpufreq_stats.c | 191 +++++++++++++++++++++++++++++++++++++++-
 include/linux/cpufreq.h         |   8 ++
 include/linux/sched.h           |   2 +
 kernel/fork.c                   |   1 +
 kernel/sched/cputime.c          |   7 ++
 5 files changed, 207 insertions(+), 2 deletions(-)

-- 
2.2.0.rc0.207.ga3a616c


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state
  2015-05-15  0:12 [PATCH v2 0/2] Adds cpu power accounting per-pid basis Ruchi Kandoi
@ 2015-05-15  0:12 ` Ruchi Kandoi
  2015-05-15  2:48   ` Viresh Kumar
  2015-05-15  0:12 ` [PATCH v2 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct Ruchi Kandoi
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 10+ messages in thread
From: Ruchi Kandoi @ 2015-05-15  0:12 UTC (permalink / raw)
  To: kandoiruchi, Rafael J. Wysocki, Viresh Kumar, Ingo Molnar,
	Peter Zijlstra, Andrew Morton, Oleg Nesterov, Kirill A. Shutemov,
	Vladimir Davydov, Heinrich Schuchardt, Thomas Gleixner,
	Kees Cook, Konstantin Khlebnikov, Davidlohr Bueso, linux-pm,
	linux-kernel

Adds the sysfs file for userspace to initialize the active current
values for all the cores at each of the frequencies.

The format for storing the values is as follows:
echo "CPU<cpu#>:<freq1>=<current in uA> <freq2>=<current>,CPU<cpu#>:
..." > /sys/devices/system/cpu/cpufreq/current_in_state

Signed-off-by: Ruchi Kandoi <kandoiruchi@google.com>
---
 drivers/cpufreq/cpufreq_stats.c | 163 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 161 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 5e370a3..6f0b562 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -30,6 +30,14 @@ struct cpufreq_stats {
 #endif
 };
 
+struct cpufreq_power_stats {
+	unsigned int state_num;
+	unsigned int *curr;
+	unsigned int *freq_table;
+};
+
+static DEFINE_PER_CPU(struct cpufreq_power_stats *, cpufreq_power_stats);
+
 static int cpufreq_stats_update(struct cpufreq_stats *stats)
 {
 	unsigned long long cur_time = get_jiffies_64();
@@ -61,6 +69,87 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	return len;
 }
 
+static void store_current_value(struct cpufreq_power_stats *powerstats,
+		int freq, int curr)
+{
+	int i;
+
+	/* freq_table doesn't contain any CPU_FREQ_INVALID */
+	for (i = 0; i < powerstats->state_num; i++) {
+		if (powerstats->freq_table[i] == freq) {
+			powerstats->curr[i] = curr;
+			break;
+		}
+	}
+}
+
+static ssize_t store_current_in_state(struct cpufreq_policy *policy,
+		const char *buf, size_t len)
+{
+	char *cp, *cp2, *start, *buffer;
+	unsigned int cpu_num, ret, curr, freq;
+	struct cpufreq_power_stats *powerstats;
+
+	if (!buf || len < 0)
+		return len;
+
+	buffer = kzalloc(len + 1, GFP_KERNEL);
+	if (!buffer)
+		return len;
+
+	strncpy(buffer, buf, len);
+	buffer[len] = '\0';
+	cp = buffer;
+	spin_lock(&cpufreq_stats_lock);
+	while ((start = strsep(&cp, ","))) {
+		ret = sscanf(start, "CPU%u:", &cpu_num);
+		if (ret != 1 || cpu_num > (num_possible_cpus() - 1)) {
+			ret = -EINVAL;
+			goto error;
+		}
+		powerstats = per_cpu(cpufreq_power_stats, cpu_num);
+		if (!powerstats)
+			continue;
+
+		/* sscanf makes sure that strchr doesn't return a NULL */
+		cp2 = strchr(start, ':') + 1;
+		while ((start = strsep(&cp2, " "))) {
+			if (sscanf(start, "%u=%u", &freq, &curr) != 2) {
+				ret = -EINVAL;
+				goto error;
+			}
+			store_current_value(powerstats, freq, curr);
+		}
+	}
+	ret = len;
+error:
+	spin_unlock(&cpufreq_stats_lock);
+	kfree(buffer);
+	return ret;
+}
+
+static ssize_t show_current_in_state(struct cpufreq_policy *policy, char *buf)
+{
+	ssize_t len = 0;
+	unsigned int i, cpu;
+	struct cpufreq_power_stats *powerstats;
+
+	spin_lock(&cpufreq_stats_lock);
+	for_each_possible_cpu(cpu) {
+		powerstats = per_cpu(cpufreq_power_stats, cpu);
+		if (!powerstats)
+			continue;
+		len += scnprintf(buf + len, PAGE_SIZE - len, "CPU%d:", cpu);
+		for (i = 0; i < powerstats->state_num; i++)
+			len += scnprintf(buf + len, PAGE_SIZE - len,
+					"%d=%d ", powerstats->freq_table[i],
+					powerstats->curr[i]);
+		len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
+	}
+	spin_unlock(&cpufreq_stats_lock);
+	return len;
+}
+
 #ifdef CONFIG_CPU_FREQ_STAT_DETAILS
 static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 {
@@ -107,6 +196,7 @@ cpufreq_freq_attr_ro(trans_table);
 
 cpufreq_freq_attr_ro(total_trans);
 cpufreq_freq_attr_ro(time_in_state);
+cpufreq_freq_attr_rw(current_in_state);
 
 static struct attribute *default_attrs[] = {
 	&total_trans.attr,
@@ -159,6 +249,67 @@ static void cpufreq_stats_free_table(unsigned int cpu)
 	cpufreq_cpu_put(policy);
 }
 
+static void cpufreq_powerstats_free(void)
+{
+	int cpu;
+	struct cpufreq_power_stats *powerstats;
+
+	sysfs_remove_file(cpufreq_global_kobject, &current_in_state.attr);
+
+	for_each_possible_cpu(cpu) {
+		powerstats = per_cpu(cpufreq_power_stats, cpu);
+		if (!powerstats)
+			continue;
+		kfree(powerstats->curr);
+		kfree(powerstats);
+		per_cpu(cpufreq_power_stats, cpu) = NULL;
+	}
+}
+
+static void cpufreq_powerstats_create(unsigned int cpu)
+{
+	unsigned int alloc_size, i = 0, j = 0, count = 0;
+	struct cpufreq_power_stats *powerstats;
+	struct cpufreq_frequency_table *pos, *table;
+
+	/* We need cpufreq table for creating power stats table */
+	table = cpufreq_frequency_get_table(cpu);
+	if (unlikely(!table))
+		return;
+
+	powerstats = kzalloc(sizeof(struct cpufreq_power_stats),
+			GFP_KERNEL);
+	if (!powerstats)
+		return;
+
+	/* Find total allocation size */
+	cpufreq_for_each_valid_entry(pos, table)
+		count++;
+
+	/* Allocate memory for freq table per cpu as well as clockticks per
+	 * freq*/
+	alloc_size = count * sizeof(unsigned int) +
+		count * sizeof(unsigned int);
+	powerstats->curr = kzalloc(alloc_size, GFP_KERNEL);
+	if (!powerstats->curr) {
+		kfree(powerstats);
+		return;
+	}
+	powerstats->freq_table = powerstats->curr + count;
+
+	spin_lock(&cpufreq_stats_lock);
+	for (i = 0; table[i].frequency != CPUFREQ_TABLE_END && j < count; i++) {
+		unsigned int freq = table[i].frequency;
+
+		if (freq == CPUFREQ_ENTRY_INVALID)
+			continue;
+		powerstats->freq_table[j++] = freq;
+	}
+	powerstats->state_num = j;
+	per_cpu(cpufreq_power_stats, cpu) = powerstats;
+	spin_unlock(&cpufreq_stats_lock);
+}
+
 static int __cpufreq_stats_create_table(struct cpufreq_policy *policy)
 {
 	unsigned int i = 0, count = 0, ret = -ENOMEM;
@@ -249,9 +400,11 @@ static int cpufreq_stat_notifier_policy(struct notifier_block *nb,
 	int ret = 0;
 	struct cpufreq_policy *policy = data;
 
-	if (val == CPUFREQ_CREATE_POLICY)
+	if (val == CPUFREQ_CREATE_POLICY) {
 		ret = __cpufreq_stats_create_table(policy);
-	else if (val == CPUFREQ_REMOVE_POLICY)
+		if (!per_cpu(cpufreq_power_stats, policy->cpu))
+			cpufreq_powerstats_create(policy->cpu);
+	} else if (val == CPUFREQ_REMOVE_POLICY)
 		__cpufreq_stats_free_table(policy);
 
 	return ret;
@@ -335,8 +488,13 @@ static int __init cpufreq_stats_init(void)
 		return ret;
 	}
 
+	ret = sysfs_create_file(cpufreq_global_kobject, &current_in_state.attr);
+	if (ret)
+		pr_warn("Cannot create sysfs file for cpufreq current stats\n");
+
 	return 0;
 }
+
 static void __exit cpufreq_stats_exit(void)
 {
 	unsigned int cpu;
@@ -347,6 +505,7 @@ static void __exit cpufreq_stats_exit(void)
 			CPUFREQ_TRANSITION_NOTIFIER);
 	for_each_online_cpu(cpu)
 		cpufreq_stats_free_table(cpu);
+	cpufreq_powerstats_free();
 }
 
 MODULE_AUTHOR("Zou Nan hai <nanhai.zou@intel.com>");
-- 
2.2.0.rc0.207.ga3a616c


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v2 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct
  2015-05-15  0:12 [PATCH v2 0/2] Adds cpu power accounting per-pid basis Ruchi Kandoi
  2015-05-15  0:12 ` [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state Ruchi Kandoi
@ 2015-05-15  0:12 ` Ruchi Kandoi
  2015-05-15  6:34 ` [PATCH v2 0/2] Adds cpu power accounting per-pid basis Heinrich Schuchardt
  2015-05-21 14:34 ` Daniel Lezcano
  3 siblings, 0 replies; 10+ messages in thread
From: Ruchi Kandoi @ 2015-05-15  0:12 UTC (permalink / raw)
  To: kandoiruchi, Rafael J. Wysocki, Viresh Kumar, Ingo Molnar,
	Peter Zijlstra, Andrew Morton, Oleg Nesterov, Kirill A. Shutemov,
	Vladimir Davydov, Heinrich Schuchardt, Thomas Gleixner,
	Kees Cook, Konstantin Khlebnikov, Davidlohr Bueso, linux-pm,
	linux-kernel

cpu_power has been added to keep track of amount of power each task is
consuming. cpu_power is updated whenever stime and utime are updated for
a task. power is computed by taking into account the frequency at which
the current core was running and the current for cpu actively
running at hat frequency.

Signed-off-by: Ruchi Kandoi <kandoiruchi@google.com>
---
 drivers/cpufreq/cpufreq_stats.c | 28 ++++++++++++++++++++++++++++
 include/linux/cpufreq.h         |  8 ++++++++
 include/linux/sched.h           |  2 ++
 kernel/fork.c                   |  1 +
 kernel/sched/cputime.c          |  7 +++++++
 5 files changed, 46 insertions(+)

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 6f0b562..682ed898 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/cputime.h>
+#include <linux/sched.h>
 
 static spinlock_t cpufreq_stats_lock;
 
@@ -83,6 +84,33 @@ static void store_current_value(struct cpufreq_power_stats *powerstats,
 	}
 }
 
+void acct_update_power(struct task_struct *task, cputime_t cputime)
+{
+	struct cpufreq_power_stats *powerstats;
+	struct cpufreq_stats *stats;
+	struct cpufreq_policy *policy;
+	unsigned int cpu_num, curr;
+
+	if (!task)
+		return;
+	cpu_num = task_cpu(task);
+	powerstats = per_cpu(cpufreq_power_stats, cpu_num);
+	policy = cpufreq_cpu_get(cpu_num);
+	if (!policy)
+		return;
+
+	if (!powerstats || !(policy->stats)) {
+		cpufreq_cpu_put(policy);
+		return;
+	}
+
+	stats = policy->stats;
+	curr = powerstats->curr[stats->last_index];
+	task->cpu_power += curr * cputime_to_usecs(cputime);
+	cpufreq_cpu_put(policy);
+}
+EXPORT_SYMBOL_GPL(acct_update_power);
+
 static ssize_t store_current_in_state(struct cpufreq_policy *policy,
 		const char *buf, size_t len)
 {
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 2ee4888..86826c8 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -18,6 +18,7 @@
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/sysfs.h>
+#include <asm/cputime.h>
 
 /*********************************************************************
  *                        CPUFREQ INTERFACE                          *
@@ -601,4 +602,11 @@ unsigned int cpufreq_generic_get(unsigned int cpu);
 int cpufreq_generic_init(struct cpufreq_policy *policy,
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency);
+
+/*********************************************************************
+ *                         CPUFREQ STATS                             *
+ *********************************************************************/
+
+void acct_update_power(struct task_struct *p, cputime_t cputime);
+
 #endif /* _LINUX_CPUFREQ_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e61..1f2400a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1429,6 +1429,7 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	cputime_t utime, stime, utimescaled, stimescaled;
+	unsigned long long cpu_power;
 	cputime_t gtime;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	struct cputime prev_cputime;
@@ -1441,6 +1442,7 @@ struct task_struct {
 		VTIME_USER,
 		VTIME_SYS,
 	} vtime_snap_whence;
+
 #endif
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	u64 start_time;		/* monotonic time in nsec */
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaa..2ca0e9e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1341,6 +1341,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->utime = p->stime = p->gtime = 0;
 	p->utimescaled = p->stimescaled = 0;
+	p->cpu_power = 0;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	p->prev_cputime.utime = p->prev_cputime.stime = 0;
 #endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8394b1e..53a79d5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,6 +4,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
 #include <linux/context_tracking.h>
+#include <linux/cpufreq.h>
 #include "sched.h"
 
 
@@ -149,6 +150,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 
 	/* Account for user time used */
 	acct_account_cputime(p);
+
+	/* Account power usage for user time */
+	acct_update_power(p, cputime);
 }
 
 /*
@@ -199,6 +203,9 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
 
 	/* Account for system time used */
 	acct_account_cputime(p);
+
+	/* Account power usage for system time */
+	acct_update_power(p, cputime);
 }
 
 /*
-- 
2.2.0.rc0.207.ga3a616c


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state
  2015-05-15  0:12 ` [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state Ruchi Kandoi
@ 2015-05-15  2:48   ` Viresh Kumar
  2015-05-16  0:55     ` Ruchi Kandoi
  0 siblings, 1 reply; 10+ messages in thread
From: Viresh Kumar @ 2015-05-15  2:48 UTC (permalink / raw)
  To: Ruchi Kandoi
  Cc: Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Oleg Nesterov, Kirill A. Shutemov, Vladimir Davydov,
	Heinrich Schuchardt, Thomas Gleixner, Kees Cook,
	Konstantin Khlebnikov, Davidlohr Bueso, linux-pm, linux-kernel

I am not replying for concept here, as sched maintainers are in a
better position for that, but a nit below..

On 14-05-15, 17:12, Ruchi Kandoi wrote:
> Adds the sysfs file for userspace to initialize the active current
> values for all the cores at each of the frequencies.
> 
> The format for storing the values is as follows:
> echo "CPU<cpu#>:<freq1>=<current in uA> <freq2>=<current>,CPU<cpu#>:
> ..." > /sys/devices/system/cpu/cpufreq/current_in_state

Why this file? And not
/sys/devices/system/cpu/cpuX/cpufreq/stats/current_in_state ? That way
you don't have to replicate the same information for all CPUs, as the
stats folder can be shared by multiple CPUs (which share their
clock/voltage rails)..

-- 
viresh

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 0/2] Adds cpu power accounting per-pid basis.
  2015-05-15  0:12 [PATCH v2 0/2] Adds cpu power accounting per-pid basis Ruchi Kandoi
  2015-05-15  0:12 ` [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state Ruchi Kandoi
  2015-05-15  0:12 ` [PATCH v2 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct Ruchi Kandoi
@ 2015-05-15  6:34 ` Heinrich Schuchardt
  2015-05-18 21:00   ` Ruchi Kandoi
  2015-05-21 14:34 ` Daniel Lezcano
  3 siblings, 1 reply; 10+ messages in thread
From: Heinrich Schuchardt @ 2015-05-15  6:34 UTC (permalink / raw)
  To: Ruchi Kandoi, Rafael J. Wysocki, Viresh Kumar, Ingo Molnar,
	Peter Zijlstra, Andrew Morton, Oleg Nesterov, Kirill A. Shutemov,
	Vladimir Davydov, Thomas Gleixner, Kees Cook,
	Konstantin Khlebnikov, Davidlohr Bueso, linux-pm, linux-kernel

On 15.05.2015 02:12, Ruchi Kandoi wrote:
> These patches add a mechanism which will accurately caculate the CPU power
> used by all the processes in the system. In order to account for the power
> used by all the processes a data field "cpu_power" has been added in the
> task_struct.

Hello Ruchi,

could you, please, explain why the CPU power consumption per task
information is needed. Please, consider that the CPU causes only part of
the total system power consumption which also comprises GPU, cooling,
RAM, etc.

The patch series increases the memory size of the kernel, the memory
consumption per thread and the thread switching time. So, please,
introduce a configuration switch to enable/disable the function.

> This field adds power for both the system as well as user
> time. cpu_power contains the total amount of charge(in uAmsec units) used
> by the process.

Is there any reasonable way to assign the power consumption to a single
task if multiple tasks are executed on the same core at the same time
(e.g. using hyperthreading)?

> This model takes into account the frequency at which the
> process was running(i.e higher power for processes running at higher
> frequencies). It requires the cpufreq_stats module to be initialized with
> the current numbers for each of the CPU core at each frequency. This will
> be initialized during init time.

This does not account for power consumption depending on anything else
but frequency, e.g. floating point commands consuming more power than NOPs.

Best regards

Heinrich Schuchardt
> 
> Ruchi Kandoi (2):
>   cpufreq_stats: Adds sysfs file    
>     /sys/devices/system/cpu/cpufreq/current_in_state
>   sched: cpufreq: Adds a field cpu_power in the task_struct
> 
>  drivers/cpufreq/cpufreq_stats.c | 191 +++++++++++++++++++++++++++++++++++++++-
>  include/linux/cpufreq.h         |   8 ++
>  include/linux/sched.h           |   2 +
>  kernel/fork.c                   |   1 +
>  kernel/sched/cputime.c          |   7 ++
>  5 files changed, 207 insertions(+), 2 deletions(-)
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state
  2015-05-15  2:48   ` Viresh Kumar
@ 2015-05-16  0:55     ` Ruchi Kandoi
  2015-05-16  2:15       ` Viresh Kumar
  0 siblings, 1 reply; 10+ messages in thread
From: Ruchi Kandoi @ 2015-05-16  0:55 UTC (permalink / raw)
  To: Viresh Kumar
  Cc: Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Oleg Nesterov, Kirill A. Shutemov, Vladimir Davydov,
	Heinrich Schuchardt, Thomas Gleixner, Kees Cook,
	Konstantin Khlebnikov, Davidlohr Bueso, linux-pm, linux-kernel

On Thu, May 14, 2015 at 7:48 PM, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> I am not replying for concept here, as sched maintainers are in a
> better position for that, but a nit below..
>
> On 14-05-15, 17:12, Ruchi Kandoi wrote:
>> Adds the sysfs file for userspace to initialize the active current
>> values for all the cores at each of the frequencies.
>>
>> The format for storing the values is as follows:
>> echo "CPU<cpu#>:<freq1>=<current in uA> <freq2>=<current>,CPU<cpu#>:
>> ..." > /sys/devices/system/cpu/cpufreq/current_in_state
>
> Why this file? And not
> /sys/devices/system/cpu/cpuX/cpufreq/stats/current_in_state ? That way
> you don't have to replicate the same information for all CPUs, as the
> stats folder can be shared by multiple CPUs (which share their
> clock/voltage rails)..

Some of the hand-held devices support hot-plugging of the cpus and
when the core is hot-plugged out the
/sys/devices/system/cpu/cpuX/cpufreq directory is removed too. So it
won't be possible to share folders by multiple CPUs.

>
> --
> viresh

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state
  2015-05-16  0:55     ` Ruchi Kandoi
@ 2015-05-16  2:15       ` Viresh Kumar
  0 siblings, 0 replies; 10+ messages in thread
From: Viresh Kumar @ 2015-05-16  2:15 UTC (permalink / raw)
  To: Ruchi Kandoi
  Cc: Rafael J. Wysocki, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Oleg Nesterov, Kirill A. Shutemov, Vladimir Davydov,
	Heinrich Schuchardt, Thomas Gleixner, Kees Cook,
	Konstantin Khlebnikov, Davidlohr Bueso, linux-pm, linux-kernel

On 15-05-15, 17:55, Ruchi Kandoi wrote:
> Some of the hand-held devices support hot-plugging of the cpus and
> when the core is hot-plugged out the
> /sys/devices/system/cpu/cpuX/cpufreq directory is removed too. So it
> won't be possible to share folders by multiple CPUs.

Okay, that is changing now..

http://permalink.gmane.org/gmane.linux.power-management.general/59841

-- 
viresh

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 0/2] Adds cpu power accounting per-pid basis.
  2015-05-15  6:34 ` [PATCH v2 0/2] Adds cpu power accounting per-pid basis Heinrich Schuchardt
@ 2015-05-18 21:00   ` Ruchi Kandoi
  0 siblings, 0 replies; 10+ messages in thread
From: Ruchi Kandoi @ 2015-05-18 21:00 UTC (permalink / raw)
  To: Heinrich Schuchardt
  Cc: Rafael J. Wysocki, Viresh Kumar, Ingo Molnar, Peter Zijlstra,
	Andrew Morton, Oleg Nesterov, Kirill A. Shutemov,
	Vladimir Davydov, Thomas Gleixner, Kees Cook,
	Konstantin Khlebnikov, Davidlohr Bueso, linux-pm, linux-kernel

On Thu, May 14, 2015 at 11:34 PM, Heinrich Schuchardt
<xypron.glpk@gmx.de> wrote:
>
> On 15.05.2015 02:12, Ruchi Kandoi wrote:
> > These patches add a mechanism which will accurately caculate the CPU power
> > used by all the processes in the system. In order to account for the power
> > used by all the processes a data field "cpu_power" has been added in the
> > task_struct.
>
> Hello Ruchi,
>
> could you, please, explain why the CPU power consumption per task
> information is needed. Please, consider that the CPU causes only part of
> the total system power consumption which also comprises GPU, cooling,
> RAM, etc.

In order to accurately account for the battery used by each of the
process, keeping a track of how long the process ran is not
sufficient. Since running at different frequency has varying power
consumption, we want to track a power number which takes into
consideration the frequency as well as the core on which it was
running. There are similar efforts for other subsystems too to account
for the power used by each process which can then accurately be
aggregated for an application.
>
> The patch series increases the memory size of the kernel, the memory
> consumption per thread and the thread switching time. So, please,
> introduce a configuration switch to enable/disable the function.
>

Yes, configuration can be added. Will update that in the next patch.

> > This field adds power for both the system as well as user
> > time. cpu_power contains the total amount of charge(in uAmsec units) used
> > by the process.
>
> Is there any reasonable way to assign the power consumption to a single
> task if multiple tasks are executed on the same core at the same time
> (e.g. using hyperthreading)?
>
I think the power will be accounted for both the processes on their
respective cores. With hyperthreading, as far as kernel is concerned
they are running on different cores and the time for all the tasks
will be accounted appropriately and hence power. Correct me if I am
wrong.

> > This model takes into account the frequency at which the
> > process was running(i.e higher power for processes running at higher
> > frequencies). It requires the cpufreq_stats module to be initialized with
> > the current numbers for each of the CPU core at each frequency. This will
> > be initialized during init time.
>
> This does not account for power consumption depending on anything else
> but frequency, e.g. floating point commands consuming more power than NOPs.

Currently we have been able to get power numbers for a core when they
are active and running at a particular frequency. Agreed that will be
a better and more accurate mode.But getting the power numbers for the
type of instruction and keeping track of number of such instructions
will be cumbersome.
>
>
> Best regards
>
> Heinrich Schuchardt
> >
> > Ruchi Kandoi (2):
> >   cpufreq_stats: Adds sysfs file
> >     /sys/devices/system/cpu/cpufreq/current_in_state
> >   sched: cpufreq: Adds a field cpu_power in the task_struct
> >
> >  drivers/cpufreq/cpufreq_stats.c | 191 +++++++++++++++++++++++++++++++++++++++-
> >  include/linux/cpufreq.h         |   8 ++
> >  include/linux/sched.h           |   2 +
> >  kernel/fork.c                   |   1 +
> >  kernel/sched/cputime.c          |   7 ++
> >  5 files changed, 207 insertions(+), 2 deletions(-)
> >
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 0/2] Adds cpu power accounting per-pid basis.
  2015-05-15  0:12 [PATCH v2 0/2] Adds cpu power accounting per-pid basis Ruchi Kandoi
                   ` (2 preceding siblings ...)
  2015-05-15  6:34 ` [PATCH v2 0/2] Adds cpu power accounting per-pid basis Heinrich Schuchardt
@ 2015-05-21 14:34 ` Daniel Lezcano
  2015-05-28 19:37   ` Ruchi Kandoi
  3 siblings, 1 reply; 10+ messages in thread
From: Daniel Lezcano @ 2015-05-21 14:34 UTC (permalink / raw)
  To: Ruchi Kandoi, Rafael J. Wysocki, Viresh Kumar, Ingo Molnar,
	Peter Zijlstra, Andrew Morton, Oleg Nesterov, Kirill A. Shutemov,
	Vladimir Davydov, Heinrich Schuchardt, Thomas Gleixner,
	Kees Cook, Konstantin Khlebnikov, Davidlohr Bueso, linux-pm,
	linux-kernel

Hi Ruchi,

On 05/15/2015 02:12 AM, Ruchi Kandoi wrote:
> These patches add a mechanism which will accurately caculate the CPU power
> used by all the processes in the system. In order to account for the power
> used by all the processes a data field "cpu_power" has been added in the
> task_struct.

The term 'energy' makes more sense than 'power'.

> This field adds power for both the system as well as user
> time. cpu_power contains the total amount of charge(in uAmsec units) used

Why not use the Joules unit ?

> by the process. This model takes into account the frequency at which the
> process was running(i.e higher power for processes running at higher
> frequencies). It requires the cpufreq_stats module to be initialized with
> the current numbers for each of the CPU core at each frequency. This will
> be initialized during init time.

The energy task accounting is an interesting feature in my opinion. But 
your patchset does not deal with the power management hardware complexity.

If we reduce the scope of the task energy accounting to the cpu, we are 
facing several issues:

  * A cpu may be supposed to run at a specific OPP but it could share a 
clock line with another cpu which is in a higher frequency. So the 
frequency is actually at a higher rate than what is assumed

  * The firmware may override the cpufreq decisions

  * A process may be idle but its behavior forces the cpuidle governor 
to choose shallow states (that won't occur without the process). For 
example, the process is using very short timers, does a small processing 
and then go to sleep again waiting for the next timer expiration. The 
result will be a process having a low energy consumption but actually 
because of these timers, it will prevent the cpu to enter deep idle state

Beside that, the process may be soliciting a subsystem (another process 
or hardware) which consumes a lot of energy. That won't be accounted 
even if the process is responsible of this extra consumption.

And the last point is: how do you expect to have the energy numbers as 
nobody is willing to share them for their platform ?

> Ruchi Kandoi (2):
>    cpufreq_stats: Adds sysfs file
>      /sys/devices/system/cpu/cpufreq/current_in_state
>    sched: cpufreq: Adds a field cpu_power in the task_struct
>
>   drivers/cpufreq/cpufreq_stats.c | 191 +++++++++++++++++++++++++++++++++++++++-
>   include/linux/cpufreq.h         |   8 ++
>   include/linux/sched.h           |   2 +
>   kernel/fork.c                   |   1 +
>   kernel/sched/cputime.c          |   7 ++
>   5 files changed, 207 insertions(+), 2 deletions(-)



-- 
  <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2 0/2] Adds cpu power accounting per-pid basis.
  2015-05-21 14:34 ` Daniel Lezcano
@ 2015-05-28 19:37   ` Ruchi Kandoi
  0 siblings, 0 replies; 10+ messages in thread
From: Ruchi Kandoi @ 2015-05-28 19:37 UTC (permalink / raw)
  To: Daniel Lezcano
  Cc: Rafael J. Wysocki, Viresh Kumar, Ingo Molnar, Peter Zijlstra,
	Andrew Morton, Oleg Nesterov, Kirill A. Shutemov,
	Vladimir Davydov, Heinrich Schuchardt, Thomas Gleixner,
	Kees Cook, Konstantin Khlebnikov, Davidlohr Bueso, linux-pm,
	linux-kernel

On Thu, May 21, 2015 at 7:34 AM, Daniel Lezcano
<daniel.lezcano@linaro.org> wrote:
> Hi Ruchi,
>
> On 05/15/2015 02:12 AM, Ruchi Kandoi wrote:
>>
>> These patches add a mechanism which will accurately caculate the CPU power
>> used by all the processes in the system. In order to account for the power
>> used by all the processes a data field "cpu_power" has been added in the
>> task_struct.
>
>
> The term 'energy' makes more sense than 'power'.

>> This field adds power for both the system as well as user
>> time. cpu_power contains the total amount of charge(in uAmsec units) used
>
>
> Why not use the Joules unit ?
>
Because most of the devices working on battery has their capacity
defined in mAh(to avoid floating point and to prevent losing precision
uAmsec is used). It will be be easier to keep it in that unit so that
it can be aggregated when we are trying to find the total capacity
which was used by a process(which will be combined for a particular
application).

>> by the process. This model takes into account the frequency at which the
>> process was running(i.e higher power for processes running at higher
>> frequencies). It requires the cpufreq_stats module to be initialized with
>> the current numbers for each of the CPU core at each frequency. This will
>> be initialized during init time.
>
>
> The energy task accounting is an interesting feature in my opinion. But your
> patchset does not deal with the power management hardware complexity.
>
> If we reduce the scope of the task energy accounting to the cpu, we are
> facing several issues:
>
>  * A cpu may be supposed to run at a specific OPP but it could share a clock
> line with another cpu which is in a higher frequency. So the frequency is
> actually at a higher rate than what is assumed
>
>  * The firmware may override the cpufreq decisions
>
>  * A process may be idle but its behavior forces the cpuidle governor to
> choose shallow states (that won't occur without the process). For example,
> the process is using very short timers, does a small processing and then go
> to sleep again waiting for the next timer expiration. The result will be a
> process having a low energy consumption but actually because of these
> timers, it will prevent the cpu to enter deep idle state
>
> Beside that, the process may be soliciting a subsystem (another process or
> hardware) which consumes a lot of energy. That won't be accounted even if
> the process is responsible of this extra consumption.
>
True, there will be cases where the accounting for the energy/power
will be deceptive, because we are not taking into consideration the
idle time and time intervals between which the process is running.
This was aimed to be a simplistic model where only the active time for
the process were taken into account and the processes were blamed for
the active power that they are consuming.
 There are similar efforts for other subsystem too which will be
keeping track of the subsystem power used by a particular pid/uid.

> And the last point is: how do you expect to have the energy numbers as
> nobody is willing to share them for their platform ?
>
This is a tough question. Yes it is difficult to get these numbers,
but I don't think it is unfeasible. We get some numbers from SoC
vendors for the CPUs, trying to drive it to a point where we can get
more accurate numbers.
>> Ruchi Kandoi (2):
>>    cpufreq_stats: Adds sysfs file
>>      /sys/devices/system/cpu/cpufreq/current_in_state
>>    sched: cpufreq: Adds a field cpu_power in the task_struct
>>
>>   drivers/cpufreq/cpufreq_stats.c | 191
>> +++++++++++++++++++++++++++++++++++++++-
>>   include/linux/cpufreq.h         |   8 ++
>>   include/linux/sched.h           |   2 +
>>   kernel/fork.c                   |   1 +
>>   kernel/sched/cputime.c          |   7 ++
>>   5 files changed, 207 insertions(+), 2 deletions(-)
>
>
>
>
> --
>  <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>
> Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
> <http://twitter.com/#!/linaroorg> Twitter |
> <http://www.linaro.org/linaro-blog/> Blog
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-05-28 19:37 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-15  0:12 [PATCH v2 0/2] Adds cpu power accounting per-pid basis Ruchi Kandoi
2015-05-15  0:12 ` [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state Ruchi Kandoi
2015-05-15  2:48   ` Viresh Kumar
2015-05-16  0:55     ` Ruchi Kandoi
2015-05-16  2:15       ` Viresh Kumar
2015-05-15  0:12 ` [PATCH v2 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct Ruchi Kandoi
2015-05-15  6:34 ` [PATCH v2 0/2] Adds cpu power accounting per-pid basis Heinrich Schuchardt
2015-05-18 21:00   ` Ruchi Kandoi
2015-05-21 14:34 ` Daniel Lezcano
2015-05-28 19:37   ` Ruchi Kandoi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.