linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 0/3] psi: support cgroup v1
@ 2019-06-04  1:57 Joseph Qi
  2019-06-04  1:57 ` [RFC PATCH 1/3] psi: make cgroup psi helpers public Joseph Qi
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Joseph Qi @ 2019-06-04  1:57 UTC (permalink / raw)
  To: linux-mm, cgroups
  Cc: Johannes Weiner, akpm, Tejun Heo, Jiufei Xue, Caspar Zhang, Joseph Qi

Currently psi supports system-wide as well as cgroup2. Since most use
cases are still on cgroup v1, this patchset is trying to do such
support.

Joseph Qi (3):
  psi: make cgroup psi helpers public
  psi: cgroup v1 support
  psi: add cgroup v1 interfaces

 block/blk-throttle.c   | 10 +++++++
 include/linux/cgroup.h | 21 ++++++++++++++
 kernel/cgroup/cgroup.c | 33 +++++++++++----------
 kernel/sched/cpuacct.c | 10 +++++++
 kernel/sched/psi.c     | 65 ++++++++++++++++++++++++++++++++++++------
 mm/memcontrol.c        | 10 +++++++
 6 files changed, 125 insertions(+), 24 deletions(-)

-- 
2.19.1.856.g8858448bb


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [RFC PATCH 1/3] psi: make cgroup psi helpers public
  2019-06-04  1:57 [RFC PATCH 0/3] psi: support cgroup v1 Joseph Qi
@ 2019-06-04  1:57 ` Joseph Qi
  2019-06-04  1:57 ` [RFC PATCH 2/3] psi: cgroup v1 support Joseph Qi
  2019-06-04  1:57 ` [RFC PATCH 3/3] psi: add cgroup v1 interfaces Joseph Qi
  2 siblings, 0 replies; 6+ messages in thread
From: Joseph Qi @ 2019-06-04  1:57 UTC (permalink / raw)
  To: linux-mm, cgroups
  Cc: Johannes Weiner, akpm, Tejun Heo, Jiufei Xue, Caspar Zhang, Joseph Qi

Make cgroup psi helpers public for later cgroup v1 support.

Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
---
 include/linux/cgroup.h | 21 +++++++++++++++++++++
 kernel/cgroup/cgroup.c | 33 ++++++++++++++++++---------------
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c0077adeea83..a5adb98490c9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -682,6 +682,27 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
 
 void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
 					char *buf, size_t buflen);
+
+#ifdef CONFIG_PSI
+int cgroup_io_pressure_show(struct seq_file *seq, void *v);
+int cgroup_memory_pressure_show(struct seq_file *seq, void *v);
+int cgroup_cpu_pressure_show(struct seq_file *seq, void *v);
+
+ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes,
+				 loff_t off);
+ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+				     char *buf, size_t nbytes,
+				     loff_t off);
+ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes,
+				  loff_t off);
+
+__poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+			      struct poll_table_struct *pt);
+void cgroup_pressure_release(struct kernfs_open_file *of);
+#endif /* CONFIG_PSI */
+
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 426a0026225c..cd3207454f8c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3550,21 +3550,23 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
 }
 
 #ifdef CONFIG_PSI
-static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
+int cgroup_io_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
 	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
 
 	return psi_show(seq, psi, PSI_IO);
 }
-static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
+
+int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
 	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
 
 	return psi_show(seq, psi, PSI_MEM);
 }
-static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
+
+int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgroup = seq_css(seq)->cgroup;
 	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
@@ -3598,34 +3600,35 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
 	return nbytes;
 }
 
-static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
-					  char *buf, size_t nbytes,
-					  loff_t off)
+ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes,
+				 loff_t off)
 {
 	return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
 }
 
-static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
-					  char *buf, size_t nbytes,
-					  loff_t off)
+ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+				     char *buf, size_t nbytes,
+				     loff_t off)
 {
 	return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
 }
 
-static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
-					  char *buf, size_t nbytes,
-					  loff_t off)
+ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+				  char *buf, size_t nbytes,
+				  loff_t off)
 {
 	return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
 }
 
-static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
-					  poll_table *pt)
+struct poll_table_struct;
+__poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+			      struct poll_table_struct *pt)
 {
 	return psi_trigger_poll(&of->priv, of->file, pt);
 }
 
-static void cgroup_pressure_release(struct kernfs_open_file *of)
+void cgroup_pressure_release(struct kernfs_open_file *of)
 {
 	psi_trigger_replace(&of->priv, NULL);
 }
-- 
2.19.1.856.g8858448bb


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC PATCH 2/3] psi: cgroup v1 support
  2019-06-04  1:57 [RFC PATCH 0/3] psi: support cgroup v1 Joseph Qi
  2019-06-04  1:57 ` [RFC PATCH 1/3] psi: make cgroup psi helpers public Joseph Qi
@ 2019-06-04  1:57 ` Joseph Qi
  2019-06-04 11:55   ` Johannes Weiner
  2019-06-04  1:57 ` [RFC PATCH 3/3] psi: add cgroup v1 interfaces Joseph Qi
  2 siblings, 1 reply; 6+ messages in thread
From: Joseph Qi @ 2019-06-04  1:57 UTC (permalink / raw)
  To: linux-mm, cgroups
  Cc: Johannes Weiner, akpm, Tejun Heo, Jiufei Xue, Caspar Zhang, Joseph Qi

Implements pressure stall tracking for cgroup v1.

Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
---
 kernel/sched/psi.c | 65 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7acc632c3b82..909083c828d5 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -719,13 +719,30 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
 	return state_mask;
 }
 
-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static struct cgroup *psi_task_cgroup(struct task_struct *task, enum psi_res res)
+{
+	switch (res) {
+	case NR_PSI_RESOURCES:
+		return task_dfl_cgroup(task);
+	case PSI_IO:
+		return task_cgroup(task, io_cgrp_subsys.id);
+	case PSI_MEM:
+		return task_cgroup(task, memory_cgrp_subsys.id);
+	case PSI_CPU:
+		return task_cgroup(task, cpu_cgrp_subsys.id);
+	default:  /* won't reach here */
+		return NULL;
+	}
+}
+
+static struct psi_group *iterate_groups(struct task_struct *task, void **iter,
+					enum psi_res res)
 {
 #ifdef CONFIG_CGROUPS
 	struct cgroup *cgroup = NULL;
 
 	if (!*iter)
-		cgroup = task->cgroups->dfl_cgrp;
+		cgroup = psi_task_cgroup(task, res);
 	else if (*iter == &psi_system)
 		return NULL;
 	else
@@ -776,15 +793,45 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 		     wq_worker_last_func(task) == psi_avgs_work))
 		wake_clock = false;
 
-	while ((group = iterate_groups(task, &iter))) {
-		u32 state_mask = psi_group_change(group, cpu, clear, set);
+	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys) ||
+	    cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
+	    cgroup_subsys_on_dfl(io_cgrp_subsys)) {
+		while ((group = iterate_groups(task, &iter, NR_PSI_RESOURCES))) {
+			u32 state_mask = psi_group_change(group, cpu, clear, set);
 
-		if (state_mask & group->poll_states)
-			psi_schedule_poll_work(group, 1);
+			if (state_mask & group->poll_states)
+				psi_schedule_poll_work(group, 1);
 
-		if (wake_clock && !delayed_work_pending(&group->avgs_work))
-			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+			if (wake_clock && !delayed_work_pending(&group->avgs_work))
+				schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+		}
+	} else {
+		enum psi_task_count i;
+		enum psi_res res;
+		int psi_flags = clear | set;
+
+		for (i = NR_IOWAIT; i < NR_PSI_TASK_COUNTS; i++) {
+			if ((i == NR_IOWAIT) && (psi_flags & TSK_IOWAIT))
+				res = PSI_IO;
+			else if ((i == NR_MEMSTALL) && (psi_flags & TSK_MEMSTALL))
+				res = PSI_MEM;
+			else if ((i == NR_RUNNING) && (psi_flags & TSK_RUNNING))
+				res = PSI_CPU;
+			else
+				continue;
+
+			while ((group = iterate_groups(task, &iter, res))) {
+				u32 state_mask = psi_group_change(group, cpu, clear, set);
+
+				if (state_mask & group->poll_states)
+					psi_schedule_poll_work(group, 1);
+
+				if (wake_clock && !delayed_work_pending(&group->avgs_work))
+					schedule_delayed_work(&group->avgs_work, PSI_FREQ);
+			}
+		}
 	}
+
 }
 
 void psi_memstall_tick(struct task_struct *task, int cpu)
@@ -792,7 +839,7 @@ void psi_memstall_tick(struct task_struct *task, int cpu)
 	struct psi_group *group;
 	void *iter = NULL;
 
-	while ((group = iterate_groups(task, &iter))) {
+	while ((group = iterate_groups(task, &iter, PSI_MEM))) {
 		struct psi_group_cpu *groupc;
 
 		groupc = per_cpu_ptr(group->pcpu, cpu);
-- 
2.19.1.856.g8858448bb


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [RFC PATCH 3/3] psi: add cgroup v1 interfaces
  2019-06-04  1:57 [RFC PATCH 0/3] psi: support cgroup v1 Joseph Qi
  2019-06-04  1:57 ` [RFC PATCH 1/3] psi: make cgroup psi helpers public Joseph Qi
  2019-06-04  1:57 ` [RFC PATCH 2/3] psi: cgroup v1 support Joseph Qi
@ 2019-06-04  1:57 ` Joseph Qi
  2 siblings, 0 replies; 6+ messages in thread
From: Joseph Qi @ 2019-06-04  1:57 UTC (permalink / raw)
  To: linux-mm, cgroups
  Cc: Johannes Weiner, akpm, Tejun Heo, Jiufei Xue, Caspar Zhang, Joseph Qi

For cgroup v1, interfaces are under each subsystem.
/sys/fs/cgroup/cpuacct/cpu.pressure
/sys/fs/cgroup/memory/memory.pressure
/sys/fs/cgroup/blkio/io.pressure

Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
---
 block/blk-throttle.c   | 10 ++++++++++
 kernel/sched/cpuacct.c | 10 ++++++++++
 mm/memcontrol.c        | 10 ++++++++++
 3 files changed, 30 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9ea7c0ecad10..b802262ecf8a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1510,6 +1510,16 @@ static struct cftype throtl_legacy_files[] = {
 		.private = (unsigned long)&blkcg_policy_throtl,
 		.seq_show = blkg_print_stat_ios_recursive,
 	},
+#ifdef CONFIG_PSI
+	{
+		.name = "io.pressure",
+		.flags = CFTYPE_NO_PREFIX,
+		.seq_show = cgroup_io_pressure_show,
+		.write = cgroup_io_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
+	},
+#endif /* CONFIG_PSI */
 	{ }	/* terminate */
 };
 
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9fbb10383434..58ccfaf996aa 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -327,6 +327,16 @@ static struct cftype files[] = {
 		.name = "stat",
 		.seq_show = cpuacct_stats_show,
 	},
+#ifdef CONFIG_PSI
+	{
+		.name = "cpu.pressure",
+		.flags = CFTYPE_NO_PREFIX,
+		.seq_show = cgroup_cpu_pressure_show,
+		.write = cgroup_cpu_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
+	},
+#endif /* CONFIG_PSI */
 	{ }	/* terminate */
 };
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ca0bc6e6be13..4fc752719412 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4391,6 +4391,16 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
+#ifdef CONFIG_PSI
+	{
+		.name = "memory.pressure",
+		.flags = CFTYPE_NO_PREFIX,
+		.seq_show = cgroup_memory_pressure_show,
+		.write = cgroup_memory_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
+	},
+#endif /* CONFIG_PSI */
 	{ },	/* terminate */
 };
 
-- 
2.19.1.856.g8858448bb


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH 2/3] psi: cgroup v1 support
  2019-06-04  1:57 ` [RFC PATCH 2/3] psi: cgroup v1 support Joseph Qi
@ 2019-06-04 11:55   ` Johannes Weiner
  2019-06-05  1:15     ` Joseph Qi
  0 siblings, 1 reply; 6+ messages in thread
From: Johannes Weiner @ 2019-06-04 11:55 UTC (permalink / raw)
  To: Joseph Qi; +Cc: linux-mm, cgroups, akpm, Tejun Heo, Jiufei Xue, Caspar Zhang

On Tue, Jun 04, 2019 at 09:57:44AM +0800, Joseph Qi wrote:
> Implements pressure stall tracking for cgroup v1.
> 
> Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
> ---
>  kernel/sched/psi.c | 65 +++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 56 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 7acc632c3b82..909083c828d5 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -719,13 +719,30 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
>  	return state_mask;
>  }
>  
> -static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> +static struct cgroup *psi_task_cgroup(struct task_struct *task, enum psi_res res)
> +{
> +	switch (res) {
> +	case NR_PSI_RESOURCES:
> +		return task_dfl_cgroup(task);
> +	case PSI_IO:
> +		return task_cgroup(task, io_cgrp_subsys.id);
> +	case PSI_MEM:
> +		return task_cgroup(task, memory_cgrp_subsys.id);
> +	case PSI_CPU:
> +		return task_cgroup(task, cpu_cgrp_subsys.id);
> +	default:  /* won't reach here */
> +		return NULL;
> +	}
> +}
> +
> +static struct psi_group *iterate_groups(struct task_struct *task, void **iter,
> +					enum psi_res res)
>  {
>  #ifdef CONFIG_CGROUPS
>  	struct cgroup *cgroup = NULL;
>  
>  	if (!*iter)
> -		cgroup = task->cgroups->dfl_cgrp;
> +		cgroup = psi_task_cgroup(task, res);
>  	else if (*iter == &psi_system)
>  		return NULL;
>  	else
> @@ -776,15 +793,45 @@ void psi_task_change(struct task_struct *task, int clear, int set)
>  		     wq_worker_last_func(task) == psi_avgs_work))
>  		wake_clock = false;
>  
> -	while ((group = iterate_groups(task, &iter))) {
> -		u32 state_mask = psi_group_change(group, cpu, clear, set);
> +	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys) ||
> +	    cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
> +	    cgroup_subsys_on_dfl(io_cgrp_subsys)) {
> +		while ((group = iterate_groups(task, &iter, NR_PSI_RESOURCES))) {
> +			u32 state_mask = psi_group_change(group, cpu, clear, set);
>  
> -		if (state_mask & group->poll_states)
> -			psi_schedule_poll_work(group, 1);
> +			if (state_mask & group->poll_states)
> +				psi_schedule_poll_work(group, 1);
>  
> -		if (wake_clock && !delayed_work_pending(&group->avgs_work))
> -			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> +			if (wake_clock && !delayed_work_pending(&group->avgs_work))
> +				schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> +		}
> +	} else {
> +		enum psi_task_count i;
> +		enum psi_res res;
> +		int psi_flags = clear | set;
> +
> +		for (i = NR_IOWAIT; i < NR_PSI_TASK_COUNTS; i++) {
> +			if ((i == NR_IOWAIT) && (psi_flags & TSK_IOWAIT))
> +				res = PSI_IO;
> +			else if ((i == NR_MEMSTALL) && (psi_flags & TSK_MEMSTALL))
> +				res = PSI_MEM;
> +			else if ((i == NR_RUNNING) && (psi_flags & TSK_RUNNING))
> +				res = PSI_CPU;
> +			else
> +				continue;
> +
> +			while ((group = iterate_groups(task, &iter, res))) {
> +				u32 state_mask = psi_group_change(group, cpu, clear, set);

This doesn't work. Each resource state is composed of all possible
task states:

static bool test_state(unsigned int *tasks, enum psi_states state)
{
	switch (state) {
	case PSI_IO_SOME:
		return tasks[NR_IOWAIT];
	case PSI_IO_FULL:
		return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
	case PSI_MEM_SOME:
		return tasks[NR_MEMSTALL];
	case PSI_MEM_FULL:
		return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
	case PSI_CPU_SOME:
		return tasks[NR_RUNNING] > 1;
	case PSI_NONIDLE:
		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
			tasks[NR_RUNNING];
	default:
		return false;
	}
}

So the IO controller needs to know of NR_RUNNING to tell some vs full,
the memory controller needs to know of NR_IOWAIT to tell nonidle etc.

You need to run the full psi task tracking and aggregation machinery
separately for each of the different cgroups a task can be in in v1.

Needless to say, that is expensive. For cpu, memory and io, it's
triple the scheduling overhead with three ancestor walks and three
times the cache footprint; three times more aggregation workers every
two seconds... We could never turn this on per default.

Have you considered just co-mounting cgroup2, if for nothing else, to
get the pressure numbers?


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH 2/3] psi: cgroup v1 support
  2019-06-04 11:55   ` Johannes Weiner
@ 2019-06-05  1:15     ` Joseph Qi
  0 siblings, 0 replies; 6+ messages in thread
From: Joseph Qi @ 2019-06-05  1:15 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: linux-mm, cgroups, akpm, Tejun Heo, Jiufei Xue, Caspar Zhang

Hi Johannes,

Thanks for the quick comments.

On 19/6/4 19:55, Johannes Weiner wrote:
> On Tue, Jun 04, 2019 at 09:57:44AM +0800, Joseph Qi wrote:
>> Implements pressure stall tracking for cgroup v1.
>>
>> Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
>> ---
>>  kernel/sched/psi.c | 65 +++++++++++++++++++++++++++++++++++++++-------
>>  1 file changed, 56 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> index 7acc632c3b82..909083c828d5 100644
>> --- a/kernel/sched/psi.c
>> +++ b/kernel/sched/psi.c
>> @@ -719,13 +719,30 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
>>  	return state_mask;
>>  }
>>  
>> -static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
>> +static struct cgroup *psi_task_cgroup(struct task_struct *task, enum psi_res res)
>> +{
>> +	switch (res) {
>> +	case NR_PSI_RESOURCES:
>> +		return task_dfl_cgroup(task);
>> +	case PSI_IO:
>> +		return task_cgroup(task, io_cgrp_subsys.id);
>> +	case PSI_MEM:
>> +		return task_cgroup(task, memory_cgrp_subsys.id);
>> +	case PSI_CPU:
>> +		return task_cgroup(task, cpu_cgrp_subsys.id);
>> +	default:  /* won't reach here */
>> +		return NULL;
>> +	}
>> +}
>> +
>> +static struct psi_group *iterate_groups(struct task_struct *task, void **iter,
>> +					enum psi_res res)
>>  {
>>  #ifdef CONFIG_CGROUPS
>>  	struct cgroup *cgroup = NULL;
>>  
>>  	if (!*iter)
>> -		cgroup = task->cgroups->dfl_cgrp;
>> +		cgroup = psi_task_cgroup(task, res);
>>  	else if (*iter == &psi_system)
>>  		return NULL;
>>  	else
>> @@ -776,15 +793,45 @@ void psi_task_change(struct task_struct *task, int clear, int set)
>>  		     wq_worker_last_func(task) == psi_avgs_work))
>>  		wake_clock = false;
>>  
>> -	while ((group = iterate_groups(task, &iter))) {
>> -		u32 state_mask = psi_group_change(group, cpu, clear, set);
>> +	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys) ||
>> +	    cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
>> +	    cgroup_subsys_on_dfl(io_cgrp_subsys)) {
>> +		while ((group = iterate_groups(task, &iter, NR_PSI_RESOURCES))) {
>> +			u32 state_mask = psi_group_change(group, cpu, clear, set);
>>  
>> -		if (state_mask & group->poll_states)
>> -			psi_schedule_poll_work(group, 1);
>> +			if (state_mask & group->poll_states)
>> +				psi_schedule_poll_work(group, 1);
>>  
>> -		if (wake_clock && !delayed_work_pending(&group->avgs_work))
>> -			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
>> +			if (wake_clock && !delayed_work_pending(&group->avgs_work))
>> +				schedule_delayed_work(&group->avgs_work, PSI_FREQ);
>> +		}
>> +	} else {
>> +		enum psi_task_count i;
>> +		enum psi_res res;
>> +		int psi_flags = clear | set;
>> +
>> +		for (i = NR_IOWAIT; i < NR_PSI_TASK_COUNTS; i++) {
>> +			if ((i == NR_IOWAIT) && (psi_flags & TSK_IOWAIT))
>> +				res = PSI_IO;
>> +			else if ((i == NR_MEMSTALL) && (psi_flags & TSK_MEMSTALL))
>> +				res = PSI_MEM;
>> +			else if ((i == NR_RUNNING) && (psi_flags & TSK_RUNNING))
>> +				res = PSI_CPU;
>> +			else
>> +				continue;
>> +
>> +			while ((group = iterate_groups(task, &iter, res))) {
>> +				u32 state_mask = psi_group_change(group, cpu, clear, set);
> 
> This doesn't work. Each resource state is composed of all possible
> task states:
> 
> static bool test_state(unsigned int *tasks, enum psi_states state)
> {
> 	switch (state) {
> 	case PSI_IO_SOME:
> 		return tasks[NR_IOWAIT];
> 	case PSI_IO_FULL:
> 		return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
> 	case PSI_MEM_SOME:
> 		return tasks[NR_MEMSTALL];
> 	case PSI_MEM_FULL:
> 		return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
> 	case PSI_CPU_SOME:
> 		return tasks[NR_RUNNING] > 1;
> 	case PSI_NONIDLE:
> 		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
> 			tasks[NR_RUNNING];
> 	default:
> 		return false;
> 	}
> }
> 
> So the IO controller needs to know of NR_RUNNING to tell some vs full,
> the memory controller needs to know of NR_IOWAIT to tell nonidle etc.
> 
> You need to run the full psi task tracking and aggregation machinery
> separately for each of the different cgroups a task can be in in v1.
> 
Yes, since different controllers have their own hierarchy.

> Needless to say, that is expensive. For cpu, memory and io, it's
> triple the scheduling overhead with three ancestor walks and three
> times the cache footprint; three times more aggregation workers every
> two seconds... We could never turn this on per default.
> 
IC, but even on cgroup v2, would it still be expensive if we have many
cgroups?

> Have you considered just co-mounting cgroup2, if for nothing else, to
> get the pressure numbers?
> 
Do you mean mounting cgroup1 and cgroup2 at the same time? 
IIUC, this may not work since many cgroup code have xxx_on_dfl check.

Thanks,
Joseph


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-06-05  1:15 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-04  1:57 [RFC PATCH 0/3] psi: support cgroup v1 Joseph Qi
2019-06-04  1:57 ` [RFC PATCH 1/3] psi: make cgroup psi helpers public Joseph Qi
2019-06-04  1:57 ` [RFC PATCH 2/3] psi: cgroup v1 support Joseph Qi
2019-06-04 11:55   ` Johannes Weiner
2019-06-05  1:15     ` Joseph Qi
2019-06-04  1:57 ` [RFC PATCH 3/3] psi: add cgroup v1 interfaces Joseph Qi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).