linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] psi: reduce calls to sched_clock() in psi
@ 2021-03-21 20:51 Shakeel Butt
  2021-03-22  7:45 ` Peter Zijlstra
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Shakeel Butt @ 2021-03-21 20:51 UTC (permalink / raw)
  To: Johannes Weiner, Peter Zijlstra, Ingo Molnar; +Cc: linux-kernel, Shakeel Butt

We noticed that the cost of psi increases with the increase in the
levels of the cgroups. Particularly the cost of cpu_clock() sticks out
as the kernel calls it multiple times as it traverses up the cgroup
tree. This patch reduces the calls to cpu_clock().

Performed perf bench on Intel Broadwell with 3 levels of cgroup.

Before the patch:

$ perf bench sched all
 # Running sched/messaging benchmark...
 # 20 sender and receiver processes per group
 # 10 groups == 400 processes run

     Total time: 0.747 [sec]

 # Running sched/pipe benchmark...
 # Executed 1000000 pipe operations between two processes

     Total time: 3.516 [sec]

       3.516689 usecs/op
         284358 ops/sec

After the patch:

$ perf bench sched all
 # Running sched/messaging benchmark...
 # 20 sender and receiver processes per group
 # 10 groups == 400 processes run

     Total time: 0.640 [sec]

 # Running sched/pipe benchmark...
 # Executed 1000000 pipe operations between two processes

     Total time: 3.329 [sec]

       3.329820 usecs/op
         300316 ops/sec

Signed-off-by: Shakeel Butt <shakeelb@google.com>
---
 kernel/sched/psi.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ee3c5b48622f..16348b269713 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -644,12 +644,10 @@ static void poll_timer_fn(struct timer_list *t)
 	wake_up_interruptible(&group->poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu)
+static void record_times(struct psi_group_cpu *groupc, u64 now)
 {
 	u32 delta;
-	u64 now;
 
-	now = cpu_clock(cpu);
 	delta = now - groupc->state_start;
 	groupc->state_start = now;
 
@@ -676,7 +674,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu)
 }
 
 static void psi_group_change(struct psi_group *group, int cpu,
-			     unsigned int clear, unsigned int set,
+			     unsigned int clear, unsigned int set, u64 now,
 			     bool wake_clock)
 {
 	struct psi_group_cpu *groupc;
@@ -696,7 +694,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	 */
 	write_seqcount_begin(&groupc->seq);
 
-	record_times(groupc, cpu);
+	record_times(groupc, now);
 
 	for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
 		if (!(m & (1 << t)))
@@ -788,12 +786,14 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 	struct psi_group *group;
 	bool wake_clock = true;
 	void *iter = NULL;
+	u64 now;
 
 	if (!task->pid)
 		return;
 
 	psi_flags_change(task, clear, set);
 
+	now = cpu_clock(cpu);
 	/*
 	 * Periodic aggregation shuts off if there is a period of no
 	 * task changes, so we wake it back up if necessary. However,
@@ -806,7 +806,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 		wake_clock = false;
 
 	while ((group = iterate_groups(task, &iter)))
-		psi_group_change(group, cpu, clear, set, wake_clock);
+		psi_group_change(group, cpu, clear, set, now, wake_clock);
 }
 
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -815,6 +815,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 	struct psi_group *group, *common = NULL;
 	int cpu = task_cpu(prev);
 	void *iter;
+	u64 now = cpu_clock(cpu);
 
 	if (next->pid) {
 		bool identical_state;
@@ -836,7 +837,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 				break;
 			}
 
-			psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
 		}
 	}
 
@@ -858,7 +859,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 
 		iter = NULL;
 		while ((group = iterate_groups(prev, &iter)) && group != common)
-			psi_group_change(group, cpu, clear, set, true);
+			psi_group_change(group, cpu, clear, set, now, true);
 
 		/*
 		 * TSK_ONCPU is handled up to the common ancestor. If we're tasked
@@ -867,7 +868,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		if (sleep) {
 			clear &= ~TSK_ONCPU;
 			for (; group; group = iterate_groups(prev, &iter))
-				psi_group_change(group, cpu, clear, set, true);
+				psi_group_change(group, cpu, clear, set, now, true);
 		}
 	}
 }
-- 
2.31.0.291.g576ba9dcdaf-goog


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] psi: reduce calls to sched_clock() in psi
  2021-03-21 20:51 [PATCH] psi: reduce calls to sched_clock() in psi Shakeel Butt
@ 2021-03-22  7:45 ` Peter Zijlstra
  2021-03-22 14:19 ` Johannes Weiner
  2021-03-23 15:08 ` [tip: sched/core] psi: Reduce " tip-bot2 for Shakeel Butt
  2 siblings, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2021-03-22  7:45 UTC (permalink / raw)
  To: Shakeel Butt; +Cc: Johannes Weiner, Ingo Molnar, linux-kernel

On Sun, Mar 21, 2021 at 01:51:56PM -0700, Shakeel Butt wrote:
> We noticed that the cost of psi increases with the increase in the
> levels of the cgroups. Particularly the cost of cpu_clock() sticks out
> as the kernel calls it multiple times as it traverses up the cgroup
> tree. This patch reduces the calls to cpu_clock().
> 
> Performed perf bench on Intel Broadwell with 3 levels of cgroup.
> 
> Before the patch:
> 
> $ perf bench sched all
>  # Running sched/messaging benchmark...
>  # 20 sender and receiver processes per group
>  # 10 groups == 400 processes run
> 
>      Total time: 0.747 [sec]
> 
>  # Running sched/pipe benchmark...
>  # Executed 1000000 pipe operations between two processes
> 
>      Total time: 3.516 [sec]
> 
>        3.516689 usecs/op
>          284358 ops/sec
> 
> After the patch:
> 
> $ perf bench sched all
>  # Running sched/messaging benchmark...
>  # 20 sender and receiver processes per group
>  # 10 groups == 400 processes run
> 
>      Total time: 0.640 [sec]
> 
>  # Running sched/pipe benchmark...
>  # Executed 1000000 pipe operations between two processes
> 
>      Total time: 3.329 [sec]
> 
>        3.329820 usecs/op
>          300316 ops/sec
> 
> Signed-off-by: Shakeel Butt <shakeelb@google.com>

Thanks!

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] psi: reduce calls to sched_clock() in psi
  2021-03-21 20:51 [PATCH] psi: reduce calls to sched_clock() in psi Shakeel Butt
  2021-03-22  7:45 ` Peter Zijlstra
@ 2021-03-22 14:19 ` Johannes Weiner
  2021-03-23 15:08 ` [tip: sched/core] psi: Reduce " tip-bot2 for Shakeel Butt
  2 siblings, 0 replies; 4+ messages in thread
From: Johannes Weiner @ 2021-03-22 14:19 UTC (permalink / raw)
  To: Shakeel Butt; +Cc: Peter Zijlstra, Ingo Molnar, linux-kernel

On Sun, Mar 21, 2021 at 01:51:56PM -0700, Shakeel Butt wrote:
> We noticed that the cost of psi increases with the increase in the
> levels of the cgroups. Particularly the cost of cpu_clock() sticks out
> as the kernel calls it multiple times as it traverses up the cgroup
> tree. This patch reduces the calls to cpu_clock().
> 
> Performed perf bench on Intel Broadwell with 3 levels of cgroup.
> 
> Before the patch:
> 
> $ perf bench sched all
>  # Running sched/messaging benchmark...
>  # 20 sender and receiver processes per group
>  # 10 groups == 400 processes run
> 
>      Total time: 0.747 [sec]
> 
>  # Running sched/pipe benchmark...
>  # Executed 1000000 pipe operations between two processes
> 
>      Total time: 3.516 [sec]
> 
>        3.516689 usecs/op
>          284358 ops/sec
> 
> After the patch:
> 
> $ perf bench sched all
>  # Running sched/messaging benchmark...
>  # 20 sender and receiver processes per group
>  # 10 groups == 400 processes run
> 
>      Total time: 0.640 [sec]
> 
>  # Running sched/pipe benchmark...
>  # Executed 1000000 pipe operations between two processes
> 
>      Total time: 3.329 [sec]
> 
>        3.329820 usecs/op
>          300316 ops/sec
> 
> Signed-off-by: Shakeel Butt <shakeelb@google.com>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tip: sched/core] psi: Reduce calls to sched_clock() in psi
  2021-03-21 20:51 [PATCH] psi: reduce calls to sched_clock() in psi Shakeel Butt
  2021-03-22  7:45 ` Peter Zijlstra
  2021-03-22 14:19 ` Johannes Weiner
@ 2021-03-23 15:08 ` tip-bot2 for Shakeel Butt
  2 siblings, 0 replies; 4+ messages in thread
From: tip-bot2 for Shakeel Butt @ 2021-03-23 15:08 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Shakeel Butt, Peter Zijlstra (Intel), Johannes Weiner, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     df77430639c9cf73559bac0f25084518bf9a812d
Gitweb:        https://git.kernel.org/tip/df77430639c9cf73559bac0f25084518bf9a812d
Author:        Shakeel Butt <shakeelb@google.com>
AuthorDate:    Sun, 21 Mar 2021 13:51:56 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 23 Mar 2021 16:01:58 +01:00

psi: Reduce calls to sched_clock() in psi

We noticed that the cost of psi increases with the increase in the
levels of the cgroups. Particularly the cost of cpu_clock() sticks out
as the kernel calls it multiple times as it traverses up the cgroup
tree. This patch reduces the calls to cpu_clock().

Performed perf bench on Intel Broadwell with 3 levels of cgroup.

Before the patch:

$ perf bench sched all
 # Running sched/messaging benchmark...
 # 20 sender and receiver processes per group
 # 10 groups == 400 processes run

     Total time: 0.747 [sec]

 # Running sched/pipe benchmark...
 # Executed 1000000 pipe operations between two processes

     Total time: 3.516 [sec]

       3.516689 usecs/op
         284358 ops/sec

After the patch:

$ perf bench sched all
 # Running sched/messaging benchmark...
 # 20 sender and receiver processes per group
 # 10 groups == 400 processes run

     Total time: 0.640 [sec]

 # Running sched/pipe benchmark...
 # Executed 1000000 pipe operations between two processes

     Total time: 3.329 [sec]

       3.329820 usecs/op
         300316 ops/sec

Signed-off-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20210321205156.4186483-1-shakeelb@google.com
---
 kernel/sched/psi.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index c8480d7..b1b00e9 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -644,12 +644,10 @@ static void poll_timer_fn(struct timer_list *t)
 	wake_up_interruptible(&group->poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu)
+static void record_times(struct psi_group_cpu *groupc, u64 now)
 {
 	u32 delta;
-	u64 now;
 
-	now = cpu_clock(cpu);
 	delta = now - groupc->state_start;
 	groupc->state_start = now;
 
@@ -676,7 +674,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu)
 }
 
 static void psi_group_change(struct psi_group *group, int cpu,
-			     unsigned int clear, unsigned int set,
+			     unsigned int clear, unsigned int set, u64 now,
 			     bool wake_clock)
 {
 	struct psi_group_cpu *groupc;
@@ -696,7 +694,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	 */
 	write_seqcount_begin(&groupc->seq);
 
-	record_times(groupc, cpu);
+	record_times(groupc, now);
 
 	for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
 		if (!(m & (1 << t)))
@@ -788,12 +786,14 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 	struct psi_group *group;
 	bool wake_clock = true;
 	void *iter = NULL;
+	u64 now;
 
 	if (!task->pid)
 		return;
 
 	psi_flags_change(task, clear, set);
 
+	now = cpu_clock(cpu);
 	/*
 	 * Periodic aggregation shuts off if there is a period of no
 	 * task changes, so we wake it back up if necessary. However,
@@ -806,7 +806,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 		wake_clock = false;
 
 	while ((group = iterate_groups(task, &iter)))
-		psi_group_change(group, cpu, clear, set, wake_clock);
+		psi_group_change(group, cpu, clear, set, now, wake_clock);
 }
 
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -815,6 +815,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 	struct psi_group *group, *common = NULL;
 	int cpu = task_cpu(prev);
 	void *iter;
+	u64 now = cpu_clock(cpu);
 
 	if (next->pid) {
 		bool identical_state;
@@ -836,7 +837,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 				break;
 			}
 
-			psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
 		}
 	}
 
@@ -858,7 +859,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 
 		iter = NULL;
 		while ((group = iterate_groups(prev, &iter)) && group != common)
-			psi_group_change(group, cpu, clear, set, true);
+			psi_group_change(group, cpu, clear, set, now, true);
 
 		/*
 		 * TSK_ONCPU is handled up to the common ancestor. If we're tasked
@@ -867,7 +868,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		if (sleep) {
 			clear &= ~TSK_ONCPU;
 			for (; group; group = iterate_groups(prev, &iter))
-				psi_group_change(group, cpu, clear, set, true);
+				psi_group_change(group, cpu, clear, set, now, true);
 		}
 	}
 }

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-03-23 15:10 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-21 20:51 [PATCH] psi: reduce calls to sched_clock() in psi Shakeel Butt
2021-03-22  7:45 ` Peter Zijlstra
2021-03-22 14:19 ` Johannes Weiner
2021-03-23 15:08 ` [tip: sched/core] psi: Reduce " tip-bot2 for Shakeel Butt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).