linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset
@ 2022-03-11  7:58 chenying
  2022-03-12 12:03 ` Peter Zijlstra
  0 siblings, 1 reply; 8+ messages in thread
From: chenying @ 2022-03-11  7:58 UTC (permalink / raw)
  To: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, mgorman, bristot, bsegall
  Cc: linux-kernel, duanxiongchun, zhouchengming, songmuchun,
	zhengqi.arch, zhoufeng.zf, ligang.bdlg

We add a time offset to the se->vruntime when the idle sched_entity
is enqueued, so that the idle entity will always be on the right of
the non-idle in the runqueue. This can allow non-idle tasks to be
selected and run before the idle.

A use-case is that sched_idle for background tasks and non-idle
for foreground. The foreground tasks are latency sensitive and do
not want to be disturbed by the background. It is well known that
the idle tasks can be preempted by the non-idle tasks when waking up,
but will not distinguish between idle and non-idle when pick the next
entity. This may cause background tasks to disturb the foreground.

Test results as below:

~$ ./loop.sh &
[1] 764
~$ chrt -i 0 ./loop.sh &
[2] 765
~$ taskset -p 04 764
~$ taskset -p 04 765

~$ top -p 764 -p 765
top - 13:10:01 up 1 min,  2 users,  load average: 1.30, 0.38, 0.13
Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
%Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.4 id,  0.0 wa,  0.0 hi, 0.0 si,  
0.0 st
KiB Mem : 16393492 total, 16142256 free,   111028 used,   140208 buff/cache
KiB Swap:   385836 total,   385836 free,        0 used. 16037992 avail Mem

   PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
   764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 1:05.12 loop.sh
   765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh

The non-idle process (764) can run at 100% and without being disturbed by
the idle process (765).

~$ cat /sys/fs/cgroup/cpu/background/cgroup.procs
765
~$ cat /sys/fs/cgroup/cpu/foreground/cgroup.procs
764
~$ top -p 764 -p 765
top - 13:17:19 up 9 min,  2 users,  load average: 2.00, 1.64, 0.86
Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
%Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.5 id,  0.0 wa,  0.0 hi, 0.0 si,  
0.0 st
KiB Mem : 16393492 total, 16139576 free,   112732 used,   141184 buff/cache
KiB Swap:   385836 total,   385836 free,        0 used. 16036236 avail Mem

   PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
   764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 8:23.51 loop.sh
   765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh

The non-idle group can run at 100% and without being disturbed by the
idle group.

Co-developed-by: chengming zhou <zhouchengming@bytedance.com>
Signed-off-by: chenying <chenying.kernel@bytedance.com>
---
  include/linux/sched.h   |  1 +
  kernel/sched/core.c     |  6 +++++-
  kernel/sched/debug.c    |  2 ++
  kernel/sched/fair.c     | 26 ++++++++++++++++++++++----
  kernel/sched/features.h |  2 ++
  kernel/sched/sched.h    |  1 +
  6 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75ba8aa60248..20412f353cad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -545,6 +545,7 @@ struct sched_entity {
      u64                exec_start;
      u64                sum_exec_runtime;
      u64                vruntime;
+    u64                vruntime_offset;
      u64                prev_sum_exec_runtime;

      u64                nr_migrations;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9745613d531c..beb9d6f54c52 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4239,6 +4239,7 @@ static void __sched_fork(unsigned long 
clone_flags, struct task_struct *p)
      p->se.prev_sum_exec_runtime    = 0;
      p->se.nr_migrations        = 0;
      p->se.vruntime            = 0;
+    p->se.vruntime_offset        = 0;
      INIT_LIST_HEAD(&p->se.group_node);

  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7211,8 +7212,11 @@ static void __setscheduler_params(struct 
task_struct *p,

      if (dl_policy(policy))
          __setparam_dl(p, attr);
-    else if (fair_policy(policy))
+    else if (fair_policy(policy)) {
          p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+        p->se.vruntime_offset = 0;
+    } else if (idle_policy(policy))
+        p->se.vruntime_offset = sched_idle_vruntime_offset;

      /*
       * __sched_setscheduler() ensures attr->sched_priority == 0 when
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aa29211de1bf..701496626830 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -460,6 +460,7 @@ static void print_cfs_group_stats(struct seq_file 
*m, int cpu, struct task_group

      PN(se->exec_start);
      PN(se->vruntime);
+    PN(se->vruntime_offset);
      PN(se->sum_exec_runtime);

      if (schedstat_enabled()) {
@@ -969,6 +970,7 @@ void proc_sched_show_task(struct task_struct *p, 
struct pid_namespace *ns,

      PN(se.exec_start);
      PN(se.vruntime);
+    PN(se.vruntime_offset);
      PN(se.sum_exec_runtime);

      nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5146163bfabb..6a2cba63b4a9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -92,6 +92,8 @@ static unsigned int 
normalized_sysctl_sched_wakeup_granularity    = 1000000UL;

  const_debug unsigned int sysctl_sched_migration_cost    = 500000UL;

+unsigned long long sched_idle_vruntime_offset    = 2592000000000000; /* 
30 days */
+
  int sched_thermal_decay_shift;
  static int __init setup_sched_thermal_decay_shift(char *str)
  {
@@ -535,10 +537,19 @@ static inline u64 min_vruntime(u64 min_vruntime, 
u64 vruntime)
      return min_vruntime;
  }

+static inline s64  vtime_diff(struct sched_entity *a,
+                struct sched_entity *b)
+{
+    if (sched_feat(VRUNTIME_OFFSET))
+        return (s64)(a->vruntime_offset - b->vruntime_offset);
+    else
+        return 0;
+}
+
  static inline bool entity_before(struct sched_entity *a,
                  struct sched_entity *b)
  {
-    return (s64)(a->vruntime - b->vruntime) < 0;
+    return (s64)(a->vruntime - b->vruntime + vtime_diff(a, b)) < 0;
  }

  #define __node_2_se(node) \
@@ -4445,7 +4456,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct 
sched_entity *curr)
          return;

      se = __pick_first_entity(cfs_rq);
-    delta = curr->vruntime - se->vruntime;
+    delta = curr->vruntime - se->vruntime + vtime_diff(curr, se);

      if (delta < 0)
          return;
@@ -7036,7 +7047,7 @@ static unsigned long wakeup_gran(struct 
sched_entity *se)
  static int
  wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
  {
-    s64 gran, vdiff = curr->vruntime - se->vruntime;
+    s64 gran, vdiff = curr->vruntime - se->vruntime + vtime_diff(curr, se);

      if (vdiff <= 0)
          return -1;
@@ -11131,7 +11142,7 @@ bool cfs_prio_less(struct task_struct *a, struct 
task_struct *b, bool in_fi)
       * min_vruntime_fi, which would have been updated in prior calls
       * to se_fi_update().
       */
-    delta = (s64)(sea->vruntime - seb->vruntime) +
+    delta = (s64)(sea->vruntime - seb->vruntime + vtime_diff(sea, seb)) +
          (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);

      return delta > 0;
@@ -11190,6 +11201,9 @@ static void task_fork_fair(struct task_struct *p)
      }
      place_entity(cfs_rq, se, 1);

+    if (task_has_idle_policy(p))
+        se->vruntime_offset = sched_idle_vruntime_offset;
+
      if (sysctl_sched_child_runs_first && curr && entity_before(curr, 
se)) {
          /*
           * Upon rescheduling, sched_class::put_prev_task() will place
@@ -11655,6 +11669,10 @@ int sched_group_set_idle(struct task_group *tg, 
long idle)
          rq_lock_irqsave(rq, &rf);

          grp_cfs_rq->idle = idle;
+        if (idle)
+            se->vruntime_offset = sched_idle_vruntime_offset;
+        else
+            se->vruntime_offset = 0;
          if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
              goto next_cpu;

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1cf435bbcd9c..f59f507e6dba 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -100,3 +100,5 @@ SCHED_FEAT(LATENCY_WARN, false)

  SCHED_FEAT(ALT_PERIOD, true)
  SCHED_FEAT(BASE_SLICE, true)
+
+SCHED_FEAT(VRUNTIME_OFFSET, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de53be905739..1bc0c0756fd4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -95,6 +95,7 @@ extern __read_mostly int scheduler_running;

  extern unsigned long calc_load_update;
  extern atomic_long_t calc_load_tasks;
+extern unsigned long long sched_idle_vruntime_offset;

  extern void calc_global_load_tick(struct rq *this_rq);
  extern long calc_load_fold_active(struct rq *this_rq, long adjust);
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset
  2022-03-11  7:58 Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset chenying
@ 2022-03-12 12:03 ` Peter Zijlstra
  2022-03-13  5:37   ` [External] " chenying
  0 siblings, 1 reply; 8+ messages in thread
From: Peter Zijlstra @ 2022-03-12 12:03 UTC (permalink / raw)
  To: chenying
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, bristot, bsegall, linux-kernel, duanxiongchun,
	zhouchengming, songmuchun, zhengqi.arch, zhoufeng.zf,
	ligang.bdlg

On Fri, Mar 11, 2022 at 03:58:47PM +0800, chenying wrote:
> We add a time offset to the se->vruntime when the idle sched_entity
> is enqueued, so that the idle entity will always be on the right of
> the non-idle in the runqueue. This can allow non-idle tasks to be
> selected and run before the idle.
> 
> A use-case is that sched_idle for background tasks and non-idle
> for foreground. The foreground tasks are latency sensitive and do
> not want to be disturbed by the background. It is well known that
> the idle tasks can be preempted by the non-idle tasks when waking up,
> but will not distinguish between idle and non-idle when pick the next
> entity. This may cause background tasks to disturb the foreground.
> 
> Test results as below:
> 
> ~$ ./loop.sh &
> [1] 764
> ~$ chrt -i 0 ./loop.sh &
> [2] 765
> ~$ taskset -p 04 764
> ~$ taskset -p 04 765
> 
> ~$ top -p 764 -p 765
> top - 13:10:01 up 1 min,  2 users,  load average: 1.30, 0.38, 0.13
> Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
> %Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.4 id,  0.0 wa,  0.0 hi, 0.0 si,  0.0
> st
> KiB Mem : 16393492 total, 16142256 free,   111028 used,   140208 buff/cache
> KiB Swap:   385836 total,   385836 free,        0 used. 16037992 avail Mem
> 
>   PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
>   764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 1:05.12 loop.sh
>   765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh
> 
> The non-idle process (764) can run at 100% and without being disturbed by
> the idle process (765).

Did you just do a very complicated true idle time scheduler, with all
the problems that brings?

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [External] Re: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset
  2022-03-12 12:03 ` Peter Zijlstra
@ 2022-03-13  5:37   ` chenying
  2022-03-13  9:02     ` Peter Zijlstra
  0 siblings, 1 reply; 8+ messages in thread
From: chenying @ 2022-03-13  5:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, bristot, bsegall, linux-kernel, duanxiongchun,
	zhouchengming, songmuchun, zhengqi.arch, zhoufeng.zf,
	ligang.bdlg

在 2022/3/12 20:03, Peter Zijlstra 写道:
> On Fri, Mar 11, 2022 at 03:58:47PM +0800, chenying wrote:
>> We add a time offset to the se->vruntime when the idle sched_entity
>> is enqueued, so that the idle entity will always be on the right of
>> the non-idle in the runqueue. This can allow non-idle tasks to be
>> selected and run before the idle.
>>
>> A use-case is that sched_idle for background tasks and non-idle
>> for foreground. The foreground tasks are latency sensitive and do
>> not want to be disturbed by the background. It is well known that
>> the idle tasks can be preempted by the non-idle tasks when waking up,
>> but will not distinguish between idle and non-idle when pick the next
>> entity. This may cause background tasks to disturb the foreground.
>>
>> Test results as below:
>>
>> ~$ ./loop.sh &
>> [1] 764
>> ~$ chrt -i 0 ./loop.sh &
>> [2] 765
>> ~$ taskset -p 04 764
>> ~$ taskset -p 04 765
>>
>> ~$ top -p 764 -p 765
>> top - 13:10:01 up 1 min,  2 users,  load average: 1.30, 0.38, 0.13
>> Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
>> %Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.4 id,  0.0 wa,  0.0 hi, 0.0 si,  0.0
>> st
>> KiB Mem : 16393492 total, 16142256 free,   111028 used,   140208 buff/cache
>> KiB Swap:   385836 total,   385836 free,        0 used. 16037992 avail Mem
>>
>>    PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
>>    764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 1:05.12 loop.sh
>>    765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh
>>
>> The non-idle process (764) can run at 100% and without being disturbed by
>> the idle process (765).
> 
> Did you just do a very complicated true idle time scheduler, with all
> the problems that brings?

When colocating CPU-intensive jobs with latency-sensitive services can 
improve CPU utilization but it is difficult to meet the stringent 
tail-latency requirements of latency-sensitive services. We use a true 
idle time scheduler for CPU-intensive jobs to minimize the impact on 
latency-sensitive services.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [External] Re: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset
  2022-03-13  5:37   ` [External] " chenying
@ 2022-03-13  9:02     ` Peter Zijlstra
  2022-03-13 10:06       ` chenying
  0 siblings, 1 reply; 8+ messages in thread
From: Peter Zijlstra @ 2022-03-13  9:02 UTC (permalink / raw)
  To: chenying
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, bristot, bsegall, linux-kernel, duanxiongchun,
	zhouchengming, songmuchun, zhengqi.arch, zhoufeng.zf,
	ligang.bdlg

On Sun, Mar 13, 2022 at 01:37:37PM +0800, chenying wrote:
> 在 2022/3/12 20:03, Peter Zijlstra 写道:
> > On Fri, Mar 11, 2022 at 03:58:47PM +0800, chenying wrote:
> > > We add a time offset to the se->vruntime when the idle sched_entity
> > > is enqueued, so that the idle entity will always be on the right of
> > > the non-idle in the runqueue. This can allow non-idle tasks to be
> > > selected and run before the idle.
> > > 
> > > A use-case is that sched_idle for background tasks and non-idle
> > > for foreground. The foreground tasks are latency sensitive and do
> > > not want to be disturbed by the background. It is well known that
> > > the idle tasks can be preempted by the non-idle tasks when waking up,
> > > but will not distinguish between idle and non-idle when pick the next
> > > entity. This may cause background tasks to disturb the foreground.
> > > 
> > > Test results as below:
> > > 
> > > ~$ ./loop.sh &
> > > [1] 764
> > > ~$ chrt -i 0 ./loop.sh &
> > > [2] 765
> > > ~$ taskset -p 04 764
> > > ~$ taskset -p 04 765
> > > 
> > > ~$ top -p 764 -p 765
> > > top - 13:10:01 up 1 min,  2 users,  load average: 1.30, 0.38, 0.13
> > > Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
> > > %Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.4 id,  0.0 wa,  0.0 hi, 0.0 si,  0.0
> > > st
> > > KiB Mem : 16393492 total, 16142256 free,   111028 used,   140208 buff/cache
> > > KiB Swap:   385836 total,   385836 free,        0 used. 16037992 avail Mem
> > > 
> > >    PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
> > >    764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 1:05.12 loop.sh
> > >    765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh
> > > 
> > > The non-idle process (764) can run at 100% and without being disturbed by
> > > the idle process (765).
> > 
> > Did you just do a very complicated true idle time scheduler, with all
> > the problems that brings?
> 
> When colocating CPU-intensive jobs with latency-sensitive services can
> improve CPU utilization but it is difficult to meet the stringent
> tail-latency requirements of latency-sensitive services. We use a true idle
> time scheduler for CPU-intensive jobs to minimize the impact on
> latency-sensitive services.

Hard NAK on any true idle-time scheduler until you make the whole kernel
immune to lock holder starvation issues.

And as said; this is a terrible way to do a true idle-time scheduler.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [External] Re: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset
  2022-03-13  9:02     ` Peter Zijlstra
@ 2022-03-13 10:06       ` chenying
  2022-03-15  0:30         ` Josh Don
  0 siblings, 1 reply; 8+ messages in thread
From: chenying @ 2022-03-13 10:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	mgorman, bristot, bsegall, linux-kernel, duanxiongchun,
	zhouchengming, songmuchun, zhengqi.arch, zhoufeng.zf,
	ligang.bdlg

在 2022/3/13 17:02, Peter Zijlstra 写道:
> On Sun, Mar 13, 2022 at 01:37:37PM +0800, chenying wrote:
>> 在 2022/3/12 20:03, Peter Zijlstra 写道:
>>> On Fri, Mar 11, 2022 at 03:58:47PM +0800, chenying wrote:
>>>> We add a time offset to the se->vruntime when the idle sched_entity
>>>> is enqueued, so that the idle entity will always be on the right of
>>>> the non-idle in the runqueue. This can allow non-idle tasks to be
>>>> selected and run before the idle.
>>>>
>>>> A use-case is that sched_idle for background tasks and non-idle
>>>> for foreground. The foreground tasks are latency sensitive and do
>>>> not want to be disturbed by the background. It is well known that
>>>> the idle tasks can be preempted by the non-idle tasks when waking up,
>>>> but will not distinguish between idle and non-idle when pick the next
>>>> entity. This may cause background tasks to disturb the foreground.
>>>>
>>>> Test results as below:
>>>>
>>>> ~$ ./loop.sh &
>>>> [1] 764
>>>> ~$ chrt -i 0 ./loop.sh &
>>>> [2] 765
>>>> ~$ taskset -p 04 764
>>>> ~$ taskset -p 04 765
>>>>
>>>> ~$ top -p 764 -p 765
>>>> top - 13:10:01 up 1 min,  2 users,  load average: 1.30, 0.38, 0.13
>>>> Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
>>>> %Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.4 id,  0.0 wa,  0.0 hi, 0.0 si,  0.0
>>>> st
>>>> KiB Mem : 16393492 total, 16142256 free,   111028 used,   140208 buff/cache
>>>> KiB Swap:   385836 total,   385836 free,        0 used. 16037992 avail Mem
>>>>
>>>>     PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
>>>>     764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 1:05.12 loop.sh
>>>>     765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh
>>>>
>>>> The non-idle process (764) can run at 100% and without being disturbed by
>>>> the idle process (765).
>>>
>>> Did you just do a very complicated true idle time scheduler, with all
>>> the problems that brings?
>>
>> When colocating CPU-intensive jobs with latency-sensitive services can
>> improve CPU utilization but it is difficult to meet the stringent
>> tail-latency requirements of latency-sensitive services. We use a true idle
>> time scheduler for CPU-intensive jobs to minimize the impact on
>> latency-sensitive services.
> 
> Hard NAK on any true idle-time scheduler until you make the whole kernel
> immune to lock holder starvation issues.

If I set the sched_idle_vruntime_offset to a relatively small value 
(e.g. 10 minutes), can this issues be avoided?


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [External] Re: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset
  2022-03-13 10:06       ` chenying
@ 2022-03-15  0:30         ` Josh Don
  2022-03-15  2:04           ` chenying
  0 siblings, 1 reply; 8+ messages in thread
From: Josh Don @ 2022-03-15  0:30 UTC (permalink / raw)
  To: chenying
  Cc: Peter Zijlstra, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman,
	Daniel Bristot de Oliveira, Benjamin Segall, linux-kernel,
	duanxiongchun, zhouchengming, songmuchun, zhengqi.arch,
	zhoufeng.zf, ligang.bdlg

On Sun, Mar 13, 2022 at 3:07 AM chenying <chenying.kernel@bytedance.com> wrote:
>
> If I set the sched_idle_vruntime_offset to a relatively small value
> (e.g. 10 minutes), can this issues be avoided?

That's still long enough to cause lockups.

Is the issue that you have a large number of sched_idle entities, and
the occasional latency sensitive thing that wakes up for a short
duration? Have you considered approaching this from the other
direction (ie. if we have a latency sensitive thing wake onto a cpu
running only sched idle stuff, we could change entity placement to
position the latency sensitive thing further left on the timeline,
akin to !GENTLE_FAIR_SLEEPERS).

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [External] Re: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset
  2022-03-15  0:30         ` Josh Don
@ 2022-03-15  2:04           ` chenying
  2022-03-15  2:21             ` Josh Don
  0 siblings, 1 reply; 8+ messages in thread
From: chenying @ 2022-03-15  2:04 UTC (permalink / raw)
  To: Josh Don
  Cc: Peter Zijlstra, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman,
	Daniel Bristot de Oliveira, Benjamin Segall, linux-kernel,
	duanxiongchun, zhouchengming, songmuchun, zhengqi.arch,
	zhoufeng.zf, ligang.bdlg

在 2022/3/15 8:30, Josh Don 写道:
> On Sun, Mar 13, 2022 at 3:07 AM chenying <chenying.kernel@bytedance.com> wrote:
>>
>> If I set the sched_idle_vruntime_offset to a relatively small value
>> (e.g. 10 minutes), can this issues be avoided?
> 
> That's still long enough to cause lockups.
> 
> Is the issue that you have a large number of sched_idle entities, and
> the occasional latency sensitive thing that wakes up for a short
> duration? Have you considered approaching this from the other
> direction (ie. if we have a latency sensitive thing wake onto a cpu
> running only sched idle stuff, we could change entity placement to
> position the latency sensitive thing further left on the timeline,
> akin to !GENTLE_FAIR_SLEEPERS).

I think this may not guarantee that latency sensitive tasks are always 
to the left of idle tasks. And it may get complicated if a 
latency-sensitive task is woken up onto a cpu which there are already 
multiple latency-sensitive tasks and sched_idle tasks.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [External] Re: Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset
  2022-03-15  2:04           ` chenying
@ 2022-03-15  2:21             ` Josh Don
  0 siblings, 0 replies; 8+ messages in thread
From: Josh Don @ 2022-03-15  2:21 UTC (permalink / raw)
  To: chenying
  Cc: Peter Zijlstra, Ingo Molnar, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Mel Gorman,
	Daniel Bristot de Oliveira, Benjamin Segall, linux-kernel,
	duanxiongchun, zhouchengming, songmuchun, zhengqi.arch,
	zhoufeng.zf, ligang.bdlg

On Mon, Mar 14, 2022 at 7:05 PM chenying <chenying.kernel@bytedance.com> wrote:
>
> 在 2022/3/15 8:30, Josh Don 写道:
> > On Sun, Mar 13, 2022 at 3:07 AM chenying <chenying.kernel@bytedance.com> wrote:
> >>
> >> If I set the sched_idle_vruntime_offset to a relatively small value
> >> (e.g. 10 minutes), can this issues be avoided?
> >
> > That's still long enough to cause lockups.
> >
> > Is the issue that you have a large number of sched_idle entities, and
> > the occasional latency sensitive thing that wakes up for a short
> > duration? Have you considered approaching this from the other
> > direction (ie. if we have a latency sensitive thing wake onto a cpu
> > running only sched idle stuff, we could change entity placement to
> > position the latency sensitive thing further left on the timeline,
> > akin to !GENTLE_FAIR_SLEEPERS).
>
> I think this may not guarantee that latency sensitive tasks are always
> to the left of idle tasks. And it may get complicated if a
> latency-sensitive task is woken up onto a cpu which there are already
> multiple latency-sensitive tasks and sched_idle tasks.

If you're waking onto a cpu with lots of latency-sensitive tasks
already, you're already outside the bounds of being able to guarantee
the latency tails you're after (given that the default
idle_min_granularity and idle weight aren't giving you the performance
at the tails that you want right now). It would be helpful to get a
clearer statement as to the problem you're trying to solve.

Perhaps Vincent's recent patch series adding latency support to CFS
("Add latency_nice priority") would be of interest?

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-03-15  2:21 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-11  7:58 Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset chenying
2022-03-12 12:03 ` Peter Zijlstra
2022-03-13  5:37   ` [External] " chenying
2022-03-13  9:02     ` Peter Zijlstra
2022-03-13 10:06       ` chenying
2022-03-15  0:30         ` Josh Don
2022-03-15  2:04           ` chenying
2022-03-15  2:21             ` Josh Don

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).