linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Ingo Molnar <mingo@kernel.org>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Thomas Gleixner <tglx@linutronix.de>,
	Andrew Morton <akpm@linux-foundation.org>
Subject: [GIT PULL] scheduler fix
Date: Sat, 17 Nov 2018 11:57:57 +0100	[thread overview]
Message-ID: <20181117105757.GA40115@gmail.com> (raw)

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus

   # HEAD: c469933e772132aad040bd6a2adc8edf9ad6f825 sched/fair: Fix cpu_util_wake() for 'execl' type workloads

Fix an exec() related scalability/performance regression, which was 
caused by incorrectly calculating load and migrating tasks on exec() when 
they shouldn't be.

 Thanks,

	Ingo

------------------>
Patrick Bellasi (1):
      sched/fair: Fix cpu_util_wake() for 'execl' type workloads


 kernel/sched/fair.c | 62 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3648d0300fdf..ac855b2f4774 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	return target;
 }
 
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
 
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
 {
-	return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+	return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
 }
 
 /*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 			avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
 
-			spare_cap = capacity_spare_wake(i, p);
+			spare_cap = capacity_spare_without(i, p);
 
 			if (spare_cap > max_spare_cap)
 				max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 		return prev_cpu;
 
 	/*
-	 * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
-	 * last_update_time.
+	 * We need task's util for capacity_spare_without, sync it up to
+	 * prev_cpu's last_update_time.
 	 */
 	if (!(sd_flag & SD_BALANCE_FORK))
 		sync_entity_load_avg(&p->se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
 }
 
 /*
- * cpu_util_wake: Compute CPU utilization with any contributions from
- * the waking task p removed.
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
  */
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq;
 	unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
 	cfs_rq = &cpu_rq(cpu)->cfs;
 	util = READ_ONCE(cfs_rq->avg.util_avg);
 
-	/* Discount task's blocked util from CPU's util */
+	/* Discount task's util from CPU's util */
 	util -= min_t(unsigned int, util, task_util(p));
 
 	/*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
 	 * a) if *p is the only task sleeping on this CPU, then:
 	 *      cpu_util (== task_util) > util_est (== 0)
 	 *    and thus we return:
-	 *      cpu_util_wake = (cpu_util - task_util) = 0
+	 *      cpu_util_without = (cpu_util - task_util) = 0
 	 *
 	 * b) if other tasks are SLEEPING on this CPU, which is now exiting
 	 *    IDLE, then:
 	 *      cpu_util >= task_util
 	 *      cpu_util > util_est (== 0)
 	 *    and thus we discount *p's blocked utilization to return:
-	 *      cpu_util_wake = (cpu_util - task_util) >= 0
+	 *      cpu_util_without = (cpu_util - task_util) >= 0
 	 *
 	 * c) if other tasks are RUNNABLE on that CPU and
 	 *      util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
 	 * covered by the following code when estimated utilization is
 	 * enabled.
 	 */
-	if (sched_feat(UTIL_EST))
-		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+	if (sched_feat(UTIL_EST)) {
+		unsigned int estimated =
+			READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+		/*
+		 * Despite the following checks we still have a small window
+		 * for a possible race, when an execl's select_task_rq_fair()
+		 * races with LB's detach_task():
+		 *
+		 *   detach_task()
+		 *     p->on_rq = TASK_ON_RQ_MIGRATING;
+		 *     ---------------------------------- A
+		 *     deactivate_task()                   \
+		 *       dequeue_task()                     + RaceTime
+		 *         util_est_dequeue()              /
+		 *     ---------------------------------- B
+		 *
+		 * The additional check on "current == p" it's required to
+		 * properly fix the execl regression and it helps in further
+		 * reducing the chances for the above race.
+		 */
+		if (unlikely(task_on_rq_queued(p) || current == p)) {
+			estimated -= min_t(unsigned int, estimated,
+					   (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+		}
+		util = max(util, estimated);
+	}
 
 	/*
 	 * Utilization (estimated) can exceed the CPU capacity, thus let's

             reply	other threads:[~2018-11-17 10:58 UTC|newest]

Thread overview: 69+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-17 10:57 Ingo Molnar [this message]
2018-11-18 20:05 ` [GIT PULL] scheduler fix pr-tracker-bot
  -- strict thread matches above, loose matches on Subject: below --
2023-10-01  8:43 Ingo Molnar
2023-10-01 17:08 ` pr-tracker-bot
2023-09-22 10:26 Ingo Molnar
2023-09-22 20:19 ` pr-tracker-bot
2021-06-24  7:06 Ingo Molnar
2021-06-24 16:34 ` pr-tracker-bot
2020-12-27  9:16 Ingo Molnar
2020-12-27 17:27 ` pr-tracker-bot
2020-03-02  7:51 Ingo Molnar
2020-03-03 23:35 ` pr-tracker-bot
2019-12-17 11:54 Ingo Molnar
2019-12-17 19:20 ` pr-tracker-bot
2019-07-14 10:19 Ingo Molnar
2019-07-14 18:45 ` pr-tracker-bot
2019-05-05 11:02 Ingo Molnar
2019-05-05 22:10 ` pr-tracker-bot
2019-04-27 14:39 Ingo Molnar
2019-04-27 18:45 ` pr-tracker-bot
2019-04-12 13:08 Ingo Molnar
2019-04-13  4:05 ` pr-tracker-bot
2018-12-31 14:58 Ingo Molnar
2018-12-31 18:05 ` pr-tracker-bot
2018-10-11  9:12 Ingo Molnar
2018-10-11 12:32 ` Greg Kroah-Hartman
2018-10-11  9:02 Ingo Molnar
2018-01-17 15:34 Ingo Molnar
2017-10-27 19:16 Ingo Molnar
2016-12-07 18:48 Ingo Molnar
2016-10-28  8:35 Ingo Molnar
2016-10-19 15:52 Ingo Molnar
2016-10-18 11:17 Ingo Molnar
2016-09-13 18:17 Ingo Molnar
2016-07-14 18:56 Ingo Molnar
2016-05-13 18:54 Ingo Molnar
2016-05-06 11:31 Ingo Molnar
2015-07-18  2:56 Ingo Molnar
2015-03-28 13:45 Ingo Molnar
2014-01-15 18:19 Ingo Molnar
2013-09-28 18:08 Ingo Molnar
2013-09-12 12:58 Ingo Molnar
2012-05-17  8:46 Ingo Molnar
2012-03-02 10:57 Ingo Molnar
2012-02-27 10:29 Ingo Molnar
2011-04-07 17:38 Ingo Molnar
2011-03-18 13:52 Ingo Molnar
2011-03-10  8:01 Ingo Molnar
2011-01-24 13:07 Ingo Molnar
2010-04-08 15:38 Ingo Molnar
2010-04-08 15:42 ` Linus Torvalds
2010-04-08 16:03   ` Andreas Schwab
2010-04-08 18:26     ` Ingo Molnar
2010-04-08 18:36       ` Linus Torvalds
2010-04-08 18:52         ` Ingo Molnar
2009-12-23 16:03 Ingo Molnar
2009-10-08 19:01 Ingo Molnar
2009-05-05  9:35 Ingo Molnar
2009-02-17 16:40 [git pull] " Ingo Molnar
2009-02-04 19:18 Ingo Molnar
2009-01-07 22:26 Ingo Molnar
2009-01-07 23:47 ` Linus Torvalds
2009-01-08  7:50   ` Peter Zijlstra
2008-12-04 19:41 Ingo Molnar
2008-04-14 15:07 Ingo Molnar
2008-01-22 10:33 Ingo Molnar
2007-10-29 20:39 [git pull] scheduler fixes Ingo Molnar
2007-10-29 23:34 ` [git pull] scheduler fix Ingo Molnar
2007-10-30 10:15   ` Guillaume Chazarain
2007-11-01  8:39     ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181117105757.GA40115@gmail.com \
    --to=mingo@kernel.org \
    --cc=a.p.zijlstra@chello.nl \
    --cc=akpm@linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).