All of lore.kernel.org
 help / color / mirror / Atom feed
From: David Dai <davidai@google.com>
To: "Rafael J. Wysocki" <rafael@kernel.org>,
	Viresh Kumar <viresh.kumar@linaro.org>,
	Rob Herring <robh+dt@kernel.org>,
	Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Jonathan Corbet <corbet@lwn.net>, Marc Zyngier <maz@kernel.org>,
	Oliver Upton <oliver.upton@linux.dev>,
	James Morse <james.morse@arm.com>,
	Suzuki K Poulose <suzuki.poulose@arm.com>,
	Zenghui Yu <yuzenghui@huawei.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Will Deacon <will@kernel.org>,
	Mark Rutland <mark.rutland@arm.com>,
	Lorenzo Pieralisi <lpieralisi@kernel.org>,
	Sudeep Holla <sudeep.holla@arm.com>,
	Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>,
	David Dai <davidai@google.com>
Cc: Saravana Kannan <saravanak@google.com>,
	kernel-team@android.com, linux-pm@vger.kernel.org,
	devicetree@vger.kernel.org, linux-kernel@vger.kernel.org,
	kvm@vger.kernel.org, linux-doc@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev
Subject: [RFC PATCH v2 1/6] sched/fair: Add util_guest for tasks
Date: Thu, 30 Mar 2023 18:43:45 -0700	[thread overview]
Message-ID: <20230331014356.1033759-2-davidai@google.com> (raw)
In-Reply-To: <20230331014356.1033759-1-davidai@google.com>

For virtualization usecases, util_est and util_avg currently tracked
on the host aren't sufficient to accurately represent the workload on
vCPU threads, which results in poor frequency selection and performance.
For example, when a large workload migrates from a busy vCPU thread to
an idle vCPU thread, it incurs additional DVFS ramp-up latencies
as util accumulates.

Introduce a new "util_guest" member as an additional PELT signal that's
independently updated by the guest. When used, it's max aggregated to
provide a boost to both task_util and task_util_est.

Updating task_util and task_util_est will ensure:
-Better task placement decisions for vCPU threads on the host
-Correctly updating util_est.ewma during dequeue
-Additive util with other threads on the same runqueue for more
accurate frequency responses

Co-developed-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: David Dai <davidai@google.com>
---
 include/linux/sched.h | 11 +++++++++++
 kernel/sched/core.c   | 18 +++++++++++++++++-
 kernel/sched/fair.c   | 15 +++++++++++++--
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..d8c346fcdf52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -445,6 +445,16 @@ struct util_est {
 #define UTIL_AVG_UNCHANGED		0x80000000
 } __attribute__((__aligned__(sizeof(u64))));
 
+/*
+ * For sched_setattr_nocheck() (kernel) only
+ *
+ * Allow vCPU threads to use UTIL_GUEST as a way to hint the scheduler with more
+ * accurate utilization info. This is useful when guest kernels have some way of
+ * tracking its own runqueue's utilization.
+ *
+ */
+#define SCHED_FLAG_UTIL_GUEST   0x20000000
+
 /*
  * The load/runnable/util_avg accumulates an infinite geometric series
  * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -499,6 +509,7 @@ struct sched_avg {
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
 	unsigned long			util_avg;
+	unsigned long			util_guest;
 	struct util_est			util_est;
 } ____cacheline_aligned;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..7700ef5610c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2024,6 +2024,16 @@ static inline void uclamp_post_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
 #endif /* CONFIG_UCLAMP_TASK */
 
+static void __setscheduler_task_util(struct task_struct *p,
+				  const struct sched_attr *attr)
+{
+
+	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_GUEST)))
+		return;
+
+	p->se.avg.util_guest = attr->sched_util_min;
+}
+
 bool sched_task_on_rq(struct task_struct *p)
 {
 	return task_on_rq_queued(p);
@@ -7561,7 +7571,7 @@ static int __sched_setscheduler(struct task_struct *p,
 			return -EINVAL;
 	}
 
-	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV | SCHED_FLAG_UTIL_GUEST))
 		return -EINVAL;
 
 	/*
@@ -7583,6 +7593,9 @@ static int __sched_setscheduler(struct task_struct *p,
 		if (attr->sched_flags & SCHED_FLAG_SUGOV)
 			return -EINVAL;
 
+		if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST)
+			return -EINVAL;
+
 		retval = security_task_setscheduler(p);
 		if (retval)
 			return retval;
@@ -7629,6 +7642,8 @@ static int __sched_setscheduler(struct task_struct *p,
 			goto change;
 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
 			goto change;
+		if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST)
+			goto change;
 
 		p->sched_reset_on_fork = reset_on_fork;
 		retval = 0;
@@ -7718,6 +7733,7 @@ static int __sched_setscheduler(struct task_struct *p,
 		__setscheduler_prio(p, newprio);
 	}
 	__setscheduler_uclamp(p, attr);
+	__setscheduler_task_util(p, attr);
 
 	if (queued) {
 		/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6986ea31c984..998649554344 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
 
 static inline unsigned long task_util(struct task_struct *p)
 {
-	return READ_ONCE(p->se.avg.util_avg);
+	return max(READ_ONCE(p->se.avg.util_avg),
+			READ_ONCE(p->se.avg.util_guest));
 }
 
 static inline unsigned long _task_util_est(struct task_struct *p)
 {
 	struct util_est ue = READ_ONCE(p->se.avg.util_est);
 
-	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+	return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest),
+			max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)));
 }
 
 static inline unsigned long task_util_est(struct task_struct *p)
@@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	util_est_enqueue(&rq->cfs, p);
 
+	/*
+	 * The normal code path for host thread enqueue doesn't take into
+	 * account guest task migrations when updating cpufreq util.
+	 * So, always update the cpufreq when a vCPU thread has a
+	 * non-zero util_guest value.
+	 */
+	if (READ_ONCE(p->se.avg.util_guest))
+		cpufreq_update_util(rq, 0);
+
 	/*
 	 * If in_iowait is set, the code below may not trigger any cpufreq
 	 * utilization updates, so do it here explicitly with the IOWAIT flag
-- 
2.40.0.348.gf938b09366-goog


WARNING: multiple messages have this Message-ID (diff)
From: David Dai <davidai@google.com>
To: "Rafael J. Wysocki" <rafael@kernel.org>,
	Viresh Kumar <viresh.kumar@linaro.org>,
	 Rob Herring <robh+dt@kernel.org>,
	 Krzysztof Kozlowski <krzysztof.kozlowski+dt@linaro.org>,
	Paolo Bonzini <pbonzini@redhat.com>,
	 Jonathan Corbet <corbet@lwn.net>, Marc Zyngier <maz@kernel.org>,
	Oliver Upton <oliver.upton@linux.dev>,
	 James Morse <james.morse@arm.com>,
	Suzuki K Poulose <suzuki.poulose@arm.com>,
	 Zenghui Yu <yuzenghui@huawei.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	 Will Deacon <will@kernel.org>,
	Mark Rutland <mark.rutland@arm.com>,
	 Lorenzo Pieralisi <lpieralisi@kernel.org>,
	Sudeep Holla <sudeep.holla@arm.com>,
	 Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	 Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>,
	 Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	 Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	 Daniel Bristot de Oliveira <bristot@redhat.com>,
	Valentin Schneider <vschneid@redhat.com>,
	David Dai <davidai@google.com>
Cc: Saravana Kannan <saravanak@google.com>,
	kernel-team@android.com, linux-pm@vger.kernel.org,
	 devicetree@vger.kernel.org, linux-kernel@vger.kernel.org,
	kvm@vger.kernel.org,  linux-doc@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,  kvmarm@lists.linux.dev
Subject: [RFC PATCH v2 1/6] sched/fair: Add util_guest for tasks
Date: Thu, 30 Mar 2023 18:43:45 -0700	[thread overview]
Message-ID: <20230331014356.1033759-2-davidai@google.com> (raw)
In-Reply-To: <20230331014356.1033759-1-davidai@google.com>

For virtualization usecases, util_est and util_avg currently tracked
on the host aren't sufficient to accurately represent the workload on
vCPU threads, which results in poor frequency selection and performance.
For example, when a large workload migrates from a busy vCPU thread to
an idle vCPU thread, it incurs additional DVFS ramp-up latencies
as util accumulates.

Introduce a new "util_guest" member as an additional PELT signal that's
independently updated by the guest. When used, it's max aggregated to
provide a boost to both task_util and task_util_est.

Updating task_util and task_util_est will ensure:
-Better task placement decisions for vCPU threads on the host
-Correctly updating util_est.ewma during dequeue
-Additive util with other threads on the same runqueue for more
accurate frequency responses

Co-developed-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: David Dai <davidai@google.com>
---
 include/linux/sched.h | 11 +++++++++++
 kernel/sched/core.c   | 18 +++++++++++++++++-
 kernel/sched/fair.c   | 15 +++++++++++++--
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..d8c346fcdf52 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -445,6 +445,16 @@ struct util_est {
 #define UTIL_AVG_UNCHANGED		0x80000000
 } __attribute__((__aligned__(sizeof(u64))));
 
+/*
+ * For sched_setattr_nocheck() (kernel) only
+ *
+ * Allow vCPU threads to use UTIL_GUEST as a way to hint the scheduler with more
+ * accurate utilization info. This is useful when guest kernels have some way of
+ * tracking its own runqueue's utilization.
+ *
+ */
+#define SCHED_FLAG_UTIL_GUEST   0x20000000
+
 /*
  * The load/runnable/util_avg accumulates an infinite geometric series
  * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -499,6 +509,7 @@ struct sched_avg {
 	unsigned long			load_avg;
 	unsigned long			runnable_avg;
 	unsigned long			util_avg;
+	unsigned long			util_guest;
 	struct util_est			util_est;
 } ____cacheline_aligned;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d18c3969f90..7700ef5610c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2024,6 +2024,16 @@ static inline void uclamp_post_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
 #endif /* CONFIG_UCLAMP_TASK */
 
+static void __setscheduler_task_util(struct task_struct *p,
+				  const struct sched_attr *attr)
+{
+
+	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_GUEST)))
+		return;
+
+	p->se.avg.util_guest = attr->sched_util_min;
+}
+
 bool sched_task_on_rq(struct task_struct *p)
 {
 	return task_on_rq_queued(p);
@@ -7561,7 +7571,7 @@ static int __sched_setscheduler(struct task_struct *p,
 			return -EINVAL;
 	}
 
-	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV | SCHED_FLAG_UTIL_GUEST))
 		return -EINVAL;
 
 	/*
@@ -7583,6 +7593,9 @@ static int __sched_setscheduler(struct task_struct *p,
 		if (attr->sched_flags & SCHED_FLAG_SUGOV)
 			return -EINVAL;
 
+		if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST)
+			return -EINVAL;
+
 		retval = security_task_setscheduler(p);
 		if (retval)
 			return retval;
@@ -7629,6 +7642,8 @@ static int __sched_setscheduler(struct task_struct *p,
 			goto change;
 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
 			goto change;
+		if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST)
+			goto change;
 
 		p->sched_reset_on_fork = reset_on_fork;
 		retval = 0;
@@ -7718,6 +7733,7 @@ static int __sched_setscheduler(struct task_struct *p,
 		__setscheduler_prio(p, newprio);
 	}
 	__setscheduler_uclamp(p, attr);
+	__setscheduler_task_util(p, attr);
 
 	if (queued) {
 		/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6986ea31c984..998649554344 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
 
 static inline unsigned long task_util(struct task_struct *p)
 {
-	return READ_ONCE(p->se.avg.util_avg);
+	return max(READ_ONCE(p->se.avg.util_avg),
+			READ_ONCE(p->se.avg.util_guest));
 }
 
 static inline unsigned long _task_util_est(struct task_struct *p)
 {
 	struct util_est ue = READ_ONCE(p->se.avg.util_est);
 
-	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+	return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest),
+			max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)));
 }
 
 static inline unsigned long task_util_est(struct task_struct *p)
@@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	util_est_enqueue(&rq->cfs, p);
 
+	/*
+	 * The normal code path for host thread enqueue doesn't take into
+	 * account guest task migrations when updating cpufreq util.
+	 * So, always update the cpufreq when a vCPU thread has a
+	 * non-zero util_guest value.
+	 */
+	if (READ_ONCE(p->se.avg.util_guest))
+		cpufreq_update_util(rq, 0);
+
 	/*
 	 * If in_iowait is set, the code below may not trigger any cpufreq
 	 * utilization updates, so do it here explicitly with the IOWAIT flag
-- 
2.40.0.348.gf938b09366-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

  reply	other threads:[~2023-03-31  1:44 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-31  1:43 [RFC PATCH v2 0/6] Improve VM CPUfreq and task placement behavior David Dai
2023-03-31  1:43 ` David Dai
2023-03-31  1:43 ` David Dai [this message]
2023-03-31  1:43   ` [RFC PATCH v2 1/6] sched/fair: Add util_guest for tasks David Dai
2023-03-31  8:19   ` kernel test robot
2023-03-31  9:20   ` kernel test robot
2023-03-31  1:43 ` [RFC PATCH v2 2/6] kvm: arm64: Add support for get_cur_cpufreq service David Dai
2023-03-31  1:43   ` David Dai
2023-04-01  3:12   ` Bagas Sanjaya
2023-04-01  3:12     ` Bagas Sanjaya
2023-04-01  3:16   ` Bagas Sanjaya
2023-04-01  3:16     ` Bagas Sanjaya
2023-03-31  1:43 ` [RFC PATCH v2 3/6] kvm: arm64: Add support for util_hint service David Dai
2023-03-31  1:43   ` David Dai
2023-04-01  3:22   ` Bagas Sanjaya
2023-04-01  3:22     ` Bagas Sanjaya
2023-03-31  1:43 ` [RFC PATCH v2 4/6] kvm: arm64: Add support for get_freqtbl service David Dai
2023-03-31  1:43   ` David Dai
2023-04-01  3:28   ` Bagas Sanjaya
2023-04-01  3:28     ` Bagas Sanjaya
2023-03-31  1:43 ` [RFC PATCH v2 5/6] dt-bindings: cpufreq: add bindings for virtual kvm cpufreq David Dai
2023-03-31  1:43   ` David Dai
2023-03-31  8:55   ` Krzysztof Kozlowski
2023-03-31  8:55     ` Krzysztof Kozlowski
2023-03-31 12:42   ` Rob Herring
2023-03-31 12:42     ` Rob Herring
2023-03-31 12:46   ` Rob Herring
2023-03-31 12:46     ` Rob Herring
2023-04-05 22:07     ` Saravana Kannan
2023-04-05 22:07       ` Saravana Kannan
2023-03-31  1:43 ` [RFC PATCH v2 6/6] cpufreq: add kvm-cpufreq driver David Dai
2023-03-31  1:43   ` David Dai

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230331014356.1033759-2-davidai@google.com \
    --to=davidai@google.com \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=catalin.marinas@arm.com \
    --cc=corbet@lwn.net \
    --cc=devicetree@vger.kernel.org \
    --cc=dietmar.eggemann@arm.com \
    --cc=james.morse@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=kernel-team@android.com \
    --cc=krzysztof.kozlowski+dt@linaro.org \
    --cc=kvm@vger.kernel.org \
    --cc=kvmarm@lists.linux.dev \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-pm@vger.kernel.org \
    --cc=lpieralisi@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=maz@kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=oliver.upton@linux.dev \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rafael@kernel.org \
    --cc=robh+dt@kernel.org \
    --cc=rostedt@goodmis.org \
    --cc=saravanak@google.com \
    --cc=sudeep.holla@arm.com \
    --cc=suzuki.poulose@arm.com \
    --cc=vincent.guittot@linaro.org \
    --cc=viresh.kumar@linaro.org \
    --cc=vschneid@redhat.com \
    --cc=will@kernel.org \
    --cc=yuzenghui@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.