linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Paul Turner <pjt@google.com>
To: linux-kernel@vger.kernel.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Bharata B Rao <bharata@linux.vnet.ibm.com>,
	Dhaval Giani <dhaval.giani@gmail.com>,
	Balbir Singh <balbir@linux.vnet.ibm.com>,
	Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>,
	Srivatsa Vaddagiri <vatsa@in.ibm.com>,
	Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>,
	Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>,
	Ingo Molnar <mingo@elte.hu>, Pavel Emelyanov <xemul@openvz.org>,
	Hu Tao <hutao@cn.fujitsu.com>, Nikhil Rao <ncrao@google.com>
Subject: [patch 03/17] sched: introduce primitives to account for CFS bandwidth tracking
Date: Wed, 06 Jul 2011 22:30:39 -0700	[thread overview]
Message-ID: <20110707053059.981950189@google.com> (raw)
In-Reply-To: 20110707053036.173186930@google.com

[-- Attachment #1: sched-bwc-add_cfs_tg_bandwidth.patch --]
[-- Type: text/plain, Size: 9996 bytes --]

In this patch we introduce the notion of CFS bandwidth, partitioned into 
globally unassigned bandwidth, and locally claimed bandwidth.

- The global bandwidth is per task_group, it represents a pool of unclaimed
  bandwidth that cfs_rqs can allocate from.  
- The local bandwidth is tracked per-cfs_rq, this represents allotments from
  the global pool bandwidth assigned to a specific cpu.

Bandwidth is managed via cgroupfs, adding two new interfaces to the cpu subsystem:
- cpu.cfs_period_us : the bandwidth period in usecs
- cpu.cfs_quota_us : the cpu bandwidth (in usecs) that this tg will be allowed
  to consume over period above.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>

---
 init/Kconfig        |   13 +++
 kernel/sched.c      |  194 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched_fair.c |   16 ++++
 3 files changed, 219 insertions(+), 4 deletions(-)

Index: tip/init/Kconfig
===================================================================
--- tip.orig/init/Kconfig
+++ tip/init/Kconfig
@@ -715,6 +715,19 @@ config FAIR_GROUP_SCHED
 	depends on CGROUP_SCHED
 	default CGROUP_SCHED
 
+config CFS_BANDWIDTH
+	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
+	depends on EXPERIMENTAL
+	depends on FAIR_GROUP_SCHED
+	depends on SMP
+	default n
+	help
+	  This option allows users to define CPU bandwidth rates (limits) for
+	  tasks running within the fair group scheduler.  Groups with no limit
+	  set are considered to be unconstrained and will run with no
+	  restriction.
+	  See tip/Documentation/scheduler/sched-bwc.txt for more information.
+
 config RT_GROUP_SCHED
 	bool "Group scheduling for SCHED_RR/FIFO"
 	depends on EXPERIMENTAL
Index: tip/kernel/sched.c
===================================================================
--- tip.orig/kernel/sched.c
+++ tip/kernel/sched.c
@@ -244,6 +244,14 @@ struct cfs_rq;
 
 static LIST_HEAD(task_groups);
 
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+	raw_spinlock_t lock;
+	ktime_t period;
+	u64 quota;
+#endif
+};
+
 /* task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
@@ -275,6 +283,8 @@ struct task_group {
 #ifdef CONFIG_SCHED_AUTOGROUP
 	struct autogroup *autogroup;
 #endif
+
+	struct cfs_bandwidth cfs_bandwidth;
 };
 
 /* task_group_lock serializes the addition/removal of task groups */
@@ -374,9 +384,46 @@ struct cfs_rq {
 
 	unsigned long load_contribution;
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	int runtime_enabled;
+	s64 runtime_remaining;
+#endif
 #endif
 };
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+	return &tg->cfs_bandwidth;
+}
+
+static inline u64 default_cfs_period(void);
+
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	raw_spin_lock_init(&cfs_b->lock);
+	cfs_b->quota = RUNTIME_INF;
+	cfs_b->period = ns_to_ktime(default_cfs_period());
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	cfs_rq->runtime_enabled = 0;
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{}
+#else
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+	return NULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 	struct rt_prio_array active;
@@ -7795,6 +7842,7 @@ static void init_tg_cfs_entry(struct tas
 	tg->cfs_rq[cpu] = cfs_rq;
 	init_cfs_rq(cfs_rq, rq);
 	cfs_rq->tg = tg;
+	init_cfs_rq_runtime(cfs_rq);
 
 	tg->se[cpu] = se;
 	/* se could be NULL for root_task_group */
@@ -7930,6 +7978,7 @@ void __init sched_init(void)
 		 * We achieve this by letting root_task_group's tasks sit
 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
 		 */
+		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -8171,6 +8220,8 @@ static void free_fair_sched_group(struct
 {
 	int i;
 
+	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
@@ -8198,6 +8249,8 @@ int alloc_fair_sched_group(struct task_g
 
 	tg->shares = NICE_0_LOAD;
 
+	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
 	for_each_possible_cpu(i) {
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
@@ -8569,7 +8622,7 @@ static int __rt_schedulable(struct task_
 	return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 
-static int tg_set_bandwidth(struct task_group *tg,
+static int tg_set_rt_bandwidth(struct task_group *tg,
 		u64 rt_period, u64 rt_runtime)
 {
 	int i, err = 0;
@@ -8608,7 +8661,7 @@ int sched_group_set_rt_runtime(struct ta
 	if (rt_runtime_us < 0)
 		rt_runtime = RUNTIME_INF;
 
-	return tg_set_bandwidth(tg, rt_period, rt_runtime);
+	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
 long sched_group_rt_runtime(struct task_group *tg)
@@ -8633,7 +8686,7 @@ int sched_group_set_rt_period(struct tas
 	if (rt_period == 0)
 		return -EINVAL;
 
-	return tg_set_bandwidth(tg, rt_period, rt_runtime);
+	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
 long sched_group_rt_period(struct task_group *tg)
@@ -8823,6 +8876,128 @@ static u64 cpu_shares_read_u64(struct cg
 
 	return (u64) scale_load_down(tg->shares);
 }
+
+#ifdef CONFIG_CFS_BANDWIDTH
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+	int i;
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+	static DEFINE_MUTEX(mutex);
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	/*
+	 * Ensure we have at some amount of bandwidth every period.  This is
+	 * to prevent reaching a state of large arrears when throttled via
+	 * entity_tick() resulting in prolonged exit starvation.
+	 */
+	if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+		return -EINVAL;
+
+	/*
+	 * Likewise, bound things on the otherside by preventing insane quota
+	 * periods.  This also allows us to normalize in computing quota
+	 * feasibility.
+	 */
+	if (period > max_cfs_quota_period)
+		return -EINVAL;
+
+	mutex_lock(&mutex);
+	raw_spin_lock_irq(&cfs_b->lock);
+	cfs_b->period = ns_to_ktime(period);
+	cfs_b->quota = quota;
+	raw_spin_unlock_irq(&cfs_b->lock);
+
+	for_each_possible_cpu(i) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+		struct rq *rq = rq_of(cfs_rq);
+
+		raw_spin_lock_irq(&rq->lock);
+		cfs_rq->runtime_enabled = quota != RUNTIME_INF;
+		cfs_rq->runtime_remaining = 0;
+		raw_spin_unlock_irq(&rq->lock);
+	}
+	mutex_unlock(&mutex);
+
+	return 0;
+}
+
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+	u64 quota, period;
+
+	period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+	if (cfs_quota_us < 0)
+		quota = RUNTIME_INF;
+	else
+		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+
+	return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_quota(struct task_group *tg)
+{
+	u64 quota_us;
+
+	if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+		return -1;
+
+	quota_us = tg_cfs_bandwidth(tg)->quota;
+	do_div(quota_us, NSEC_PER_USEC);
+
+	return quota_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+	u64 quota, period;
+
+	period = (u64)cfs_period_us * NSEC_PER_USEC;
+	quota = tg_cfs_bandwidth(tg)->quota;
+
+	if (period <= 0)
+		return -EINVAL;
+
+	return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+	u64 cfs_period_us;
+
+	cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+	do_div(cfs_period_us, NSEC_PER_USEC);
+
+	return cfs_period_us;
+}
+
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+				s64 cfs_quota_us)
+{
+	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+				u64 cfs_period_us)
+{
+	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -8857,6 +9032,18 @@ static struct cftype cpu_files[] = {
 		.write_u64 = cpu_shares_write_u64,
 	},
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.name = "cfs_quota_us",
+		.read_s64 = cpu_cfs_quota_read_s64,
+		.write_s64 = cpu_cfs_quota_write_s64,
+	},
+	{
+		.name = "cfs_period_us",
+		.read_u64 = cpu_cfs_period_read_u64,
+		.write_u64 = cpu_cfs_period_write_u64,
+	},
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
@@ -9166,4 +9353,3 @@ struct cgroup_subsys cpuacct_subsys = {
 	.subsys_id = cpuacct_subsys_id,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
-
Index: tip/kernel/sched_fair.c
===================================================================
--- tip.orig/kernel/sched_fair.c
+++ tip/kernel/sched_fair.c
@@ -1256,6 +1256,22 @@ entity_tick(struct cfs_rq *cfs_rq, struc
 		check_preempt_tick(cfs_rq, curr);
 }
 
+
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+	return 100000000ULL;
+}
+#endif
+
 /**************************************************
  * CFS operations on tasks:
  */



  parent reply	other threads:[~2011-07-07  5:33 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-07-07  5:30 [patch 00/17] CFS Bandwidth Control v7.1 Paul Turner
2011-07-07  5:30 ` [patch 01/17] sched: (fixlet) dont update shares twice on on_rq parent Paul Turner
2011-07-21 18:28   ` [tip:sched/core] sched: Don't " tip-bot for Paul Turner
2011-07-07  5:30 ` [patch 02/17] sched: hierarchical task accounting for SCHED_OTHER Paul Turner
2011-07-07  5:30 ` Paul Turner [this message]
2011-07-07 13:48   ` [patch 03/17] sched: introduce primitives to account for CFS bandwidth tracking Peter Zijlstra
2011-07-07 21:30     ` Paul Turner
2011-07-07  5:30 ` [patch 04/17] sched: validate CFS quota hierarchies Paul Turner
2011-07-07  5:30 ` [patch 05/17] sched: accumulate per-cfs_rq cpu usage and charge against bandwidth Paul Turner
2011-07-07  5:30 ` [patch 06/17] sched: add a timer to handle CFS bandwidth refresh Paul Turner
2011-07-07  5:30 ` [patch 07/17] sched: expire invalid runtime Paul Turner
2011-07-07  5:30 ` [patch 08/17] sched: add support for throttling group entities Paul Turner
2011-07-07  5:30 ` [patch 09/17] sched: add support for unthrottling " Paul Turner
2011-07-07  5:30 ` [patch 10/17] sched: allow for positional tg_tree walks Paul Turner
2011-07-07  5:30 ` [patch 11/17] sched: prevent interactions with throttled entities Paul Turner
2011-07-07  5:30 ` [patch 12/17] sched: prevent buddy " Paul Turner
2011-07-07  5:30 ` [patch 13/17] sched: migrate throttled tasks on HOTPLUG Paul Turner
2011-07-07  5:30 ` [patch 14/17] sched: throttle entities exceeding their allowed bandwidth Paul Turner
2011-07-07  5:30 ` [patch 15/17] sched: add exports tracking cfs bandwidth control statistics Paul Turner
2011-07-07  5:30 ` [patch 16/17] sched: return unused runtime on group dequeue Paul Turner
2011-07-07  5:30 ` [patch 17/17] sched: add documentation for bandwidth control Paul Turner
2011-07-07 11:13 ` [patch 00/17] CFS Bandwidth Control v7.1 Peter Zijlstra
2011-07-11  1:22   ` Hu Tao
2011-07-07 11:23 ` Ingo Molnar
2011-07-07 11:28   ` Peter Zijlstra
2011-07-07 14:38   ` Peter Zijlstra
2011-07-07 14:51     ` Ingo Molnar
2011-07-07 14:54       ` Peter Zijlstra
2011-07-07 14:56         ` Ingo Molnar
2011-07-07 16:23           ` Jason Baron
2011-07-07 17:20             ` Peter Zijlstra
2011-07-07 18:15               ` Jason Baron
2011-07-07 20:36                 ` jump_label defaults (was Re: [patch 00/17] CFS Bandwidth Control v7.1) Peter Zijlstra
2011-07-08  9:20                   ` Peter Zijlstra
2011-07-08 15:47                     ` Jason Baron
2011-07-07 16:52     ` [patch 00/17] CFS Bandwidth Control v7.1 Andi Kleen
2011-07-07 17:08       ` Peter Zijlstra
2011-07-07 17:59     ` Peter Zijlstra
2011-07-07 19:36       ` Jason Baron
2011-07-08  7:45       ` Paul Turner
2011-07-08  7:39     ` Paul Turner
2011-07-08 10:32       ` Peter Zijlstra
2011-07-09  7:34         ` Paul Turner
2011-07-10 18:12           ` Ingo Molnar
2011-07-07 14:06 ` Peter Zijlstra
2011-07-08  7:35   ` Paul Turner
2011-07-11  1:22 ` Hu Tao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110707053059.981950189@google.com \
    --to=pjt@google.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=bharata@linux.vnet.ibm.com \
    --cc=dhaval.giani@gmail.com \
    --cc=hutao@cn.fujitsu.com \
    --cc=kamalesh@linux.vnet.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=ncrao@google.com \
    --cc=seto.hidetoshi@jp.fujitsu.com \
    --cc=svaidy@linux.vnet.ibm.com \
    --cc=vatsa@in.ibm.com \
    --cc=xemul@openvz.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).