From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934797AbeE2Nns (ORCPT ); Tue, 29 May 2018 09:43:48 -0400 Received: from mx3-rdu2.redhat.com ([66.187.233.73]:54510 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S934316AbeE2NmN (ORCPT ); Tue, 29 May 2018 09:42:13 -0400 From: Waiman Long To: Tejun Heo , Li Zefan , Johannes Weiner , Peter Zijlstra , Ingo Molnar Cc: cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, linux-doc@vger.kernel.org, kernel-team@fb.com, pjt@google.com, luto@amacapital.net, Mike Galbraith , torvalds@linux-foundation.org, Roman Gushchin , Juri Lelli , Patrick Bellasi , Waiman Long Subject: [PATCH v9 3/7] cpuset: Add cpuset.sched.load_balance flag to v2 Date: Tue, 29 May 2018 09:41:30 -0400 Message-Id: <1527601294-3444-4-git-send-email-longman@redhat.com> In-Reply-To: <1527601294-3444-1-git-send-email-longman@redhat.com> References: <1527601294-3444-1-git-send-email-longman@redhat.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org The sched.load_balance flag is needed to enable CPU isolation similar to what can be done with the "isolcpus" kernel boot parameter. Its value can only be changed in a scheduling domain with no child cpusets. On a non-scheduling domain cpuset, the value of sched.load_balance is inherited from its parent. This is to make sure that all the cpusets within the same scheduling domain or partition has the same load balancing state. This flag is set by the parent and is not delegatable. Signed-off-by: Waiman Long --- Documentation/cgroup-v2.txt | 26 +++++++++++++++++++++ kernel/cgroup/cpuset.c | 55 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index e7534c5..681a809 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -1542,6 +1542,32 @@ Cpuset Interface Files Further changes made to "cpuset.cpus" is allowed as long as the first condition above is still true. + A parent scheduling domain root cgroup cannot distribute all + its CPUs to its child scheduling domain root cgroups unless + its load balancing flag is turned off. + + cpuset.sched.load_balance + A read-write single value file which exists on non-root + cpuset-enabled cgroups. It is a binary value flag that accepts + either "0" (off) or "1" (on). This flag is set by the parent + and is not delegatable. It is on by default in the root cgroup. + + When it is on, tasks within this cpuset will be load-balanced + by the kernel scheduler. Tasks will be moved from CPUs with + high load to other CPUs within the same cpuset with less load + periodically. + + When it is off, there will be no load balancing among CPUs on + this cgroup. Tasks will stay in the CPUs they are running on + and will not be moved to other CPUs. + + The load balancing state of a cgroup can only be changed on a + scheduling domain root cgroup with no cpuset-enabled children. + All cgroups within a scheduling domain or partition must have + the same load balancing state. As descendant cgroups of a + scheduling domain root are created, they inherit the same load + balancing state of their root. + Device controller ----------------- diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 405b072..b94d4a0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -510,7 +510,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); - /* On legacy hiearchy, we must be a subset of our parent cpuset. */ + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ ret = -EACCES; if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) goto out; @@ -1063,6 +1063,14 @@ static int update_isolated_cpumask(struct cpuset *cpuset, goto out; /* + * A parent can't distribute all its CPUs to child scheduling + * domain root cpusets unless load balancing is off. + */ + if (adding & !deleting && is_sched_load_balance(parent) && + cpumask_equal(addmask, parent->effective_cpus)) + goto out; + + /* * Check if any CPUs in addmask or delmask are in a sibling cpuset. * An empty sibling cpus_allowed means it is the same as parent's * effective_cpus. This checking is skipped if the cpuset is dying. @@ -1540,6 +1548,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, domain_flag_changed = (is_sched_domain_root(cs) != is_sched_domain_root(trialcs)); + /* + * On default hierachy, a load balance flag change is only allowed + * in a scheduling domain root with no child cpuset as all the + * cpusets within the same scheduling domain/partition must have the + * same load balancing state. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && balance_flag_changed && + (!is_sched_domain_root(cs) || css_has_online_children(&cs->css))) { + err = -EINVAL; + goto out; + } + if (domain_flag_changed) { err = turning_on ? update_isolated_cpumask(cs, NULL, cs->cpus_allowed) @@ -2196,6 +2216,14 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "sched.load_balance", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_LOAD_BALANCE, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ }; @@ -2209,19 +2237,38 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuset *cs; + struct cgroup_subsys_state *errptr = ERR_PTR(-ENOMEM); if (!parent_css) return &top_cpuset.css; cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) - return ERR_PTR(-ENOMEM); + return errptr; if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) goto free_cs; if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) goto free_cpus; - set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + /* + * On default hierarchy, inherit parent's CS_SCHED_LOAD_BALANCE flag. + * Creating new cpuset is also not allowed if the effective_cpus of + * its parent is empty. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + struct cpuset *parent = css_cs(parent_css); + + if (test_bit(CS_SCHED_LOAD_BALANCE, &parent->flags)) + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + + if (cpumask_empty(parent->effective_cpus)) { + errptr = ERR_PTR(-EINVAL); + goto free_cpus; + } + } else { + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + } + cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); cpumask_clear(cs->effective_cpus); @@ -2235,7 +2282,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) free_cpumask_var(cs->cpus_allowed); free_cs: kfree(cs); - return ERR_PTR(-ENOMEM); + return errptr; } static int cpuset_css_online(struct cgroup_subsys_state *css) -- 1.8.3.1 From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.1 (2015-04-28) on archive.lwn.net X-Spam-Level: X-Spam-Status: No, score=-5.8 required=5.0 tests=HEADER_FROM_DIFFERENT_DOMAINS, MAILING_LIST_MULTI,RCVD_IN_DNSWL_HI autolearn=unavailable autolearn_force=no version=3.4.1 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by archive.lwn.net (Postfix) with ESMTP id 308007DF87 for ; Tue, 29 May 2018 13:45:29 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934078AbeE2Nnp (ORCPT ); Tue, 29 May 2018 09:43:45 -0400 Received: from mx3-rdu2.redhat.com ([66.187.233.73]:54510 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-FAIL) by vger.kernel.org with ESMTP id S934316AbeE2NmN (ORCPT ); Tue, 29 May 2018 09:42:13 -0400 Received: from smtp.corp.redhat.com (int-mx05.intmail.prod.int.rdu2.redhat.com [10.11.54.5]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by mx1.redhat.com (Postfix) with ESMTPS id E6B7D401EF09; Tue, 29 May 2018 13:42:12 +0000 (UTC) Received: from llong.com (dhcp-17-81.bos.redhat.com [10.18.17.81]) by smtp.corp.redhat.com (Postfix) with ESMTP id 1FC986466D; Tue, 29 May 2018 13:42:12 +0000 (UTC) From: Waiman Long To: Tejun Heo , Li Zefan , Johannes Weiner , Peter Zijlstra , Ingo Molnar Cc: cgroups@vger.kernel.org, linux-kernel@vger.kernel.org, linux-doc@vger.kernel.org, kernel-team@fb.com, pjt@google.com, luto@amacapital.net, Mike Galbraith , torvalds@linux-foundation.org, Roman Gushchin , Juri Lelli , Patrick Bellasi , Waiman Long Subject: [PATCH v9 3/7] cpuset: Add cpuset.sched.load_balance flag to v2 Date: Tue, 29 May 2018 09:41:30 -0400 Message-Id: <1527601294-3444-4-git-send-email-longman@redhat.com> In-Reply-To: <1527601294-3444-1-git-send-email-longman@redhat.com> References: <1527601294-3444-1-git-send-email-longman@redhat.com> X-Scanned-By: MIMEDefang 2.79 on 10.11.54.5 X-Greylist: Sender IP whitelisted, not delayed by milter-greylist-4.5.16 (mx1.redhat.com [10.11.55.5]); Tue, 29 May 2018 13:42:13 +0000 (UTC) X-Greylist: inspected by milter-greylist-4.5.16 (mx1.redhat.com [10.11.55.5]); Tue, 29 May 2018 13:42:13 +0000 (UTC) for IP:'10.11.54.5' DOMAIN:'int-mx05.intmail.prod.int.rdu2.redhat.com' HELO:'smtp.corp.redhat.com' FROM:'longman@redhat.com' RCPT:'' Sender: linux-doc-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-doc@vger.kernel.org The sched.load_balance flag is needed to enable CPU isolation similar to what can be done with the "isolcpus" kernel boot parameter. Its value can only be changed in a scheduling domain with no child cpusets. On a non-scheduling domain cpuset, the value of sched.load_balance is inherited from its parent. This is to make sure that all the cpusets within the same scheduling domain or partition has the same load balancing state. This flag is set by the parent and is not delegatable. Signed-off-by: Waiman Long --- Documentation/cgroup-v2.txt | 26 +++++++++++++++++++++ kernel/cgroup/cpuset.c | 55 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index e7534c5..681a809 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -1542,6 +1542,32 @@ Cpuset Interface Files Further changes made to "cpuset.cpus" is allowed as long as the first condition above is still true. + A parent scheduling domain root cgroup cannot distribute all + its CPUs to its child scheduling domain root cgroups unless + its load balancing flag is turned off. + + cpuset.sched.load_balance + A read-write single value file which exists on non-root + cpuset-enabled cgroups. It is a binary value flag that accepts + either "0" (off) or "1" (on). This flag is set by the parent + and is not delegatable. It is on by default in the root cgroup. + + When it is on, tasks within this cpuset will be load-balanced + by the kernel scheduler. Tasks will be moved from CPUs with + high load to other CPUs within the same cpuset with less load + periodically. + + When it is off, there will be no load balancing among CPUs on + this cgroup. Tasks will stay in the CPUs they are running on + and will not be moved to other CPUs. + + The load balancing state of a cgroup can only be changed on a + scheduling domain root cgroup with no cpuset-enabled children. + All cgroups within a scheduling domain or partition must have + the same load balancing state. As descendant cgroups of a + scheduling domain root are created, they inherit the same load + balancing state of their root. + Device controller ----------------- diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 405b072..b94d4a0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -510,7 +510,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); - /* On legacy hiearchy, we must be a subset of our parent cpuset. */ + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ ret = -EACCES; if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) goto out; @@ -1063,6 +1063,14 @@ static int update_isolated_cpumask(struct cpuset *cpuset, goto out; /* + * A parent can't distribute all its CPUs to child scheduling + * domain root cpusets unless load balancing is off. + */ + if (adding & !deleting && is_sched_load_balance(parent) && + cpumask_equal(addmask, parent->effective_cpus)) + goto out; + + /* * Check if any CPUs in addmask or delmask are in a sibling cpuset. * An empty sibling cpus_allowed means it is the same as parent's * effective_cpus. This checking is skipped if the cpuset is dying. @@ -1540,6 +1548,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, domain_flag_changed = (is_sched_domain_root(cs) != is_sched_domain_root(trialcs)); + /* + * On default hierachy, a load balance flag change is only allowed + * in a scheduling domain root with no child cpuset as all the + * cpusets within the same scheduling domain/partition must have the + * same load balancing state. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && balance_flag_changed && + (!is_sched_domain_root(cs) || css_has_online_children(&cs->css))) { + err = -EINVAL; + goto out; + } + if (domain_flag_changed) { err = turning_on ? update_isolated_cpumask(cs, NULL, cs->cpus_allowed) @@ -2196,6 +2216,14 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "sched.load_balance", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_LOAD_BALANCE, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* terminate */ }; @@ -2209,19 +2237,38 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuset *cs; + struct cgroup_subsys_state *errptr = ERR_PTR(-ENOMEM); if (!parent_css) return &top_cpuset.css; cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) - return ERR_PTR(-ENOMEM); + return errptr; if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) goto free_cs; if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) goto free_cpus; - set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + /* + * On default hierarchy, inherit parent's CS_SCHED_LOAD_BALANCE flag. + * Creating new cpuset is also not allowed if the effective_cpus of + * its parent is empty. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + struct cpuset *parent = css_cs(parent_css); + + if (test_bit(CS_SCHED_LOAD_BALANCE, &parent->flags)) + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + + if (cpumask_empty(parent->effective_cpus)) { + errptr = ERR_PTR(-EINVAL); + goto free_cpus; + } + } else { + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + } + cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); cpumask_clear(cs->effective_cpus); @@ -2235,7 +2282,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) free_cpumask_var(cs->cpus_allowed); free_cs: kfree(cs); - return ERR_PTR(-ENOMEM); + return errptr; } static int cpuset_css_online(struct cgroup_subsys_state *css) -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html