linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Waiman Long <longman@redhat.com>
To: Tejun Heo <tj@kernel.org>, Li Zefan <lizefan@huawei.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>
Cc: cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, kernel-team@fb.com, pjt@google.com,
	luto@amacapital.net, Mike Galbraith <efault@gmx.de>,
	torvalds@linux-foundation.org, Roman Gushchin <guro@fb.com>,
	Juri Lelli <juri.lelli@redhat.com>,
	Patrick Bellasi <patrick.bellasi@arm.com>,
	Tom Hromatka <tom.hromatka@oracle.com>,
	Waiman Long <longman@redhat.com>
Subject: [PATCH v14 07/12] cpuset: Make CPU hotplug work with partition
Date: Mon, 15 Oct 2018 16:29:32 -0400	[thread overview]
Message-ID: <1539635377-22335-8-git-send-email-longman@redhat.com> (raw)
In-Reply-To: <1539635377-22335-1-git-send-email-longman@redhat.com>

When there is a cpu hotplug event (CPU online or offline), the partitions
may need to be reconfigured and regenerated. So code is added to the
hotplug functions to make them work with new subparts_cpus mask to
compute the right effective_cpus for each of the affected cpusets.
It may also change the state of a partition root from real one to an
erroneous one or vice versa.

Signed-off-by: Waiman Long <longman@redhat.com>
---
 kernel/cgroup/cpuset.c | 131 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 116 insertions(+), 15 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 596e0b16fb96..2e9fab21ea57 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -113,6 +113,9 @@ struct cpuset {
 	 * CPUs allocated to child sub-partitions (default hierarchy only)
 	 * - CPUs granted by the parent = effective_cpus U subparts_cpus
 	 * - effective_cpus and subparts_cpus are mutually exclusive.
+	 *
+	 * effective_cpus contains only onlined CPUs, but subparts_cpus
+	 * may have offlined ones.
 	 */
 	cpumask_var_t subparts_cpus;
 
@@ -994,7 +997,9 @@ static void update_tasks_cpumask(struct cpuset *cs)
  * @parent: the parent cpuset
  *
  * If the parent has subpartition CPUs, include them in the list of
- * allowable CPUs in computing the new effective_cpus mask.
+ * allowable CPUs in computing the new effective_cpus mask. Since offlined
+ * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
+ * to mask those out.
  */
 static void compute_effective_cpumask(struct cpumask *new_cpus,
 				      struct cpuset *cs, struct cpuset *parent)
@@ -1003,6 +1008,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
 		cpumask_or(new_cpus, parent->effective_cpus,
 			   parent->subparts_cpus);
 		cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
+		cpumask_and(new_cpus, new_cpus, cpu_active_mask);
 	} else {
 		cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
 	}
@@ -1125,9 +1131,20 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 		/*
 		 * Return error if the new effective_cpus could become empty.
 		 */
-		if (adding && !deleting &&
-		    cpumask_equal(parent->effective_cpus, tmp->addmask))
-			return -EINVAL;
+		if (adding &&
+		    cpumask_equal(parent->effective_cpus, tmp->addmask)) {
+			if (!deleting)
+				return -EINVAL;
+			/*
+			 * As some of the CPUs in subparts_cpus might have
+			 * been offlined, we need to compute the real delmask
+			 * to confirm that.
+			 */
+			if (!cpumask_and(tmp->addmask, tmp->delmask,
+					 cpu_active_mask))
+				return -EINVAL;
+			cpumask_copy(tmp->addmask, parent->effective_cpus);
+		}
 	} else {
 		/*
 		 * partcmd_update w/o newmask:
@@ -1197,6 +1214,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
 	if (deleting) {
 		cpumask_andnot(parent->subparts_cpus,
 			       parent->subparts_cpus, tmp->delmask);
+		/*
+		 * Some of the CPUs in subparts_cpus might have been offlined.
+		 */
+		cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
 		cpumask_or(parent->effective_cpus,
 			   parent->effective_cpus, tmp->delmask);
 	}
@@ -2858,20 +2879,29 @@ hotplug_update_tasks(struct cpuset *cs,
 		update_tasks_nodemask(cs);
 }
 
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+	force_rebuild = true;
+}
+
 /**
  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
  * @cs: cpuset in interest
+ * @tmp: the tmpmasks structure pointer
  *
  * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
  * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
  * all its tasks are moved to the nearest ancestor with both resources.
  */
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
 	static cpumask_t new_cpus;
 	static nodemask_t new_mems;
 	bool cpus_updated;
 	bool mems_updated;
+	struct cpuset *parent;
 retry:
 	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
 
@@ -2886,9 +2916,60 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 		goto retry;
 	}
 
-	cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
-	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
+	parent =  parent_cs(cs);
+	compute_effective_cpumask(&new_cpus, cs, parent);
+	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
+
+	if (cs->nr_subparts_cpus)
+		/*
+		 * Make sure that CPUs allocated to child partitions
+		 * do not show up in effective_cpus.
+		 */
+		cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
+
+	if (!tmp || !cs->partition_root_state)
+		goto update_tasks;
+
+	/*
+	 * In the unlikely event that a partition root has empty
+	 * effective_cpus or its parent becomes erroneous, we have to
+	 * transition it to the erroneous state.
+	 */
+	if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
+	   (parent->partition_root_state == PRS_ERROR))) {
+		if (cs->nr_subparts_cpus) {
+			cs->nr_subparts_cpus = 0;
+			cpumask_clear(cs->subparts_cpus);
+			compute_effective_cpumask(&new_cpus, cs, parent);
+		}
 
+		/*
+		 * If the effective_cpus is empty because the child
+		 * partitions take away all the CPUs, we can keep
+		 * the current partition and let the child partitions
+		 * fight for available CPUs.
+		 */
+		if ((parent->partition_root_state == PRS_ERROR) ||
+		     cpumask_empty(&new_cpus)) {
+			update_parent_subparts_cpumask(cs, partcmd_disable,
+						       NULL, tmp);
+			cs->partition_root_state = PRS_ERROR;
+		}
+		cpuset_force_rebuild();
+	}
+
+	/*
+	 * On the other hand, an erroneous partition root may be transitioned
+	 * back to a regular one or a partition root with no CPU allocated
+	 * from the parent may change to erroneous.
+	 */
+	if (is_partition_root(parent) &&
+	   ((cs->partition_root_state == PRS_ERROR) ||
+	    !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
+	     update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
+		cpuset_force_rebuild();
+
+update_tasks:
 	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
 	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
 
@@ -2902,13 +2983,6 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 	mutex_unlock(&cpuset_mutex);
 }
 
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
-	force_rebuild = true;
-}
-
 /**
  * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
  *
@@ -2931,6 +3005,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	static nodemask_t new_mems;
 	bool cpus_updated, mems_updated;
 	bool on_dfl = is_in_v2_mode();
+	struct tmpmasks tmp, *ptmp = NULL;
+
+	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+		ptmp = &tmp;
 
 	mutex_lock(&cpuset_mutex);
 
@@ -2938,6 +3016,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	cpumask_copy(&new_cpus, cpu_active_mask);
 	new_mems = node_states[N_MEMORY];
 
+	/*
+	 * If subparts_cpus is populated, it is likely that the check below
+	 * will produce a false positive on cpus_updated when the cpu list
+	 * isn't changed. It is extra work, but it is better to be safe.
+	 */
 	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
 	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
@@ -2946,6 +3029,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 		spin_lock_irq(&callback_lock);
 		if (!on_dfl)
 			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+		/*
+		 * Make sure that CPUs allocated to child partitions
+		 * do not show up in effective_cpus. If no CPU is left,
+		 * we clear the subparts_cpus & let the child partitions
+		 * fight for the CPUs again.
+		 */
+		if (top_cpuset.nr_subparts_cpus) {
+			if (cpumask_subset(&new_cpus,
+					   top_cpuset.subparts_cpus)) {
+				top_cpuset.nr_subparts_cpus = 0;
+				cpumask_clear(top_cpuset.subparts_cpus);
+			} else {
+				cpumask_andnot(&new_cpus, &new_cpus,
+					       top_cpuset.subparts_cpus);
+			}
+		}
 		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
 		spin_unlock_irq(&callback_lock);
 		/* we don't mess with cpumasks of tasks in top_cpuset */
@@ -2974,7 +3073,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 				continue;
 			rcu_read_unlock();
 
-			cpuset_hotplug_update_tasks(cs);
+			cpuset_hotplug_update_tasks(cs, ptmp);
 
 			rcu_read_lock();
 			css_put(&cs->css);
@@ -2987,6 +3086,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 		force_rebuild = false;
 		rebuild_sched_domains();
 	}
+
+	free_cpumasks(NULL, ptmp);
 }
 
 void cpuset_update_active_cpus(void)
-- 
2.18.0


  parent reply	other threads:[~2018-10-15 20:30 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-10-15 20:29 [PATCH v14 00/12] Enable cpuset controller in default hierarchy Waiman Long
2018-10-15 20:29 ` [PATCH v14 01/12] cpuset: " Waiman Long
2018-10-15 20:29 ` [PATCH v14 02/12] cpuset: Define data structures to support scheduling partition Waiman Long
2018-10-15 20:29 ` [PATCH v14 03/12] cpuset: Simply allocation and freeing of cpumasks Waiman Long
2018-10-19 15:28   ` Tom Hromatka
2018-10-15 20:29 ` [PATCH v14 04/12] cpuset: Add new v2 cpuset.sched.partition flag Waiman Long
2018-11-06 11:35   ` Peter Zijlstra
2018-10-15 20:29 ` [PATCH v14 05/12] cpuset: Add an error state to cpuset.sched.partition Waiman Long
2018-11-06 11:37   ` Peter Zijlstra
2018-11-06 14:17     ` Waiman Long
2018-11-06 11:40   ` Peter Zijlstra
2018-11-07 23:13     ` Waiman Long
2018-11-06 11:40   ` Peter Zijlstra
2018-10-15 20:29 ` [PATCH v14 06/12] cpuset: Track cpusets that use parent's effective_cpus Waiman Long
2018-10-15 20:29 ` Waiman Long [this message]
2018-10-15 20:29 ` [PATCH v14 08/12] cpuset: Make generate_sched_domains() work with partition Waiman Long
2018-10-15 20:29 ` [PATCH v14 09/12] cpuset: Expose cpus.effective and mems.effective on cgroup v2 root Waiman Long
2018-10-15 20:29 ` [PATCH v14 10/12] cpuset: Add documentation about the new "cpuset.sched.partition" flag Waiman Long
2018-11-06 11:50   ` Peter Zijlstra
2018-11-06 14:09     ` Waiman Long
2018-11-07 22:58     ` Waiman Long
2018-10-15 20:29 ` [PATCH v14 11/12] cpuset: Expose cpuset.cpus.subpartitions with cgroup_debug Waiman Long
2018-10-15 20:29 ` [PATCH v14 12/12] cpuset: Show descriptive text when reading cpuset.sched.partition Waiman Long
2018-10-17 15:08   ` Tejun Heo
2018-10-17 15:20     ` Waiman Long
2018-10-19 18:56     ` Waiman Long
2018-10-19 19:24       ` Tejun Heo
2018-10-19 19:32         ` Waiman Long
2018-11-02 14:34         ` Waiman Long
2018-11-06 11:52   ` Peter Zijlstra
2018-11-05 16:36 ` [PATCH v14 00/12] Enable cpuset controller in default hierarchy Tejun Heo
2018-11-05 16:57   ` Peter Zijlstra
2018-11-06 11:53   ` Peter Zijlstra
2018-11-06 11:55     ` Peter Zijlstra
2018-11-06 14:06       ` Waiman Long
     [not found]         ` <CAOS58YPye=7Ga+y-ujFsgHqo6vdVnjykmON1z+UjNQLvvM_g4w@mail.gmail.com>
2018-11-06 14:11           ` Tejun Heo
2018-11-07 21:32         ` Tejun Heo
2018-11-07 21:52           ` Waiman Long
2018-11-08  9:41           ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1539635377-22335-8-git-send-email-longman@redhat.com \
    --to=longman@redhat.com \
    --cc=cgroups@vger.kernel.org \
    --cc=efault@gmx.de \
    --cc=guro@fb.com \
    --cc=hannes@cmpxchg.org \
    --cc=juri.lelli@redhat.com \
    --cc=kernel-team@fb.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lizefan@huawei.com \
    --cc=luto@amacapital.net \
    --cc=mingo@redhat.com \
    --cc=patrick.bellasi@arm.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=tj@kernel.org \
    --cc=tom.hromatka@oracle.com \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).