linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] cgroup/cpuset: remove circular dependency deadlock
@ 2017-09-06 11:48 Prateek Sood
  2017-09-06 12:56 ` Waiman Long
  0 siblings, 1 reply; 19+ messages in thread
From: Prateek Sood @ 2017-09-06 11:48 UTC (permalink / raw)
  To: tj, lizefan, cgroups, mingo, longman; +Cc: Prateek Sood, linux-kernel, sramana

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Example scenario:
kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0

kworker/0:0 - lock(cpuhotplug.mutex)  [held]
              flush(work)   [no high prio workqueue available on CPU]
              wait_for_completion()

kthreadd    - percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]

init:729    - percpu_down_write(cgroup_threadgroup_rwsem)   [held]
              lock(cpuset_mutex)   [waiting]

init:1      - lock(cpuset_mutex)   [held]
              lock(cpuhotplug.mutex)   [waiting]

Eliminate this dependecy by reordering locking of cpuset_mutex
and cpuhotplug.mutex in following order
1. Acquire cpuhotplug.mutex
2. Acquire cpuset_mutex

Signed-off-by: Prateek Sood <prsood@codeaurora.org>
---
 kernel/cgroup/cpuset.c | 70 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039b..c7a3901 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -843,10 +843,41 @@ static void rebuild_sched_domains_locked(void)
 out:
 	put_online_cpus();
 }
+
+/*
+ * Rebuild scheduler domains.
+ * Call with following mutex held in the order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
+ */
+static void rebuild_sched_domains_unlocked(void)
+{
+        struct sched_domain_attr *attr;
+        cpumask_var_t *doms;
+        int ndoms;
+
+        /*
+         * We have raced with CPU hotplug. Don't do anything to avoid
+         * passing doms with offlined cpu to partition_sched_domains().
+         * Anyways, hotplug work item will rebuild sched domains.
+         */
+        if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+                return;
+
+        /* Generate domain masks and attrs */
+        ndoms = generate_sched_domains(&doms, &attr);
+
+        /* Have scheduler rebuild the domains */
+        partition_sched_domains(ndoms, doms, attr);
+}
 #else /* !CONFIG_SMP */
 static void rebuild_sched_domains_locked(void)
 {
 }
+
+static void rebuild_sched_domains_unlocked(void)
+{
+}
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
@@ -885,7 +916,9 @@ static void update_tasks_cpumask(struct cpuset *cs)
  *
  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
- * Called with cpuset_mutex held
+ * Called with following mutex held in order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
@@ -940,7 +973,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 	rcu_read_unlock();
 
 	if (need_rebuild_sched_domains)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_unlocked();
 }
 
 /**
@@ -1262,6 +1295,11 @@ int current_cpuset_is_being_rebound(void)
 	return ret;
 }
 
+/*
+ * Call with following mutex held in order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
+ */
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
 #ifdef CONFIG_SMP
@@ -1273,7 +1311,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 		cs->relax_domain_level = val;
 		if (!cpumask_empty(cs->cpus_allowed) &&
 		    is_sched_load_balance(cs))
-			rebuild_sched_domains_locked();
+			rebuild_sched_domains_unlocked();
 	}
 
 	return 0;
@@ -1304,9 +1342,10 @@ static void update_tasks_flags(struct cpuset *cs)
  * cs:		the cpuset to update
  * turning_on: 	whether the flag is being set or cleared
  *
- * Call with cpuset_mutex held.
+ * Call with following mutex held in order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 		       int turning_on)
 {
@@ -1339,7 +1378,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	spin_unlock_irq(&callback_lock);
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_unlocked();
 
 	if (spread_flag_changed)
 		update_tasks_flags(cs);
@@ -1607,6 +1646,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = 0;
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs)) {
 		retval = -ENODEV;
@@ -1644,6 +1684,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -1654,6 +1695,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = -ENODEV;
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1668,6 +1710,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -1706,6 +1749,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	kernfs_break_active_protection(of->kn);
 	flush_work(&cpuset_hotplug_work);
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1731,6 +1775,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	free_trial_cpuset(trialcs);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	kernfs_unbreak_active_protection(of->kn);
 	css_put(&cs->css);
 	flush_workqueue(cpuset_migrate_mm_wq);
@@ -2031,13 +2076,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 /*
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_unlocked().
  */
 
 static void cpuset_css_offline(struct cgroup_subsys_state *css)
 {
 	struct cpuset *cs = css_cs(css);
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 
 	if (is_sched_load_balance(cs))
@@ -2047,6 +2093,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 	clear_bit(CS_ONLINE, &cs->flags);
 
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 }
 
 static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2341,8 +2388,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	}
 
 	/* rebuild sched domains if cpus_allowed has changed */
-	if (cpus_updated)
-		rebuild_sched_domains();
+	if (cpus_updated) {
+		get_online_cpus();
+		mutex_lock(&cpuset_mutex);
+		rebuild_sched_domains_unlocked();
+		mutex_unlock(&cpuset_mutex);
+		put_online_cpus();
+	}
 }
 
 void cpuset_update_active_cpus(void)
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-09-06 11:48 [PATCH] cgroup/cpuset: remove circular dependency deadlock Prateek Sood
@ 2017-09-06 12:56 ` Waiman Long
  2017-09-06 14:23   ` Prateek Sood
  0 siblings, 1 reply; 19+ messages in thread
From: Waiman Long @ 2017-09-06 12:56 UTC (permalink / raw)
  To: Prateek Sood, tj, lizefan, cgroups, mingo; +Cc: linux-kernel, sramana

On 09/06/2017 07:48 AM, Prateek Sood wrote:
> Remove circular dependency deadlock in a scenario where hotplug of CPU is
> being done while there is updation in cgroup and cpuset triggered from
> userspace.
>
> Example scenario:
> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
>
> kworker/0:0 - lock(cpuhotplug.mutex)  [held]
>               flush(work)   [no high prio workqueue available on CPU]
>               wait_for_completion()
>
> kthreadd    - percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
>
> init:729    - percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>               lock(cpuset_mutex)   [waiting]
>
> init:1      - lock(cpuset_mutex)   [held]
>               lock(cpuhotplug.mutex)   [waiting]
>
> Eliminate this dependecy by reordering locking of cpuset_mutex
> and cpuhotplug.mutex in following order
> 1. Acquire cpuhotplug.mutex
> 2. Acquire cpuset_mutex
>
> Signed-off-by: Prateek Sood <prsood@codeaurora.org>

Is this patch for the latest upstream kernel or 4.4? There is no
cpuhotplug.mutex anymore in upstream kernel. It is a per-cpu rwsem
cpu_hotplug_lock.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-09-06 12:56 ` Waiman Long
@ 2017-09-06 14:23   ` Prateek Sood
  0 siblings, 0 replies; 19+ messages in thread
From: Prateek Sood @ 2017-09-06 14:23 UTC (permalink / raw)
  To: Waiman Long, tj, lizefan, cgroups, mingo; +Cc: linux-kernel, sramana

On 09/06/2017 06:26 PM, Waiman Long wrote:
> On 09/06/2017 07:48 AM, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Example scenario:
>> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
>>
>> kworker/0:0 - lock(cpuhotplug.mutex)  [held]
>>               flush(work)   [no high prio workqueue available on CPU]
>>               wait_for_completion()
>>
>> kthreadd    - percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
>>
>> init:729    - percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>>               lock(cpuset_mutex)   [waiting]
>>
>> init:1      - lock(cpuset_mutex)   [held]
>>               lock(cpuhotplug.mutex)   [waiting]
>>
>> Eliminate this dependecy by reordering locking of cpuset_mutex
>> and cpuhotplug.mutex in following order
>> 1. Acquire cpuhotplug.mutex
>> 2. Acquire cpuset_mutex
>>
>> Signed-off-by: Prateek Sood <prsood@codeaurora.org>
> 
> Is this patch for the latest upstream kernel or 4.4? There is no
> cpuhotplug.mutex anymore in upstream kernel. It is a per-cpu rwsem
> cpu_hotplug_lock.
> 
> Cheers,
> Longman
> 

Thanks for inputs, I will check latest kernel for details

Regards
Prateek

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-10-26 14:05             ` Waiman Long
@ 2017-10-27  8:03               ` Prateek Sood
  0 siblings, 0 replies; 19+ messages in thread
From: Prateek Sood @ 2017-10-27  8:03 UTC (permalink / raw)
  To: Waiman Long, peterz, tj, lizefan, mingo, boqun.feng, tglx
  Cc: cgroups, sramana, linux-kernel

On 10/26/2017 07:35 PM, Waiman Long wrote:
> On 10/26/2017 07:52 AM, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Process A => kthreadd => Process B => Process C => Process A
>>
>> Process A
>> cpu_subsys_offline();
>>   cpu_down();
>>     _cpu_down();
>>       percpu_down_write(&cpu_hotplug_lock); //held
>>       cpuhp_invoke_callback();
>>         workqueue_offline_cpu();
>>           wq_update_unbound_numa();
>>             kthread_create_on_node();
>>               wake_up_process();  //wakeup kthreadd
>>           flush_work();
>>           wait_for_completion();
>>
>> kthreadd
>> kthreadd();
>>   kernel_thread();
>>     do_fork();
>>       copy_process();
>>         percpu_down_read(&cgroup_threadgroup_rwsem);
>>           __rwsem_down_read_failed_common(); //waiting
>>
>> Process B
>> kernfs_fop_write();
>>   cgroup_file_write();
>>     cgroup_procs_write();
>>       percpu_down_write(&cgroup_threadgroup_rwsem); //held
>>       cgroup_attach_task();
>>         cgroup_migrate();
>>           cgroup_migrate_execute();
>>             cpuset_can_attach();
>>               mutex_lock(&cpuset_mutex); //waiting
>>
>> Process C
>> kernfs_fop_write();
>>   cgroup_file_write();
>>     cpuset_write_resmask();
>>       mutex_lock(&cpuset_mutex); //held
>>       update_cpumask();
>>         update_cpumasks_hier();
>>           rebuild_sched_domains_locked();
>>             get_online_cpus();
>>               percpu_down_read(&cpu_hotplug_lock); //waiting
>>
>> Eliminating deadlock by reversing the locking order for cpuset_mutex and
>> cpu_hotplug_lock.
> 
> General comments:
> 
> Please add a version number of your patch. I have seen multiple versions
> of this patch and have lost track how many are there as there is no
> version number information.  In addition, there are changes beyond just
> swapping the lock order and they are not documented in this change log.
> I would like to see you discuss about those additional changes here as well.
Thanks for the comments Longman. I will introduce patch versioning and update
commit text to document extra changes.

Explaintaion for extra changes in this patch:
After inverting the locking sequence of cpu_hotplug_lock and cpuset_mutex,
cpuset_hotplug_workfn() related functionality can be done synchronously from
the context doing cpu hotplug. Extra changes in this patch intend to remove
queuing of cpuset_hotplug_workfn() as a work item for cpu hotplug path. For
memory hotplug it still gets queued as a work item.


This suggestion came in from Peter. 
Peter could you please elaborate if I have missed anything.

> 
>>  void rebuild_sched_domains(void)
>>  {
>> +	cpus_read_lock();
>>  	mutex_lock(&cpuset_mutex);
>> -	rebuild_sched_domains_locked();
>> +	rebuild_sched_domains_cpuslocked();
>>  	mutex_unlock(&cpuset_mutex);
>> +	cpus_read_unlock();
>>  }
> 
> I saw a lot of instances where cpus_read_lock() and mutex_lock() come
> together. Maybe some new lock/unlock helper functions may help.
Ok, I will introduce a single wrapper for locking and unlocking
of both locks
 
> 
>> @@ -2356,25 +2354,29 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
>>  	}
>>  
>>  	/* rebuild sched domains if cpus_allowed has changed */
>> -	if (cpus_updated || force_rebuild) {
>> -		force_rebuild = false;
>> -		rebuild_sched_domains();
>> +	if (cpus_updated) {
>> +		if (use_cpu_hp_lock)
>> +			rebuild_sched_domains();
>> +		else {
>> +			/* When called during cpu hotplug cpu_hotplug_lock
>> +			 * is held by the calling thread, not
>> +			 * not cpuhp_thread_fun
>> +			 */
> 
> ??? The comment is not clear.

Following is the scenario that is described by the comment

Process A
_cpu_down()
   cpus_write_lock() //cpu_hotplug_lock held
      cpuhp_kick_ap_work()
         cpuhp_kick_ap()
            wake_up_process() // wake up cpuhp_thread_fun
            wait_for_ap_thread() //wait for hotplug thread to signal completion


cpuhp_thread_fun()
   cpuhp_invoke_callback()
     sched_cpu_deactivate()
        cpuset_cpu_inactive()
           cpuset_update_active_cpus()
             cpuset_hotplug(false) \\ do not use cpu_hotplug_lock from _cpu_down() path
                
I will update the comment in next version of patch to elaborate more.

> 
> Cheers,
> Longman
> 


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-10-26 11:52           ` Prateek Sood
@ 2017-10-26 14:05             ` Waiman Long
  2017-10-27  8:03               ` Prateek Sood
  0 siblings, 1 reply; 19+ messages in thread
From: Waiman Long @ 2017-10-26 14:05 UTC (permalink / raw)
  To: Prateek Sood, peterz, tj, lizefan, mingo, boqun.feng, tglx
  Cc: cgroups, sramana, linux-kernel

On 10/26/2017 07:52 AM, Prateek Sood wrote:
> Remove circular dependency deadlock in a scenario where hotplug of CPU is
> being done while there is updation in cgroup and cpuset triggered from
> userspace.
>
> Process A => kthreadd => Process B => Process C => Process A
>
> Process A
> cpu_subsys_offline();
>   cpu_down();
>     _cpu_down();
>       percpu_down_write(&cpu_hotplug_lock); //held
>       cpuhp_invoke_callback();
>         workqueue_offline_cpu();
>           wq_update_unbound_numa();
>             kthread_create_on_node();
>               wake_up_process();  //wakeup kthreadd
>           flush_work();
>           wait_for_completion();
>
> kthreadd
> kthreadd();
>   kernel_thread();
>     do_fork();
>       copy_process();
>         percpu_down_read(&cgroup_threadgroup_rwsem);
>           __rwsem_down_read_failed_common(); //waiting
>
> Process B
> kernfs_fop_write();
>   cgroup_file_write();
>     cgroup_procs_write();
>       percpu_down_write(&cgroup_threadgroup_rwsem); //held
>       cgroup_attach_task();
>         cgroup_migrate();
>           cgroup_migrate_execute();
>             cpuset_can_attach();
>               mutex_lock(&cpuset_mutex); //waiting
>
> Process C
> kernfs_fop_write();
>   cgroup_file_write();
>     cpuset_write_resmask();
>       mutex_lock(&cpuset_mutex); //held
>       update_cpumask();
>         update_cpumasks_hier();
>           rebuild_sched_domains_locked();
>             get_online_cpus();
>               percpu_down_read(&cpu_hotplug_lock); //waiting
>
> Eliminating deadlock by reversing the locking order for cpuset_mutex and
> cpu_hotplug_lock.

General comments:

Please add a version number of your patch. I have seen multiple versions
of this patch and have lost track how many are there as there is no
version number information.  In addition, there are changes beyond just
swapping the lock order and they are not documented in this change log.
I would like to see you discuss about those additional changes here as well.

>  void rebuild_sched_domains(void)
>  {
> +	cpus_read_lock();
>  	mutex_lock(&cpuset_mutex);
> -	rebuild_sched_domains_locked();
> +	rebuild_sched_domains_cpuslocked();
>  	mutex_unlock(&cpuset_mutex);
> +	cpus_read_unlock();
>  }

I saw a lot of instances where cpus_read_lock() and mutex_lock() come
together. Maybe some new lock/unlock helper functions may help.

> @@ -2356,25 +2354,29 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
>  	}
>  
>  	/* rebuild sched domains if cpus_allowed has changed */
> -	if (cpus_updated || force_rebuild) {
> -		force_rebuild = false;
> -		rebuild_sched_domains();
> +	if (cpus_updated) {
> +		if (use_cpu_hp_lock)
> +			rebuild_sched_domains();
> +		else {
> +			/* When called during cpu hotplug cpu_hotplug_lock
> +			 * is held by the calling thread, not
> +			 * not cpuhp_thread_fun
> +			 */

??? The comment is not clear.

Cheers,
Longman

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-10-25  9:30         ` Peter Zijlstra
@ 2017-10-26 11:52           ` Prateek Sood
  2017-10-26 14:05             ` Waiman Long
  0 siblings, 1 reply; 19+ messages in thread
From: Prateek Sood @ 2017-10-26 11:52 UTC (permalink / raw)
  To: peterz, tj, lizefan, mingo, longman, boqun.feng, tglx
  Cc: Prateek Sood, cgroups, sramana, linux-kernel

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Process A => kthreadd => Process B => Process C => Process A

Process A
cpu_subsys_offline();
  cpu_down();
    _cpu_down();
      percpu_down_write(&cpu_hotplug_lock); //held
      cpuhp_invoke_callback();
        workqueue_offline_cpu();
          wq_update_unbound_numa();
            kthread_create_on_node();
              wake_up_process();  //wakeup kthreadd
          flush_work();
          wait_for_completion();

kthreadd
kthreadd();
  kernel_thread();
    do_fork();
      copy_process();
        percpu_down_read(&cgroup_threadgroup_rwsem);
          __rwsem_down_read_failed_common(); //waiting

Process B
kernfs_fop_write();
  cgroup_file_write();
    cgroup_procs_write();
      percpu_down_write(&cgroup_threadgroup_rwsem); //held
      cgroup_attach_task();
        cgroup_migrate();
          cgroup_migrate_execute();
            cpuset_can_attach();
              mutex_lock(&cpuset_mutex); //waiting

Process C
kernfs_fop_write();
  cgroup_file_write();
    cpuset_write_resmask();
      mutex_lock(&cpuset_mutex); //held
      update_cpumask();
        update_cpumasks_hier();
          rebuild_sched_domains_locked();
            get_online_cpus();
              percpu_down_read(&cpu_hotplug_lock); //waiting

Eliminating deadlock by reversing the locking order for cpuset_mutex and
cpu_hotplug_lock.

Signed-off-by: Prateek Sood <prsood@codeaurora.org>
---
 include/linux/cpuset.h |  6 -----
 kernel/cgroup/cpuset.c | 70 ++++++++++++++++++++++++++------------------------
 kernel/power/process.c |  2 --
 kernel/sched/core.c    |  1 -
 4 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a1e6a33..e74655d 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_force_rebuild(void);
 extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_force_rebuild(void) { }
-
 static inline void cpuset_update_active_cpus(void)
 {
 	partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_wait_for_hotplug(void) { }
-
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
 {
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e29..a8213c2 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -826,16 +826,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
 	struct sched_domain_attr *attr;
 	cpumask_var_t *doms;
 	int ndoms;
 
 	lockdep_assert_held(&cpuset_mutex);
-	get_online_cpus();
 
 	/*
 	 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -843,27 +841,27 @@ static void rebuild_sched_domains_locked(void)
 	 * Anyways, hotplug work item will rebuild sched domains.
 	 */
 	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-		goto out;
+		return;
 
 	/* Generate domain masks and attrs */
 	ndoms = generate_sched_domains(&doms, &attr);
 
 	/* Have scheduler rebuild the domains */
 	partition_sched_domains(ndoms, doms, attr);
-out:
-	put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
 }
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
 {
+	cpus_read_lock();
 	mutex_lock(&cpuset_mutex);
-	rebuild_sched_domains_locked();
+	rebuild_sched_domains_cpuslocked();
 	mutex_unlock(&cpuset_mutex);
+	cpus_read_unlock();
 }
 
 /**
@@ -949,7 +947,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 	rcu_read_unlock();
 
 	if (need_rebuild_sched_domains)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_cpuslocked();
 }
 
 /**
@@ -1281,7 +1279,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 		cs->relax_domain_level = val;
 		if (!cpumask_empty(cs->cpus_allowed) &&
 		    is_sched_load_balance(cs))
-			rebuild_sched_domains_locked();
+			rebuild_sched_domains_cpuslocked();
 	}
 
 	return 0;
@@ -1314,7 +1312,6 @@ static void update_tasks_flags(struct cpuset *cs)
  *
  * Call with cpuset_mutex held.
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 		       int turning_on)
 {
@@ -1347,7 +1344,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	spin_unlock_irq(&callback_lock);
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_cpuslocked();
 
 	if (spread_flag_changed)
 		update_tasks_flags(cs);
@@ -1615,6 +1612,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = 0;
 
+	cpus_read_lock();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs)) {
 		retval = -ENODEV;
@@ -1652,6 +1650,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	cpus_read_unlock();
 	return retval;
 }
 
@@ -1662,6 +1661,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = -ENODEV;
 
+	cpus_read_lock();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1676,6 +1676,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	cpus_read_unlock();
 	return retval;
 }
 
@@ -1714,6 +1715,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	kernfs_break_active_protection(of->kn);
 	flush_work(&cpuset_hotplug_work);
 
+	cpus_read_lock();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1739,6 +1741,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	free_trial_cpuset(trialcs);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	cpus_read_unlock();
 	kernfs_unbreak_active_protection(of->kn);
 	css_put(&cs->css);
 	flush_workqueue(cpuset_migrate_mm_wq);
@@ -2039,13 +2042,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 /*
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_cpuslocked().
  */
 
 static void cpuset_css_offline(struct cgroup_subsys_state *css)
 {
 	struct cpuset *cs = css_cs(css);
 
+	cpus_read_lock();
 	mutex_lock(&cpuset_mutex);
 
 	if (is_sched_load_balance(cs))
@@ -2055,6 +2059,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 	clear_bit(CS_ONLINE, &cs->flags);
 
 	mutex_unlock(&cpuset_mutex);
+	cpus_read_unlock();
 }
 
 static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2275,15 +2280,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 	mutex_unlock(&cpuset_mutex);
 }
 
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
-	force_rebuild = true;
-}
-
 /**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset
  *
  * This function is called after either CPU or memory configuration has
  * changed and updates cpuset accordingly.  The top_cpuset is always
@@ -2298,7 +2296,7 @@ void cpuset_force_rebuild(void)
  * Note that CPU offlining during suspend is ignored.  We don't modify
  * cpusets across suspend/resume cycles at all.
  */
-static void cpuset_hotplug_workfn(struct work_struct *work)
+static void cpuset_hotplug(bool use_cpu_hp_lock)
 {
 	static cpumask_t new_cpus;
 	static nodemask_t new_mems;
@@ -2356,25 +2354,29 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	}
 
 	/* rebuild sched domains if cpus_allowed has changed */
-	if (cpus_updated || force_rebuild) {
-		force_rebuild = false;
-		rebuild_sched_domains();
+	if (cpus_updated) {
+		if (use_cpu_hp_lock)
+			rebuild_sched_domains();
+		else {
+			/* When called during cpu hotplug cpu_hotplug_lock
+			 * is held by the calling thread, not
+			 * not cpuhp_thread_fun
+			 */
+			mutex_lock(&cpuset_mutex);
+			rebuild_sched_domains_cpuslocked();
+			mutex_unlock(&cpuset_mutex);
+		}
 	}
 }
 
-void cpuset_update_active_cpus(void)
+static void cpuset_hotplug_workfn(struct work_struct *work)
 {
-	/*
-	 * We're inside cpu hotplug critical region which usually nests
-	 * inside cgroup synchronization.  Bounce actual hotplug processing
-	 * to a work item to avoid reverse locking order.
-	 */
-	schedule_work(&cpuset_hotplug_work);
+	cpuset_hotplug(true);
 }
 
-void cpuset_wait_for_hotplug(void)
+void cpuset_update_active_cpus(void)
 {
-	flush_work(&cpuset_hotplug_work);
+	cpuset_hotplug(false);
 }
 
 /*
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 50f25cb..28772b405 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -203,8 +203,6 @@ void thaw_processes(void)
 	__usermodehelper_set_disable_depth(UMH_FREEZING);
 	thaw_workqueues();
 
-	cpuset_wait_for_hotplug();
-
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
 		/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da..25b8717 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5590,7 +5590,6 @@ static void cpuset_cpu_active(void)
 		 * restore the original sched domains by considering the
 		 * cpuset configurations.
 		 */
-		cpuset_force_rebuild();
 	}
 	cpuset_update_active_cpus();
 }
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-10-25  8:39       ` Prateek Sood
@ 2017-10-25  9:30         ` Peter Zijlstra
  2017-10-26 11:52           ` Prateek Sood
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2017-10-25  9:30 UTC (permalink / raw)
  To: Prateek Sood
  Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx,
	linux-kernel, sramana

On Wed, Oct 25, 2017 at 02:09:54PM +0530, Prateek Sood wrote:
> >  void cpuset_update_active_cpus(void)
> >  {
> > +	mutex_lock(&cpuset_mutex);
> > +	rebuild_sched_domains_cpuslocked();
> > +	mutex_unlock(&cpuset_mutex);
> >  }
> 
> In the above patch rebuild_sched_domains_cpuslocked() has been
> used directly. Earlier cpuset_hotplug_update_tasks() was also
> called from cpuset_hotplug_workfn(). So migration of tasks
> related to cgroup which has empty cpuset would not happen
> during cpu hotplug.
> 
> 
> Could you please help in understanding more on this.
> 

That was me being lazy...

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-10-11  9:48     ` Peter Zijlstra
@ 2017-10-25  8:39       ` Prateek Sood
  2017-10-25  9:30         ` Peter Zijlstra
  0 siblings, 1 reply; 19+ messages in thread
From: Prateek Sood @ 2017-10-25  8:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx,
	linux-kernel, sramana

On 10/11/2017 03:18 PM, Peter Zijlstra wrote:
> On Mon, Oct 09, 2017 at 06:57:46PM +0530, Prateek Sood wrote:
>> On 09/07/2017 11:21 PM, Peter Zijlstra wrote:
> 
>>> But if you invert these locks, the need for cpuset_hotplug_workfn() goes
>>> away, at least for the CPU part, and we can make in synchronous again.
>>> Yay!!
> 
>> The callback making a call to cpuset_hotplug_workfn()in hotplug path are
>>         [CPUHP_AP_ACTIVE] = {
>>                 .name                   = "sched:active",
>>                 .startup.single         = sched_cpu_activate,
>>                 .teardown.single        = sched_cpu_deactivate,
>>         },
>>
>> if we make cpuset_hotplug_workfn() synchronous, deadlock might happen:
>> _cpu_down()
>>    cpus_write_lock()  //held
>>       cpuhp_kick_ap_work()
>>         cpuhp_kick_ap()
>>            __cpuhp_kick_ap()
>>               wake_up_process() //cpuhp_thread_fun
>>                 wait_for_ap_thread() //wait for complete from cpuhp_thread_fun()
>>
>> cpuhp_thread_fun()
>>    cpuhp_invoke_callback()
>>      sched_cpu_deactivate()
>>        cpuset_cpu_inactive()
>>           cpuset_update_active_cpus()
>>              cpuset_hotplug_work()
>>                 rebuild_sched_domains()
>>                    cpus_read_lock() //waiting as acquired in _cpu_down()
> 
> Well, duh, don't use rebuild_sched_domains() 'obviously' :-) use
> rebuild_sched_domains_cpuslocked() instead and it works just fine.
> 
> After applying your patch, the below boots and survives a hotplug.
> 
> ---
>  include/linux/cpuset.h |    6 ------
>  kernel/cgroup/cpuset.c |   30 +++++++++---------------------
>  kernel/power/process.c |    2 --
>  kernel/sched/core.c    |    1 -
>  4 files changed, 9 insertions(+), 30 deletions(-)
> 
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
>  
>  extern int cpuset_init(void);
>  extern void cpuset_init_smp(void);
> -extern void cpuset_force_rebuild(void);
>  extern void cpuset_update_active_cpus(void);
> -extern void cpuset_wait_for_hotplug(void);
>  extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
>  extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
>  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
> @@ -166,15 +164,11 @@ static inline bool cpusets_enabled(void)
>  static inline int cpuset_init(void) { return 0; }
>  static inline void cpuset_init_smp(void) {}
>  
> -static inline void cpuset_force_rebuild(void) { }
> -
>  static inline void cpuset_update_active_cpus(void)
>  {
>  	partition_sched_domains(1, NULL, NULL);
>  }
>  
> -static inline void cpuset_wait_for_hotplug(void) { }
> -
>  static inline void cpuset_cpus_allowed(struct task_struct *p,
>  				       struct cpumask *mask)
>  {
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -833,7 +833,12 @@ static void rebuild_sched_domains_cpuslo
>  	cpumask_var_t *doms;
>  	int ndoms;
>  
> +	/*
> +	 * When called during hotplug, this lock is held by the calling
> +	 * thread, not cpuhp_thread_fun :/
> +	 *
>  	lockdep_assert_cpus_held();
> +	 */
>  	lockdep_assert_held(&cpuset_mutex);
>  
>  	/*
> @@ -2281,13 +2286,6 @@ static void cpuset_hotplug_update_tasks(
>  	mutex_unlock(&cpuset_mutex);
>  }
>  
> -static bool force_rebuild;
> -
> -void cpuset_force_rebuild(void)
> -{
> -	force_rebuild = true;
> -}
> -
>  /**
>   * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
>   *
> @@ -2362,25 +2360,15 @@ static void cpuset_hotplug_workfn(struct
>  	}
>  
>  	/* rebuild sched domains if cpus_allowed has changed */
> -	if (cpus_updated || force_rebuild) {
> -		force_rebuild = false;
> +	if (cpus_updated)
>  		rebuild_sched_domains();
> -	}
>  }
>  
>  void cpuset_update_active_cpus(void)
>  {
> -	/*
> -	 * We're inside cpu hotplug critical region which usually nests
> -	 * inside cgroup synchronization.  Bounce actual hotplug processing
> -	 * to a work item to avoid reverse locking order.
> -	 */
> -	schedule_work(&cpuset_hotplug_work);
> -}
> -
> -void cpuset_wait_for_hotplug(void)
> -{
> -	flush_work(&cpuset_hotplug_work);
> +	mutex_lock(&cpuset_mutex);
> +	rebuild_sched_domains_cpuslocked();
> +	mutex_unlock(&cpuset_mutex);
>  }
>  
>  /*
> --- a/kernel/power/process.c
> +++ b/kernel/power/process.c
> @@ -203,8 +203,6 @@ void thaw_processes(void)
>  	__usermodehelper_set_disable_depth(UMH_FREEZING);
>  	thaw_workqueues();
>  
> -	cpuset_wait_for_hotplug();
> -
>  	read_lock(&tasklist_lock);
>  	for_each_process_thread(g, p) {
>  		/* No other threads should have PF_SUSPEND_TASK set */
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5598,7 +5598,6 @@ static void cpuset_cpu_active(void)
>  		 * restore the original sched domains by considering the
>  		 * cpuset configurations.
>  		 */
> -		cpuset_force_rebuild();
>  	}
>  	cpuset_update_active_cpus();
>  }
> 


Thanks Peter for sharing the patch and test results.

>  void cpuset_update_active_cpus(void)
>  {
> -	/*
> -	 * We're inside cpu hotplug critical region which usually nests
> -	 * inside cgroup synchronization.  Bounce actual hotplug processing
> -	 * to a work item to avoid reverse locking order.
> -	 */
> -	schedule_work(&cpuset_hotplug_work);
> -}
> -
> -void cpuset_wait_for_hotplug(void)
> -{
> -	flush_work(&cpuset_hotplug_work);
> +	mutex_lock(&cpuset_mutex);
> +	rebuild_sched_domains_cpuslocked();
> +	mutex_unlock(&cpuset_mutex);
>  }

In the above patch rebuild_sched_domains_cpuslocked() has been
used directly. Earlier cpuset_hotplug_update_tasks() was also
called from cpuset_hotplug_workfn(). So migration of tasks
related to cgroup which has empty cpuset would not happen
during cpu hotplug.


Could you please help in understanding more on this.



-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-10-09 13:27   ` Prateek Sood
@ 2017-10-11  9:48     ` Peter Zijlstra
  2017-10-25  8:39       ` Prateek Sood
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2017-10-11  9:48 UTC (permalink / raw)
  To: Prateek Sood
  Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx,
	linux-kernel, sramana

On Mon, Oct 09, 2017 at 06:57:46PM +0530, Prateek Sood wrote:
> On 09/07/2017 11:21 PM, Peter Zijlstra wrote:

> > But if you invert these locks, the need for cpuset_hotplug_workfn() goes
> > away, at least for the CPU part, and we can make in synchronous again.
> > Yay!!

> The callback making a call to cpuset_hotplug_workfn()in hotplug path are
>         [CPUHP_AP_ACTIVE] = {
>                 .name                   = "sched:active",
>                 .startup.single         = sched_cpu_activate,
>                 .teardown.single        = sched_cpu_deactivate,
>         },
> 
> if we make cpuset_hotplug_workfn() synchronous, deadlock might happen:
> _cpu_down()
>    cpus_write_lock()  //held
>       cpuhp_kick_ap_work()
>         cpuhp_kick_ap()
>            __cpuhp_kick_ap()
>               wake_up_process() //cpuhp_thread_fun
>                 wait_for_ap_thread() //wait for complete from cpuhp_thread_fun()
> 
> cpuhp_thread_fun()
>    cpuhp_invoke_callback()
>      sched_cpu_deactivate()
>        cpuset_cpu_inactive()
>           cpuset_update_active_cpus()
>              cpuset_hotplug_work()
>                 rebuild_sched_domains()
>                    cpus_read_lock() //waiting as acquired in _cpu_down()

Well, duh, don't use rebuild_sched_domains() 'obviously' :-) use
rebuild_sched_domains_cpuslocked() instead and it works just fine.

After applying your patch, the below boots and survives a hotplug.

---
 include/linux/cpuset.h |    6 ------
 kernel/cgroup/cpuset.c |   30 +++++++++---------------------
 kernel/power/process.c |    2 --
 kernel/sched/core.c    |    1 -
 4 files changed, 9 insertions(+), 30 deletions(-)

--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_force_rebuild(void);
 extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -166,15 +164,11 @@ static inline bool cpusets_enabled(void)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_force_rebuild(void) { }
-
 static inline void cpuset_update_active_cpus(void)
 {
 	partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_wait_for_hotplug(void) { }
-
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
 {
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -833,7 +833,12 @@ static void rebuild_sched_domains_cpuslo
 	cpumask_var_t *doms;
 	int ndoms;
 
+	/*
+	 * When called during hotplug, this lock is held by the calling
+	 * thread, not cpuhp_thread_fun :/
+	 *
 	lockdep_assert_cpus_held();
+	 */
 	lockdep_assert_held(&cpuset_mutex);
 
 	/*
@@ -2281,13 +2286,6 @@ static void cpuset_hotplug_update_tasks(
 	mutex_unlock(&cpuset_mutex);
 }
 
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
-	force_rebuild = true;
-}
-
 /**
  * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
  *
@@ -2362,25 +2360,15 @@ static void cpuset_hotplug_workfn(struct
 	}
 
 	/* rebuild sched domains if cpus_allowed has changed */
-	if (cpus_updated || force_rebuild) {
-		force_rebuild = false;
+	if (cpus_updated)
 		rebuild_sched_domains();
-	}
 }
 
 void cpuset_update_active_cpus(void)
 {
-	/*
-	 * We're inside cpu hotplug critical region which usually nests
-	 * inside cgroup synchronization.  Bounce actual hotplug processing
-	 * to a work item to avoid reverse locking order.
-	 */
-	schedule_work(&cpuset_hotplug_work);
-}
-
-void cpuset_wait_for_hotplug(void)
-{
-	flush_work(&cpuset_hotplug_work);
+	mutex_lock(&cpuset_mutex);
+	rebuild_sched_domains_cpuslocked();
+	mutex_unlock(&cpuset_mutex);
 }
 
 /*
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -203,8 +203,6 @@ void thaw_processes(void)
 	__usermodehelper_set_disable_depth(UMH_FREEZING);
 	thaw_workqueues();
 
-	cpuset_wait_for_hotplug();
-
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
 		/* No other threads should have PF_SUSPEND_TASK set */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5598,7 +5598,6 @@ static void cpuset_cpu_active(void)
 		 * restore the original sched domains by considering the
 		 * cpuset configurations.
 		 */
-		cpuset_force_rebuild();
 	}
 	cpuset_update_active_cpus();
 }

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-09-07 17:51 ` Peter Zijlstra
@ 2017-10-09 13:27   ` Prateek Sood
  2017-10-11  9:48     ` Peter Zijlstra
  0 siblings, 1 reply; 19+ messages in thread
From: Prateek Sood @ 2017-10-09 13:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx,
	linux-kernel, sramana

On 09/07/2017 11:21 PM, Peter Zijlstra wrote:
> On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
> 
> You've forgotten to mention your solution to the deadlock, namely
> inverting cpuset_mutex and cpu_hotplug_lock.
> 
>> Signed-off-by: Prateek Sood <prsood@codeaurora.org>
>> ---
>>  kernel/cgroup/cpuset.c | 32 +++++++++++++++++++-------------
>>  1 file changed, 19 insertions(+), 13 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index 2f4039b..60dc0ac 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -816,16 +816,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
>>   * 'cpus' is removed, then call this routine to rebuild the
>>   * scheduler's dynamic sched domains.
>>   *
>> - * Call with cpuset_mutex held.  Takes get_online_cpus().
>>   */
>> -static void rebuild_sched_domains_locked(void)
>> +static void rebuild_sched_domains_cpuslocked(void)
>>  {
>>  	struct sched_domain_attr *attr;
>>  	cpumask_var_t *doms;
>>  	int ndoms;
>>  
>> +	lockdep_assert_cpus_held();
>>  	lockdep_assert_held(&cpuset_mutex);
>> -	get_online_cpus();
>>  
>>  	/*
>>  	 * We have raced with CPU hotplug. Don't do anything to avoid
>> @@ -833,27 +832,27 @@ static void rebuild_sched_domains_locked(void)
>>  	 * Anyways, hotplug work item will rebuild sched domains.
>>  	 */
>>  	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
>> -		goto out;
>> +		return;
>>  
>>  	/* Generate domain masks and attrs */
>>  	ndoms = generate_sched_domains(&doms, &attr);
>>  
>>  	/* Have scheduler rebuild the domains */
>>  	partition_sched_domains(ndoms, doms, attr);
>> -out:
>> -	put_online_cpus();
>>  }
>>  #else /* !CONFIG_SMP */
>> -static void rebuild_sched_domains_locked(void)
>> +static void rebuild_sched_domains_cpuslocked(void)
>>  {
>>  }
>>  #endif /* CONFIG_SMP */
>>  
>>  void rebuild_sched_domains(void)
>>  {
>> +	get_online_cpus();
>>  	mutex_lock(&cpuset_mutex);
>> -	rebuild_sched_domains_locked();
>> +	rebuild_sched_domains_cpuslocked();
>>  	mutex_unlock(&cpuset_mutex);
>> +	put_online_cpus();
>>  }
> 
> But if you invert these locks, the need for cpuset_hotplug_workfn() goes
> away, at least for the CPU part, and we can make in synchronous again.
> Yay!!
> 
> Also, I think new code should use cpus_read_lock() instead of
> get_online_cpus().
> 

Thanks for the review comments Peter.
For patch related to circular deadlock, I will send an updated version.

The callback making a call to cpuset_hotplug_workfn()in hotplug path are
        [CPUHP_AP_ACTIVE] = {
                .name                   = "sched:active",
                .startup.single         = sched_cpu_activate,
                .teardown.single        = sched_cpu_deactivate,
        },

if we make cpuset_hotplug_workfn() synchronous, deadlock might happen:
_cpu_down()
   cpus_write_lock()  //held
      cpuhp_kick_ap_work()
        cpuhp_kick_ap()
           __cpuhp_kick_ap()
              wake_up_process() //cpuhp_thread_fun
                wait_for_ap_thread() //wait for complete from cpuhp_thread_fun()

cpuhp_thread_fun()
   cpuhp_invoke_callback()
     sched_cpu_deactivate()
       cpuset_cpu_inactive()
          cpuset_update_active_cpus()
             cpuset_hotplug_work()
                rebuild_sched_domains()
                   cpus_read_lock() //waiting as acquired in _cpu_down()
                  

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-09-07 17:45 ` Peter Zijlstra
@ 2017-09-08  2:13   ` Prateek Sood
  0 siblings, 0 replies; 19+ messages in thread
From: Prateek Sood @ 2017-09-08  2:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx,
	linux-kernel, sramana

On 09/07/2017 11:15 PM, Peter Zijlstra wrote:
> On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Process A => kthreadd => Process B => Process C => Process A
> 
>> Process A
>> cpu_subsys_offline();
>>   cpu_down();
>>     _cpu_down();
>>       percpu_down_write(&cpu_hotplug_lock); //held
>>       cpuhp_invoke_callback();
>>         workqueue_offline_cpu();
>>           wq_update_unbound_numa();
>>             kthread_create_on_node();
>>               wake_up_process();  //wakeup kthreadd
> 
> TJ, I'm puzzled, why would we need to spawn new threads to update NUMA
> affinity when taking a CPU out? That doesn't make sense to me, we can
> either shrink the affinity of an existing thread or completely kill of a
> thread if the mask becomes empty. But why spawn a new thread?
> 
>>           flush_work();
>>           wait_for_completion();
>>

> 
> Yes, inverting cpuset and hotplug would break that chain, but I'm still
> wondering why workqueue needs to spawn threads on CPU down.
> 

Thanks for the comments Peter

You rightly mentioned that a new thread will not be spawn
while updating NUMA affinity when taking a CPU out.

While a CPU is made offline, attempt is made to unbind per-cpu
worker for CPU going down. This is done by queuing unbind work
to system_highpri_wq. It results in an attempt to create one
bounded worker thread as there is none. 

wait_for_completion() in flush_work() waits for unbinding to
finish for CPU going down.

Process A
cpu_subsys_offline();
   cpu_down();
     _cpu_down();
       percpu_down_write(&cpu_hotplug_lock); //held
       cpuhp_invoke_callback();
         workqueue_offline_cpu();
           queue_work_on(system_highpri_wq);
             __queue_work();
               insert_work();
                 wake_up_worker(); //pool->nr_running = 0
           flush_work();
		   wait_for_completion();
		   
	   
worker_thread();
  need_more_worker(); // returns true
  manage_workers();
    maybe_create_worker();
	  create_worker();
	    kthread_create_on_node();
		  wake_up_process(kthreadd_task);



-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-09-07 13:56 Prateek Sood
  2017-09-07 17:45 ` Peter Zijlstra
@ 2017-09-07 17:51 ` Peter Zijlstra
  2017-10-09 13:27   ` Prateek Sood
  1 sibling, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2017-09-07 17:51 UTC (permalink / raw)
  To: Prateek Sood
  Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx,
	linux-kernel, sramana

On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote:
> Remove circular dependency deadlock in a scenario where hotplug of CPU is
> being done while there is updation in cgroup and cpuset triggered from
> userspace.

You've forgotten to mention your solution to the deadlock, namely
inverting cpuset_mutex and cpu_hotplug_lock.

> Signed-off-by: Prateek Sood <prsood@codeaurora.org>
> ---
>  kernel/cgroup/cpuset.c | 32 +++++++++++++++++++-------------
>  1 file changed, 19 insertions(+), 13 deletions(-)
> 
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 2f4039b..60dc0ac 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -816,16 +816,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
>   * 'cpus' is removed, then call this routine to rebuild the
>   * scheduler's dynamic sched domains.
>   *
> - * Call with cpuset_mutex held.  Takes get_online_cpus().
>   */
> -static void rebuild_sched_domains_locked(void)
> +static void rebuild_sched_domains_cpuslocked(void)
>  {
>  	struct sched_domain_attr *attr;
>  	cpumask_var_t *doms;
>  	int ndoms;
>  
> +	lockdep_assert_cpus_held();
>  	lockdep_assert_held(&cpuset_mutex);
> -	get_online_cpus();
>  
>  	/*
>  	 * We have raced with CPU hotplug. Don't do anything to avoid
> @@ -833,27 +832,27 @@ static void rebuild_sched_domains_locked(void)
>  	 * Anyways, hotplug work item will rebuild sched domains.
>  	 */
>  	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
> -		goto out;
> +		return;
>  
>  	/* Generate domain masks and attrs */
>  	ndoms = generate_sched_domains(&doms, &attr);
>  
>  	/* Have scheduler rebuild the domains */
>  	partition_sched_domains(ndoms, doms, attr);
> -out:
> -	put_online_cpus();
>  }
>  #else /* !CONFIG_SMP */
> -static void rebuild_sched_domains_locked(void)
> +static void rebuild_sched_domains_cpuslocked(void)
>  {
>  }
>  #endif /* CONFIG_SMP */
>  
>  void rebuild_sched_domains(void)
>  {
> +	get_online_cpus();
>  	mutex_lock(&cpuset_mutex);
> -	rebuild_sched_domains_locked();
> +	rebuild_sched_domains_cpuslocked();
>  	mutex_unlock(&cpuset_mutex);
> +	put_online_cpus();
>  }

But if you invert these locks, the need for cpuset_hotplug_workfn() goes
away, at least for the CPU part, and we can make in synchronous again.
Yay!!

Also, I think new code should use cpus_read_lock() instead of
get_online_cpus().

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-09-07 13:56 Prateek Sood
@ 2017-09-07 17:45 ` Peter Zijlstra
  2017-09-08  2:13   ` Prateek Sood
  2017-09-07 17:51 ` Peter Zijlstra
  1 sibling, 1 reply; 19+ messages in thread
From: Peter Zijlstra @ 2017-09-07 17:45 UTC (permalink / raw)
  To: Prateek Sood
  Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx,
	linux-kernel, sramana

On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote:
> Remove circular dependency deadlock in a scenario where hotplug of CPU is
> being done while there is updation in cgroup and cpuset triggered from
> userspace.
> 
> Process A => kthreadd => Process B => Process C => Process A

> Process A
> cpu_subsys_offline();
>   cpu_down();
>     _cpu_down();
>       percpu_down_write(&cpu_hotplug_lock); //held
>       cpuhp_invoke_callback();
>         workqueue_offline_cpu();
>           wq_update_unbound_numa();
>             kthread_create_on_node();
>               wake_up_process();  //wakeup kthreadd

TJ, I'm puzzled, why would we need to spawn new threads to update NUMA
affinity when taking a CPU out? That doesn't make sense to me, we can
either shrink the affinity of an existing thread or completely kill of a
thread if the mask becomes empty. But why spawn a new thread?

>           flush_work();
>           wait_for_completion();
> 
> kthreadd
> kthreadd();
>   kernel_thread();
>     do_fork();
>       copy_process();
>         percpu_down_read(&cgroup_threadgroup_rwsem);
>           __rwsem_down_read_failed_common(); //waiting

So this will eventually do our:

		complete() to make A go.


> Process B
> kernfs_fop_write();
>   cgroup_file_write();
>     cgroup_procs_write();
>       percpu_down_write(&cgroup_threadgroup_rwsem); //held
>       cgroup_attach_task();
>         cgroup_migrate();
>           cgroup_migrate_execute();
>             cpuset_can_attach();
>               mutex_lock(&cpuset_mutex); //waiting
> 
> Process C
> kernfs_fop_write();
>   cgroup_file_write();
>     cpuset_write_resmask();
>       mutex_lock(&cpuset_mutex); //held
>       update_cpumask();
>         update_cpumasks_hier();
>           rebuild_sched_domains_locked();
>             get_online_cpus();
>               percpu_down_read(&cpu_hotplug_lock); //waiting


So the whole thing looks like:


A		B			C		D

L(hotplug)
		L(threadgroup)
					L(cpuset)
							L(threadgroup)
WFC(c)
		L(cpuset)
					L(hotplug)
							C(c)

Yes, inverting cpuset and hotplug would break that chain, but I'm still
wondering why workqueue needs to spawn threads on CPU down.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] cgroup/cpuset: remove circular dependency deadlock
@ 2017-09-07 13:56 Prateek Sood
  2017-09-07 17:45 ` Peter Zijlstra
  2017-09-07 17:51 ` Peter Zijlstra
  0 siblings, 2 replies; 19+ messages in thread
From: Prateek Sood @ 2017-09-07 13:56 UTC (permalink / raw)
  To: tj, lizefan, cgroups, mingo, longman, peterz, boqun.feng, tglx
  Cc: Prateek Sood, linux-kernel, sramana

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Process A => kthreadd => Process B => Process C => Process A

Process A
cpu_subsys_offline();
  cpu_down();
    _cpu_down();
      percpu_down_write(&cpu_hotplug_lock); //held
      cpuhp_invoke_callback();
        workqueue_offline_cpu();
          wq_update_unbound_numa();
            kthread_create_on_node();
              wake_up_process();  //wakeup kthreadd
          flush_work();
          wait_for_completion();

kthreadd
kthreadd();
  kernel_thread();
    do_fork();
      copy_process();
        percpu_down_read(&cgroup_threadgroup_rwsem);
          __rwsem_down_read_failed_common(); //waiting

Process B
kernfs_fop_write();
  cgroup_file_write();
    cgroup_procs_write();
      percpu_down_write(&cgroup_threadgroup_rwsem); //held
      cgroup_attach_task();
        cgroup_migrate();
          cgroup_migrate_execute();
            cpuset_can_attach();
              mutex_lock(&cpuset_mutex); //waiting

Process C
kernfs_fop_write();
  cgroup_file_write();
    cpuset_write_resmask();
      mutex_lock(&cpuset_mutex); //held
      update_cpumask();
        update_cpumasks_hier();
          rebuild_sched_domains_locked();
            get_online_cpus();
              percpu_down_read(&cpu_hotplug_lock); //waiting

Signed-off-by: Prateek Sood <prsood@codeaurora.org>
---
 kernel/cgroup/cpuset.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039b..60dc0ac 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -816,16 +816,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
 	struct sched_domain_attr *attr;
 	cpumask_var_t *doms;
 	int ndoms;
 
+	lockdep_assert_cpus_held();
 	lockdep_assert_held(&cpuset_mutex);
-	get_online_cpus();
 
 	/*
 	 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -833,27 +832,27 @@ static void rebuild_sched_domains_locked(void)
 	 * Anyways, hotplug work item will rebuild sched domains.
 	 */
 	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-		goto out;
+		return;
 
 	/* Generate domain masks and attrs */
 	ndoms = generate_sched_domains(&doms, &attr);
 
 	/* Have scheduler rebuild the domains */
 	partition_sched_domains(ndoms, doms, attr);
-out:
-	put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
 }
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
 {
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
-	rebuild_sched_domains_locked();
+	rebuild_sched_domains_cpuslocked();
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 }
 
 /**
@@ -940,7 +939,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 	rcu_read_unlock();
 
 	if (need_rebuild_sched_domains)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_cpuslocked();
 }
 
 /**
@@ -1273,7 +1272,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 		cs->relax_domain_level = val;
 		if (!cpumask_empty(cs->cpus_allowed) &&
 		    is_sched_load_balance(cs))
-			rebuild_sched_domains_locked();
+			rebuild_sched_domains_cpuslocked();
 	}
 
 	return 0;
@@ -1306,7 +1305,6 @@ static void update_tasks_flags(struct cpuset *cs)
  *
  * Call with cpuset_mutex held.
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 		       int turning_on)
 {
@@ -1339,7 +1337,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	spin_unlock_irq(&callback_lock);
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_cpuslocked();
 
 	if (spread_flag_changed)
 		update_tasks_flags(cs);
@@ -1607,6 +1605,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = 0;
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs)) {
 		retval = -ENODEV;
@@ -1644,6 +1643,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -1654,6 +1654,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = -ENODEV;
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1668,6 +1669,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -1706,6 +1708,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	kernfs_break_active_protection(of->kn);
 	flush_work(&cpuset_hotplug_work);
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1731,6 +1734,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	free_trial_cpuset(trialcs);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	kernfs_unbreak_active_protection(of->kn);
 	css_put(&cs->css);
 	flush_workqueue(cpuset_migrate_mm_wq);
@@ -2031,13 +2035,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 /*
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_cpuslocked().
  */
 
 static void cpuset_css_offline(struct cgroup_subsys_state *css)
 {
 	struct cpuset *cs = css_cs(css);
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 
 	if (is_sched_load_balance(cs))
@@ -2047,6 +2052,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 	clear_bit(CS_ONLINE, &cs->flags);
 
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 }
 
 static void cpuset_css_free(struct cgroup_subsys_state *css)
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-09-07  8:56   ` Boqun Feng
@ 2017-09-07  9:07     ` Prateek Sood
  0 siblings, 0 replies; 19+ messages in thread
From: Prateek Sood @ 2017-09-07  9:07 UTC (permalink / raw)
  To: Boqun Feng, Peter Zijlstra
  Cc: tj, lizefan, cgroups, mingo, longman, linux-kernel, sramana,
	Thomas Gleixner

On 09/07/2017 02:26 PM, Boqun Feng wrote:
> On Thu, Sep 07, 2017 at 09:28:48AM +0200, Peter Zijlstra wrote:
>> On Thu, Sep 07, 2017 at 11:34:12AM +0530, Prateek Sood wrote:
>>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>>> being done while there is updation in cgroup and cpuset triggered from
>>> userspace.
>>>
>>> Example scenario:
>>> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
>>>
>>> kworker/0:0 - percpu_down_write(&cpu_hotplug_lock)  [held]
>>>               flush(work)   [no high prio workqueue available on CPU]
>>>               wait_for_completion()
> 
> Hi Prateek,
> 
> so this is:
> 
> 	_cpu_down():
> 	  cpus_write_lock(); // percpu_down_write(&cpu_hotlug_lock)
> 	  cpuhp_invoke_callbacks():
> 	    workqueue_offine_cpu():
> 	      wq_update_unbound_numa():
> 	        alloc_unbound_pool():
> 	          get_unbound_pool():
> 		    create_worker():
> 		      kthread_create_on_node():
> 		        wake_up_process(kthreadd_task);
> 		        wait_for_completion(); // create->done
> 
> , right?
> 
> Wonder running in a kworker is necessary to trigger this, I mean running
> a cpu_down() in a normal process context could also trigger this, no?
> Just ask out of curiosity.
> 
> Regards,
> Boqun

Hi Boqun,

cpu_down() in normal process can also trigger this.



Regards
Prateek
> 
>>>
>>> kthreadd    - percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
>>>
>>> init:729    - percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>>>               lock(cpuset_mutex)   [waiting]
>>>
>>> init:1      - lock(cpuset_mutex)   [held]
>>>               percpu_down_read(&cpu_hotplug_lock)   [waiting]
>>
>> That's both unreadable and useless :/ You want to tell what code paths
>> that were, not which random tasks happened to run them.
>>
>>
> [...]
> 


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-09-07  7:28 ` Peter Zijlstra
  2017-09-07  8:56   ` Boqun Feng
@ 2017-09-07  9:05   ` Prateek Sood
  1 sibling, 0 replies; 19+ messages in thread
From: Prateek Sood @ 2017-09-07  9:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: tj, lizefan, cgroups, mingo, longman, linux-kernel, sramana,
	Thomas Gleixner

On 09/07/2017 12:58 PM, Peter Zijlstra wrote:
> On Thu, Sep 07, 2017 at 11:34:12AM +0530, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Example scenario:
>> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
>>
>> kworker/0:0 - percpu_down_write(&cpu_hotplug_lock)  [held]
>>               flush(work)   [no high prio workqueue available on CPU]
>>               wait_for_completion()
>>
>> kthreadd    - percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
>>
>> init:729    - percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>>               lock(cpuset_mutex)   [waiting]
>>
>> init:1      - lock(cpuset_mutex)   [held]
>>               percpu_down_read(&cpu_hotplug_lock)   [waiting]
> 
> That's both unreadable and useless :/ You want to tell what code paths
> that were, not which random tasks happened to run them.
> 
> 
> 
>> Eliminate this dependecy by reordering locking of cpuset_mutex
>> and cpu_hotplug_lock in following order
>> 1. Acquire cpu_hotplug_lock (read)
>> 2. Acquire cpuset_mutex
>>
>> Signed-off-by: Prateek Sood <prsood@codeaurora.org>
>> ---
>>  kernel/cgroup/cpuset.c | 70 +++++++++++++++++++++++++++++++++++++++++++-------
>>  1 file changed, 61 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index 2f4039b..687be57 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -843,10 +843,41 @@ static void rebuild_sched_domains_locked(void)
>>  out:
>>  	put_online_cpus();
>>  }
>> +
>> +/*
>> + * Rebuild scheduler domains.
>> + * Call with following lock held in the order
>> + * 1. cpu_hotplug_lock (read)
>> + * 2. cpuset_mutex
> 
> Do not put that in comments, nobody ever reads comments.
> 
>> + */
>> +static void rebuild_sched_domains_unlocked(void)
> 
> The common postfix for a function called with the cpuhotplug lock held
> is: _cpuslocked()
> 
>> +{
>> +        struct sched_domain_attr *attr;
>> +        cpumask_var_t *doms;
>> +        int ndoms;
> 
> 	lockdep_assert_cpus_held();
> 	lockdep_assert_held(&cpuset_mutex);
> 
>> +
>> +        /*
>> +         * We have raced with CPU hotplug. Don't do anything to avoid
>> +         * passing doms with offlined cpu to partition_sched_domains().
>> +         * Anyways, hotplug work item will rebuild sched domains.
>> +         */
>> +        if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
>> +                return;
>> +
>> +        /* Generate domain masks and attrs */
>> +        ndoms = generate_sched_domains(&doms, &attr);
>> +
>> +        /* Have scheduler rebuild the domains */
>> +        partition_sched_domains(ndoms, doms, attr);
>> +}
> 
> And you couldn't come up with a way to share _anything_ with the
> existing rebuild_sched_domains_locked() function?
> 
> *sigh*.. please try again.
> 

Thanks for the suggestion Peter, I will resend the patch

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-09-07  7:28 ` Peter Zijlstra
@ 2017-09-07  8:56   ` Boqun Feng
  2017-09-07  9:07     ` Prateek Sood
  2017-09-07  9:05   ` Prateek Sood
  1 sibling, 1 reply; 19+ messages in thread
From: Boqun Feng @ 2017-09-07  8:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Prateek Sood, tj, lizefan, cgroups, mingo, longman, linux-kernel,
	sramana, Thomas Gleixner

[-- Attachment #1: Type: text/plain, Size: 1688 bytes --]

On Thu, Sep 07, 2017 at 09:28:48AM +0200, Peter Zijlstra wrote:
> On Thu, Sep 07, 2017 at 11:34:12AM +0530, Prateek Sood wrote:
> > Remove circular dependency deadlock in a scenario where hotplug of CPU is
> > being done while there is updation in cgroup and cpuset triggered from
> > userspace.
> > 
> > Example scenario:
> > kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
> > 
> > kworker/0:0 - percpu_down_write(&cpu_hotplug_lock)  [held]
> >               flush(work)   [no high prio workqueue available on CPU]
> >               wait_for_completion()

Hi Prateek,

so this is:

	_cpu_down():
	  cpus_write_lock(); // percpu_down_write(&cpu_hotlug_lock)
	  cpuhp_invoke_callbacks():
	    workqueue_offine_cpu():
	      wq_update_unbound_numa():
	        alloc_unbound_pool():
	          get_unbound_pool():
		    create_worker():
		      kthread_create_on_node():
		        wake_up_process(kthreadd_task);
		        wait_for_completion(); // create->done

, right?

Wonder running in a kworker is necessary to trigger this, I mean running
a cpu_down() in a normal process context could also trigger this, no?
Just ask out of curiosity.

Regards,
Boqun

> > 
> > kthreadd    - percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
> > 
> > init:729    - percpu_down_write(cgroup_threadgroup_rwsem)   [held]
> >               lock(cpuset_mutex)   [waiting]
> > 
> > init:1      - lock(cpuset_mutex)   [held]
> >               percpu_down_read(&cpu_hotplug_lock)   [waiting]
> 
> That's both unreadable and useless :/ You want to tell what code paths
> that were, not which random tasks happened to run them.
> 
> 
[...]

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock
  2017-09-07  6:04 Prateek Sood
@ 2017-09-07  7:28 ` Peter Zijlstra
  2017-09-07  8:56   ` Boqun Feng
  2017-09-07  9:05   ` Prateek Sood
  0 siblings, 2 replies; 19+ messages in thread
From: Peter Zijlstra @ 2017-09-07  7:28 UTC (permalink / raw)
  To: Prateek Sood
  Cc: tj, lizefan, cgroups, mingo, longman, linux-kernel, sramana,
	Thomas Gleixner

On Thu, Sep 07, 2017 at 11:34:12AM +0530, Prateek Sood wrote:
> Remove circular dependency deadlock in a scenario where hotplug of CPU is
> being done while there is updation in cgroup and cpuset triggered from
> userspace.
> 
> Example scenario:
> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
> 
> kworker/0:0 - percpu_down_write(&cpu_hotplug_lock)  [held]
>               flush(work)   [no high prio workqueue available on CPU]
>               wait_for_completion()
> 
> kthreadd    - percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
> 
> init:729    - percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>               lock(cpuset_mutex)   [waiting]
> 
> init:1      - lock(cpuset_mutex)   [held]
>               percpu_down_read(&cpu_hotplug_lock)   [waiting]

That's both unreadable and useless :/ You want to tell what code paths
that were, not which random tasks happened to run them.



> Eliminate this dependecy by reordering locking of cpuset_mutex
> and cpu_hotplug_lock in following order
> 1. Acquire cpu_hotplug_lock (read)
> 2. Acquire cpuset_mutex
> 
> Signed-off-by: Prateek Sood <prsood@codeaurora.org>
> ---
>  kernel/cgroup/cpuset.c | 70 +++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 61 insertions(+), 9 deletions(-)
> 
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 2f4039b..687be57 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -843,10 +843,41 @@ static void rebuild_sched_domains_locked(void)
>  out:
>  	put_online_cpus();
>  }
> +
> +/*
> + * Rebuild scheduler domains.
> + * Call with following lock held in the order
> + * 1. cpu_hotplug_lock (read)
> + * 2. cpuset_mutex

Do not put that in comments, nobody ever reads comments.

> + */
> +static void rebuild_sched_domains_unlocked(void)

The common postfix for a function called with the cpuhotplug lock held
is: _cpuslocked()

> +{
> +        struct sched_domain_attr *attr;
> +        cpumask_var_t *doms;
> +        int ndoms;

	lockdep_assert_cpus_held();
	lockdep_assert_held(&cpuset_mutex);

> +
> +        /*
> +         * We have raced with CPU hotplug. Don't do anything to avoid
> +         * passing doms with offlined cpu to partition_sched_domains().
> +         * Anyways, hotplug work item will rebuild sched domains.
> +         */
> +        if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
> +                return;
> +
> +        /* Generate domain masks and attrs */
> +        ndoms = generate_sched_domains(&doms, &attr);
> +
> +        /* Have scheduler rebuild the domains */
> +        partition_sched_domains(ndoms, doms, attr);
> +}

And you couldn't come up with a way to share _anything_ with the
existing rebuild_sched_domains_locked() function?

*sigh*.. please try again.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] cgroup/cpuset: remove circular dependency deadlock
@ 2017-09-07  6:04 Prateek Sood
  2017-09-07  7:28 ` Peter Zijlstra
  0 siblings, 1 reply; 19+ messages in thread
From: Prateek Sood @ 2017-09-07  6:04 UTC (permalink / raw)
  To: tj, lizefan, cgroups, mingo, longman; +Cc: Prateek Sood, linux-kernel, sramana

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Example scenario:
kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0

kworker/0:0 - percpu_down_write(&cpu_hotplug_lock)  [held]
              flush(work)   [no high prio workqueue available on CPU]
              wait_for_completion()

kthreadd    - percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]

init:729    - percpu_down_write(cgroup_threadgroup_rwsem)   [held]
              lock(cpuset_mutex)   [waiting]

init:1      - lock(cpuset_mutex)   [held]
              percpu_down_read(&cpu_hotplug_lock)   [waiting]

Eliminate this dependecy by reordering locking of cpuset_mutex
and cpu_hotplug_lock in following order
1. Acquire cpu_hotplug_lock (read)
2. Acquire cpuset_mutex

Signed-off-by: Prateek Sood <prsood@codeaurora.org>
---
 kernel/cgroup/cpuset.c | 70 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039b..687be57 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -843,10 +843,41 @@ static void rebuild_sched_domains_locked(void)
 out:
 	put_online_cpus();
 }
+
+/*
+ * Rebuild scheduler domains.
+ * Call with following lock held in the order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
+ */
+static void rebuild_sched_domains_unlocked(void)
+{
+        struct sched_domain_attr *attr;
+        cpumask_var_t *doms;
+        int ndoms;
+
+        /*
+         * We have raced with CPU hotplug. Don't do anything to avoid
+         * passing doms with offlined cpu to partition_sched_domains().
+         * Anyways, hotplug work item will rebuild sched domains.
+         */
+        if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+                return;
+
+        /* Generate domain masks and attrs */
+        ndoms = generate_sched_domains(&doms, &attr);
+
+        /* Have scheduler rebuild the domains */
+        partition_sched_domains(ndoms, doms, attr);
+}
 #else /* !CONFIG_SMP */
 static void rebuild_sched_domains_locked(void)
 {
 }
+
+static void rebuild_sched_domains_unlocked(void)
+{
+}
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
@@ -885,7 +916,9 @@ static void update_tasks_cpumask(struct cpuset *cs)
  *
  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
- * Called with cpuset_mutex held
+ * Called with following lock held in order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
@@ -940,7 +973,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 	rcu_read_unlock();
 
 	if (need_rebuild_sched_domains)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_unlocked();
 }
 
 /**
@@ -1262,6 +1295,11 @@ int current_cpuset_is_being_rebound(void)
 	return ret;
 }
 
+/*
+ * Call with following lock held in order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
+ */
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
 #ifdef CONFIG_SMP
@@ -1273,7 +1311,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 		cs->relax_domain_level = val;
 		if (!cpumask_empty(cs->cpus_allowed) &&
 		    is_sched_load_balance(cs))
-			rebuild_sched_domains_locked();
+			rebuild_sched_domains_unlocked();
 	}
 
 	return 0;
@@ -1304,9 +1342,10 @@ static void update_tasks_flags(struct cpuset *cs)
  * cs:		the cpuset to update
  * turning_on: 	whether the flag is being set or cleared
  *
- * Call with cpuset_mutex held.
+ * Call with following lock held in order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 		       int turning_on)
 {
@@ -1339,7 +1378,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	spin_unlock_irq(&callback_lock);
 
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-		rebuild_sched_domains_locked();
+		rebuild_sched_domains_unlocked();
 
 	if (spread_flag_changed)
 		update_tasks_flags(cs);
@@ -1607,6 +1646,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = 0;
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs)) {
 		retval = -ENODEV;
@@ -1644,6 +1684,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -1654,6 +1695,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	cpuset_filetype_t type = cft->private;
 	int retval = -ENODEV;
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1668,6 +1710,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
 	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	return retval;
 }
 
@@ -1706,6 +1749,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	kernfs_break_active_protection(of->kn);
 	flush_work(&cpuset_hotplug_work);
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
@@ -1731,6 +1775,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	free_trial_cpuset(trialcs);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 	kernfs_unbreak_active_protection(of->kn);
 	css_put(&cs->css);
 	flush_workqueue(cpuset_migrate_mm_wq);
@@ -2031,13 +2076,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 /*
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_unlocked().
  */
 
 static void cpuset_css_offline(struct cgroup_subsys_state *css)
 {
 	struct cpuset *cs = css_cs(css);
 
+	get_online_cpus();
 	mutex_lock(&cpuset_mutex);
 
 	if (is_sched_load_balance(cs))
@@ -2047,6 +2093,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 	clear_bit(CS_ONLINE, &cs->flags);
 
 	mutex_unlock(&cpuset_mutex);
+	put_online_cpus();
 }
 
 static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2341,8 +2388,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	}
 
 	/* rebuild sched domains if cpus_allowed has changed */
-	if (cpus_updated)
-		rebuild_sched_domains();
+	if (cpus_updated) {
+		get_online_cpus();
+		mutex_lock(&cpuset_mutex);
+		rebuild_sched_domains_unlocked();
+		mutex_unlock(&cpuset_mutex);
+		put_online_cpus();
+	}
 }
 
 void cpuset_update_active_cpus(void)
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

^ permalink raw reply related	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2017-10-27  8:04 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-06 11:48 [PATCH] cgroup/cpuset: remove circular dependency deadlock Prateek Sood
2017-09-06 12:56 ` Waiman Long
2017-09-06 14:23   ` Prateek Sood
2017-09-07  6:04 Prateek Sood
2017-09-07  7:28 ` Peter Zijlstra
2017-09-07  8:56   ` Boqun Feng
2017-09-07  9:07     ` Prateek Sood
2017-09-07  9:05   ` Prateek Sood
2017-09-07 13:56 Prateek Sood
2017-09-07 17:45 ` Peter Zijlstra
2017-09-08  2:13   ` Prateek Sood
2017-09-07 17:51 ` Peter Zijlstra
2017-10-09 13:27   ` Prateek Sood
2017-10-11  9:48     ` Peter Zijlstra
2017-10-25  8:39       ` Prateek Sood
2017-10-25  9:30         ` Peter Zijlstra
2017-10-26 11:52           ` Prateek Sood
2017-10-26 14:05             ` Waiman Long
2017-10-27  8:03               ` Prateek Sood

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).