* [PATCH] cgroup/cpuset: remove circular dependency deadlock @ 2017-09-07 13:56 Prateek Sood 2017-09-07 17:45 ` Peter Zijlstra 2017-09-07 17:51 ` Peter Zijlstra 0 siblings, 2 replies; 21+ messages in thread From: Prateek Sood @ 2017-09-07 13:56 UTC (permalink / raw) To: tj, lizefan, cgroups, mingo, longman, peterz, boqun.feng, tglx Cc: Prateek Sood, linux-kernel, sramana Remove circular dependency deadlock in a scenario where hotplug of CPU is being done while there is updation in cgroup and cpuset triggered from userspace. Process A => kthreadd => Process B => Process C => Process A Process A cpu_subsys_offline(); cpu_down(); _cpu_down(); percpu_down_write(&cpu_hotplug_lock); //held cpuhp_invoke_callback(); workqueue_offline_cpu(); wq_update_unbound_numa(); kthread_create_on_node(); wake_up_process(); //wakeup kthreadd flush_work(); wait_for_completion(); kthreadd kthreadd(); kernel_thread(); do_fork(); copy_process(); percpu_down_read(&cgroup_threadgroup_rwsem); __rwsem_down_read_failed_common(); //waiting Process B kernfs_fop_write(); cgroup_file_write(); cgroup_procs_write(); percpu_down_write(&cgroup_threadgroup_rwsem); //held cgroup_attach_task(); cgroup_migrate(); cgroup_migrate_execute(); cpuset_can_attach(); mutex_lock(&cpuset_mutex); //waiting Process C kernfs_fop_write(); cgroup_file_write(); cpuset_write_resmask(); mutex_lock(&cpuset_mutex); //held update_cpumask(); update_cpumasks_hier(); rebuild_sched_domains_locked(); get_online_cpus(); percpu_down_read(&cpu_hotplug_lock); //waiting Signed-off-by: Prateek Sood <prsood@codeaurora.org> --- kernel/cgroup/cpuset.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2f4039b..60dc0ac 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -816,16 +816,15 @@ static int generate_sched_domains(cpumask_var_t **domains, * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; + lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -833,27 +832,27 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { + get_online_cpus(); mutex_lock(&cpuset_mutex); - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); mutex_unlock(&cpuset_mutex); + put_online_cpus(); } /** @@ -940,7 +939,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } /** @@ -1273,7 +1272,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } return 0; @@ -1306,7 +1305,6 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ - static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1339,7 +1337,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1607,6 +1605,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -1644,6 +1643,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; } @@ -1654,6 +1654,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1668,6 +1669,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; } @@ -1706,6 +1708,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1731,6 +1734,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2031,13 +2035,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_cpuslocked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + get_online_cpus(); mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) @@ -2047,6 +2052,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); + put_online_cpus(); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project. ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock 2017-09-07 13:56 [PATCH] cgroup/cpuset: remove circular dependency deadlock Prateek Sood @ 2017-09-07 17:45 ` Peter Zijlstra 2017-09-08 2:13 ` Prateek Sood 2017-09-07 17:51 ` Peter Zijlstra 1 sibling, 1 reply; 21+ messages in thread From: Peter Zijlstra @ 2017-09-07 17:45 UTC (permalink / raw) To: Prateek Sood Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx, linux-kernel, sramana On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote: > Remove circular dependency deadlock in a scenario where hotplug of CPU is > being done while there is updation in cgroup and cpuset triggered from > userspace. > > Process A => kthreadd => Process B => Process C => Process A > Process A > cpu_subsys_offline(); > cpu_down(); > _cpu_down(); > percpu_down_write(&cpu_hotplug_lock); //held > cpuhp_invoke_callback(); > workqueue_offline_cpu(); > wq_update_unbound_numa(); > kthread_create_on_node(); > wake_up_process(); //wakeup kthreadd TJ, I'm puzzled, why would we need to spawn new threads to update NUMA affinity when taking a CPU out? That doesn't make sense to me, we can either shrink the affinity of an existing thread or completely kill of a thread if the mask becomes empty. But why spawn a new thread? > flush_work(); > wait_for_completion(); > > kthreadd > kthreadd(); > kernel_thread(); > do_fork(); > copy_process(); > percpu_down_read(&cgroup_threadgroup_rwsem); > __rwsem_down_read_failed_common(); //waiting So this will eventually do our: complete() to make A go. > Process B > kernfs_fop_write(); > cgroup_file_write(); > cgroup_procs_write(); > percpu_down_write(&cgroup_threadgroup_rwsem); //held > cgroup_attach_task(); > cgroup_migrate(); > cgroup_migrate_execute(); > cpuset_can_attach(); > mutex_lock(&cpuset_mutex); //waiting > > Process C > kernfs_fop_write(); > cgroup_file_write(); > cpuset_write_resmask(); > mutex_lock(&cpuset_mutex); //held > update_cpumask(); > update_cpumasks_hier(); > rebuild_sched_domains_locked(); > get_online_cpus(); > percpu_down_read(&cpu_hotplug_lock); //waiting So the whole thing looks like: A B C D L(hotplug) L(threadgroup) L(cpuset) L(threadgroup) WFC(c) L(cpuset) L(hotplug) C(c) Yes, inverting cpuset and hotplug would break that chain, but I'm still wondering why workqueue needs to spawn threads on CPU down. ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock 2017-09-07 17:45 ` Peter Zijlstra @ 2017-09-08 2:13 ` Prateek Sood 0 siblings, 0 replies; 21+ messages in thread From: Prateek Sood @ 2017-09-08 2:13 UTC (permalink / raw) To: Peter Zijlstra Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx, linux-kernel, sramana On 09/07/2017 11:15 PM, Peter Zijlstra wrote: > On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote: >> Remove circular dependency deadlock in a scenario where hotplug of CPU is >> being done while there is updation in cgroup and cpuset triggered from >> userspace. >> >> Process A => kthreadd => Process B => Process C => Process A > >> Process A >> cpu_subsys_offline(); >> cpu_down(); >> _cpu_down(); >> percpu_down_write(&cpu_hotplug_lock); //held >> cpuhp_invoke_callback(); >> workqueue_offline_cpu(); >> wq_update_unbound_numa(); >> kthread_create_on_node(); >> wake_up_process(); //wakeup kthreadd > > TJ, I'm puzzled, why would we need to spawn new threads to update NUMA > affinity when taking a CPU out? That doesn't make sense to me, we can > either shrink the affinity of an existing thread or completely kill of a > thread if the mask becomes empty. But why spawn a new thread? > >> flush_work(); >> wait_for_completion(); >> > > Yes, inverting cpuset and hotplug would break that chain, but I'm still > wondering why workqueue needs to spawn threads on CPU down. > Thanks for the comments Peter You rightly mentioned that a new thread will not be spawn while updating NUMA affinity when taking a CPU out. While a CPU is made offline, attempt is made to unbind per-cpu worker for CPU going down. This is done by queuing unbind work to system_highpri_wq. It results in an attempt to create one bounded worker thread as there is none. wait_for_completion() in flush_work() waits for unbinding to finish for CPU going down. Process A cpu_subsys_offline(); cpu_down(); _cpu_down(); percpu_down_write(&cpu_hotplug_lock); //held cpuhp_invoke_callback(); workqueue_offline_cpu(); queue_work_on(system_highpri_wq); __queue_work(); insert_work(); wake_up_worker(); //pool->nr_running = 0 flush_work(); wait_for_completion(); worker_thread(); need_more_worker(); // returns true manage_workers(); maybe_create_worker(); create_worker(); kthread_create_on_node(); wake_up_process(kthreadd_task); -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock 2017-09-07 13:56 [PATCH] cgroup/cpuset: remove circular dependency deadlock Prateek Sood 2017-09-07 17:45 ` Peter Zijlstra @ 2017-09-07 17:51 ` Peter Zijlstra 2017-10-09 13:27 ` Prateek Sood 1 sibling, 1 reply; 21+ messages in thread From: Peter Zijlstra @ 2017-09-07 17:51 UTC (permalink / raw) To: Prateek Sood Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx, linux-kernel, sramana On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote: > Remove circular dependency deadlock in a scenario where hotplug of CPU is > being done while there is updation in cgroup and cpuset triggered from > userspace. You've forgotten to mention your solution to the deadlock, namely inverting cpuset_mutex and cpu_hotplug_lock. > Signed-off-by: Prateek Sood <prsood@codeaurora.org> > --- > kernel/cgroup/cpuset.c | 32 +++++++++++++++++++------------- > 1 file changed, 19 insertions(+), 13 deletions(-) > > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c > index 2f4039b..60dc0ac 100644 > --- a/kernel/cgroup/cpuset.c > +++ b/kernel/cgroup/cpuset.c > @@ -816,16 +816,15 @@ static int generate_sched_domains(cpumask_var_t **domains, > * 'cpus' is removed, then call this routine to rebuild the > * scheduler's dynamic sched domains. > * > - * Call with cpuset_mutex held. Takes get_online_cpus(). > */ > -static void rebuild_sched_domains_locked(void) > +static void rebuild_sched_domains_cpuslocked(void) > { > struct sched_domain_attr *attr; > cpumask_var_t *doms; > int ndoms; > > + lockdep_assert_cpus_held(); > lockdep_assert_held(&cpuset_mutex); > - get_online_cpus(); > > /* > * We have raced with CPU hotplug. Don't do anything to avoid > @@ -833,27 +832,27 @@ static void rebuild_sched_domains_locked(void) > * Anyways, hotplug work item will rebuild sched domains. > */ > if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) > - goto out; > + return; > > /* Generate domain masks and attrs */ > ndoms = generate_sched_domains(&doms, &attr); > > /* Have scheduler rebuild the domains */ > partition_sched_domains(ndoms, doms, attr); > -out: > - put_online_cpus(); > } > #else /* !CONFIG_SMP */ > -static void rebuild_sched_domains_locked(void) > +static void rebuild_sched_domains_cpuslocked(void) > { > } > #endif /* CONFIG_SMP */ > > void rebuild_sched_domains(void) > { > + get_online_cpus(); > mutex_lock(&cpuset_mutex); > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > mutex_unlock(&cpuset_mutex); > + put_online_cpus(); > } But if you invert these locks, the need for cpuset_hotplug_workfn() goes away, at least for the CPU part, and we can make in synchronous again. Yay!! Also, I think new code should use cpus_read_lock() instead of get_online_cpus(). ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock 2017-09-07 17:51 ` Peter Zijlstra @ 2017-10-09 13:27 ` Prateek Sood 2017-10-11 9:48 ` Peter Zijlstra 0 siblings, 1 reply; 21+ messages in thread From: Prateek Sood @ 2017-10-09 13:27 UTC (permalink / raw) To: Peter Zijlstra Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx, linux-kernel, sramana On 09/07/2017 11:21 PM, Peter Zijlstra wrote: > On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote: >> Remove circular dependency deadlock in a scenario where hotplug of CPU is >> being done while there is updation in cgroup and cpuset triggered from >> userspace. > > You've forgotten to mention your solution to the deadlock, namely > inverting cpuset_mutex and cpu_hotplug_lock. > >> Signed-off-by: Prateek Sood <prsood@codeaurora.org> >> --- >> kernel/cgroup/cpuset.c | 32 +++++++++++++++++++------------- >> 1 file changed, 19 insertions(+), 13 deletions(-) >> >> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c >> index 2f4039b..60dc0ac 100644 >> --- a/kernel/cgroup/cpuset.c >> +++ b/kernel/cgroup/cpuset.c >> @@ -816,16 +816,15 @@ static int generate_sched_domains(cpumask_var_t **domains, >> * 'cpus' is removed, then call this routine to rebuild the >> * scheduler's dynamic sched domains. >> * >> - * Call with cpuset_mutex held. Takes get_online_cpus(). >> */ >> -static void rebuild_sched_domains_locked(void) >> +static void rebuild_sched_domains_cpuslocked(void) >> { >> struct sched_domain_attr *attr; >> cpumask_var_t *doms; >> int ndoms; >> >> + lockdep_assert_cpus_held(); >> lockdep_assert_held(&cpuset_mutex); >> - get_online_cpus(); >> >> /* >> * We have raced with CPU hotplug. Don't do anything to avoid >> @@ -833,27 +832,27 @@ static void rebuild_sched_domains_locked(void) >> * Anyways, hotplug work item will rebuild sched domains. >> */ >> if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) >> - goto out; >> + return; >> >> /* Generate domain masks and attrs */ >> ndoms = generate_sched_domains(&doms, &attr); >> >> /* Have scheduler rebuild the domains */ >> partition_sched_domains(ndoms, doms, attr); >> -out: >> - put_online_cpus(); >> } >> #else /* !CONFIG_SMP */ >> -static void rebuild_sched_domains_locked(void) >> +static void rebuild_sched_domains_cpuslocked(void) >> { >> } >> #endif /* CONFIG_SMP */ >> >> void rebuild_sched_domains(void) >> { >> + get_online_cpus(); >> mutex_lock(&cpuset_mutex); >> - rebuild_sched_domains_locked(); >> + rebuild_sched_domains_cpuslocked(); >> mutex_unlock(&cpuset_mutex); >> + put_online_cpus(); >> } > > But if you invert these locks, the need for cpuset_hotplug_workfn() goes > away, at least for the CPU part, and we can make in synchronous again. > Yay!! > > Also, I think new code should use cpus_read_lock() instead of > get_online_cpus(). > Thanks for the review comments Peter. For patch related to circular deadlock, I will send an updated version. The callback making a call to cpuset_hotplug_workfn()in hotplug path are [CPUHP_AP_ACTIVE] = { .name = "sched:active", .startup.single = sched_cpu_activate, .teardown.single = sched_cpu_deactivate, }, if we make cpuset_hotplug_workfn() synchronous, deadlock might happen: _cpu_down() cpus_write_lock() //held cpuhp_kick_ap_work() cpuhp_kick_ap() __cpuhp_kick_ap() wake_up_process() //cpuhp_thread_fun wait_for_ap_thread() //wait for complete from cpuhp_thread_fun() cpuhp_thread_fun() cpuhp_invoke_callback() sched_cpu_deactivate() cpuset_cpu_inactive() cpuset_update_active_cpus() cpuset_hotplug_work() rebuild_sched_domains() cpus_read_lock() //waiting as acquired in _cpu_down() -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock 2017-10-09 13:27 ` Prateek Sood @ 2017-10-11 9:48 ` Peter Zijlstra 2017-10-25 8:39 ` Prateek Sood 0 siblings, 1 reply; 21+ messages in thread From: Peter Zijlstra @ 2017-10-11 9:48 UTC (permalink / raw) To: Prateek Sood Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx, linux-kernel, sramana On Mon, Oct 09, 2017 at 06:57:46PM +0530, Prateek Sood wrote: > On 09/07/2017 11:21 PM, Peter Zijlstra wrote: > > But if you invert these locks, the need for cpuset_hotplug_workfn() goes > > away, at least for the CPU part, and we can make in synchronous again. > > Yay!! > The callback making a call to cpuset_hotplug_workfn()in hotplug path are > [CPUHP_AP_ACTIVE] = { > .name = "sched:active", > .startup.single = sched_cpu_activate, > .teardown.single = sched_cpu_deactivate, > }, > > if we make cpuset_hotplug_workfn() synchronous, deadlock might happen: > _cpu_down() > cpus_write_lock() //held > cpuhp_kick_ap_work() > cpuhp_kick_ap() > __cpuhp_kick_ap() > wake_up_process() //cpuhp_thread_fun > wait_for_ap_thread() //wait for complete from cpuhp_thread_fun() > > cpuhp_thread_fun() > cpuhp_invoke_callback() > sched_cpu_deactivate() > cpuset_cpu_inactive() > cpuset_update_active_cpus() > cpuset_hotplug_work() > rebuild_sched_domains() > cpus_read_lock() //waiting as acquired in _cpu_down() Well, duh, don't use rebuild_sched_domains() 'obviously' :-) use rebuild_sched_domains_cpuslocked() instead and it works just fine. After applying your patch, the below boots and survives a hotplug. --- include/linux/cpuset.h | 6 ------ kernel/cgroup/cpuset.c | 30 +++++++++--------------------- kernel/power/process.c | 2 -- kernel/sched/core.c | 1 - 4 files changed, 9 insertions(+), 30 deletions(-) --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -51,9 +51,7 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); -extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); -extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -166,15 +164,11 @@ static inline bool cpusets_enabled(void) static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} -static inline void cpuset_force_rebuild(void) { } - static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } -static inline void cpuset_wait_for_hotplug(void) { } - static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -833,7 +833,12 @@ static void rebuild_sched_domains_cpuslo cpumask_var_t *doms; int ndoms; + /* + * When called during hotplug, this lock is held by the calling + * thread, not cpuhp_thread_fun :/ + * lockdep_assert_cpus_held(); + */ lockdep_assert_held(&cpuset_mutex); /* @@ -2281,13 +2286,6 @@ static void cpuset_hotplug_update_tasks( mutex_unlock(&cpuset_mutex); } -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; -} - /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2362,25 +2360,15 @@ static void cpuset_hotplug_workfn(struct } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; + if (cpus_updated) rebuild_sched_domains(); - } } void cpuset_update_active_cpus(void) { - /* - * We're inside cpu hotplug critical region which usually nests - * inside cgroup synchronization. Bounce actual hotplug processing - * to a work item to avoid reverse locking order. - */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_cpuslocked(); + mutex_unlock(&cpuset_mutex); } /* --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -203,8 +203,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5598,7 +5598,6 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ - cpuset_force_rebuild(); } cpuset_update_active_cpus(); } ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock 2017-10-11 9:48 ` Peter Zijlstra @ 2017-10-25 8:39 ` Prateek Sood 2017-10-25 9:30 ` Peter Zijlstra 0 siblings, 1 reply; 21+ messages in thread From: Prateek Sood @ 2017-10-25 8:39 UTC (permalink / raw) To: Peter Zijlstra Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx, linux-kernel, sramana On 10/11/2017 03:18 PM, Peter Zijlstra wrote: > On Mon, Oct 09, 2017 at 06:57:46PM +0530, Prateek Sood wrote: >> On 09/07/2017 11:21 PM, Peter Zijlstra wrote: > >>> But if you invert these locks, the need for cpuset_hotplug_workfn() goes >>> away, at least for the CPU part, and we can make in synchronous again. >>> Yay!! > >> The callback making a call to cpuset_hotplug_workfn()in hotplug path are >> [CPUHP_AP_ACTIVE] = { >> .name = "sched:active", >> .startup.single = sched_cpu_activate, >> .teardown.single = sched_cpu_deactivate, >> }, >> >> if we make cpuset_hotplug_workfn() synchronous, deadlock might happen: >> _cpu_down() >> cpus_write_lock() //held >> cpuhp_kick_ap_work() >> cpuhp_kick_ap() >> __cpuhp_kick_ap() >> wake_up_process() //cpuhp_thread_fun >> wait_for_ap_thread() //wait for complete from cpuhp_thread_fun() >> >> cpuhp_thread_fun() >> cpuhp_invoke_callback() >> sched_cpu_deactivate() >> cpuset_cpu_inactive() >> cpuset_update_active_cpus() >> cpuset_hotplug_work() >> rebuild_sched_domains() >> cpus_read_lock() //waiting as acquired in _cpu_down() > > Well, duh, don't use rebuild_sched_domains() 'obviously' :-) use > rebuild_sched_domains_cpuslocked() instead and it works just fine. > > After applying your patch, the below boots and survives a hotplug. > > --- > include/linux/cpuset.h | 6 ------ > kernel/cgroup/cpuset.c | 30 +++++++++--------------------- > kernel/power/process.c | 2 -- > kernel/sched/core.c | 1 - > 4 files changed, 9 insertions(+), 30 deletions(-) > > --- a/include/linux/cpuset.h > +++ b/include/linux/cpuset.h > @@ -51,9 +51,7 @@ static inline void cpuset_dec(void) > > extern int cpuset_init(void); > extern void cpuset_init_smp(void); > -extern void cpuset_force_rebuild(void); > extern void cpuset_update_active_cpus(void); > -extern void cpuset_wait_for_hotplug(void); > extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); > extern void cpuset_cpus_allowed_fallback(struct task_struct *p); > extern nodemask_t cpuset_mems_allowed(struct task_struct *p); > @@ -166,15 +164,11 @@ static inline bool cpusets_enabled(void) > static inline int cpuset_init(void) { return 0; } > static inline void cpuset_init_smp(void) {} > > -static inline void cpuset_force_rebuild(void) { } > - > static inline void cpuset_update_active_cpus(void) > { > partition_sched_domains(1, NULL, NULL); > } > > -static inline void cpuset_wait_for_hotplug(void) { } > - > static inline void cpuset_cpus_allowed(struct task_struct *p, > struct cpumask *mask) > { > --- a/kernel/cgroup/cpuset.c > +++ b/kernel/cgroup/cpuset.c > @@ -833,7 +833,12 @@ static void rebuild_sched_domains_cpuslo > cpumask_var_t *doms; > int ndoms; > > + /* > + * When called during hotplug, this lock is held by the calling > + * thread, not cpuhp_thread_fun :/ > + * > lockdep_assert_cpus_held(); > + */ > lockdep_assert_held(&cpuset_mutex); > > /* > @@ -2281,13 +2286,6 @@ static void cpuset_hotplug_update_tasks( > mutex_unlock(&cpuset_mutex); > } > > -static bool force_rebuild; > - > -void cpuset_force_rebuild(void) > -{ > - force_rebuild = true; > -} > - > /** > * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset > * > @@ -2362,25 +2360,15 @@ static void cpuset_hotplug_workfn(struct > } > > /* rebuild sched domains if cpus_allowed has changed */ > - if (cpus_updated || force_rebuild) { > - force_rebuild = false; > + if (cpus_updated) > rebuild_sched_domains(); > - } > } > > void cpuset_update_active_cpus(void) > { > - /* > - * We're inside cpu hotplug critical region which usually nests > - * inside cgroup synchronization. Bounce actual hotplug processing > - * to a work item to avoid reverse locking order. > - */ > - schedule_work(&cpuset_hotplug_work); > -} > - > -void cpuset_wait_for_hotplug(void) > -{ > - flush_work(&cpuset_hotplug_work); > + mutex_lock(&cpuset_mutex); > + rebuild_sched_domains_cpuslocked(); > + mutex_unlock(&cpuset_mutex); > } > > /* > --- a/kernel/power/process.c > +++ b/kernel/power/process.c > @@ -203,8 +203,6 @@ void thaw_processes(void) > __usermodehelper_set_disable_depth(UMH_FREEZING); > thaw_workqueues(); > > - cpuset_wait_for_hotplug(); > - > read_lock(&tasklist_lock); > for_each_process_thread(g, p) { > /* No other threads should have PF_SUSPEND_TASK set */ > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5598,7 +5598,6 @@ static void cpuset_cpu_active(void) > * restore the original sched domains by considering the > * cpuset configurations. > */ > - cpuset_force_rebuild(); > } > cpuset_update_active_cpus(); > } > Thanks Peter for sharing the patch and test results. > void cpuset_update_active_cpus(void) > { > - /* > - * We're inside cpu hotplug critical region which usually nests > - * inside cgroup synchronization. Bounce actual hotplug processing > - * to a work item to avoid reverse locking order. > - */ > - schedule_work(&cpuset_hotplug_work); > -} > - > -void cpuset_wait_for_hotplug(void) > -{ > - flush_work(&cpuset_hotplug_work); > + mutex_lock(&cpuset_mutex); > + rebuild_sched_domains_cpuslocked(); > + mutex_unlock(&cpuset_mutex); > } In the above patch rebuild_sched_domains_cpuslocked() has been used directly. Earlier cpuset_hotplug_update_tasks() was also called from cpuset_hotplug_workfn(). So migration of tasks related to cgroup which has empty cpuset would not happen during cpu hotplug. Could you please help in understanding more on this. -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock 2017-10-25 8:39 ` Prateek Sood @ 2017-10-25 9:30 ` Peter Zijlstra 2017-10-26 11:52 ` Prateek Sood 0 siblings, 1 reply; 21+ messages in thread From: Peter Zijlstra @ 2017-10-25 9:30 UTC (permalink / raw) To: Prateek Sood Cc: tj, lizefan, cgroups, mingo, longman, boqun.feng, tglx, linux-kernel, sramana On Wed, Oct 25, 2017 at 02:09:54PM +0530, Prateek Sood wrote: > > void cpuset_update_active_cpus(void) > > { > > + mutex_lock(&cpuset_mutex); > > + rebuild_sched_domains_cpuslocked(); > > + mutex_unlock(&cpuset_mutex); > > } > > In the above patch rebuild_sched_domains_cpuslocked() has been > used directly. Earlier cpuset_hotplug_update_tasks() was also > called from cpuset_hotplug_workfn(). So migration of tasks > related to cgroup which has empty cpuset would not happen > during cpu hotplug. > > > Could you please help in understanding more on this. > That was me being lazy... ^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH] cgroup/cpuset: remove circular dependency deadlock 2017-10-25 9:30 ` Peter Zijlstra @ 2017-10-26 11:52 ` Prateek Sood 2017-10-26 14:05 ` Waiman Long 0 siblings, 1 reply; 21+ messages in thread From: Prateek Sood @ 2017-10-26 11:52 UTC (permalink / raw) To: peterz, tj, lizefan, mingo, longman, boqun.feng, tglx Cc: Prateek Sood, cgroups, sramana, linux-kernel Remove circular dependency deadlock in a scenario where hotplug of CPU is being done while there is updation in cgroup and cpuset triggered from userspace. Process A => kthreadd => Process B => Process C => Process A Process A cpu_subsys_offline(); cpu_down(); _cpu_down(); percpu_down_write(&cpu_hotplug_lock); //held cpuhp_invoke_callback(); workqueue_offline_cpu(); wq_update_unbound_numa(); kthread_create_on_node(); wake_up_process(); //wakeup kthreadd flush_work(); wait_for_completion(); kthreadd kthreadd(); kernel_thread(); do_fork(); copy_process(); percpu_down_read(&cgroup_threadgroup_rwsem); __rwsem_down_read_failed_common(); //waiting Process B kernfs_fop_write(); cgroup_file_write(); cgroup_procs_write(); percpu_down_write(&cgroup_threadgroup_rwsem); //held cgroup_attach_task(); cgroup_migrate(); cgroup_migrate_execute(); cpuset_can_attach(); mutex_lock(&cpuset_mutex); //waiting Process C kernfs_fop_write(); cgroup_file_write(); cpuset_write_resmask(); mutex_lock(&cpuset_mutex); //held update_cpumask(); update_cpumasks_hier(); rebuild_sched_domains_locked(); get_online_cpus(); percpu_down_read(&cpu_hotplug_lock); //waiting Eliminating deadlock by reversing the locking order for cpuset_mutex and cpu_hotplug_lock. Signed-off-by: Prateek Sood <prsood@codeaurora.org> --- include/linux/cpuset.h | 6 ----- kernel/cgroup/cpuset.c | 70 ++++++++++++++++++++++++++------------------------ kernel/power/process.c | 2 -- kernel/sched/core.c | 1 - 4 files changed, 36 insertions(+), 43 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a1e6a33..e74655d 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -51,9 +51,7 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); -extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); -extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} -static inline void cpuset_force_rebuild(void) { } - static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } -static inline void cpuset_wait_for_hotplug(void) { } - static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4657e29..a8213c2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -826,16 +826,14 @@ static int generate_sched_domains(cpumask_var_t **domains, * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -843,27 +841,27 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { + cpus_read_lock(); mutex_lock(&cpuset_mutex); - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); } /** @@ -949,7 +947,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } /** @@ -1281,7 +1279,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } return 0; @@ -1314,7 +1312,6 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ - static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1347,7 +1344,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1615,6 +1612,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; + cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -1652,6 +1650,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); return retval; } @@ -1662,6 +1661,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; + cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1676,6 +1676,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); return retval; } @@ -1714,6 +1715,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); + cpus_read_lock(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1739,6 +1741,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2039,13 +2042,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_cpuslocked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + cpus_read_lock(); mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) @@ -2055,6 +2059,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -2275,15 +2280,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) mutex_unlock(&cpuset_mutex); } -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; -} - /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2298,7 +2296,7 @@ void cpuset_force_rebuild(void) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug(bool use_cpu_hp_lock) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2356,25 +2354,29 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); + if (cpus_updated) { + if (use_cpu_hp_lock) + rebuild_sched_domains(); + else { + /* When called during cpu hotplug cpu_hotplug_lock + * is held by the calling thread, not + * not cpuhp_thread_fun + */ + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_cpuslocked(); + mutex_unlock(&cpuset_mutex); + } } } -void cpuset_update_active_cpus(void) +static void cpuset_hotplug_workfn(struct work_struct *work) { - /* - * We're inside cpu hotplug critical region which usually nests - * inside cgroup synchronization. Bounce actual hotplug processing - * to a work item to avoid reverse locking order. - */ - schedule_work(&cpuset_hotplug_work); + cpuset_hotplug(true); } -void cpuset_wait_for_hotplug(void) +void cpuset_update_active_cpus(void) { - flush_work(&cpuset_hotplug_work); + cpuset_hotplug(false); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index 50f25cb..28772b405 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -203,8 +203,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da..25b8717 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5590,7 +5590,6 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ - cpuset_force_rebuild(); } cpuset_update_active_cpus(); } -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project. ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock 2017-10-26 11:52 ` Prateek Sood @ 2017-10-26 14:05 ` Waiman Long 2017-10-27 8:03 ` Prateek Sood 0 siblings, 1 reply; 21+ messages in thread From: Waiman Long @ 2017-10-26 14:05 UTC (permalink / raw) To: Prateek Sood, peterz, tj, lizefan, mingo, boqun.feng, tglx Cc: cgroups, sramana, linux-kernel On 10/26/2017 07:52 AM, Prateek Sood wrote: > Remove circular dependency deadlock in a scenario where hotplug of CPU is > being done while there is updation in cgroup and cpuset triggered from > userspace. > > Process A => kthreadd => Process B => Process C => Process A > > Process A > cpu_subsys_offline(); > cpu_down(); > _cpu_down(); > percpu_down_write(&cpu_hotplug_lock); //held > cpuhp_invoke_callback(); > workqueue_offline_cpu(); > wq_update_unbound_numa(); > kthread_create_on_node(); > wake_up_process(); //wakeup kthreadd > flush_work(); > wait_for_completion(); > > kthreadd > kthreadd(); > kernel_thread(); > do_fork(); > copy_process(); > percpu_down_read(&cgroup_threadgroup_rwsem); > __rwsem_down_read_failed_common(); //waiting > > Process B > kernfs_fop_write(); > cgroup_file_write(); > cgroup_procs_write(); > percpu_down_write(&cgroup_threadgroup_rwsem); //held > cgroup_attach_task(); > cgroup_migrate(); > cgroup_migrate_execute(); > cpuset_can_attach(); > mutex_lock(&cpuset_mutex); //waiting > > Process C > kernfs_fop_write(); > cgroup_file_write(); > cpuset_write_resmask(); > mutex_lock(&cpuset_mutex); //held > update_cpumask(); > update_cpumasks_hier(); > rebuild_sched_domains_locked(); > get_online_cpus(); > percpu_down_read(&cpu_hotplug_lock); //waiting > > Eliminating deadlock by reversing the locking order for cpuset_mutex and > cpu_hotplug_lock. General comments: Please add a version number of your patch. I have seen multiple versions of this patch and have lost track how many are there as there is no version number information. In addition, there are changes beyond just swapping the lock order and they are not documented in this change log. I would like to see you discuss about those additional changes here as well. > void rebuild_sched_domains(void) > { > + cpus_read_lock(); > mutex_lock(&cpuset_mutex); > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > mutex_unlock(&cpuset_mutex); > + cpus_read_unlock(); > } I saw a lot of instances where cpus_read_lock() and mutex_lock() come together. Maybe some new lock/unlock helper functions may help. > @@ -2356,25 +2354,29 @@ static void cpuset_hotplug_workfn(struct work_struct *work) > } > > /* rebuild sched domains if cpus_allowed has changed */ > - if (cpus_updated || force_rebuild) { > - force_rebuild = false; > - rebuild_sched_domains(); > + if (cpus_updated) { > + if (use_cpu_hp_lock) > + rebuild_sched_domains(); > + else { > + /* When called during cpu hotplug cpu_hotplug_lock > + * is held by the calling thread, not > + * not cpuhp_thread_fun > + */ ??? The comment is not clear. Cheers, Longman ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock 2017-10-26 14:05 ` Waiman Long @ 2017-10-27 8:03 ` Prateek Sood 2017-10-30 7:16 ` [PATCH v2] " Prateek Sood 0 siblings, 1 reply; 21+ messages in thread From: Prateek Sood @ 2017-10-27 8:03 UTC (permalink / raw) To: Waiman Long, peterz, tj, lizefan, mingo, boqun.feng, tglx Cc: cgroups, sramana, linux-kernel On 10/26/2017 07:35 PM, Waiman Long wrote: > On 10/26/2017 07:52 AM, Prateek Sood wrote: >> Remove circular dependency deadlock in a scenario where hotplug of CPU is >> being done while there is updation in cgroup and cpuset triggered from >> userspace. >> >> Process A => kthreadd => Process B => Process C => Process A >> >> Process A >> cpu_subsys_offline(); >> cpu_down(); >> _cpu_down(); >> percpu_down_write(&cpu_hotplug_lock); //held >> cpuhp_invoke_callback(); >> workqueue_offline_cpu(); >> wq_update_unbound_numa(); >> kthread_create_on_node(); >> wake_up_process(); //wakeup kthreadd >> flush_work(); >> wait_for_completion(); >> >> kthreadd >> kthreadd(); >> kernel_thread(); >> do_fork(); >> copy_process(); >> percpu_down_read(&cgroup_threadgroup_rwsem); >> __rwsem_down_read_failed_common(); //waiting >> >> Process B >> kernfs_fop_write(); >> cgroup_file_write(); >> cgroup_procs_write(); >> percpu_down_write(&cgroup_threadgroup_rwsem); //held >> cgroup_attach_task(); >> cgroup_migrate(); >> cgroup_migrate_execute(); >> cpuset_can_attach(); >> mutex_lock(&cpuset_mutex); //waiting >> >> Process C >> kernfs_fop_write(); >> cgroup_file_write(); >> cpuset_write_resmask(); >> mutex_lock(&cpuset_mutex); //held >> update_cpumask(); >> update_cpumasks_hier(); >> rebuild_sched_domains_locked(); >> get_online_cpus(); >> percpu_down_read(&cpu_hotplug_lock); //waiting >> >> Eliminating deadlock by reversing the locking order for cpuset_mutex and >> cpu_hotplug_lock. > > General comments: > > Please add a version number of your patch. I have seen multiple versions > of this patch and have lost track how many are there as there is no > version number information. In addition, there are changes beyond just > swapping the lock order and they are not documented in this change log. > I would like to see you discuss about those additional changes here as well. Thanks for the comments Longman. I will introduce patch versioning and update commit text to document extra changes. Explaintaion for extra changes in this patch: After inverting the locking sequence of cpu_hotplug_lock and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be done synchronously from the context doing cpu hotplug. Extra changes in this patch intend to remove queuing of cpuset_hotplug_workfn() as a work item for cpu hotplug path. For memory hotplug it still gets queued as a work item. This suggestion came in from Peter. Peter could you please elaborate if I have missed anything. > >> void rebuild_sched_domains(void) >> { >> + cpus_read_lock(); >> mutex_lock(&cpuset_mutex); >> - rebuild_sched_domains_locked(); >> + rebuild_sched_domains_cpuslocked(); >> mutex_unlock(&cpuset_mutex); >> + cpus_read_unlock(); >> } > > I saw a lot of instances where cpus_read_lock() and mutex_lock() come > together. Maybe some new lock/unlock helper functions may help. Ok, I will introduce a single wrapper for locking and unlocking of both locks > >> @@ -2356,25 +2354,29 @@ static void cpuset_hotplug_workfn(struct work_struct *work) >> } >> >> /* rebuild sched domains if cpus_allowed has changed */ >> - if (cpus_updated || force_rebuild) { >> - force_rebuild = false; >> - rebuild_sched_domains(); >> + if (cpus_updated) { >> + if (use_cpu_hp_lock) >> + rebuild_sched_domains(); >> + else { >> + /* When called during cpu hotplug cpu_hotplug_lock >> + * is held by the calling thread, not >> + * not cpuhp_thread_fun >> + */ > > ??? The comment is not clear. Following is the scenario that is described by the comment Process A _cpu_down() cpus_write_lock() //cpu_hotplug_lock held cpuhp_kick_ap_work() cpuhp_kick_ap() wake_up_process() // wake up cpuhp_thread_fun wait_for_ap_thread() //wait for hotplug thread to signal completion cpuhp_thread_fun() cpuhp_invoke_callback() sched_cpu_deactivate() cpuset_cpu_inactive() cpuset_update_active_cpus() cpuset_hotplug(false) \\ do not use cpu_hotplug_lock from _cpu_down() path I will update the comment in next version of patch to elaborate more. > > Cheers, > Longman > -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v2] cgroup/cpuset: remove circular dependency deadlock 2017-10-27 8:03 ` Prateek Sood @ 2017-10-30 7:16 ` Prateek Sood 2017-11-06 4:01 ` Prateek Sood 2017-11-15 10:26 ` Prateek Sood 0 siblings, 2 replies; 21+ messages in thread From: Prateek Sood @ 2017-10-30 7:16 UTC (permalink / raw) To: longman, peterz, tj, lizefan, mingo, boqun.feng, tglx Cc: Prateek Sood, cgroups, sramana, linux-kernel Remove circular dependency deadlock in a scenario where hotplug of CPU is being done while there is updation in cgroup and cpuset triggered from userspace. Process A => kthreadd => Process B => Process C => Process A Process A cpu_subsys_offline(); cpu_down(); _cpu_down(); percpu_down_write(&cpu_hotplug_lock); //held cpuhp_invoke_callback(); workqueue_offline_cpu(); queue_work_on(); // unbind_work on system_highpri_wq __queue_work(); insert_work(); wake_up_worker(); flush_work(); wait_for_completion(); worker_thread(); manage_workers(); create_worker(); kthread_create_on_node(); wake_up_process(kthreadd_task); kthreadd kthreadd(); kernel_thread(); do_fork(); copy_process(); percpu_down_read(&cgroup_threadgroup_rwsem); __rwsem_down_read_failed_common(); //waiting Process B kernfs_fop_write(); cgroup_file_write(); cgroup_procs_write(); percpu_down_write(&cgroup_threadgroup_rwsem); //held cgroup_attach_task(); cgroup_migrate(); cgroup_migrate_execute(); cpuset_can_attach(); mutex_lock(&cpuset_mutex); //waiting Process C kernfs_fop_write(); cgroup_file_write(); cpuset_write_resmask(); mutex_lock(&cpuset_mutex); //held update_cpumask(); update_cpumasks_hier(); rebuild_sched_domains_locked(); get_online_cpus(); percpu_down_read(&cpu_hotplug_lock); //waiting Eliminating deadlock by reversing the locking order for cpuset_mutex and cpu_hotplug_lock. After inverting the locking sequence of cpu_hotplug_lock and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be done synchronously from the context doing cpu hotplug. For memory hotplug it still gets queued as a work item. Signed-off-by: Prateek Sood <prsood@codeaurora.org> --- include/linux/cpuset.h | 6 ---- kernel/cgroup/cpuset.c | 94 +++++++++++++++++++++++++++----------------------- kernel/power/process.c | 2 -- kernel/sched/core.c | 1 - 4 files changed, 50 insertions(+), 53 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a1e6a33..e74655d 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -51,9 +51,7 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); -extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); -extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} -static inline void cpuset_force_rebuild(void) { } - static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } -static inline void cpuset_wait_for_hotplug(void) { } - static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4657e29..ec44aaa 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -817,6 +817,18 @@ static int generate_sched_domains(cpumask_var_t **domains, return ndoms; } +static void cpuset_sched_change_begin(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +static void cpuset_sched_change_end(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + /* * Rebuild scheduler domains. * @@ -826,16 +838,14 @@ static int generate_sched_domains(cpumask_var_t **domains, * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -843,27 +853,25 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { - mutex_lock(&cpuset_mutex); - rebuild_sched_domains_locked(); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_begin(); + rebuild_sched_domains_cpuslocked(); + cpuset_sched_change_end(); } /** @@ -949,7 +957,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } /** @@ -1281,7 +1289,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } return 0; @@ -1314,7 +1322,6 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ - static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1347,7 +1354,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1615,7 +1622,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -1651,7 +1658,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1662,7 +1669,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1675,7 +1682,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1714,7 +1721,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1738,7 +1745,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2039,14 +2046,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_cpuslocked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -2054,7 +2061,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -2275,15 +2282,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) mutex_unlock(&cpuset_mutex); } -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; -} - /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2298,7 +2298,7 @@ void cpuset_force_rebuild(void) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug(bool use_cpu_hp_lock) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2356,25 +2356,31 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); + if (cpus_updated) { + if (use_cpu_hp_lock) + rebuild_sched_domains(); + else { + /* Acquiring cpu_hotplug_lock is not required. + * When cpuset_hotplug() is called in hotplug path, + * cpu_hotplug_lock is held by the hotplug context + * which is waiting for cpuhp_thread_fun to indicate + * completion of callback. + */ + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_cpuslocked(); + mutex_unlock(&cpuset_mutex); + } } } -void cpuset_update_active_cpus(void) +static void cpuset_hotplug_workfn(struct work_struct *work) { - /* - * We're inside cpu hotplug critical region which usually nests - * inside cgroup synchronization. Bounce actual hotplug processing - * to a work item to avoid reverse locking order. - */ - schedule_work(&cpuset_hotplug_work); + cpuset_hotplug(true); } -void cpuset_wait_for_hotplug(void) +void cpuset_update_active_cpus(void) { - flush_work(&cpuset_hotplug_work); + cpuset_hotplug(false); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index 50f25cb..28772b405 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -203,8 +203,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da..25b8717 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5590,7 +5590,6 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ - cpuset_force_rebuild(); } cpuset_update_active_cpus(); } -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project. ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock 2017-10-30 7:16 ` [PATCH v2] " Prateek Sood @ 2017-11-06 4:01 ` Prateek Sood 2017-11-15 10:26 ` Prateek Sood 1 sibling, 0 replies; 21+ messages in thread From: Prateek Sood @ 2017-11-06 4:01 UTC (permalink / raw) To: longman, peterz, tj, lizefan, mingo, boqun.feng, tglx Cc: cgroups, sramana, linux-kernel On 10/30/2017 12:46 PM, Prateek Sood wrote: > Remove circular dependency deadlock in a scenario where hotplug of CPU is > being done while there is updation in cgroup and cpuset triggered from > userspace. > > Process A => kthreadd => Process B => Process C => Process A > > Process A > cpu_subsys_offline(); > cpu_down(); > _cpu_down(); > percpu_down_write(&cpu_hotplug_lock); //held > cpuhp_invoke_callback(); > workqueue_offline_cpu(); > queue_work_on(); // unbind_work on system_highpri_wq > __queue_work(); > insert_work(); > wake_up_worker(); > flush_work(); > wait_for_completion(); > > worker_thread(); > manage_workers(); > create_worker(); > kthread_create_on_node(); > wake_up_process(kthreadd_task); > > kthreadd > kthreadd(); > kernel_thread(); > do_fork(); > copy_process(); > percpu_down_read(&cgroup_threadgroup_rwsem); > __rwsem_down_read_failed_common(); //waiting > > Process B > kernfs_fop_write(); > cgroup_file_write(); > cgroup_procs_write(); > percpu_down_write(&cgroup_threadgroup_rwsem); //held > cgroup_attach_task(); > cgroup_migrate(); > cgroup_migrate_execute(); > cpuset_can_attach(); > mutex_lock(&cpuset_mutex); //waiting > > Process C > kernfs_fop_write(); > cgroup_file_write(); > cpuset_write_resmask(); > mutex_lock(&cpuset_mutex); //held > update_cpumask(); > update_cpumasks_hier(); > rebuild_sched_domains_locked(); > get_online_cpus(); > percpu_down_read(&cpu_hotplug_lock); //waiting > > Eliminating deadlock by reversing the locking order for cpuset_mutex and > cpu_hotplug_lock. After inverting the locking sequence of cpu_hotplug_lock > and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be > done synchronously from the context doing cpu hotplug. For memory hotplug > it still gets queued as a work item. > > Signed-off-by: Prateek Sood <prsood@codeaurora.org> > --- > include/linux/cpuset.h | 6 ---- > kernel/cgroup/cpuset.c | 94 +++++++++++++++++++++++++++----------------------- > kernel/power/process.c | 2 -- > kernel/sched/core.c | 1 - > 4 files changed, 50 insertions(+), 53 deletions(-) > > diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h > index a1e6a33..e74655d 100644 > --- a/include/linux/cpuset.h > +++ b/include/linux/cpuset.h > @@ -51,9 +51,7 @@ static inline void cpuset_dec(void) > > extern int cpuset_init(void); > extern void cpuset_init_smp(void); > -extern void cpuset_force_rebuild(void); > extern void cpuset_update_active_cpus(void); > -extern void cpuset_wait_for_hotplug(void); > extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); > extern void cpuset_cpus_allowed_fallback(struct task_struct *p); > extern nodemask_t cpuset_mems_allowed(struct task_struct *p); > @@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) > static inline int cpuset_init(void) { return 0; } > static inline void cpuset_init_smp(void) {} > > -static inline void cpuset_force_rebuild(void) { } > - > static inline void cpuset_update_active_cpus(void) > { > partition_sched_domains(1, NULL, NULL); > } > > -static inline void cpuset_wait_for_hotplug(void) { } > - > static inline void cpuset_cpus_allowed(struct task_struct *p, > struct cpumask *mask) > { > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c > index 4657e29..ec44aaa 100644 > --- a/kernel/cgroup/cpuset.c > +++ b/kernel/cgroup/cpuset.c > @@ -817,6 +817,18 @@ static int generate_sched_domains(cpumask_var_t **domains, > return ndoms; > } > > +static void cpuset_sched_change_begin(void) > +{ > + cpus_read_lock(); > + mutex_lock(&cpuset_mutex); > +} > + > +static void cpuset_sched_change_end(void) > +{ > + mutex_unlock(&cpuset_mutex); > + cpus_read_unlock(); > +} > + > /* > * Rebuild scheduler domains. > * > @@ -826,16 +838,14 @@ static int generate_sched_domains(cpumask_var_t **domains, > * 'cpus' is removed, then call this routine to rebuild the > * scheduler's dynamic sched domains. > * > - * Call with cpuset_mutex held. Takes get_online_cpus(). > */ > -static void rebuild_sched_domains_locked(void) > +static void rebuild_sched_domains_cpuslocked(void) > { > struct sched_domain_attr *attr; > cpumask_var_t *doms; > int ndoms; > > lockdep_assert_held(&cpuset_mutex); > - get_online_cpus(); > > /* > * We have raced with CPU hotplug. Don't do anything to avoid > @@ -843,27 +853,25 @@ static void rebuild_sched_domains_locked(void) > * Anyways, hotplug work item will rebuild sched domains. > */ > if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) > - goto out; > + return; > > /* Generate domain masks and attrs */ > ndoms = generate_sched_domains(&doms, &attr); > > /* Have scheduler rebuild the domains */ > partition_sched_domains(ndoms, doms, attr); > -out: > - put_online_cpus(); > } > #else /* !CONFIG_SMP */ > -static void rebuild_sched_domains_locked(void) > +static void rebuild_sched_domains_cpuslocked(void) > { > } > #endif /* CONFIG_SMP */ > > void rebuild_sched_domains(void) > { > - mutex_lock(&cpuset_mutex); > - rebuild_sched_domains_locked(); > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_begin(); > + rebuild_sched_domains_cpuslocked(); > + cpuset_sched_change_end(); > } > > /** > @@ -949,7 +957,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) > rcu_read_unlock(); > > if (need_rebuild_sched_domains) > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > } > > /** > @@ -1281,7 +1289,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) > cs->relax_domain_level = val; > if (!cpumask_empty(cs->cpus_allowed) && > is_sched_load_balance(cs)) > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > } > > return 0; > @@ -1314,7 +1322,6 @@ static void update_tasks_flags(struct cpuset *cs) > * > * Call with cpuset_mutex held. > */ > - > static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, > int turning_on) > { > @@ -1347,7 +1354,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, > spin_unlock_irq(&callback_lock); > > if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > > if (spread_flag_changed) > update_tasks_flags(cs); > @@ -1615,7 +1622,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, > cpuset_filetype_t type = cft->private; > int retval = 0; > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > if (!is_cpuset_online(cs)) { > retval = -ENODEV; > goto out_unlock; > @@ -1651,7 +1658,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, > break; > } > out_unlock: > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > return retval; > } > > @@ -1662,7 +1669,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, > cpuset_filetype_t type = cft->private; > int retval = -ENODEV; > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > if (!is_cpuset_online(cs)) > goto out_unlock; > > @@ -1675,7 +1682,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, > break; > } > out_unlock: > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > return retval; > } > > @@ -1714,7 +1721,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, > kernfs_break_active_protection(of->kn); > flush_work(&cpuset_hotplug_work); > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > if (!is_cpuset_online(cs)) > goto out_unlock; > > @@ -1738,7 +1745,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, > > free_trial_cpuset(trialcs); > out_unlock: > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > kernfs_unbreak_active_protection(of->kn); > css_put(&cs->css); > flush_workqueue(cpuset_migrate_mm_wq); > @@ -2039,14 +2046,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) > /* > * If the cpuset being removed has its flag 'sched_load_balance' > * enabled, then simulate turning sched_load_balance off, which > - * will call rebuild_sched_domains_locked(). > + * will call rebuild_sched_domains_cpuslocked(). > */ > > static void cpuset_css_offline(struct cgroup_subsys_state *css) > { > struct cpuset *cs = css_cs(css); > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > > if (is_sched_load_balance(cs)) > update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); > @@ -2054,7 +2061,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) > cpuset_dec(); > clear_bit(CS_ONLINE, &cs->flags); > > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > } > > static void cpuset_css_free(struct cgroup_subsys_state *css) > @@ -2275,15 +2282,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) > mutex_unlock(&cpuset_mutex); > } > > -static bool force_rebuild; > - > -void cpuset_force_rebuild(void) > -{ > - force_rebuild = true; > -} > - > /** > - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset > + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset > * > * This function is called after either CPU or memory configuration has > * changed and updates cpuset accordingly. The top_cpuset is always > @@ -2298,7 +2298,7 @@ void cpuset_force_rebuild(void) > * Note that CPU offlining during suspend is ignored. We don't modify > * cpusets across suspend/resume cycles at all. > */ > -static void cpuset_hotplug_workfn(struct work_struct *work) > +static void cpuset_hotplug(bool use_cpu_hp_lock) > { > static cpumask_t new_cpus; > static nodemask_t new_mems; > @@ -2356,25 +2356,31 @@ static void cpuset_hotplug_workfn(struct work_struct *work) > } > > /* rebuild sched domains if cpus_allowed has changed */ > - if (cpus_updated || force_rebuild) { > - force_rebuild = false; > - rebuild_sched_domains(); > + if (cpus_updated) { > + if (use_cpu_hp_lock) > + rebuild_sched_domains(); > + else { > + /* Acquiring cpu_hotplug_lock is not required. > + * When cpuset_hotplug() is called in hotplug path, > + * cpu_hotplug_lock is held by the hotplug context > + * which is waiting for cpuhp_thread_fun to indicate > + * completion of callback. > + */ > + mutex_lock(&cpuset_mutex); > + rebuild_sched_domains_cpuslocked(); > + mutex_unlock(&cpuset_mutex); > + } > } > } > > -void cpuset_update_active_cpus(void) > +static void cpuset_hotplug_workfn(struct work_struct *work) > { > - /* > - * We're inside cpu hotplug critical region which usually nests > - * inside cgroup synchronization. Bounce actual hotplug processing > - * to a work item to avoid reverse locking order. > - */ > - schedule_work(&cpuset_hotplug_work); > + cpuset_hotplug(true); > } > > -void cpuset_wait_for_hotplug(void) > +void cpuset_update_active_cpus(void) > { > - flush_work(&cpuset_hotplug_work); > + cpuset_hotplug(false); > } > > /* > diff --git a/kernel/power/process.c b/kernel/power/process.c > index 50f25cb..28772b405 100644 > --- a/kernel/power/process.c > +++ b/kernel/power/process.c > @@ -203,8 +203,6 @@ void thaw_processes(void) > __usermodehelper_set_disable_depth(UMH_FREEZING); > thaw_workqueues(); > > - cpuset_wait_for_hotplug(); > - > read_lock(&tasklist_lock); > for_each_process_thread(g, p) { > /* No other threads should have PF_SUSPEND_TASK set */ > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index d17c5da..25b8717 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5590,7 +5590,6 @@ static void cpuset_cpu_active(void) > * restore the original sched domains by considering the > * cpuset configurations. > */ > - cpuset_force_rebuild(); > } > cpuset_update_active_cpus(); > } > Hi Folks, Are there any more feedbacks/suggestions to improve this patch? Thanks PS -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock 2017-10-30 7:16 ` [PATCH v2] " Prateek Sood 2017-11-06 4:01 ` Prateek Sood @ 2017-11-15 10:26 ` Prateek Sood 2017-11-15 10:37 ` Peter Zijlstra 1 sibling, 1 reply; 21+ messages in thread From: Prateek Sood @ 2017-11-15 10:26 UTC (permalink / raw) To: longman, peterz, tj, lizefan, mingo, boqun.feng, tglx Cc: cgroups, sramana, linux-kernel On 10/30/2017 12:46 PM, Prateek Sood wrote: > Remove circular dependency deadlock in a scenario where hotplug of CPU is > being done while there is updation in cgroup and cpuset triggered from > userspace. > > Process A => kthreadd => Process B => Process C => Process A > > Process A > cpu_subsys_offline(); > cpu_down(); > _cpu_down(); > percpu_down_write(&cpu_hotplug_lock); //held > cpuhp_invoke_callback(); > workqueue_offline_cpu(); > queue_work_on(); // unbind_work on system_highpri_wq > __queue_work(); > insert_work(); > wake_up_worker(); > flush_work(); > wait_for_completion(); > > worker_thread(); > manage_workers(); > create_worker(); > kthread_create_on_node(); > wake_up_process(kthreadd_task); > > kthreadd > kthreadd(); > kernel_thread(); > do_fork(); > copy_process(); > percpu_down_read(&cgroup_threadgroup_rwsem); > __rwsem_down_read_failed_common(); //waiting > > Process B > kernfs_fop_write(); > cgroup_file_write(); > cgroup_procs_write(); > percpu_down_write(&cgroup_threadgroup_rwsem); //held > cgroup_attach_task(); > cgroup_migrate(); > cgroup_migrate_execute(); > cpuset_can_attach(); > mutex_lock(&cpuset_mutex); //waiting > > Process C > kernfs_fop_write(); > cgroup_file_write(); > cpuset_write_resmask(); > mutex_lock(&cpuset_mutex); //held > update_cpumask(); > update_cpumasks_hier(); > rebuild_sched_domains_locked(); > get_online_cpus(); > percpu_down_read(&cpu_hotplug_lock); //waiting > > Eliminating deadlock by reversing the locking order for cpuset_mutex and > cpu_hotplug_lock. After inverting the locking sequence of cpu_hotplug_lock > and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be > done synchronously from the context doing cpu hotplug. For memory hotplug > it still gets queued as a work item. > > Signed-off-by: Prateek Sood <prsood@codeaurora.org> > --- > include/linux/cpuset.h | 6 ---- > kernel/cgroup/cpuset.c | 94 +++++++++++++++++++++++++++----------------------- > kernel/power/process.c | 2 -- > kernel/sched/core.c | 1 - > 4 files changed, 50 insertions(+), 53 deletions(-) > > diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h > index a1e6a33..e74655d 100644 > --- a/include/linux/cpuset.h > +++ b/include/linux/cpuset.h > @@ -51,9 +51,7 @@ static inline void cpuset_dec(void) > > extern int cpuset_init(void); > extern void cpuset_init_smp(void); > -extern void cpuset_force_rebuild(void); > extern void cpuset_update_active_cpus(void); > -extern void cpuset_wait_for_hotplug(void); > extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); > extern void cpuset_cpus_allowed_fallback(struct task_struct *p); > extern nodemask_t cpuset_mems_allowed(struct task_struct *p); > @@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) > static inline int cpuset_init(void) { return 0; } > static inline void cpuset_init_smp(void) {} > > -static inline void cpuset_force_rebuild(void) { } > - > static inline void cpuset_update_active_cpus(void) > { > partition_sched_domains(1, NULL, NULL); > } > > -static inline void cpuset_wait_for_hotplug(void) { } > - > static inline void cpuset_cpus_allowed(struct task_struct *p, > struct cpumask *mask) > { > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c > index 4657e29..ec44aaa 100644 > --- a/kernel/cgroup/cpuset.c > +++ b/kernel/cgroup/cpuset.c > @@ -817,6 +817,18 @@ static int generate_sched_domains(cpumask_var_t **domains, > return ndoms; > } > > +static void cpuset_sched_change_begin(void) > +{ > + cpus_read_lock(); > + mutex_lock(&cpuset_mutex); > +} > + > +static void cpuset_sched_change_end(void) > +{ > + mutex_unlock(&cpuset_mutex); > + cpus_read_unlock(); > +} > + > /* > * Rebuild scheduler domains. > * > @@ -826,16 +838,14 @@ static int generate_sched_domains(cpumask_var_t **domains, > * 'cpus' is removed, then call this routine to rebuild the > * scheduler's dynamic sched domains. > * > - * Call with cpuset_mutex held. Takes get_online_cpus(). > */ > -static void rebuild_sched_domains_locked(void) > +static void rebuild_sched_domains_cpuslocked(void) > { > struct sched_domain_attr *attr; > cpumask_var_t *doms; > int ndoms; > > lockdep_assert_held(&cpuset_mutex); > - get_online_cpus(); > > /* > * We have raced with CPU hotplug. Don't do anything to avoid > @@ -843,27 +853,25 @@ static void rebuild_sched_domains_locked(void) > * Anyways, hotplug work item will rebuild sched domains. > */ > if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) > - goto out; > + return; > > /* Generate domain masks and attrs */ > ndoms = generate_sched_domains(&doms, &attr); > > /* Have scheduler rebuild the domains */ > partition_sched_domains(ndoms, doms, attr); > -out: > - put_online_cpus(); > } > #else /* !CONFIG_SMP */ > -static void rebuild_sched_domains_locked(void) > +static void rebuild_sched_domains_cpuslocked(void) > { > } > #endif /* CONFIG_SMP */ > > void rebuild_sched_domains(void) > { > - mutex_lock(&cpuset_mutex); > - rebuild_sched_domains_locked(); > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_begin(); > + rebuild_sched_domains_cpuslocked(); > + cpuset_sched_change_end(); > } > > /** > @@ -949,7 +957,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) > rcu_read_unlock(); > > if (need_rebuild_sched_domains) > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > } > > /** > @@ -1281,7 +1289,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) > cs->relax_domain_level = val; > if (!cpumask_empty(cs->cpus_allowed) && > is_sched_load_balance(cs)) > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > } > > return 0; > @@ -1314,7 +1322,6 @@ static void update_tasks_flags(struct cpuset *cs) > * > * Call with cpuset_mutex held. > */ > - > static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, > int turning_on) > { > @@ -1347,7 +1354,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, > spin_unlock_irq(&callback_lock); > > if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) > - rebuild_sched_domains_locked(); > + rebuild_sched_domains_cpuslocked(); > > if (spread_flag_changed) > update_tasks_flags(cs); > @@ -1615,7 +1622,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, > cpuset_filetype_t type = cft->private; > int retval = 0; > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > if (!is_cpuset_online(cs)) { > retval = -ENODEV; > goto out_unlock; > @@ -1651,7 +1658,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, > break; > } > out_unlock: > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > return retval; > } > > @@ -1662,7 +1669,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, > cpuset_filetype_t type = cft->private; > int retval = -ENODEV; > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > if (!is_cpuset_online(cs)) > goto out_unlock; > > @@ -1675,7 +1682,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, > break; > } > out_unlock: > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > return retval; > } > > @@ -1714,7 +1721,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, > kernfs_break_active_protection(of->kn); > flush_work(&cpuset_hotplug_work); > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > if (!is_cpuset_online(cs)) > goto out_unlock; > > @@ -1738,7 +1745,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, > > free_trial_cpuset(trialcs); > out_unlock: > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > kernfs_unbreak_active_protection(of->kn); > css_put(&cs->css); > flush_workqueue(cpuset_migrate_mm_wq); > @@ -2039,14 +2046,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) > /* > * If the cpuset being removed has its flag 'sched_load_balance' > * enabled, then simulate turning sched_load_balance off, which > - * will call rebuild_sched_domains_locked(). > + * will call rebuild_sched_domains_cpuslocked(). > */ > > static void cpuset_css_offline(struct cgroup_subsys_state *css) > { > struct cpuset *cs = css_cs(css); > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > > if (is_sched_load_balance(cs)) > update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); > @@ -2054,7 +2061,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) > cpuset_dec(); > clear_bit(CS_ONLINE, &cs->flags); > > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > } > > static void cpuset_css_free(struct cgroup_subsys_state *css) > @@ -2275,15 +2282,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) > mutex_unlock(&cpuset_mutex); > } > > -static bool force_rebuild; > - > -void cpuset_force_rebuild(void) > -{ > - force_rebuild = true; > -} > - > /** > - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset > + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset > * > * This function is called after either CPU or memory configuration has > * changed and updates cpuset accordingly. The top_cpuset is always > @@ -2298,7 +2298,7 @@ void cpuset_force_rebuild(void) > * Note that CPU offlining during suspend is ignored. We don't modify > * cpusets across suspend/resume cycles at all. > */ > -static void cpuset_hotplug_workfn(struct work_struct *work) > +static void cpuset_hotplug(bool use_cpu_hp_lock) > { > static cpumask_t new_cpus; > static nodemask_t new_mems; > @@ -2356,25 +2356,31 @@ static void cpuset_hotplug_workfn(struct work_struct *work) > } > > /* rebuild sched domains if cpus_allowed has changed */ > - if (cpus_updated || force_rebuild) { > - force_rebuild = false; > - rebuild_sched_domains(); > + if (cpus_updated) { > + if (use_cpu_hp_lock) > + rebuild_sched_domains(); > + else { > + /* Acquiring cpu_hotplug_lock is not required. > + * When cpuset_hotplug() is called in hotplug path, > + * cpu_hotplug_lock is held by the hotplug context > + * which is waiting for cpuhp_thread_fun to indicate > + * completion of callback. > + */ > + mutex_lock(&cpuset_mutex); > + rebuild_sched_domains_cpuslocked(); > + mutex_unlock(&cpuset_mutex); > + } > } > } > > -void cpuset_update_active_cpus(void) > +static void cpuset_hotplug_workfn(struct work_struct *work) > { > - /* > - * We're inside cpu hotplug critical region which usually nests > - * inside cgroup synchronization. Bounce actual hotplug processing > - * to a work item to avoid reverse locking order. > - */ > - schedule_work(&cpuset_hotplug_work); > + cpuset_hotplug(true); > } > > -void cpuset_wait_for_hotplug(void) > +void cpuset_update_active_cpus(void) > { > - flush_work(&cpuset_hotplug_work); > + cpuset_hotplug(false); > } > > /* > diff --git a/kernel/power/process.c b/kernel/power/process.c > index 50f25cb..28772b405 100644 > --- a/kernel/power/process.c > +++ b/kernel/power/process.c > @@ -203,8 +203,6 @@ void thaw_processes(void) > __usermodehelper_set_disable_depth(UMH_FREEZING); > thaw_workqueues(); > > - cpuset_wait_for_hotplug(); > - > read_lock(&tasklist_lock); > for_each_process_thread(g, p) { > /* No other threads should have PF_SUSPEND_TASK set */ > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index d17c5da..25b8717 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5590,7 +5590,6 @@ static void cpuset_cpu_active(void) > * restore the original sched domains by considering the > * cpuset configurations. > */ > - cpuset_force_rebuild(); > } > cpuset_update_active_cpus(); > } > Any improvement/suggestion for this patch? -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock 2017-11-15 10:26 ` Prateek Sood @ 2017-11-15 10:37 ` Peter Zijlstra 2017-11-15 14:20 ` [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order Prateek Sood 2017-11-15 17:05 ` [PATCH v2] cgroup/cpuset: remove circular dependency deadlock Tejun Heo 0 siblings, 2 replies; 21+ messages in thread From: Peter Zijlstra @ 2017-11-15 10:37 UTC (permalink / raw) To: Prateek Sood Cc: longman, tj, lizefan, mingo, boqun.feng, tglx, cgroups, sramana, linux-kernel On Wed, Nov 15, 2017 at 03:56:26PM +0530, Prateek Sood wrote: > Any improvement/suggestion for this patch? I would have done 2 patches, the first one solving the locking issue, the second removing the then redundant async rebuild stuff. Other than that this looks OK I suppose, but I was expecting this to go through the cgroup tree, TJ? ^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order. 2017-11-15 10:37 ` Peter Zijlstra @ 2017-11-15 14:20 ` Prateek Sood 2017-11-15 14:20 ` [PATCH v3 1/2] cgroup/cpuset: remove circular dependency deadlock Prateek Sood ` (2 more replies) 2017-11-15 17:05 ` [PATCH v2] cgroup/cpuset: remove circular dependency deadlock Tejun Heo 1 sibling, 3 replies; 21+ messages in thread From: Prateek Sood @ 2017-11-15 14:20 UTC (permalink / raw) To: peterz, longman, tj, lizefan, mingo, boqun.feng, tglx Cc: cgroups, sramana, linux-kernel, clingutla, prsood This patch does following 1- Remove circular dependency deadlock by inverting order of cpu_hotplug_lock and cpuset_mutex. 2- Make cpuset_hotplug_workfn() synchronous for cpu hotplug path. For memory hotplug path it still gets queued as a work item. Prateek Sood (2): cgroup/cpuset: remove circular dependency deadlock cpuset: Make cpuset hotplug synchronous include/linux/cpuset.h | 6 ---- kernel/cgroup/cpuset.c | 94 +++++++++++++++++++++++++++----------------------- kernel/power/process.c | 2 -- kernel/sched/core.c | 1 - 4 files changed, 50 insertions(+), 53 deletions(-) -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project. ^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v3 1/2] cgroup/cpuset: remove circular dependency deadlock 2017-11-15 14:20 ` [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order Prateek Sood @ 2017-11-15 14:20 ` Prateek Sood 2017-11-15 14:20 ` [PATCH v3 2/2] cpuset: Make cpuset hotplug synchronous Prateek Sood 2017-11-27 16:48 ` [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order Tejun Heo 2 siblings, 0 replies; 21+ messages in thread From: Prateek Sood @ 2017-11-15 14:20 UTC (permalink / raw) To: peterz, longman, tj, lizefan, mingo, boqun.feng, tglx Cc: cgroups, sramana, linux-kernel, clingutla, prsood Remove circular dependency deadlock in a scenario where hotplug of CPU is being done while there is updation in cgroup and cpuset triggered from userspace. Process A => kthreadd => Process B => Process C => Process A Process A cpu_subsys_offline(); cpu_down(); _cpu_down(); percpu_down_write(&cpu_hotplug_lock); //held cpuhp_invoke_callback(); workqueue_offline_cpu(); queue_work_on(); // unbind_work on system_highpri_wq __queue_work(); insert_work(); wake_up_worker(); flush_work(); wait_for_completion(); worker_thread(); manage_workers(); create_worker(); kthread_create_on_node(); wake_up_process(kthreadd_task); kthreadd kthreadd(); kernel_thread(); do_fork(); copy_process(); percpu_down_read(&cgroup_threadgroup_rwsem); __rwsem_down_read_failed_common(); //waiting Process B kernfs_fop_write(); cgroup_file_write(); cgroup_procs_write(); percpu_down_write(&cgroup_threadgroup_rwsem); //held cgroup_attach_task(); cgroup_migrate(); cgroup_migrate_execute(); cpuset_can_attach(); mutex_lock(&cpuset_mutex); //waiting Process C kernfs_fop_write(); cgroup_file_write(); cpuset_write_resmask(); mutex_lock(&cpuset_mutex); //held update_cpumask(); update_cpumasks_hier(); rebuild_sched_domains_locked(); get_online_cpus(); percpu_down_read(&cpu_hotplug_lock); //waiting Eliminating deadlock by reversing the locking order for cpuset_mutex and cpu_hotplug_lock. Signed-off-by: Prateek Sood <prsood@codeaurora.org> --- kernel/cgroup/cpuset.c | 53 ++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f7efa7b..cab5fd1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -812,6 +812,18 @@ static int generate_sched_domains(cpumask_var_t **domains, return ndoms; } +static void cpuset_sched_change_begin(void) +{ + cpus_read_lock(); + mutex_lock(&cpuset_mutex); +} + +static void cpuset_sched_change_end(void) +{ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); +} + /* * Rebuild scheduler domains. * @@ -821,16 +833,14 @@ static int generate_sched_domains(cpumask_var_t **domains, * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -838,27 +848,25 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_cpuslocked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { - mutex_lock(&cpuset_mutex); - rebuild_sched_domains_locked(); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_begin(); + rebuild_sched_domains_cpuslocked(); + cpuset_sched_change_end(); } /** @@ -944,7 +952,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } /** @@ -1276,7 +1284,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); } return 0; @@ -1309,7 +1317,6 @@ static void update_tasks_flags(struct cpuset *cs) * * Call with cpuset_mutex held. */ - static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { @@ -1342,7 +1349,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + rebuild_sched_domains_cpuslocked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1610,7 +1617,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) { retval = -ENODEV; goto out_unlock; @@ -1646,7 +1653,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1657,7 +1664,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1670,7 +1677,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, break; } out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return retval; } @@ -1709,7 +1716,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1733,7 +1740,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -2034,14 +2041,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_cpuslocked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); @@ -2049,7 +2056,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); } static void cpuset_css_free(struct cgroup_subsys_state *css) -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project. ^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 2/2] cpuset: Make cpuset hotplug synchronous 2017-11-15 14:20 ` [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order Prateek Sood 2017-11-15 14:20 ` [PATCH v3 1/2] cgroup/cpuset: remove circular dependency deadlock Prateek Sood @ 2017-11-15 14:20 ` Prateek Sood 2017-11-27 16:48 ` [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order Tejun Heo 2 siblings, 0 replies; 21+ messages in thread From: Prateek Sood @ 2017-11-15 14:20 UTC (permalink / raw) To: peterz, longman, tj, lizefan, mingo, boqun.feng, tglx Cc: cgroups, sramana, linux-kernel, clingutla, prsood Convert cpuset_hotplug_workfn() into synchronous call for cpu hotplug path. For memory hotplug path it still gets queued as a work item. Since cpuset_hotplug_workfn() can be made synchronous for cpu hotplug path, it is not required to wait for cpuset hotplug while thawing processes. Signed-off-by: Prateek Sood <prsood@codeaurora.org> --- include/linux/cpuset.h | 6 ------ kernel/cgroup/cpuset.c | 41 ++++++++++++++++++++--------------------- kernel/power/process.c | 2 -- kernel/sched/core.c | 1 - 4 files changed, 20 insertions(+), 30 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 1b8e415..2ab910f 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -52,9 +52,7 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); -extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); -extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -167,15 +165,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} -static inline void cpuset_force_rebuild(void) { } - static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } -static inline void cpuset_wait_for_hotplug(void) { } - static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cab5fd1..227bc25 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2277,15 +2277,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) mutex_unlock(&cpuset_mutex); } -static bool force_rebuild; - -void cpuset_force_rebuild(void) -{ - force_rebuild = true; -} - /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -2300,7 +2293,7 @@ void cpuset_force_rebuild(void) * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_hotplug(bool use_cpu_hp_lock) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -2358,25 +2351,31 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated || force_rebuild) { - force_rebuild = false; - rebuild_sched_domains(); + if (cpus_updated) { + if (use_cpu_hp_lock) + rebuild_sched_domains(); + else { + /* Acquiring cpu_hotplug_lock is not required. + * When cpuset_hotplug() is called in hotplug path, + * cpu_hotplug_lock is held by the hotplug context + * which is waiting for cpuhp_thread_fun to indicate + * completion of callback. + */ + mutex_lock(&cpuset_mutex); + rebuild_sched_domains_cpuslocked(); + mutex_unlock(&cpuset_mutex); + } } } -void cpuset_update_active_cpus(void) +static void cpuset_hotplug_workfn(struct work_struct *work) { - /* - * We're inside cpu hotplug critical region which usually nests - * inside cgroup synchronization. Bounce actual hotplug processing - * to a work item to avoid reverse locking order. - */ - schedule_work(&cpuset_hotplug_work); + cpuset_hotplug(true); } -void cpuset_wait_for_hotplug(void) +void cpuset_update_active_cpus(void) { - flush_work(&cpuset_hotplug_work); + cpuset_hotplug(false); } /* diff --git a/kernel/power/process.c b/kernel/power/process.c index 7381d49..c326d72 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -204,8 +204,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b82a00..efcf753 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5600,7 +5600,6 @@ static void cpuset_cpu_active(void) * restore the original sched domains by considering the * cpuset configurations. */ - cpuset_force_rebuild(); } cpuset_update_active_cpus(); } -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project. ^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order. 2017-11-15 14:20 ` [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order Prateek Sood 2017-11-15 14:20 ` [PATCH v3 1/2] cgroup/cpuset: remove circular dependency deadlock Prateek Sood 2017-11-15 14:20 ` [PATCH v3 2/2] cpuset: Make cpuset hotplug synchronous Prateek Sood @ 2017-11-27 16:48 ` Tejun Heo 2 siblings, 0 replies; 21+ messages in thread From: Tejun Heo @ 2017-11-27 16:48 UTC (permalink / raw) To: Prateek Sood Cc: peterz, longman, lizefan, mingo, boqun.feng, tglx, cgroups, sramana, linux-kernel, clingutla On Wed, Nov 15, 2017 at 07:50:13PM +0530, Prateek Sood wrote: > This patch does following > 1- Remove circular dependency deadlock by inverting order of > cpu_hotplug_lock and cpuset_mutex. > 2- Make cpuset_hotplug_workfn() synchronous for cpu hotplug path. > For memory hotplug path it still gets queued as a work item. > > Prateek Sood (2): > cgroup/cpuset: remove circular dependency deadlock > cpuset: Make cpuset hotplug synchronous Applied to cgroup/for-4.15-fixes. Thanks. -- tejun ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock 2017-11-15 10:37 ` Peter Zijlstra 2017-11-15 14:20 ` [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order Prateek Sood @ 2017-11-15 17:05 ` Tejun Heo 2017-11-15 17:18 ` Prateek Sood 1 sibling, 1 reply; 21+ messages in thread From: Tejun Heo @ 2017-11-15 17:05 UTC (permalink / raw) To: Peter Zijlstra Cc: Prateek Sood, longman, lizefan, mingo, boqun.feng, tglx, cgroups, sramana, linux-kernel On Wed, Nov 15, 2017 at 11:37:42AM +0100, Peter Zijlstra wrote: > On Wed, Nov 15, 2017 at 03:56:26PM +0530, Prateek Sood wrote: > > Any improvement/suggestion for this patch? > > I would have done 2 patches, the first one solving the locking issue, > the second removing the then redundant async rebuild stuff. > > Other than that this looks OK I suppose, but I was expecting this to go > through the cgroup tree, TJ? Will pick them up after -rc1. Thanks. -- tejun ^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock 2017-11-15 17:05 ` [PATCH v2] cgroup/cpuset: remove circular dependency deadlock Tejun Heo @ 2017-11-15 17:18 ` Prateek Sood 0 siblings, 0 replies; 21+ messages in thread From: Prateek Sood @ 2017-11-15 17:18 UTC (permalink / raw) To: Tejun Heo, Peter Zijlstra Cc: longman, lizefan, mingo, boqun.feng, tglx, cgroups, sramana, linux-kernel On 11/15/2017 10:35 PM, Tejun Heo wrote: > On Wed, Nov 15, 2017 at 11:37:42AM +0100, Peter Zijlstra wrote: >> On Wed, Nov 15, 2017 at 03:56:26PM +0530, Prateek Sood wrote: >>> Any improvement/suggestion for this patch? >> >> I would have done 2 patches, the first one solving the locking issue, >> the second removing the then redundant async rebuild stuff. >> >> Other than that this looks OK I suppose, but I was expecting this to go >> through the cgroup tree, TJ? > > Will pick them up after -rc1. > > Thanks. > I have made two patches as suggested by Peter and sent for review. -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2017-11-27 16:49 UTC | newest] Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2017-09-07 13:56 [PATCH] cgroup/cpuset: remove circular dependency deadlock Prateek Sood 2017-09-07 17:45 ` Peter Zijlstra 2017-09-08 2:13 ` Prateek Sood 2017-09-07 17:51 ` Peter Zijlstra 2017-10-09 13:27 ` Prateek Sood 2017-10-11 9:48 ` Peter Zijlstra 2017-10-25 8:39 ` Prateek Sood 2017-10-25 9:30 ` Peter Zijlstra 2017-10-26 11:52 ` Prateek Sood 2017-10-26 14:05 ` Waiman Long 2017-10-27 8:03 ` Prateek Sood 2017-10-30 7:16 ` [PATCH v2] " Prateek Sood 2017-11-06 4:01 ` Prateek Sood 2017-11-15 10:26 ` Prateek Sood 2017-11-15 10:37 ` Peter Zijlstra 2017-11-15 14:20 ` [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order Prateek Sood 2017-11-15 14:20 ` [PATCH v3 1/2] cgroup/cpuset: remove circular dependency deadlock Prateek Sood 2017-11-15 14:20 ` [PATCH v3 2/2] cpuset: Make cpuset hotplug synchronous Prateek Sood 2017-11-27 16:48 ` [PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order Tejun Heo 2017-11-15 17:05 ` [PATCH v2] cgroup/cpuset: remove circular dependency deadlock Tejun Heo 2017-11-15 17:18 ` Prateek Sood
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).