* [PATCH] cgroup/cpuset: fix circular locking dependency [not found] <CANaxB-zT+sz=z1FFk5npnwMySdfKCBZDkM+P+=JXDkCXbh=rCw@mail.gmail.com> @ 2017-11-28 11:35 ` Prateek Sood 2017-12-04 5:14 ` Prateek Sood 0 siblings, 1 reply; 39+ messages in thread From: Prateek Sood @ 2017-11-28 11:35 UTC (permalink / raw) To: tj, avagin, mingo, peterz; +Cc: linux-kernel, cgroups, sramana, Prateek Sood CPU1 cpus_read_lock+0x3e/0x80 static_key_slow_inc+0xe/0xa0 cpuset_css_online+0x62/0x330 online_css+0x26/0x80 cgroup_apply_control_enable+0x266/0x3d0 cgroup_mkdir+0x37d/0x4f0 kernfs_iop_mkdir+0x53/0x80 vfs_mkdir+0x10e/0x1a0 SyS_mkdirat+0xb3/0xe0 entry_SYSCALL_64_fastpath+0x23/0x9a CPU0 lock_acquire+0xec/0x1e0 __mutex_lock+0x89/0x920 cpuset_write_resmask+0x61/0x1100 cgroup_file_write+0x7b/0x200 kernfs_fop_write+0x112/0x1a0 __vfs_write+0x23/0x150 vfs_write+0xc8/0x1c0 SyS_write+0x45/0xa0 entry_SYSCALL_64_fastpath+0x23/0x9a CPU0 CPU1 ---- ---- lock(cpu_hotplug_lock.rw_sem); lock(cpuset_mutex); lock(cpu_hotplug_lock.rw_sem); lock(cpuset_mutex); Change locking order of cpu_hotplug_lock.rw_sem and cpuset_mutex in cpuset_css_online(). Use _cpuslocked() version for static_branch_inc/static_branch_dec in cpuset_inc()/cpuset_dec(). Signed-off-by: Prateek Sood <prsood@codeaurora.org> --- include/linux/cpuset.h | 8 ++++---- include/linux/jump_label.h | 10 ++++++++-- kernel/cgroup/cpuset.c | 4 ++-- kernel/jump_label.c | 13 +++++++++++++ 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 2ab910f..5aadc25 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void) static inline void cpuset_inc(void) { - static_branch_inc(&cpusets_pre_enable_key); - static_branch_inc(&cpusets_enabled_key); + static_branch_inc_cpuslocked(&cpusets_pre_enable_key); + static_branch_inc_cpuslocked(&cpusets_enabled_key); } static inline void cpuset_dec(void) { - static_branch_dec(&cpusets_enabled_key); - static_branch_dec(&cpusets_pre_enable_key); + static_branch_dec_cpuslocked(&cpusets_enabled_key); + static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } extern int cpuset_init(void); diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index c7b368c..890e21c 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry, extern int jump_label_text_reserved(void *start, void *end); extern void static_key_slow_inc(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); +extern void static_key_slow_incr_cpuslocked(struct static_key *key); +extern void static_key_slow_decr_cpuslocked(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); @@ -259,6 +261,8 @@ static inline void static_key_disable(struct static_key *key) #define static_key_enable_cpuslocked(k) static_key_enable((k)) #define static_key_disable_cpuslocked(k) static_key_disable((k)) +#define static_key_slow_incr_cpuslocked(k) static_key_slow_inc((k)) +#define static_key_slow_decr_cpuslocked(k) static_key_slow_dec((k)) #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } @@ -414,8 +418,10 @@ struct static_key_false { * Advanced usage; refcount, branch is enabled when: count != 0 */ -#define static_branch_inc(x) static_key_slow_inc(&(x)->key) -#define static_branch_dec(x) static_key_slow_dec(&(x)->key) +#define static_branch_inc(x) static_key_slow_inc(&(x)->key) +#define static_branch_dec(x) static_key_slow_dec(&(x)->key) +#define static_branch_inc_cpuslocked(x) static_key_slow_incr_cpuslocked(&(x)->key) +#define static_branch_dec_cpuslocked(x) static_key_slow_decr_cpuslocked(&(x)->key) /* * Normal usage; boolean enable/disable. diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 227bc25..4ad8bae 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1985,7 +1985,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (!parent) return 0; - mutex_lock(&cpuset_mutex); + cpuset_sched_change_begin(); set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) @@ -2034,7 +2034,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: - mutex_unlock(&cpuset_mutex); + cpuset_sched_change_end(); return 0; } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 8594d24..dde0eaa 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -126,6 +126,12 @@ void static_key_slow_inc(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_slow_inc); +void static_key_slow_incr_cpuslocked(struct static_key *key) +{ + static_key_slow_inc_cpuslocked(key); +} +EXPORT_SYMBOL_GPL(static_key_slow_incr_cpuslocked); + void static_key_enable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); @@ -229,6 +235,13 @@ void static_key_slow_dec(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_slow_dec); +void static_key_slow_decr_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(key); + static_key_slow_dec_cpuslocked(key, 0, NULL); +} +EXPORT_SYMBOL_GPL(static_key_slow_decr_cpuslocked); + void static_key_slow_dec_deferred(struct static_key_deferred *key) { STATIC_KEY_CHECK_USE(key); -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project. ^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-11-28 11:35 ` [PATCH] cgroup/cpuset: fix circular locking dependency Prateek Sood @ 2017-12-04 5:14 ` Prateek Sood 2017-12-04 20:22 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Prateek Sood @ 2017-12-04 5:14 UTC (permalink / raw) To: tj, avagin, mingo, peterz; +Cc: linux-kernel, cgroups, sramana On 11/28/2017 05:05 PM, Prateek Sood wrote: > CPU1 > cpus_read_lock+0x3e/0x80 > static_key_slow_inc+0xe/0xa0 > cpuset_css_online+0x62/0x330 > online_css+0x26/0x80 > cgroup_apply_control_enable+0x266/0x3d0 > cgroup_mkdir+0x37d/0x4f0 > kernfs_iop_mkdir+0x53/0x80 > vfs_mkdir+0x10e/0x1a0 > SyS_mkdirat+0xb3/0xe0 > entry_SYSCALL_64_fastpath+0x23/0x9a > > CPU0 > lock_acquire+0xec/0x1e0 > __mutex_lock+0x89/0x920 > cpuset_write_resmask+0x61/0x1100 > cgroup_file_write+0x7b/0x200 > kernfs_fop_write+0x112/0x1a0 > __vfs_write+0x23/0x150 > vfs_write+0xc8/0x1c0 > SyS_write+0x45/0xa0 > entry_SYSCALL_64_fastpath+0x23/0x9a > > CPU0 CPU1 > ---- ---- > lock(cpu_hotplug_lock.rw_sem); > lock(cpuset_mutex); > lock(cpu_hotplug_lock.rw_sem); > lock(cpuset_mutex); > > Change locking order of cpu_hotplug_lock.rw_sem and > cpuset_mutex in cpuset_css_online(). Use _cpuslocked() > version for static_branch_inc/static_branch_dec in > cpuset_inc()/cpuset_dec(). > > Signed-off-by: Prateek Sood <prsood@codeaurora.org> > --- > include/linux/cpuset.h | 8 ++++---- > include/linux/jump_label.h | 10 ++++++++-- > kernel/cgroup/cpuset.c | 4 ++-- > kernel/jump_label.c | 13 +++++++++++++ > 4 files changed, 27 insertions(+), 8 deletions(-) > > diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h > index 2ab910f..5aadc25 100644 > --- a/include/linux/cpuset.h > +++ b/include/linux/cpuset.h > @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void) > > static inline void cpuset_inc(void) > { > - static_branch_inc(&cpusets_pre_enable_key); > - static_branch_inc(&cpusets_enabled_key); > + static_branch_inc_cpuslocked(&cpusets_pre_enable_key); > + static_branch_inc_cpuslocked(&cpusets_enabled_key); > } > > static inline void cpuset_dec(void) > { > - static_branch_dec(&cpusets_enabled_key); > - static_branch_dec(&cpusets_pre_enable_key); > + static_branch_dec_cpuslocked(&cpusets_enabled_key); > + static_branch_dec_cpuslocked(&cpusets_pre_enable_key); > } > > extern int cpuset_init(void); > diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h > index c7b368c..890e21c 100644 > --- a/include/linux/jump_label.h > +++ b/include/linux/jump_label.h > @@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry, > extern int jump_label_text_reserved(void *start, void *end); > extern void static_key_slow_inc(struct static_key *key); > extern void static_key_slow_dec(struct static_key *key); > +extern void static_key_slow_incr_cpuslocked(struct static_key *key); > +extern void static_key_slow_decr_cpuslocked(struct static_key *key); > extern void jump_label_apply_nops(struct module *mod); > extern int static_key_count(struct static_key *key); > extern void static_key_enable(struct static_key *key); > @@ -259,6 +261,8 @@ static inline void static_key_disable(struct static_key *key) > > #define static_key_enable_cpuslocked(k) static_key_enable((k)) > #define static_key_disable_cpuslocked(k) static_key_disable((k)) > +#define static_key_slow_incr_cpuslocked(k) static_key_slow_inc((k)) > +#define static_key_slow_decr_cpuslocked(k) static_key_slow_dec((k)) > > #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } > #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } > @@ -414,8 +418,10 @@ struct static_key_false { > * Advanced usage; refcount, branch is enabled when: count != 0 > */ > > -#define static_branch_inc(x) static_key_slow_inc(&(x)->key) > -#define static_branch_dec(x) static_key_slow_dec(&(x)->key) > +#define static_branch_inc(x) static_key_slow_inc(&(x)->key) > +#define static_branch_dec(x) static_key_slow_dec(&(x)->key) > +#define static_branch_inc_cpuslocked(x) static_key_slow_incr_cpuslocked(&(x)->key) > +#define static_branch_dec_cpuslocked(x) static_key_slow_decr_cpuslocked(&(x)->key) > > /* > * Normal usage; boolean enable/disable. > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c > index 227bc25..4ad8bae 100644 > --- a/kernel/cgroup/cpuset.c > +++ b/kernel/cgroup/cpuset.c > @@ -1985,7 +1985,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) > if (!parent) > return 0; > > - mutex_lock(&cpuset_mutex); > + cpuset_sched_change_begin(); > > set_bit(CS_ONLINE, &cs->flags); > if (is_spread_page(parent)) > @@ -2034,7 +2034,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) > cpumask_copy(cs->effective_cpus, parent->cpus_allowed); > spin_unlock_irq(&callback_lock); > out_unlock: > - mutex_unlock(&cpuset_mutex); > + cpuset_sched_change_end(); > return 0; > } > > diff --git a/kernel/jump_label.c b/kernel/jump_label.c > index 8594d24..dde0eaa 100644 > --- a/kernel/jump_label.c > +++ b/kernel/jump_label.c > @@ -126,6 +126,12 @@ void static_key_slow_inc(struct static_key *key) > } > EXPORT_SYMBOL_GPL(static_key_slow_inc); > > +void static_key_slow_incr_cpuslocked(struct static_key *key) > +{ > + static_key_slow_inc_cpuslocked(key); > +} > +EXPORT_SYMBOL_GPL(static_key_slow_incr_cpuslocked); > + > void static_key_enable_cpuslocked(struct static_key *key) > { > STATIC_KEY_CHECK_USE(key); > @@ -229,6 +235,13 @@ void static_key_slow_dec(struct static_key *key) > } > EXPORT_SYMBOL_GPL(static_key_slow_dec); > > +void static_key_slow_decr_cpuslocked(struct static_key *key) > +{ > + STATIC_KEY_CHECK_USE(key); > + static_key_slow_dec_cpuslocked(key, 0, NULL); > +} > +EXPORT_SYMBOL_GPL(static_key_slow_decr_cpuslocked); > + > void static_key_slow_dec_deferred(struct static_key_deferred *key) > { > STATIC_KEY_CHECK_USE(key); > TJ, Any feedback/suggestion for this patch? -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-04 5:14 ` Prateek Sood @ 2017-12-04 20:22 ` Tejun Heo 2017-12-04 22:58 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2017-12-04 20:22 UTC (permalink / raw) To: Prateek Sood; +Cc: avagin, mingo, peterz, linux-kernel, cgroups, sramana Hello, On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote: > Any feedback/suggestion for this patch? Sorry about the delay. I'm a bit worried because it feels like we're chasing a squirrel. I'll think through the recent changes and this one and get back to you. Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-04 20:22 ` Tejun Heo @ 2017-12-04 22:58 ` Tejun Heo 2017-12-04 23:01 ` Peter Zijlstra 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2017-12-04 22:58 UTC (permalink / raw) To: Prateek Sood; +Cc: avagin, mingo, peterz, linux-kernel, cgroups, sramana Hello, again. On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote: > Hello, > > On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote: > > Any feedback/suggestion for this patch? > > Sorry about the delay. I'm a bit worried because it feels like we're > chasing a squirrel. I'll think through the recent changes and this > one and get back to you. Can you please take a look at the following pending commit? https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d AFAICS, this should remove the circular dependency you originally reported. I'll revert the two cpuset commits for now. Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-04 22:58 ` Tejun Heo @ 2017-12-04 23:01 ` Peter Zijlstra 2017-12-08 9:40 ` Prateek Sood 2017-12-11 15:20 ` [PATCH] cgroup/cpuset: fix circular locking dependency Tejun Heo 0 siblings, 2 replies; 39+ messages in thread From: Peter Zijlstra @ 2017-12-04 23:01 UTC (permalink / raw) To: Tejun Heo; +Cc: Prateek Sood, avagin, mingo, linux-kernel, cgroups, sramana On Mon, Dec 04, 2017 at 02:58:25PM -0800, Tejun Heo wrote: > Hello, again. > > On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote: > > Hello, > > > > On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote: > > > Any feedback/suggestion for this patch? > > > > Sorry about the delay. I'm a bit worried because it feels like we're > > chasing a squirrel. I'll think through the recent changes and this > > one and get back to you. > > Can you please take a look at the following pending commit? > > https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d > > AFAICS, this should remove the circular dependency you originally > reported. I'll revert the two cpuset commits for now. So I liked his patches in that we would be able to go back to synchronous sched_domain building. ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-04 23:01 ` Peter Zijlstra @ 2017-12-08 9:40 ` Prateek Sood 2017-12-08 11:45 ` Prateek Sood 2017-12-11 15:20 ` [PATCH] cgroup/cpuset: fix circular locking dependency Tejun Heo 1 sibling, 1 reply; 39+ messages in thread From: Prateek Sood @ 2017-12-08 9:40 UTC (permalink / raw) To: Peter Zijlstra, Tejun Heo; +Cc: avagin, mingo, linux-kernel, cgroups, sramana On 12/05/2017 04:31 AM, Peter Zijlstra wrote: > On Mon, Dec 04, 2017 at 02:58:25PM -0800, Tejun Heo wrote: >> Hello, again. >> >> On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote: >>> Hello, >>> >>> On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote: >>>> Any feedback/suggestion for this patch? >>> >>> Sorry about the delay. I'm a bit worried because it feels like we're >>> chasing a squirrel. I'll think through the recent changes and this >>> one and get back to you. >> >> Can you please take a look at the following pending commit? >> >> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d >> >> AFAICS, this should remove the circular dependency you originally >> reported. I'll revert the two cpuset commits for now. > > So I liked his patches in that we would be able to go back to > synchronous sched_domain building. > https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d This will fix the original circular locking dependency issue. I will let you both (Peter & TJ) to decide on which one to pick. -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-08 9:40 ` Prateek Sood @ 2017-12-08 11:45 ` Prateek Sood 2017-12-11 15:32 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Prateek Sood @ 2017-12-08 11:45 UTC (permalink / raw) To: Peter Zijlstra, Tejun Heo; +Cc: avagin, mingo, linux-kernel, cgroups, sramana On 12/08/2017 03:10 PM, Prateek Sood wrote: > On 12/05/2017 04:31 AM, Peter Zijlstra wrote: >> On Mon, Dec 04, 2017 at 02:58:25PM -0800, Tejun Heo wrote: >>> Hello, again. >>> >>> On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote: >>>> Hello, >>>> >>>> On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote: >>>>> Any feedback/suggestion for this patch? >>>> >>>> Sorry about the delay. I'm a bit worried because it feels like we're >>>> chasing a squirrel. I'll think through the recent changes and this >>>> one and get back to you. >>> >>> Can you please take a look at the following pending commit? >>> >>> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d >>> >>> AFAICS, this should remove the circular dependency you originally >>> reported. I'll revert the two cpuset commits for now. >> >> So I liked his patches in that we would be able to go back to >> synchronous sched_domain building. >> > https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d > > This will fix the original circular locking dependency issue. > I will let you both (Peter & TJ) to decide on which one to > pick. > > > TJ & Peter, There is one deadlock issue during cgroup migration from cpu hotplug path when a task T is being moved from source to destination cgroup. kworker/0:0 cpuset_hotplug_workfn() cpuset_hotplug_update_tasks() hotplug_update_tasks_legacy() remove_tasks_in_empty_cpuset() cgroup_transfer_tasks() // stuck in iterator loop cgroup_migrate() cgroup_migrate_add_task() In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T. Task T will not migrate to destination cgroup. css_task_iter_start() will keep pointing to task T in loop waiting for task T cg_list node to be removed. Task T do_exit() exit_signals() // sets PF_EXITING exit_task_namespaces() switch_task_namespaces() free_nsproxy() put_mnt_ns() drop_collected_mounts() namespace_unlock() synchronize_rcu() _synchronize_rcu_expedited() schedule_work() // on cpu0 low priority worker pool wait_event() // waiting for work item to execute Task T inserted a work item in the worklist of cpu0 low priority worker pool. It is waiting for expedited grace period work item to execute. This work item will only be executed once kworker/0:0 complete execution of cpuset_hotplug_workfn(). kworker/0:0 ==> Task T ==>kworker/0:0 Following suggested patch might not be able to fix the above mentioned case: https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d Combination of following patches fixes above mentioned scenario as well: 1) Inverting cpuset_mutex and cpu_hotplug_lock locking sequence 2) Making cpuset hotplug work synchronous -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-08 11:45 ` Prateek Sood @ 2017-12-11 15:32 ` Tejun Heo 2017-12-13 14:28 ` Prateek Sood 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2017-12-11 15:32 UTC (permalink / raw) To: Prateek Sood Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana Hello, Prateek. On Fri, Dec 08, 2017 at 05:15:55PM +0530, Prateek Sood wrote: > There is one deadlock issue during cgroup migration from cpu > hotplug path when a task T is being moved from source to > destination cgroup. > > kworker/0:0 > cpuset_hotplug_workfn() > cpuset_hotplug_update_tasks() > hotplug_update_tasks_legacy() > remove_tasks_in_empty_cpuset() > cgroup_transfer_tasks() // stuck in iterator loop > cgroup_migrate() > cgroup_migrate_add_task() > > In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T. > Task T will not migrate to destination cgroup. css_task_iter_start() > will keep pointing to task T in loop waiting for task T cg_list node > to be removed. Heh, that's a bug in cgroup_transfer_tasks() which happened because I forgot to update when we changed how we handle exiting tasks. The right thing to do here is making cgroup_transfer_tasks() repeat iff there were a valid migration target which didn't get transferred. Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-11 15:32 ` Tejun Heo @ 2017-12-13 14:28 ` Prateek Sood 2017-12-13 15:40 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Prateek Sood @ 2017-12-13 14:28 UTC (permalink / raw) To: Tejun Heo; +Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On 12/11/2017 09:02 PM, Tejun Heo wrote: > Hello, Prateek. > > On Fri, Dec 08, 2017 at 05:15:55PM +0530, Prateek Sood wrote: >> There is one deadlock issue during cgroup migration from cpu >> hotplug path when a task T is being moved from source to >> destination cgroup. >> >> kworker/0:0 >> cpuset_hotplug_workfn() >> cpuset_hotplug_update_tasks() >> hotplug_update_tasks_legacy() >> remove_tasks_in_empty_cpuset() >> cgroup_transfer_tasks() // stuck in iterator loop >> cgroup_migrate() >> cgroup_migrate_add_task() >> >> In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T. >> Task T will not migrate to destination cgroup. css_task_iter_start() >> will keep pointing to task T in loop waiting for task T cg_list node >> to be removed. > > Heh, that's a bug in cgroup_transfer_tasks() which happened because I > forgot to update when we changed how we handle exiting tasks. The > right thing to do here is making cgroup_transfer_tasks() repeat iff > there were a valid migration target which didn't get transferred. > > Thanks. > Hi TJ, Did you mean something like below. If not then could you please share a patch for this problem in cgroup_transfer_tasks(). diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 473e0c0..41de618 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -143,6 +143,8 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); +void css_task_migrate_iter_start(struct cgroup_subsys_state *css, + unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085d..12279ae 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -122,7 +122,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * ->can_attach() fails. */ do { - css_task_iter_start(&from->self, 0, &it); + css_task_migrate_iter_start(&from->self, 0, &it); task = css_task_iter_next(&it); if (task) get_task_struct(task); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0b1ffe1..3c1d2d2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4189,6 +4189,42 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, spin_unlock_irq(&css_set_lock); } +void css_task_migrate_iter_start(struct cgroup_subsys_state *css, + unsigned int flags, struct css_task_iter *it) +{ + struct task_struct *task = NULL; + /* no one should try to iterate before mounting cgroups */ + WARN_ON_ONCE(!use_task_css_set_links); + + memset(it, 0, sizeof(*it)); + + spin_lock_irq(&css_set_lock); + + it->ss = css->ss; + it->flags = flags; + + if (it->ss) + it->cset_pos = &css->cgroup->e_csets[css->ss->id]; + else + it->cset_pos = &css->cgroup->cset_links; + + it->cset_head = it->cset_pos; + + css_task_iter_advance_css_set(it); + + while (it->task_pos) { + task = list_entry(it->task_pos, struct task_struct, + cg_list); + + if (likely(!(task->flags & PF_EXITING))) + break; + + css_task_iter_advance(it); + } + + spin_unlock_irq(&css_set_lock); +} + /** * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated Thanks -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-13 14:28 ` Prateek Sood @ 2017-12-13 15:40 ` Tejun Heo 2017-12-15 8:54 ` Prateek Sood 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2017-12-13 15:40 UTC (permalink / raw) To: Prateek Sood Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana Hello, Prateek. On Wed, Dec 13, 2017 at 07:58:24PM +0530, Prateek Sood wrote: > Did you mean something like below. If not then could you > please share a patch for this problem in > cgroup_transfer_tasks(). Oh we surely can add a new iterator but we can just count in cgroup_transfer_tasks() too, right? Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-13 15:40 ` Tejun Heo @ 2017-12-15 8:54 ` Prateek Sood 2017-12-15 13:22 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Prateek Sood @ 2017-12-15 8:54 UTC (permalink / raw) To: Tejun Heo; +Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On 12/13/2017 09:10 PM, Tejun Heo wrote: Hi TJ, > Hello, Prateek. > > On Wed, Dec 13, 2017 at 07:58:24PM +0530, Prateek Sood wrote: >> Did you mean something like below. If not then could you >> please share a patch for this problem in >> cgroup_transfer_tasks(). > > Oh we surely can add a new iterator but we can just count in > cgroup_transfer_tasks() too, right? I did not get what you meant by this. Could you please share a patch for this. > > Thanks. > Following are two ways to improve cgroup_transfer_tasks(). In both cases task in PF_EXITING state would be left in source cgroup. It would be removed from cgroup_exit() in exit path. diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085d..e2bdcdb 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -123,7 +123,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, 0, &it); - task = css_task_iter_next(&it); + do { + task = css_task_iter_next(&it); + } while (task && (task & PF_EXITING)) + if (task) get_task_struct(task); css_task_iter_end(&it); ----------------------------8<--------------------------------- diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085d..843b8bb 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -121,12 +121,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ + css_task_iter_start(&from->self, 0, &it); do { - css_task_iter_start(&from->self, 0, &it); task = css_task_iter_next(&it); if (task) get_task_struct(task); - css_task_iter_end(&it); if (task) { ret = cgroup_migrate(task, false, &mgctx); @@ -135,6 +134,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) put_task_struct(task); } } while (task && !ret); + css_task_iter_end(&it); out_err: cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); Thanks -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-15 8:54 ` Prateek Sood @ 2017-12-15 13:22 ` Tejun Heo 2017-12-15 19:06 ` Prateek Sood 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2017-12-15 13:22 UTC (permalink / raw) To: Prateek Sood Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana Hello, Prateek. On Fri, Dec 15, 2017 at 02:24:55PM +0530, Prateek Sood wrote: > Following are two ways to improve cgroup_transfer_tasks(). In > both cases task in PF_EXITING state would be left in source > cgroup. It would be removed from cgroup_exit() in exit path. > > diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c > index 024085d..e2bdcdb 100644 > --- a/kernel/cgroup/cgroup-v1.c > +++ b/kernel/cgroup/cgroup-v1.c > @@ -123,7 +123,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) > */ > do { > css_task_iter_start(&from->self, 0, &it); > - task = css_task_iter_next(&it); > + do { > + task = css_task_iter_next(&it); > + } while (task && (task & PF_EXITING)) > + Yeah, this looks good to me. We can't just make a single pass as in the other one because we can race aginst fork. And PF_EXITING being left behind is what was happening previously too anyway. They can't be moved. Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-15 13:22 ` Tejun Heo @ 2017-12-15 19:06 ` Prateek Sood 2017-12-19 7:26 ` [PATCH] cgroup: Fix deadlock in cpu hotplug path Prateek Sood 0 siblings, 1 reply; 39+ messages in thread From: Prateek Sood @ 2017-12-15 19:06 UTC (permalink / raw) To: Tejun Heo; +Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On 12/15/2017 06:52 PM, Tejun Heo wrote: > Hello, Prateek. > > On Fri, Dec 15, 2017 at 02:24:55PM +0530, Prateek Sood wrote: >> Following are two ways to improve cgroup_transfer_tasks(). In >> both cases task in PF_EXITING state would be left in source >> cgroup. It would be removed from cgroup_exit() in exit path. >> >> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c >> index 024085d..e2bdcdb 100644 >> --- a/kernel/cgroup/cgroup-v1.c >> +++ b/kernel/cgroup/cgroup-v1.c >> @@ -123,7 +123,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) >> */ >> do { >> css_task_iter_start(&from->self, 0, &it); >> - task = css_task_iter_next(&it); >> + do { >> + task = css_task_iter_next(&it); >> + } while (task && (task & PF_EXITING)) >> + > > Yeah, this looks good to me. We can't just make a single pass as in > the other one because we can race aginst fork. And PF_EXITING being > left behind is what was happening previously too anyway. They can't > be moved. > > Thanks. > Thanks TJ for reviewing. I will send a formal patch with the above approved approach. -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 39+ messages in thread
* [PATCH] cgroup: Fix deadlock in cpu hotplug path 2017-12-15 19:06 ` Prateek Sood @ 2017-12-19 7:26 ` Prateek Sood 2017-12-19 13:39 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Prateek Sood @ 2017-12-19 7:26 UTC (permalink / raw) To: tj; +Cc: peterz, avagin, mingo, linux-kernel, cgroups, sramana, Prateek Sood Deadlock during cgroup migration from cpu hotplug path when a task T is being moved from source to destination cgroup. kworker/0:0 cpuset_hotplug_workfn() cpuset_hotplug_update_tasks() hotplug_update_tasks_legacy() remove_tasks_in_empty_cpuset() cgroup_transfer_tasks() // stuck in iterator loop cgroup_migrate() cgroup_migrate_add_task() In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T. Task T will not migrate to destination cgroup. css_task_iter_start() will keep pointing to task T in loop waiting for task T cg_list node to be removed. Task T do_exit() exit_signals() // sets PF_EXITING exit_task_namespaces() switch_task_namespaces() free_nsproxy() put_mnt_ns() drop_collected_mounts() namespace_unlock() synchronize_rcu() _synchronize_rcu_expedited() schedule_work() // on cpu0 low priority worker pool wait_event() // waiting for work item to execute Task T inserted a work item in the worklist of cpu0 low priority worker pool. It is waiting for expedited grace period work item to execute. This work item will only be executed once kworker/0:0 complete execution of cpuset_hotplug_workfn(). kworker/0:0 ==> Task T ==>kworker/0:0 In case of PF_EXITING task being migrated from source to destination cgroup, migrate next available task in source cgroup. Change-Id: I8874fb04479c136cae4dabd5c168c7749df66664 Signed-off-by: Prateek Sood <prsood@codeaurora.org> --- kernel/cgroup/cgroup-v1.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085d..a2c05d2 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, 0, &it); - task = css_task_iter_next(&it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project. ^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup: Fix deadlock in cpu hotplug path 2017-12-19 7:26 ` [PATCH] cgroup: Fix deadlock in cpu hotplug path Prateek Sood @ 2017-12-19 13:39 ` Tejun Heo 0 siblings, 0 replies; 39+ messages in thread From: Tejun Heo @ 2017-12-19 13:39 UTC (permalink / raw) To: Prateek Sood; +Cc: peterz, avagin, mingo, linux-kernel, cgroups, sramana On Tue, Dec 19, 2017 at 12:56:57PM +0530, Prateek Sood wrote: > Deadlock during cgroup migration from cpu hotplug path when a task T is > being moved from source to destination cgroup. ... > Task T inserted a work item in the worklist of cpu0 low priority > worker pool. It is waiting for expedited grace period work item > to execute. This work item will only be executed once kworker/0:0 > complete execution of cpuset_hotplug_workfn(). > > kworker/0:0 ==> Task T ==>kworker/0:0 > > In case of PF_EXITING task being migrated from source to destination > cgroup, migrate next available task in source cgroup. > > Change-Id: I8874fb04479c136cae4dabd5c168c7749df66664 > Signed-off-by: Prateek Sood <prsood@codeaurora.org> Applied to cgroup/for-4.15-fixes. Thanks a lot, Prateek. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-04 23:01 ` Peter Zijlstra 2017-12-08 9:40 ` Prateek Sood @ 2017-12-11 15:20 ` Tejun Heo 2017-12-13 7:50 ` Prateek Sood 1 sibling, 1 reply; 39+ messages in thread From: Tejun Heo @ 2017-12-11 15:20 UTC (permalink / raw) To: Peter Zijlstra Cc: Prateek Sood, avagin, mingo, linux-kernel, cgroups, sramana Hello, Peter. On Tue, Dec 05, 2017 at 12:01:17AM +0100, Peter Zijlstra wrote: > > AFAICS, this should remove the circular dependency you originally > > reported. I'll revert the two cpuset commits for now. > > So I liked his patches in that we would be able to go back to > synchronous sched_domain building. Ah, yeah, that's a separate issue but didn't we intentionally make that asynchronous? IIRC, cpuset migration can take a really long time when the memory migration is turned on and doing that synchronously could mess up the system. Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-11 15:20 ` [PATCH] cgroup/cpuset: fix circular locking dependency Tejun Heo @ 2017-12-13 7:50 ` Prateek Sood 2017-12-13 16:06 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Prateek Sood @ 2017-12-13 7:50 UTC (permalink / raw) To: Tejun Heo, Peter Zijlstra; +Cc: avagin, mingo, linux-kernel, cgroups, sramana On 12/11/2017 08:50 PM, Tejun Heo wrote: > Hello, Peter. > > On Tue, Dec 05, 2017 at 12:01:17AM +0100, Peter Zijlstra wrote: >>> AFAICS, this should remove the circular dependency you originally >>> reported. I'll revert the two cpuset commits for now. >> >> So I liked his patches in that we would be able to go back to >> synchronous sched_domain building. > > Ah, yeah, that's a separate issue but didn't we intentionally make > that asynchronous? IIRC, cpuset migration can take a really long time > when the memory migration is turned on and doing that synchronously > could mess up the system. > > Thanks. > Hi TJ, This change makes the usage of cpuset_hotplug_workfn() from cpu hotplug path synchronous. For memory hotplug it still remains asynchronous. Memory migration happening from cpuset_hotplug_workfn() is already asynchronous by queuing cpuset_migrate_mm_workfn() in cpuset_migrate_mm_wq. cpuset_hotplug_workfn() cpuset_hotplug_workfn(() cpuset_migrate_mm() queue_work(cpuset_migrate_mm_wq) It seems that memory migration latency might not have impact with this change. Please let me know if you meant something else by cpuset migration taking time when memory migration is turned on. Thanks -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-13 7:50 ` Prateek Sood @ 2017-12-13 16:06 ` Tejun Heo 2017-12-15 19:04 ` Prateek Sood 2017-12-28 20:37 ` Prateek Sood 0 siblings, 2 replies; 39+ messages in thread From: Tejun Heo @ 2017-12-13 16:06 UTC (permalink / raw) To: Prateek Sood Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana Hello, Prateek. On Wed, Dec 13, 2017 at 01:20:46PM +0530, Prateek Sood wrote: > This change makes the usage of cpuset_hotplug_workfn() from cpu > hotplug path synchronous. For memory hotplug it still remains > asynchronous. Ah, right. > Memory migration happening from cpuset_hotplug_workfn() is > already asynchronous by queuing cpuset_migrate_mm_workfn() in > cpuset_migrate_mm_wq. > > cpuset_hotplug_workfn() > cpuset_hotplug_workfn(() > cpuset_migrate_mm() > queue_work(cpuset_migrate_mm_wq) > > It seems that memory migration latency might not have > impact with this change. > > Please let me know if you meant something else by cpuset > migration taking time when memory migration is turned on. No, I didn't. I was just confused about which part became synchronous. So, I don't have anything against making the cpu part synchronous, but let's not do that as the fix to the deadlocks cuz, while we can avoid them by changing cpuset, I don't think cpuset is the root cause for them. If there are benefits to making cpuset cpu migration synchronous, let's do that for those benefits. Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-13 16:06 ` Tejun Heo @ 2017-12-15 19:04 ` Prateek Sood 2017-12-28 20:37 ` Prateek Sood 1 sibling, 0 replies; 39+ messages in thread From: Prateek Sood @ 2017-12-15 19:04 UTC (permalink / raw) To: Tejun Heo; +Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On 12/13/2017 09:36 PM, Tejun Heo wrote: > Hello, Prateek. > > On Wed, Dec 13, 2017 at 01:20:46PM +0530, Prateek Sood wrote: >> This change makes the usage of cpuset_hotplug_workfn() from cpu >> hotplug path synchronous. For memory hotplug it still remains >> asynchronous. > > Ah, right. > >> Memory migration happening from cpuset_hotplug_workfn() is >> already asynchronous by queuing cpuset_migrate_mm_workfn() in >> cpuset_migrate_mm_wq. >> >> cpuset_hotplug_workfn() >> cpuset_hotplug_workfn(() >> cpuset_migrate_mm() >> queue_work(cpuset_migrate_mm_wq) >> >> It seems that memory migration latency might not have >> impact with this change. >> >> Please let me know if you meant something else by cpuset >> migration taking time when memory migration is turned on. > > No, I didn't. I was just confused about which part became > synchronous. So, I don't have anything against making the cpu part > synchronous, but let's not do that as the fix to the deadlocks cuz, > while we can avoid them by changing cpuset, I don't think cpuset is > the root cause for them. If there are benefits to making cpuset cpu > migration synchronous, let's do that for those benefits. Making CPU part synchronous can only be achieved if we retain the patch for inverting the locking order for cpuset_mutex and cpu_hotplug_lock. > > Thanks. > Peter, Do you suggest taking both the patches: 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock. 2) Make cpuset hotplug work synchronous. or https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes&id=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d I would leave it for you and TJ to decide on this. Thanks -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-13 16:06 ` Tejun Heo 2017-12-15 19:04 ` Prateek Sood @ 2017-12-28 20:37 ` Prateek Sood 2018-01-02 16:16 ` Tejun Heo 1 sibling, 1 reply; 39+ messages in thread From: Prateek Sood @ 2017-12-28 20:37 UTC (permalink / raw) To: Tejun Heo; +Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On 12/13/2017 09:36 PM, Tejun Heo wrote: > Hello, Prateek. > > On Wed, Dec 13, 2017 at 01:20:46PM +0530, Prateek Sood wrote: >> This change makes the usage of cpuset_hotplug_workfn() from cpu >> hotplug path synchronous. For memory hotplug it still remains >> asynchronous. > > Ah, right. > >> Memory migration happening from cpuset_hotplug_workfn() is >> already asynchronous by queuing cpuset_migrate_mm_workfn() in >> cpuset_migrate_mm_wq. >> >> cpuset_hotplug_workfn() >> cpuset_hotplug_workfn(() >> cpuset_migrate_mm() >> queue_work(cpuset_migrate_mm_wq) >> >> It seems that memory migration latency might not have >> impact with this change. >> >> Please let me know if you meant something else by cpuset >> migration taking time when memory migration is turned on. > > No, I didn't. I was just confused about which part became > synchronous. So, I don't have anything against making the cpu part > synchronous, but let's not do that as the fix to the deadlocks cuz, > while we can avoid them by changing cpuset, I don't think cpuset is > the root cause for them. If there are benefits to making cpuset cpu > migration synchronous, let's do that for those benefits. > > Thanks. > TJ, One more deadlock scenario Task: sh [<ffffff874f917290>] wait_for_completion+0x14 [<ffffff874e8a82e0>] cpuhp_kick_ap_work+0x80 //waiting for cpuhp/2 [<ffffff874f913780>] _cpu_down+0xe0 [<ffffff874e8a9934>] cpu_down+0x38 [<ffffff874ef6e4ec>] cpu_subsys_offline+0x10 Task: cpuhp/2 [<ffffff874f91645c>] schedule+0x38 [<ffffff874e92c76c>] _synchronize_rcu_expedited+0x2ec [<ffffff874e92c874>] synchronize_sched_expedited+0x60 [<ffffff874e92c9f8>] synchronize_sched+0xb0 [<ffffff874e9104e4>] sugov_stop+0x58 [<ffffff874f36967c>] cpufreq_stop_governor+0x48 [<ffffff874f36a89c>] cpufreq_offline+0x84 [<ffffff874f36aa30>] cpuhp_cpufreq_offline+0xc [<ffffff874e8a797c>] cpuhp_invoke_callback+0xac [<ffffff874e8a89b4>] cpuhp_down_callbacks+0x58 [<ffffff874e8a95e8>] cpuhp_thread_fun+0xa8 _synchronize_rcu_expedited is waiting for execution of rcu expedited grace period work item wait_rcu_exp_gp() Task: kworker/2:1 [<ffffff874f91645c>] schedule+0x38 [<ffffff874f916870>] schedule_preempt_disabled+0x20 [<ffffff874f918df8>] __mutex_lock_slowpath+0x158 [<ffffff874f919004>] mutex_lock+0x14 [<ffffff874e8a77b0>] get_online_cpus+0x34 //waiting for cpu_hotplug_lock [<ffffff874e96452c>] rebuild_sched_domains+0x30 [<ffffff874e964648>] cpuset_hotplug_workfn+0xb8 [<ffffff874e8c27b8>] process_one_work+0x168 [<ffffff874e8c2ff4>] worker_thread+0x140 [<ffffff874e8c95b8>] kthread+0xe0 cpu_hotplug_lock is acquired by task: sh Task: kworker/2:3 [<ffffff874f91645c>] schedule+0x38 [<ffffff874f91a384>] schedule_timeout+0x1d8 [<ffffff874f9171d4>] wait_for_common+0xb4 [<ffffff874f917304>] wait_for_completion_killable+0x14 //waiting for kthreadd [<ffffff874e8c931c>] __kthread_create_on_node+0xec [<ffffff874e8c9448>] kthread_create_on_node+0x64 [<ffffff874e8c2d88>] create_worker+0xb4 [<ffffff874e8c3194>] worker_thread+0x2e0 [<ffffff874e8c95b8>] kthread+0xe0 Task: kthreadd [<ffffff874e8858ec>] __switch_to+0x94 [<ffffff874f915ed8>] __schedule+0x2a8 [<ffffff874f91645c>] schedule+0x38 [<ffffff874f91a0ec>] rwsem_down_read_failed+0xe8 [<ffffff874e913dc4>] __percpu_down_read+0xfc [<ffffff874e8a4ab0>] copy_process.isra.72.part.73+0xf60 [<ffffff874e8a53b8>] _do_fork+0xc4 [<ffffff874e8a5720>] kernel_thread+0x34 [<ffffff874e8ca83c>] kthreadd+0x144 kthreadd is waiting for cgroup_threadgroup_rwsem acquired by task T Task: T [<ffffff874f91645c>] schedule+0x38 [<ffffff874f916870>] schedule_preempt_disabled+0x20 [<ffffff874f918df8>] __mutex_lock_slowpath+0x158 [<ffffff874f919004>] mutex_lock+0x14 [<ffffff874e962ff4>] cpuset_can_attach+0x58 [<ffffff874e95d640>] cgroup_taskset_migrate+0x8c [<ffffff874e95d9b4>] cgroup_migrate+0xa4 [<ffffff874e95daf0>] cgroup_attach_task+0x100 [<ffffff874e95df28>] __cgroup_procs_write.isra.35+0x228 [<ffffff874e95e00c>] cgroup_tasks_write+0x10 [<ffffff874e958294>] cgroup_file_write+0x44 [<ffffff874eaa4384>] kernfs_fop_write+0xc0 task T is waiting for cpuset_mutex acquired by kworker/2:1 sh ==> cpuhp/2 ==> kworker/2:1 ==> sh kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1 It seems that my earlier patch set should fix this scenario: 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock. 2) Make cpuset hotplug work synchronous. Could you please share your feedback. Thanks -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2017-12-28 20:37 ` Prateek Sood @ 2018-01-02 16:16 ` Tejun Heo 2018-01-02 17:44 ` Paul E. McKenney 2018-01-15 12:02 ` Prateek Sood 0 siblings, 2 replies; 39+ messages in thread From: Tejun Heo @ 2018-01-02 16:16 UTC (permalink / raw) To: Prateek Sood Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana, Paul E. McKenney Hello, On Fri, Dec 29, 2017 at 02:07:16AM +0530, Prateek Sood wrote: > task T is waiting for cpuset_mutex acquired > by kworker/2:1 > > sh ==> cpuhp/2 ==> kworker/2:1 ==> sh > > kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1 > > It seems that my earlier patch set should fix this scenario: > 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock. > 2) Make cpuset hotplug work synchronous. > > Could you please share your feedback. Hmm... this can also be resolved by adding WQ_MEM_RECLAIM to the synchronize rcu workqueue, right? Given the wide-spread usages of synchronize_rcu and friends, maybe that's the right solution, or at least something we also need to do, for this particular deadlock? Again, I don't have anything against making the domain rebuliding part of cpuset operations synchronous and these tricky deadlock scenarios do indicate that doing so would probably be beneficial. That said, tho, these scenarios seem more of manifestations of other problems exposed through kthreadd dependency than anything else. Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-02 16:16 ` Tejun Heo @ 2018-01-02 17:44 ` Paul E. McKenney 2018-01-02 18:01 ` Paul E. McKenney 2018-01-15 12:02 ` Prateek Sood 1 sibling, 1 reply; 39+ messages in thread From: Paul E. McKenney @ 2018-01-02 17:44 UTC (permalink / raw) To: Tejun Heo Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On Tue, Jan 02, 2018 at 08:16:56AM -0800, Tejun Heo wrote: > Hello, > > On Fri, Dec 29, 2017 at 02:07:16AM +0530, Prateek Sood wrote: > > task T is waiting for cpuset_mutex acquired > > by kworker/2:1 > > > > sh ==> cpuhp/2 ==> kworker/2:1 ==> sh > > > > kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1 > > > > It seems that my earlier patch set should fix this scenario: > > 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock. > > 2) Make cpuset hotplug work synchronous. > > > > Could you please share your feedback. > > Hmm... this can also be resolved by adding WQ_MEM_RECLAIM to the > synchronize rcu workqueue, right? Given the wide-spread usages of > synchronize_rcu and friends, maybe that's the right solution, or at > least something we also need to do, for this particular deadlock? To make WQ_MEM_RECLAIM work, I need to dynamically allocate RCU's workqueues, correct? Or is there some way to mark a statically allocated workqueue as WQ_MEM_RECLAIM after the fact? I can dynamically allocate them, but I need to carefully investigate boot-time use. So if it is possible to be lazy, I do want to take the easy way out. ;-) Thanx, Paul > Again, I don't have anything against making the domain rebuliding part > of cpuset operations synchronous and these tricky deadlock scenarios > do indicate that doing so would probably be beneficial. That said, > tho, these scenarios seem more of manifestations of other problems > exposed through kthreadd dependency than anything else. > > Thanks. > > -- > tejun > ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-02 17:44 ` Paul E. McKenney @ 2018-01-02 18:01 ` Paul E. McKenney 2018-01-08 12:28 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Paul E. McKenney @ 2018-01-02 18:01 UTC (permalink / raw) To: Tejun Heo Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On Tue, Jan 02, 2018 at 09:44:08AM -0800, Paul E. McKenney wrote: > On Tue, Jan 02, 2018 at 08:16:56AM -0800, Tejun Heo wrote: > > Hello, > > > > On Fri, Dec 29, 2017 at 02:07:16AM +0530, Prateek Sood wrote: > > > task T is waiting for cpuset_mutex acquired > > > by kworker/2:1 > > > > > > sh ==> cpuhp/2 ==> kworker/2:1 ==> sh > > > > > > kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1 > > > > > > It seems that my earlier patch set should fix this scenario: > > > 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock. > > > 2) Make cpuset hotplug work synchronous. > > > > > > Could you please share your feedback. > > > > Hmm... this can also be resolved by adding WQ_MEM_RECLAIM to the > > synchronize rcu workqueue, right? Given the wide-spread usages of > > synchronize_rcu and friends, maybe that's the right solution, or at > > least something we also need to do, for this particular deadlock? > > To make WQ_MEM_RECLAIM work, I need to dynamically allocate RCU's > workqueues, correct? Or is there some way to mark a statically > allocated workqueue as WQ_MEM_RECLAIM after the fact? > > I can dynamically allocate them, but I need to carefully investigate > boot-time use. So if it is possible to be lazy, I do want to take > the easy way out. ;-) Actually, after taking a quick look, could you please supply me with a way of mark a statically allocated workqueue as WQ_MEM_RECLAIM after the fact? Otherwise, I end up having to check for the workqueue having been allocated pretty much each time I use it, which is going to be an open invitation for bugs. Plus it looks like there are ways that RCU's workqueue wakeups can be executed during very early boot, which can be handled, but again in a rather messy fashion. In contrast, given a way of mark a statically allocated workqueue as WQ_MEM_RECLAIM after the fact, I simply continue initializing the workqueue at early boot, and then add the WQ_MEM_RECLAIM marking some arbitrarily chosen time after the scheduler has been initialized. The required change to workqueues looks easy, just move the body of the "if (flags & WQ_MEM_RECLAIM) {" statement in __alloc_workqueue_key() to a separate function, right? Thanx, Paul ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-02 18:01 ` Paul E. McKenney @ 2018-01-08 12:28 ` Tejun Heo 2018-01-08 13:47 ` [PATCH wq/for-4.16 1/2] workqueue: separate out init_rescuer() Tejun Heo 2018-01-08 22:52 ` [PATCH] cgroup/cpuset: fix circular locking dependency Paul E. McKenney 0 siblings, 2 replies; 39+ messages in thread From: Tejun Heo @ 2018-01-08 12:28 UTC (permalink / raw) To: Paul E. McKenney Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana Hello, Paul. Sorry about the delay. Travel followed by cold. :( On Tue, Jan 02, 2018 at 10:01:19AM -0800, Paul E. McKenney wrote: > Actually, after taking a quick look, could you please supply me with > a way of mark a statically allocated workqueue as WQ_MEM_RECLAIM after > the fact? Otherwise, I end up having to check for the workqueue having Hmmm... there is no statically allocated workqueue tho. If you're referring to the system-wide workqueues (system*_wq), they're just created dynamically early during boot. > been allocated pretty much each time I use it, which is going to be an > open invitation for bugs. Plus it looks like there are ways that RCU's > workqueue wakeups can be executed during very early boot, which can be > handled, but again in a rather messy fashion. > > In contrast, given a way of mark a statically allocated workqueue > as WQ_MEM_RECLAIM after the fact, I simply continue initializing the > workqueue at early boot, and then add the WQ_MEM_RECLAIM marking some > arbitrarily chosen time after the scheduler has been initialized. > > The required change to workqueues looks easy, just move the body of > the "if (flags & WQ_MEM_RECLAIM) {" statement in __alloc_workqueue_key() > to a separate function, right? Ah, okay, yes, currently, workqueue init is kinda silly in that while it allows init of non-mem-reclaiming workqueues way before workqueue is actually online, it doesn't allow the same for mem-reclaiming ones. As you pointed out, it's just an oversight on my part as the init path split was done initially to accomodate early init of system workqueues. I'll update the code so that rescuers can be added later too; however, please note that while the work items may be queued, they won't be executed until workqueue_init() is run (the same as now) as there can't be worker threads anyway before that point. Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* [PATCH wq/for-4.16 1/2] workqueue: separate out init_rescuer() 2018-01-08 12:28 ` Tejun Heo @ 2018-01-08 13:47 ` Tejun Heo 2018-01-08 13:47 ` [PATCH wq/for-4.16 2/2] workqueue: allow WQ_MEM_RECLAIM on early init workqueues Tejun Heo 2018-01-08 22:52 ` [PATCH] cgroup/cpuset: fix circular locking dependency Paul E. McKenney 1 sibling, 1 reply; 39+ messages in thread From: Tejun Heo @ 2018-01-08 13:47 UTC (permalink / raw) To: Paul E. McKenney Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana Hello, Paul. Applied the following two patches to allow early init of WQ_MEM_RECLAIM workqueues to the following git branch. git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git for-4.16 The branch won't be rebased and currently only contains these two patches. Please feel free to pull and put further RCU changes on top. Thanks. ------ 8< ------ >From 983c751532738b83c4c5b51192ebc6fbfe9330f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 8 Jan 2018 05:38:32 -0800 Separate out init_rescuer() from __alloc_workqueue_key() to prepare for early init support for WQ_MEM_RECLAIM. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> --- kernel/workqueue.c | 56 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 43d18cb..5baac7c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3939,6 +3939,37 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, return clamp_val(max_active, 1, lim); } +/* + * Workqueues which may be used during memory reclaim should have a rescuer + * to guarantee forward progress. + */ +static int init_rescuer(struct workqueue_struct *wq) +{ + struct worker *rescuer; + int ret; + + if (!(wq->flags & WQ_MEM_RECLAIM)) + return 0; + + rescuer = alloc_worker(NUMA_NO_NODE); + if (!rescuer) + return -ENOMEM; + + rescuer->rescue_wq = wq; + rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); + ret = PTR_ERR_OR_ZERO(rescuer->task); + if (ret) { + kfree(rescuer); + return ret; + } + + wq->rescuer = rescuer; + kthread_bind_mask(rescuer->task, cpu_possible_mask); + wake_up_process(rescuer->task); + + return 0; +} + struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, @@ -4001,29 +4032,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err_free_wq; - /* - * Workqueues which may be used during memory reclaim should - * have a rescuer to guarantee forward progress. - */ - if (flags & WQ_MEM_RECLAIM) { - struct worker *rescuer; - - rescuer = alloc_worker(NUMA_NO_NODE); - if (!rescuer) - goto err_destroy; - - rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", - wq->name); - if (IS_ERR(rescuer->task)) { - kfree(rescuer); - goto err_destroy; - } - - wq->rescuer = rescuer; - kthread_bind_mask(rescuer->task, cpu_possible_mask); - wake_up_process(rescuer->task); - } + if (init_rescuer(wq) < 0) + goto err_destroy; if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) goto err_destroy; -- 2.9.5 ^ permalink raw reply related [flat|nested] 39+ messages in thread
* [PATCH wq/for-4.16 2/2] workqueue: allow WQ_MEM_RECLAIM on early init workqueues 2018-01-08 13:47 ` [PATCH wq/for-4.16 1/2] workqueue: separate out init_rescuer() Tejun Heo @ 2018-01-08 13:47 ` Tejun Heo 0 siblings, 0 replies; 39+ messages in thread From: Tejun Heo @ 2018-01-08 13:47 UTC (permalink / raw) To: Paul E. McKenney Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana >From 40c17f75dfa9ec163478efd3f7443fd6af9992c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 8 Jan 2018 05:38:37 -0800 Workqueues can be created early during boot before workqueue subsystem in fully online - work items are queued waiting for later full initialization. However, early init wasn't supported for WQ_MEM_RECLAIM workqueues causing unnecessary annoyances for a subset of users. Expand early init support to include WQ_MEM_RECLAIM workqueues. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> --- kernel/workqueue.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5baac7c..c86cc1e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4032,7 +4032,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err_free_wq; - if (init_rescuer(wq) < 0) + if (wq_online && init_rescuer(wq) < 0) goto err_destroy; if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) @@ -5639,6 +5639,8 @@ int __init workqueue_init(void) * archs such as power and arm64. As per-cpu pools created * previously could be missing node hint and unbound pools NUMA * affinity, fix them up. + * + * Also, while iterating workqueues, create rescuers if requested. */ wq_numa_init(); @@ -5650,8 +5652,12 @@ int __init workqueue_init(void) } } - list_for_each_entry(wq, &workqueues, list) + list_for_each_entry(wq, &workqueues, list) { wq_update_unbound_numa(wq, smp_processor_id(), true); + WARN(init_rescuer(wq), + "workqueue: failed to create early rescuer for %s", + wq->name); + } mutex_unlock(&wq_pool_mutex); -- 2.9.5 ^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-08 12:28 ` Tejun Heo 2018-01-08 13:47 ` [PATCH wq/for-4.16 1/2] workqueue: separate out init_rescuer() Tejun Heo @ 2018-01-08 22:52 ` Paul E. McKenney 2018-01-09 0:31 ` Paul E. McKenney 1 sibling, 1 reply; 39+ messages in thread From: Paul E. McKenney @ 2018-01-08 22:52 UTC (permalink / raw) To: Tejun Heo Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On Mon, Jan 08, 2018 at 04:28:23AM -0800, Tejun Heo wrote: > Hello, Paul. > > Sorry about the delay. Travel followed by cold. :( > > On Tue, Jan 02, 2018 at 10:01:19AM -0800, Paul E. McKenney wrote: > > Actually, after taking a quick look, could you please supply me with > > a way of mark a statically allocated workqueue as WQ_MEM_RECLAIM after > > the fact? Otherwise, I end up having to check for the workqueue having > > Hmmm... there is no statically allocated workqueue tho. If you're > referring to the system-wide workqueues (system*_wq), they're just > created dynamically early during boot. Good point, I was confused. But yes, they are conveniently allocated just before the call to rcu_init(), which does work out well. ;-) > > been allocated pretty much each time I use it, which is going to be an > > open invitation for bugs. Plus it looks like there are ways that RCU's > > workqueue wakeups can be executed during very early boot, which can be > > handled, but again in a rather messy fashion. > > > > In contrast, given a way of mark a statically allocated workqueue > > as WQ_MEM_RECLAIM after the fact, I simply continue initializing the > > workqueue at early boot, and then add the WQ_MEM_RECLAIM marking some > > arbitrarily chosen time after the scheduler has been initialized. > > > > The required change to workqueues looks easy, just move the body of > > the "if (flags & WQ_MEM_RECLAIM) {" statement in __alloc_workqueue_key() > > to a separate function, right? > > Ah, okay, yes, currently, workqueue init is kinda silly in that while > it allows init of non-mem-reclaiming workqueues way before workqueue > is actually online, it doesn't allow the same for mem-reclaiming ones. > As you pointed out, it's just an oversight on my part as the init path > split was done initially to accomodate early init of system > workqueues. > > I'll update the code so that rescuers can be added later too; however, > please note that while the work items may be queued, they won't be > executed until workqueue_init() is run (the same as now) as there > can't be worker threads anyway before that point. Thank you! I added the following patch to allow RCU access to the init_rescuer() function. Does that work for you, or did you have some other arrangement in mind? Thanx, Paul ------------------------------------------------------------------------ commit 66683a07503d71e5d5cceac72caf772e6e59c787 Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Date: Mon Jan 8 14:27:46 2018 -0800 workqueue: Allow init_rescuer() to be invoked from other files This commit exports init_rescuer() so that RCU can invoke it. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Lai Jiangshan <jiangshanlai@gmail.com> diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 4a54ef96aff5..31ce9343b4a9 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -433,6 +433,8 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, #define create_singlethread_workqueue(name) \ alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) +int init_rescuer(struct workqueue_struct *wq); + extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c86cc1ed678b..7440c61c6213 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3943,7 +3943,7 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, * Workqueues which may be used during memory reclaim should have a rescuer * to guarantee forward progress. */ -static int init_rescuer(struct workqueue_struct *wq) +int init_rescuer(struct workqueue_struct *wq) { struct worker *rescuer; int ret; ^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-08 22:52 ` [PATCH] cgroup/cpuset: fix circular locking dependency Paul E. McKenney @ 2018-01-09 0:31 ` Paul E. McKenney 2018-01-09 3:42 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Paul E. McKenney @ 2018-01-09 0:31 UTC (permalink / raw) To: Tejun Heo Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On Mon, Jan 08, 2018 at 02:52:38PM -0800, Paul E. McKenney wrote: > On Mon, Jan 08, 2018 at 04:28:23AM -0800, Tejun Heo wrote: > > Hello, Paul. > > > > Sorry about the delay. Travel followed by cold. :( > > > > On Tue, Jan 02, 2018 at 10:01:19AM -0800, Paul E. McKenney wrote: > > > Actually, after taking a quick look, could you please supply me with > > > a way of mark a statically allocated workqueue as WQ_MEM_RECLAIM after > > > the fact? Otherwise, I end up having to check for the workqueue having > > > > Hmmm... there is no statically allocated workqueue tho. If you're > > referring to the system-wide workqueues (system*_wq), they're just > > created dynamically early during boot. > > Good point, I was confused. But yes, they are conveniently allocated > just before the call to rcu_init(), which does work out well. ;-) > > > > been allocated pretty much each time I use it, which is going to be an > > > open invitation for bugs. Plus it looks like there are ways that RCU's > > > workqueue wakeups can be executed during very early boot, which can be > > > handled, but again in a rather messy fashion. > > > > > > In contrast, given a way of mark a statically allocated workqueue > > > as WQ_MEM_RECLAIM after the fact, I simply continue initializing the > > > workqueue at early boot, and then add the WQ_MEM_RECLAIM marking some > > > arbitrarily chosen time after the scheduler has been initialized. > > > > > > The required change to workqueues looks easy, just move the body of > > > the "if (flags & WQ_MEM_RECLAIM) {" statement in __alloc_workqueue_key() > > > to a separate function, right? > > > > Ah, okay, yes, currently, workqueue init is kinda silly in that while > > it allows init of non-mem-reclaiming workqueues way before workqueue > > is actually online, it doesn't allow the same for mem-reclaiming ones. > > As you pointed out, it's just an oversight on my part as the init path > > split was done initially to accomodate early init of system > > workqueues. > > > > I'll update the code so that rescuers can be added later too; however, > > please note that while the work items may be queued, they won't be > > executed until workqueue_init() is run (the same as now) as there > > can't be worker threads anyway before that point. > > Thank you! I added the following patch to allow RCU access to the > init_rescuer() function. Does that work for you, or did you have some > other arrangement in mind? And here are the corresponding changes to RCU, which pass light rcutorture testing. Thanx, Paul ------------------------------------------------------------------------ commit d0d6626927faf3421df6a1db875ad7099f7d49cd Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Date: Mon Jan 8 14:35:52 2018 -0800 rcu: Create RCU-specific workqueues with rescuers RCU's expedited grace periods can participate in out-of-memory deadlocks due to all available system_wq kthreads being blocked and there not being memory available to create more. This commit prevents such deadlocks by allocating an RCU-specific workqueue_struct at early boot time, and providing it with a rescuer to ensure forward progress. This uses the shiny new init_rescuer() function provided by Tejun. This commit also causes SRCU to use this new RCU-specific workqueue_struct. Note that SRCU's use of workqueues never blocks them waiting for readers, so this should be safe from a forward-progress viewpoint. Reported-by: Prateek Sood <prsood@codeaurora.org> Reported-by: Tejun Heo <tj@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 59c471de342a..acabc4781b08 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -493,6 +493,7 @@ void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); +extern struct workqueue_struct *rcu_gp_workqueue; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d5880089ff6..89f0f6b3ce9a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -465,7 +465,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, + srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_workqueue, &sdp->work, delay); } @@ -664,7 +664,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); srcu_gp_start(sp); - queue_delayed_work(system_power_efficient_wq, &sp->work, + queue_delayed_work(rcu_gp_workqueue, &sp->work, srcu_get_delay(sp)); } raw_spin_unlock_irqrestore_rcu_node(sp, flags); @@ -1198,7 +1198,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) raw_spin_unlock_irq_rcu_node(sp); if (pushgp) - queue_delayed_work(system_power_efficient_wq, &sp->work, delay); + queue_delayed_work(rcu_gp_workqueue, &sp->work, delay); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c0ca2ccf0c..99c12650b9db 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4272,6 +4272,15 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) pr_cont("\n"); } +struct workqueue_struct *rcu_gp_workqueue; + +static int __init rcu_init_wq_rescuer(void) +{ + WARN_ON(init_rescuer(rcu_gp_workqueue)); + return 0; +} +core_initcall(rcu_init_wq_rescuer); + void __init rcu_init(void) { int cpu; @@ -4298,6 +4307,10 @@ void __init rcu_init(void) rcu_cpu_starting(cpu); rcutree_online_cpu(cpu); } + + /* Create workqueue for expedited GPs and for Tree SRCU. */ + rcu_gp_workqueue = alloc_workqueue("rcu_gp", 0, 0); + WARN_ON(!rcu_gp_workqueue); } #include "tree_exp.h" diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 46d61b597731..3ba3ef4d4796 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -606,7 +606,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rew.rew_rsp = rsp; rew.rew_s = s; INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - schedule_work(&rew.rew_work); + queue_work(rcu_gp_workqueue, &rew.rew_work); } /* Wait for expedited grace period to complete. */ ^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-09 0:31 ` Paul E. McKenney @ 2018-01-09 3:42 ` Tejun Heo 2018-01-09 4:20 ` Paul E. McKenney 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2018-01-09 3:42 UTC (permalink / raw) To: Paul E. McKenney Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana Hello, Paul. On Mon, Jan 08, 2018 at 04:31:27PM -0800, Paul E. McKenney wrote: > +static int __init rcu_init_wq_rescuer(void) > +{ > + WARN_ON(init_rescuer(rcu_gp_workqueue)); > + return 0; > +} > +core_initcall(rcu_init_wq_rescuer); So, what I don't get is why RCU needs to call this explicitly. core_initcall() is after workqueue_init() anyway. Why am I missing? Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-09 3:42 ` Tejun Heo @ 2018-01-09 4:20 ` Paul E. McKenney 2018-01-09 13:44 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Paul E. McKenney @ 2018-01-09 4:20 UTC (permalink / raw) To: Tejun Heo Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On Mon, Jan 08, 2018 at 07:42:11PM -0800, Tejun Heo wrote: > Hello, Paul. > > On Mon, Jan 08, 2018 at 04:31:27PM -0800, Paul E. McKenney wrote: > > +static int __init rcu_init_wq_rescuer(void) > > +{ > > + WARN_ON(init_rescuer(rcu_gp_workqueue)); > > + return 0; > > +} > > +core_initcall(rcu_init_wq_rescuer); > > So, what I don't get is why RCU needs to call this explicitly. > core_initcall() is after workqueue_init() anyway. Why am I missing? Me being stupid, I guess. OK, so I can put WQ_MEM_RECLAIM on the early boot creation of RCU's workqueue_struct as shown below, right? Thanx, Paul ------------------------------------------------------------------------ commit 9884a945a65837cda6de2ff621d47c59a6ca3e28 Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Date: Mon Jan 8 14:35:52 2018 -0800 rcu: Create RCU-specific workqueues with rescuers RCU's expedited grace periods can participate in out-of-memory deadlocks due to all available system_wq kthreads being blocked and there not being memory available to create more. This commit prevents such deadlocks by allocating an RCU-specific workqueue_struct at early boot time, and providing it with a rescuer to ensure forward progress. This uses the shiny new init_rescuer() function provided by Tejun (but indirectly). This commit also causes SRCU to use this new RCU-specific workqueue_struct. Note that SRCU's use of workqueues never blocks them waiting for readers, so this should be safe from a forward-progress viewpoint. Reported-by: Prateek Sood <prsood@codeaurora.org> Reported-by: Tejun Heo <tj@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 59c471de342a..acabc4781b08 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -493,6 +493,7 @@ void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); +extern struct workqueue_struct *rcu_gp_workqueue; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d5880089ff6..89f0f6b3ce9a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -465,7 +465,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, + srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_workqueue, &sdp->work, delay); } @@ -664,7 +664,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); srcu_gp_start(sp); - queue_delayed_work(system_power_efficient_wq, &sp->work, + queue_delayed_work(rcu_gp_workqueue, &sp->work, srcu_get_delay(sp)); } raw_spin_unlock_irqrestore_rcu_node(sp, flags); @@ -1198,7 +1198,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) raw_spin_unlock_irq_rcu_node(sp); if (pushgp) - queue_delayed_work(system_power_efficient_wq, &sp->work, delay); + queue_delayed_work(rcu_gp_workqueue, &sp->work, delay); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c0ca2ccf0c..d658538e6f7d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4272,6 +4272,8 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) pr_cont("\n"); } +struct workqueue_struct *rcu_gp_workqueue; + void __init rcu_init(void) { int cpu; @@ -4298,6 +4300,10 @@ void __init rcu_init(void) rcu_cpu_starting(cpu); rcutree_online_cpu(cpu); } + + /* Create workqueue for expedited GPs and for Tree SRCU. */ + rcu_gp_workqueue = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_gp_workqueue); } #include "tree_exp.h" diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 46d61b597731..3ba3ef4d4796 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -606,7 +606,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rew.rew_rsp = rsp; rew.rew_s = s; INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - schedule_work(&rew.rew_work); + queue_work(rcu_gp_workqueue, &rew.rew_work); } /* Wait for expedited grace period to complete. */ ^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-09 4:20 ` Paul E. McKenney @ 2018-01-09 13:44 ` Tejun Heo 2018-01-09 15:21 ` Paul E. McKenney 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2018-01-09 13:44 UTC (permalink / raw) To: Paul E. McKenney Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana Hello, Paul. On Mon, Jan 08, 2018 at 08:20:16PM -0800, Paul E. McKenney wrote: > OK, so I can put WQ_MEM_RECLAIM on the early boot creation of RCU's > workqueue_struct as shown below, right? Yes, this looks good to me. Just one question. > +struct workqueue_struct *rcu_gp_workqueue; > + > void __init rcu_init(void) > { > int cpu; > @@ -4298,6 +4300,10 @@ void __init rcu_init(void) > rcu_cpu_starting(cpu); > rcutree_online_cpu(cpu); > } > + > + /* Create workqueue for expedited GPs and for Tree SRCU. */ > + rcu_gp_workqueue = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); > + WARN_ON(!rcu_gp_workqueue); The code was previously using both system_power_efficient_wq and system_workqueue (for the expedited path). I guess the options were either using two workqueues or dropping POWER_EFFICIENT. I have no idea how big an impact this will make or whether it'd even be noticeable but maybe it'd be worthwhile to mention that in the description? Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-09 13:44 ` Tejun Heo @ 2018-01-09 15:21 ` Paul E. McKenney 2018-01-09 15:37 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Paul E. McKenney @ 2018-01-09 15:21 UTC (permalink / raw) To: Tejun Heo Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On Tue, Jan 09, 2018 at 05:44:48AM -0800, Tejun Heo wrote: > Hello, Paul. > > On Mon, Jan 08, 2018 at 08:20:16PM -0800, Paul E. McKenney wrote: > > OK, so I can put WQ_MEM_RECLAIM on the early boot creation of RCU's > > workqueue_struct as shown below, right? > > Yes, this looks good to me. Just one question. > > > +struct workqueue_struct *rcu_gp_workqueue; > > + > > void __init rcu_init(void) > > { > > int cpu; > > @@ -4298,6 +4300,10 @@ void __init rcu_init(void) > > rcu_cpu_starting(cpu); > > rcutree_online_cpu(cpu); > > } > > + > > + /* Create workqueue for expedited GPs and for Tree SRCU. */ > > + rcu_gp_workqueue = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); > > + WARN_ON(!rcu_gp_workqueue); > > The code was previously using both system_power_efficient_wq and > system_workqueue (for the expedited path). I guess the options were > either using two workqueues or dropping POWER_EFFICIENT. I have no > idea how big an impact this will make or whether it'd even be > noticeable but maybe it'd be worthwhile to mention that in the > description? Good point! How about if I change the last paragraph of the commit log to read as follows? Thanx, Paul ------------------------------------------------------------------------ This commit also causes SRCU to use this new RCU-specific workqueue_struct. Note that SRCU's use of workqueues never blocks them waiting for readers, so this should be safe from a forward-progress viewpoint. Note that this moves SRCU from system_power_efficient_wq to a normal workqueue. In the unlikely event that this results in measurable degradation, a separate power-efficient workqueue will be creates for SRCU. ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-09 15:21 ` Paul E. McKenney @ 2018-01-09 15:37 ` Tejun Heo 2018-01-09 16:00 ` Paul E. McKenney 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2018-01-09 15:37 UTC (permalink / raw) To: Paul E. McKenney Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana Hello, Paul. On Tue, Jan 09, 2018 at 07:21:12AM -0800, Paul E. McKenney wrote: > > The code was previously using both system_power_efficient_wq and > > system_workqueue (for the expedited path). I guess the options were > > either using two workqueues or dropping POWER_EFFICIENT. I have no > > idea how big an impact this will make or whether it'd even be > > noticeable but maybe it'd be worthwhile to mention that in the > > description? > > Good point! How about if I change the last paragraph of the commit > log to read as follows? > > Thanx, Paul > > ------------------------------------------------------------------------ > > This commit also causes SRCU to use this new RCU-specific > workqueue_struct. Note that SRCU's use of workqueues never blocks them > waiting for readers, so this should be safe from a forward-progress > viewpoint. Note that this moves SRCU from system_power_efficient_wq > to a normal workqueue. In the unlikely event that this results in > measurable degradation, a separate power-efficient workqueue will be > creates for SRCU. Sounds good. Please feel free to add Acked-by: Tejun Heo <tj@kernel.org> Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-09 15:37 ` Tejun Heo @ 2018-01-09 16:00 ` Paul E. McKenney 2018-01-10 20:08 ` Paul E. McKenney 0 siblings, 1 reply; 39+ messages in thread From: Paul E. McKenney @ 2018-01-09 16:00 UTC (permalink / raw) To: Tejun Heo Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On Tue, Jan 09, 2018 at 07:37:52AM -0800, Tejun Heo wrote: > Hello, Paul. > > On Tue, Jan 09, 2018 at 07:21:12AM -0800, Paul E. McKenney wrote: > > > The code was previously using both system_power_efficient_wq and > > > system_workqueue (for the expedited path). I guess the options were > > > either using two workqueues or dropping POWER_EFFICIENT. I have no > > > idea how big an impact this will make or whether it'd even be > > > noticeable but maybe it'd be worthwhile to mention that in the > > > description? > > > > Good point! How about if I change the last paragraph of the commit > > log to read as follows? > > > > Thanx, Paul > > > > ------------------------------------------------------------------------ > > > > This commit also causes SRCU to use this new RCU-specific > > workqueue_struct. Note that SRCU's use of workqueues never blocks them > > waiting for readers, so this should be safe from a forward-progress > > viewpoint. Note that this moves SRCU from system_power_efficient_wq > > to a normal workqueue. In the unlikely event that this results in > > measurable degradation, a separate power-efficient workqueue will be > > creates for SRCU. > > Sounds good. Please feel free to add > > Acked-by: Tejun Heo <tj@kernel.org> Done, thank you! Thanx, Paul ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-09 16:00 ` Paul E. McKenney @ 2018-01-10 20:08 ` Paul E. McKenney 2018-01-10 21:41 ` Tejun Heo 0 siblings, 1 reply; 39+ messages in thread From: Paul E. McKenney @ 2018-01-10 20:08 UTC (permalink / raw) To: Tejun Heo Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On Tue, Jan 09, 2018 at 08:00:22AM -0800, Paul E. McKenney wrote: > On Tue, Jan 09, 2018 at 07:37:52AM -0800, Tejun Heo wrote: > > Hello, Paul. > > > > On Tue, Jan 09, 2018 at 07:21:12AM -0800, Paul E. McKenney wrote: > > > > The code was previously using both system_power_efficient_wq and > > > > system_workqueue (for the expedited path). I guess the options were > > > > either using two workqueues or dropping POWER_EFFICIENT. I have no > > > > idea how big an impact this will make or whether it'd even be > > > > noticeable but maybe it'd be worthwhile to mention that in the > > > > description? > > > > > > Good point! How about if I change the last paragraph of the commit > > > log to read as follows? > > > > > > Thanx, Paul > > > > > > ------------------------------------------------------------------------ > > > > > > This commit also causes SRCU to use this new RCU-specific > > > workqueue_struct. Note that SRCU's use of workqueues never blocks them > > > waiting for readers, so this should be safe from a forward-progress > > > viewpoint. Note that this moves SRCU from system_power_efficient_wq > > > to a normal workqueue. In the unlikely event that this results in > > > measurable degradation, a separate power-efficient workqueue will be > > > creates for SRCU. > > > > Sounds good. Please feel free to add > > > > Acked-by: Tejun Heo <tj@kernel.org> > > Done, thank you! And one additional question... How are we pushing this upstream? By default, I would push things starting this late into the merge window following the next one (v4.17), but would be more than willing to make an exception given that this fixes a valid real-world complaint. For that matter, if you would rather push my commit along with your pair of commits, that works for me! Either way, please just let me know. Thanx, Paul ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-10 20:08 ` Paul E. McKenney @ 2018-01-10 21:41 ` Tejun Heo 2018-01-10 22:10 ` Paul E. McKenney 0 siblings, 1 reply; 39+ messages in thread From: Tejun Heo @ 2018-01-10 21:41 UTC (permalink / raw) To: Paul E. McKenney Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana Hello, Paul. On Wed, Jan 10, 2018 at 12:08:21PM -0800, Paul E. McKenney wrote: > And one additional question... How are we pushing this upstream? By > default, I would push things starting this late into the merge window > following the next one (v4.17), but would be more than willing to make > an exception given that this fixes a valid real-world complaint. I put the workqueue part in the fixes branch. If there's gonna be another rc, I'm gonna push it to Linus. If not, I'll push when the next window opens. I think either way should be fine. > For that matter, if you would rather push my commit along with your pair > of commits, that works for me! Either way, please just let me know. Oh, let's push these through their respective trees. Please take the RCU one through the usual RCU tree. Thank you very much. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-10 21:41 ` Tejun Heo @ 2018-01-10 22:10 ` Paul E. McKenney 0 siblings, 0 replies; 39+ messages in thread From: Paul E. McKenney @ 2018-01-10 22:10 UTC (permalink / raw) To: Tejun Heo Cc: Prateek Sood, Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana On Wed, Jan 10, 2018 at 01:41:01PM -0800, Tejun Heo wrote: > Hello, Paul. > > On Wed, Jan 10, 2018 at 12:08:21PM -0800, Paul E. McKenney wrote: > > And one additional question... How are we pushing this upstream? By > > default, I would push things starting this late into the merge window > > following the next one (v4.17), but would be more than willing to make > > an exception given that this fixes a valid real-world complaint. > > I put the workqueue part in the fixes branch. If there's gonna be > another rc, I'm gonna push it to Linus. If not, I'll push when the > next window opens. I think either way should be fine. > > > For that matter, if you would rather push my commit along with your pair > > of commits, that works for me! Either way, please just let me know. > > Oh, let's push these through their respective trees. Please take the > RCU one through the usual RCU tree. Works for me! > Thank you very much. And to you! Thanx, Paul ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-02 16:16 ` Tejun Heo 2018-01-02 17:44 ` Paul E. McKenney @ 2018-01-15 12:02 ` Prateek Sood 2018-01-16 16:27 ` Tejun Heo 1 sibling, 1 reply; 39+ messages in thread From: Prateek Sood @ 2018-01-15 12:02 UTC (permalink / raw) To: Tejun Heo Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana, Paul E. McKenney On 01/02/2018 09:46 PM, Tejun Heo wrote: > Hello, > > On Fri, Dec 29, 2017 at 02:07:16AM +0530, Prateek Sood wrote: >> task T is waiting for cpuset_mutex acquired >> by kworker/2:1 >> >> sh ==> cpuhp/2 ==> kworker/2:1 ==> sh >> >> kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1 >> >> It seems that my earlier patch set should fix this scenario: >> 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock. >> 2) Make cpuset hotplug work synchronous. >> >> Could you please share your feedback. > > Hmm... this can also be resolved by adding WQ_MEM_RECLAIM to the > synchronize rcu workqueue, right? Given the wide-spread usages of > synchronize_rcu and friends, maybe that's the right solution, or at > least something we also need to do, for this particular deadlock? > > Again, I don't have anything against making the domain rebuliding part > of cpuset operations synchronous and these tricky deadlock scenarios > do indicate that doing so would probably be beneficial. That said, > tho, these scenarios seem more of manifestations of other problems > exposed through kthreadd dependency than anything else. > > Thanks. > Hi TJ, Thanks for suggesting WQ_MEM_RECLAIM solution. My understanding of WQ_MEM_RECLAIM was that it needs to be used for cases where memory pressure could cause deadlocks. In this case it does not seem to be a memory pressure issue. Overloading WQ_MEM_RECLAIM usage for solution to another problem is the correct approach? This scenario can be resolved by using WQ_MEM_RECLAIM and a separate workqueue for rcu. But there seems to be a possibility in future if any cpu hotplug callbacks use other predefined workqueues which do not have WQ_MEM_RECLAIM option. Please let me know your feedback on this. Thanks -- Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project ^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH] cgroup/cpuset: fix circular locking dependency 2018-01-15 12:02 ` Prateek Sood @ 2018-01-16 16:27 ` Tejun Heo 0 siblings, 0 replies; 39+ messages in thread From: Tejun Heo @ 2018-01-16 16:27 UTC (permalink / raw) To: Prateek Sood Cc: Peter Zijlstra, avagin, mingo, linux-kernel, cgroups, sramana, Paul E. McKenney Hello, Prateek. On Mon, Jan 15, 2018 at 05:32:18PM +0530, Prateek Sood wrote: > My understanding of WQ_MEM_RECLAIM was that it needs to be used for > cases where memory pressure could cause deadlocks. Yes, that is the primary role; however, there are a couple places where we need it to isolate a low level subsystem's forward progress from the dependencies of the dynanic worker pool management. > In this case it does not seem to be a memory pressure issue. > Overloading WQ_MEM_RECLAIM usage for solution to another problem > is the correct approach? In general, we don't face this issue but for things like RCU, I do believe this is the right approach. The name of the flag is a bit of misnomer for those cases as it only describes one of the dependencies of worker pool management. I don't think it's an actual problem given that vast majority of the usages are protecting against memory dependency. > This scenario can be resolved by using WQ_MEM_RECLAIM and a separate > workqueue for rcu. But there seems to be a possibility in future if > any cpu hotplug callbacks use other predefined workqueues which do not > have WQ_MEM_RECLAIM option. We want to isolate from this sort of dependencies anyway, especially how widely and deeply we depend on RCU these days. Maybe we will develop a justified case in the future but at least up until now every case around this issue seems to be something to be fixed on their own. Thanks. -- tejun ^ permalink raw reply [flat|nested] 39+ messages in thread
end of thread, other threads:[~2018-01-16 16:27 UTC | newest] Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <CANaxB-zT+sz=z1FFk5npnwMySdfKCBZDkM+P+=JXDkCXbh=rCw@mail.gmail.com> 2017-11-28 11:35 ` [PATCH] cgroup/cpuset: fix circular locking dependency Prateek Sood 2017-12-04 5:14 ` Prateek Sood 2017-12-04 20:22 ` Tejun Heo 2017-12-04 22:58 ` Tejun Heo 2017-12-04 23:01 ` Peter Zijlstra 2017-12-08 9:40 ` Prateek Sood 2017-12-08 11:45 ` Prateek Sood 2017-12-11 15:32 ` Tejun Heo 2017-12-13 14:28 ` Prateek Sood 2017-12-13 15:40 ` Tejun Heo 2017-12-15 8:54 ` Prateek Sood 2017-12-15 13:22 ` Tejun Heo 2017-12-15 19:06 ` Prateek Sood 2017-12-19 7:26 ` [PATCH] cgroup: Fix deadlock in cpu hotplug path Prateek Sood 2017-12-19 13:39 ` Tejun Heo 2017-12-11 15:20 ` [PATCH] cgroup/cpuset: fix circular locking dependency Tejun Heo 2017-12-13 7:50 ` Prateek Sood 2017-12-13 16:06 ` Tejun Heo 2017-12-15 19:04 ` Prateek Sood 2017-12-28 20:37 ` Prateek Sood 2018-01-02 16:16 ` Tejun Heo 2018-01-02 17:44 ` Paul E. McKenney 2018-01-02 18:01 ` Paul E. McKenney 2018-01-08 12:28 ` Tejun Heo 2018-01-08 13:47 ` [PATCH wq/for-4.16 1/2] workqueue: separate out init_rescuer() Tejun Heo 2018-01-08 13:47 ` [PATCH wq/for-4.16 2/2] workqueue: allow WQ_MEM_RECLAIM on early init workqueues Tejun Heo 2018-01-08 22:52 ` [PATCH] cgroup/cpuset: fix circular locking dependency Paul E. McKenney 2018-01-09 0:31 ` Paul E. McKenney 2018-01-09 3:42 ` Tejun Heo 2018-01-09 4:20 ` Paul E. McKenney 2018-01-09 13:44 ` Tejun Heo 2018-01-09 15:21 ` Paul E. McKenney 2018-01-09 15:37 ` Tejun Heo 2018-01-09 16:00 ` Paul E. McKenney 2018-01-10 20:08 ` Paul E. McKenney 2018-01-10 21:41 ` Tejun Heo 2018-01-10 22:10 ` Paul E. McKenney 2018-01-15 12:02 ` Prateek Sood 2018-01-16 16:27 ` Tejun Heo
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).