If a worker task for a normal (bound, non-CPU-intensive) calls cond_resched(), this allows other non-workqueue processes to run, but does *not* allow other workqueue workers to run. This is because workqueue will only attempt to run one task at a time on each CPU, unless the current task actually sleeps. This is already a problem for should_reclaim_retry() in mm/page_alloc.c, which inserts a small sleep just to convince workqueue to allow other workers to run. It can also be a problem for NFS when closing a very large file (e.g. 100 million pages in memory). NFS can call the final iput() from a workqueue, which can then take long enough to trigger a workqueue-lockup warning, and long enough for performance problems to be observed. I added a WARN_ON_ONCE() in cond_resched() to report when it is run from a workqueue, and got about 20 hits during boot, many of system_wq (aka "events") which suggests there is a real chance that worker are being delayed unnecessarily be tasks which are written to politely relinquish the CPU. I think that once a worker calls cond_resched(), it should be treated as though it was run from a WQ_CPU_INTENSIVE queue, because only cpu-intensive tasks need to call cond_resched(). This would allow other workers to be scheduled. The following patch achieves this I believe. Signed-off-by: NeilBrown --- include/linux/sched.h | 7 ++++++- include/linux/workqueue.h | 2 ++ kernel/sched/core.c | 4 ++++ kernel/workqueue.c | 16 +++++++++++++++- mm/page_alloc.c | 12 +----------- 5 files changed, 28 insertions(+), 13 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4418f5cb8324..728870965df1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1784,7 +1784,12 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) #ifndef CONFIG_PREEMPTION extern int _cond_resched(void); #else -static inline int _cond_resched(void) { return 0; } +static inline int _cond_resched(void) +{ + if (current->flags & PF_WQ_WORKER) + workqueue_cond_resched(); + return 0; +} #endif #define cond_resched() ({ \ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 8b505d22fc0e..7bcc5717d80f 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -626,6 +626,8 @@ static inline bool schedule_delayed_work(struct delayed_work *dwork, return queue_delayed_work(system_wq, dwork, delay); } +void workqueue_cond_resched(void); + #ifndef CONFIG_SMP static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a2fbf98fd6f..5b2e38567a0c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5620,6 +5620,8 @@ SYSCALL_DEFINE0(sched_yield) #ifndef CONFIG_PREEMPTION int __sched _cond_resched(void) { + if (current->flags & PF_WQ_WORKER) + workqueue_cond_resched(); if (should_resched(0)) { preempt_schedule_common(); return 1; @@ -5643,6 +5645,8 @@ int __cond_resched_lock(spinlock_t *lock) int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; + if (current->flags & PF_WQ_WORKER) + workqueue_cond_resched(); lockdep_assert_held(lock); if (spin_needbreak(lock) || resched) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 891ccad5f271..fd2e9557b1a6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2296,7 +2296,7 @@ __acquires(&pool->lock) spin_lock_irq(&pool->lock); /* clear cpu intensive status */ - if (unlikely(cpu_intensive)) + if (unlikely(worker->flags & WORKER_CPU_INTENSIVE)) worker_clr_flags(worker, WORKER_CPU_INTENSIVE); /* tag the worker for identification in schedule() */ @@ -5330,6 +5330,20 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) return ret; } +void workqueue_cond_resched(void) +{ + struct worker *worker = current_wq_worker(); + + if (worker && !(worker->flags & WORKER_CPU_INTENSIVE)) { + struct worker_pool *pool = worker->pool; + + worker_set_flags(worker, WORKER_CPU_INTENSIVE); + if (need_more_worker(pool)) + wake_up_worker(pool); + } +} +EXPORT_SYMBOL(workqueue_cond_resched); + #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 13cc653122b7..0d5720ecbfe7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4420,17 +4420,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, } out: - /* - * Memory allocation/reclaim might be called from a WQ context and the - * current implementation of the WQ concurrency control doesn't - * recognize that a particular WQ is congested if the worker thread is - * looping without ever sleeping. Therefore we have to do a short sleep - * here rather than calling cond_resched(). - */ - if (current->flags & PF_WQ_WORKER) - schedule_timeout_uninterruptible(1); - else - cond_resched(); + cond_resched(); return ret; } -- 2.29.2