Re: [PATCH 2/2] io-wq: ensure task is valid before sending it a signal

From: "Paul E. McKenney" <paulmck@kernel.org>
To: Jens Axboe <axboe@kernel.dk>
Cc: io-uring@vger.kernel.org
Subject: Re: [PATCH 2/2] io-wq: ensure task is valid before sending it a signal
Date: Wed, 13 Nov 2019 11:55:57 -0800	[thread overview]
Message-ID: <20191113195557.GG2865@paulmck-ThinkPad-P72> (raw)
In-Reply-To: <20191113194355.12107-3-axboe@kernel.dk>

On Wed, Nov 13, 2019 at 12:43:55PM -0700, Jens Axboe wrote:
> While we're always under RCU read protection when finding the worker
> to signal, that only protects the worker from being freed. The task
> could very well be exiting, if we get unlucky enough.
> 
> Same is true for ->cur_work, which is currently under protection by
> the wqe->lock that this worker belongs to. Add a specific worker lock
> that protects whether the task is exiting and also the current work
> item. Then we can guarantee that the task we're sending a signal is:
> 
> 1) Currently processing the exact work we think it is
> 2) It's not currently exiting
> 
> It's important to not use the wqe->lock for ->cur_work, as we can run
> into lock ordering issues with io_poll_wake() being called under the
> signal lock if we're polling a signal fd, and io_poll_wake() then
> needing to call io_wq_enqueue() which grabs wqe->lock. For cancel, the
> ordering is exactly the opposite.
> 
> Reported-by: Paul E. McKenney <paulmck@kernel.org>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>

> ---
>  fs/io-wq.c | 66 ++++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 44 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 26d81540c1fc..f035460b9776 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -49,7 +49,10 @@ struct io_worker {
>  	struct task_struct *task;
>  	wait_queue_head_t wait;
>  	struct io_wqe *wqe;
> +
>  	struct io_wq_work *cur_work;
> +	spinlock_t lock;
> +	int exiting;
>  
>  	struct rcu_head rcu;
>  	struct mm_struct *mm;
> @@ -223,6 +226,10 @@ static void io_worker_exit(struct io_worker *worker)
>  	if (!nr_workers && refcount_dec_and_test(&wqe->wq->refs))
>  		complete(&wqe->wq->done);
>  
> +	spin_lock_irq(&worker->lock);
> +	worker->exiting = true;
> +	spin_unlock_irq(&worker->lock);
> +
>  	kfree_rcu(worker, rcu);
>  }
>  
> @@ -323,7 +330,6 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
>  		hlist_nulls_add_head_rcu(&worker->nulls_node,
>  						&wqe->busy_list.head);
>  	}
> -	worker->cur_work = work;
>  
>  	/*
>  	 * If worker is moving from bound to unbound (or vice versa), then
> @@ -402,17 +408,6 @@ static void io_worker_handle_work(struct io_worker *worker)
>  	do {
>  		unsigned hash = -1U;
>  
> -		/*
> -		 * Signals are either sent to cancel specific work, or to just
> -		 * cancel all work items. For the former, ->cur_work must
> -		 * match. ->cur_work is NULL at this point, since we haven't
> -		 * assigned any work, so it's safe to flush signals for that
> -		 * case. For the latter case of cancelling all work, the caller
> -		 * wil have set IO_WQ_BIT_CANCEL.
> -		 */
> -		if (signal_pending(current))
> -			flush_signals(current);
> -
>  		/*
>  		 * If we got some work, mark us as busy. If we didn't, but
>  		 * the list isn't empty, it means we stalled on hashed work.
> @@ -432,6 +427,14 @@ static void io_worker_handle_work(struct io_worker *worker)
>  		if (!work)
>  			break;
>  next:
> +		/* flush any pending signals before assigning new work */
> +		if (signal_pending(current))
> +			flush_signals(current);
> +
> +		spin_lock_irq(&worker->lock);
> +		worker->cur_work = work;
> +		spin_unlock_irq(&worker->lock);
> +
>  		if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
>  		    current->files != work->files) {
>  			task_lock(current);
> @@ -457,8 +460,12 @@ static void io_worker_handle_work(struct io_worker *worker)
>  		old_work = work;
>  		work->func(&work);
>  
> -		spin_lock_irq(&wqe->lock);
> +		spin_lock_irq(&worker->lock);
>  		worker->cur_work = NULL;
> +		spin_unlock_irq(&worker->lock);
> +
> +		spin_lock_irq(&wqe->lock);
> +
>  		if (hash != -1U) {
>  			wqe->hash_map &= ~BIT_ULL(hash);
>  			wqe->flags &= ~IO_WQE_FLAG_STALLED;
> @@ -577,6 +584,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
>  	worker->nulls_node.pprev = NULL;
>  	init_waitqueue_head(&worker->wait);
>  	worker->wqe = wqe;
> +	spin_lock_init(&worker->lock);
>  
>  	worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node,
>  				"io_wqe_worker-%d/%d", index, wqe->node);
> @@ -721,7 +729,10 @@ void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val)
>  
>  static bool io_wqe_worker_send_sig(struct io_worker *worker, void *data)
>  {
> -	send_sig(SIGINT, worker->task, 1);
> +	spin_lock_irq(&worker->lock);
> +	if (!worker->exiting)
> +		send_sig(SIGINT, worker->task, 1);
> +	spin_unlock_irq(&worker->lock);
>  	return false;
>  }
>  
> @@ -783,7 +794,6 @@ struct io_cb_cancel_data {
>  static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
>  {
>  	struct io_cb_cancel_data *data = cancel_data;
> -	struct io_wqe *wqe = data->wqe;
>  	unsigned long flags;
>  	bool ret = false;
>  
> @@ -791,13 +801,14 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
>  	 * Hold the lock to avoid ->cur_work going out of scope, caller
>  	 * may deference the passed in work.
>  	 */
> -	spin_lock_irqsave(&wqe->lock, flags);
> +	spin_lock_irqsave(&worker->lock, flags);
>  	if (worker->cur_work &&
>  	    data->cancel(worker->cur_work, data->caller_data)) {
> -		send_sig(SIGINT, worker->task, 1);
> +		if (!worker->exiting)
> +			send_sig(SIGINT, worker->task, 1);
>  		ret = true;
>  	}
> -	spin_unlock_irqrestore(&wqe->lock, flags);
> +	spin_unlock_irqrestore(&worker->lock, flags);
>  
>  	return ret;
>  }
> @@ -864,13 +875,21 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
>  static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
>  {
>  	struct io_wq_work *work = data;
> +	unsigned long flags;
> +	bool ret = false;
> +
> +	if (worker->cur_work != work)
> +		return false;
>  
> +	spin_lock_irqsave(&worker->lock, flags);
>  	if (worker->cur_work == work) {
> -		send_sig(SIGINT, worker->task, 1);
> -		return true;
> +		if (!worker->exiting)
> +			send_sig(SIGINT, worker->task, 1);
> +		ret = true;
>  	}
> +	spin_unlock_irqrestore(&worker->lock, flags);
>  
> -	return false;
> +	return ret;
>  }
>  
>  static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
> @@ -1049,7 +1068,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
>  
>  static bool io_wq_worker_wake(struct io_worker *worker, void *data)
>  {
> -	wake_up_process(worker->task);
> +	spin_lock_irq(&worker->lock);
> +	if (!worker->exiting)
> +		wake_up_process(worker->task);
> +	spin_unlock_irq(&worker->lock);
>  	return false;
>  }
>  
> -- 
> 2.24.0
>