On 19/03/2020 21:56, Jens Axboe wrote:
> We always punt async buffered writes to an io-wq helper, as the core
> kernel does not have IOCB_NOWAIT support for that. Most buffered async
> writes complete very quickly, as it's just a copy operation. This means
> that doing multiple locking roundtrips on the shared wqe lock for each
> buffered write is wasteful. Additionally, buffered writes are hashed
> work items, which means that any buffered write to a given file is
> serialized.
> 
> When looking for a new work item, build a chain of identicaly hashed
> work items, and then hand back that batch. Until the batch is done, the
> caller doesn't have to synchronize with the wqe or worker locks again.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> ---
> 
> Changes:
> - Don't overwrite passed back work
> 
> 
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 9541df2729de..8402c6e417e1 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -380,32 +380,65 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
>  	return work->flags >> IO_WQ_HASH_SHIFT;
>  }
>  
> -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
> +/*
> + * Returns the next work item to process, if any. For hashed work that hash
> + * to the same key, we can't process N+1 before N is done. To make the
> + * processing more efficient, return N+1 and later identically hashed work
> + * in the passed in list. This avoids repeated hammering on the wqe lock for,
> + * as the caller can just process items in the on-stack list.
> + */
> +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
> +					   struct io_wq_work_list *list)
>  	__must_hold(wqe->lock)
>  {
> -	struct io_wq_work_node *node, *prev;
> -	struct io_wq_work *work;
> -	unsigned int hash;
> +	struct io_wq_work *ret = NULL;
>  
> -	wq_list_for_each(node, prev, &wqe->work_list) {
> -		work = container_of(node, struct io_wq_work, list);
> +	do {
> +		unsigned int new_hash, hash;
> +		struct io_wq_work *work;
> +
> +		work = wq_first_entry(&wqe->work_list, struct io_wq_work, list);
> +		if (!work)
> +			break;
>  
>  		/* not hashed, can run anytime */
>  		if (!io_wq_is_hashed(work)) {
> -			wq_node_del(&wqe->work_list, node, prev);
> -			return work;
> +			/* already have hashed work, let new worker get this */
> +			if (ret) {
> +				struct io_wqe_acct *acct;
> +
> +				/* get new worker for unhashed, if none now */
> +				acct = io_work_get_acct(wqe, work);
> +				if (!atomic_read(&acct->nr_running))
> +					io_wqe_wake_worker(wqe, acct);
> +				break;
> +			}
> +			wq_node_del(&wqe->work_list, &work->list);
> +			ret = work;
> +			break;
>  		}
>  
>  		/* hashed, can run if not already running */
> -		hash = io_get_work_hash(work);
> -		if (!(wqe->hash_map & BIT(hash))) {
> +		new_hash = io_get_work_hash(work);
> +		if (wqe->hash_map & BIT(new_hash))
> +			break;

This will always break for subsequent hashed, as the @hash_map bit is set.
Isn't it? And anyway, it seems it doesn't optimise not-contiguous same-hashed
requests, e.g.

0: Link(hash=0)
1: Link(hash=1)
2: Link(hash=0)
3: Link(not_hashed)
4: Link(hash=0)
...

> +
> +		if (!ret) {
> +			hash = new_hash;
>  			wqe->hash_map |= BIT(hash);
> -			wq_node_del(&wqe->work_list, node, prev);
> -			return work;
> +		} else if (hash != new_hash) {
> +			break;
>  		}
> -	}
>  
> -	return NULL;
> +		wq_node_del(&wqe->work_list, &work->list);
> +		/* return first node, add subsequent same hash to the list */
> +		if (!ret)
> +			ret = work;
> +		else
> +			wq_list_add_tail(&work->list, list);
> +	} while (1);
> +
> +	return ret;
>  }
>  
>  static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
> @@ -481,6 +514,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
>  static void io_worker_handle_work(struct io_worker *worker)
>  	__releases(wqe->lock)
>  {
> +	struct io_wq_work_list list = { .first = NULL, .last = NULL };
>  	struct io_wqe *wqe = worker->wqe;
>  	struct io_wq *wq = wqe->wq;
>  
> @@ -495,7 +529,7 @@ static void io_worker_handle_work(struct io_worker *worker)
>  		 * can't make progress, any work completion or insertion will
>  		 * clear the stalled flag.
>  		 */
> -		work = io_get_next_work(wqe);
> +		work = io_get_next_work(wqe, &list);
>  		if (work)
>  			__io_worker_busy(wqe, worker, work);
>  		else if (!wq_list_empty(&wqe->work_list))
> @@ -504,6 +538,7 @@ static void io_worker_handle_work(struct io_worker *worker)
>  		spin_unlock_irq(&wqe->lock);
>  		if (!work)
>  			break;
> +got_work:
>  		io_assign_current_work(worker, work);
>  
>  		/* handle a whole dependent link */
> @@ -530,6 +565,24 @@ static void io_worker_handle_work(struct io_worker *worker)
>  				work = NULL;
>  			}
>  			if (hash != -1U) {
> +				/*
> +				 * If the local list is non-empty, then we
> +				 * have work that hashed to the same key.
> +				 * No need for a lock round-trip, or fiddling
> +				 * the the free/busy state of the worker, or
> +				 * clearing the hashed state. Just process the
> +				 * next one.
> +				 */
> +				if (!work) {
> +					work = wq_first_entry(&list,
> +							      struct io_wq_work,
> +							      list);

Wouldn't it just drop a linked request? Probably works because of the comment above.

> +					if (work) {
> +						wq_node_del(&list, &work->list);

There is a bug, apparently from one of my commits, where it do
io_assign_current_work() but then re-enqueue and reassign new work, though there
is a gap for cancel to happen, which would screw everything up.

I'll send a patch, so it'd be more clear. However, this is a good point to look
after for this as well.

> +						goto got_work;
> +					}
> +				}
> +
>  				spin_lock_irq(&wqe->lock);
>  				wqe->hash_map &= ~BIT_ULL(hash);
>  				wqe->flags &= ~IO_WQE_FLAG_STALLED;
> @@ -910,7 +963,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
>  		work = container_of(node, struct io_wq_work, list);
>  
>  		if (match->fn(work, match->data)) {
> -			wq_node_del(&wqe->work_list, node, prev);
> +			__wq_node_del(&wqe->work_list, node, prev);
>  			found = true;
>  			break;
>  		}
> diff --git a/fs/io-wq.h b/fs/io-wq.h
> index 298b21f4a4d2..9a194339bd9d 100644
> --- a/fs/io-wq.h
> +++ b/fs/io-wq.h
> @@ -40,9 +40,9 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
>  	}
>  }
>  
> -static inline void wq_node_del(struct io_wq_work_list *list,
> -			       struct io_wq_work_node *node,
> -			       struct io_wq_work_node *prev)
> +static inline void __wq_node_del(struct io_wq_work_list *list,
> +				struct io_wq_work_node *node,
> +				struct io_wq_work_node *prev)
>  {
>  	if (node == list->first)
>  		WRITE_ONCE(list->first, node->next);
> @@ -53,6 +53,21 @@ static inline void wq_node_del(struct io_wq_work_list *list,
>  	node->next = NULL;
>  }
>  
> +
> +static inline void wq_node_del(struct io_wq_work_list *list,
> +			       struct io_wq_work_node *node)
> +{
> +	__wq_node_del(list, node, NULL);
> +}
> +
> +#define wq_first_entry(list, type, member)				\
> +({									\
> +	struct io_wq_work *__work = NULL;				\
> +	if (!wq_list_empty((list)))					\
> +		__work = container_of((list)->first, type, member);	\
> +	__work;								\
> +})
> +
>  #define wq_list_for_each(pos, prv, head)			\
>  	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
>  
> 

-- 
Pavel Begunkov