On 19/03/2020 21:56, Jens Axboe wrote: > We always punt async buffered writes to an io-wq helper, as the core > kernel does not have IOCB_NOWAIT support for that. Most buffered async > writes complete very quickly, as it's just a copy operation. This means > that doing multiple locking roundtrips on the shared wqe lock for each > buffered write is wasteful. Additionally, buffered writes are hashed > work items, which means that any buffered write to a given file is > serialized. > > When looking for a new work item, build a chain of identicaly hashed > work items, and then hand back that batch. Until the batch is done, the > caller doesn't have to synchronize with the wqe or worker locks again. > > Signed-off-by: Jens Axboe > > --- > > Changes: > - Don't overwrite passed back work > > > diff --git a/fs/io-wq.c b/fs/io-wq.c > index 9541df2729de..8402c6e417e1 100644 > --- a/fs/io-wq.c > +++ b/fs/io-wq.c > @@ -380,32 +380,65 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work) > return work->flags >> IO_WQ_HASH_SHIFT; > } > > -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) > +/* > + * Returns the next work item to process, if any. For hashed work that hash > + * to the same key, we can't process N+1 before N is done. To make the > + * processing more efficient, return N+1 and later identically hashed work > + * in the passed in list. This avoids repeated hammering on the wqe lock for, > + * as the caller can just process items in the on-stack list. > + */ > +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, > + struct io_wq_work_list *list) > __must_hold(wqe->lock) > { > - struct io_wq_work_node *node, *prev; > - struct io_wq_work *work; > - unsigned int hash; > + struct io_wq_work *ret = NULL; > > - wq_list_for_each(node, prev, &wqe->work_list) { > - work = container_of(node, struct io_wq_work, list); > + do { > + unsigned int new_hash, hash; > + struct io_wq_work *work; > + > + work = wq_first_entry(&wqe->work_list, struct io_wq_work, list); > + if (!work) > + break; > > /* not hashed, can run anytime */ > if (!io_wq_is_hashed(work)) { > - wq_node_del(&wqe->work_list, node, prev); > - return work; > + /* already have hashed work, let new worker get this */ > + if (ret) { > + struct io_wqe_acct *acct; > + > + /* get new worker for unhashed, if none now */ > + acct = io_work_get_acct(wqe, work); > + if (!atomic_read(&acct->nr_running)) > + io_wqe_wake_worker(wqe, acct); > + break; > + } > + wq_node_del(&wqe->work_list, &work->list); > + ret = work; > + break; > } > > /* hashed, can run if not already running */ > - hash = io_get_work_hash(work); > - if (!(wqe->hash_map & BIT(hash))) { > + new_hash = io_get_work_hash(work); > + if (wqe->hash_map & BIT(new_hash)) > + break; This will always break for subsequent hashed, as the @hash_map bit is set. Isn't it? And anyway, it seems it doesn't optimise not-contiguous same-hashed requests, e.g. 0: Link(hash=0) 1: Link(hash=1) 2: Link(hash=0) 3: Link(not_hashed) 4: Link(hash=0) ... > + > + if (!ret) { > + hash = new_hash; > wqe->hash_map |= BIT(hash); > - wq_node_del(&wqe->work_list, node, prev); > - return work; > + } else if (hash != new_hash) { > + break; > } > - } > > - return NULL; > + wq_node_del(&wqe->work_list, &work->list); > + /* return first node, add subsequent same hash to the list */ > + if (!ret) > + ret = work; > + else > + wq_list_add_tail(&work->list, list); > + } while (1); > + > + return ret; > } > > static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) > @@ -481,6 +514,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); > static void io_worker_handle_work(struct io_worker *worker) > __releases(wqe->lock) > { > + struct io_wq_work_list list = { .first = NULL, .last = NULL }; > struct io_wqe *wqe = worker->wqe; > struct io_wq *wq = wqe->wq; > > @@ -495,7 +529,7 @@ static void io_worker_handle_work(struct io_worker *worker) > * can't make progress, any work completion or insertion will > * clear the stalled flag. > */ > - work = io_get_next_work(wqe); > + work = io_get_next_work(wqe, &list); > if (work) > __io_worker_busy(wqe, worker, work); > else if (!wq_list_empty(&wqe->work_list)) > @@ -504,6 +538,7 @@ static void io_worker_handle_work(struct io_worker *worker) > spin_unlock_irq(&wqe->lock); > if (!work) > break; > +got_work: > io_assign_current_work(worker, work); > > /* handle a whole dependent link */ > @@ -530,6 +565,24 @@ static void io_worker_handle_work(struct io_worker *worker) > work = NULL; > } > if (hash != -1U) { > + /* > + * If the local list is non-empty, then we > + * have work that hashed to the same key. > + * No need for a lock round-trip, or fiddling > + * the the free/busy state of the worker, or > + * clearing the hashed state. Just process the > + * next one. > + */ > + if (!work) { > + work = wq_first_entry(&list, > + struct io_wq_work, > + list); Wouldn't it just drop a linked request? Probably works because of the comment above. > + if (work) { > + wq_node_del(&list, &work->list); There is a bug, apparently from one of my commits, where it do io_assign_current_work() but then re-enqueue and reassign new work, though there is a gap for cancel to happen, which would screw everything up. I'll send a patch, so it'd be more clear. However, this is a good point to look after for this as well. > + goto got_work; > + } > + } > + > spin_lock_irq(&wqe->lock); > wqe->hash_map &= ~BIT_ULL(hash); > wqe->flags &= ~IO_WQE_FLAG_STALLED; > @@ -910,7 +963,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, > work = container_of(node, struct io_wq_work, list); > > if (match->fn(work, match->data)) { > - wq_node_del(&wqe->work_list, node, prev); > + __wq_node_del(&wqe->work_list, node, prev); > found = true; > break; > } > diff --git a/fs/io-wq.h b/fs/io-wq.h > index 298b21f4a4d2..9a194339bd9d 100644 > --- a/fs/io-wq.h > +++ b/fs/io-wq.h > @@ -40,9 +40,9 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, > } > } > > -static inline void wq_node_del(struct io_wq_work_list *list, > - struct io_wq_work_node *node, > - struct io_wq_work_node *prev) > +static inline void __wq_node_del(struct io_wq_work_list *list, > + struct io_wq_work_node *node, > + struct io_wq_work_node *prev) > { > if (node == list->first) > WRITE_ONCE(list->first, node->next); > @@ -53,6 +53,21 @@ static inline void wq_node_del(struct io_wq_work_list *list, > node->next = NULL; > } > > + > +static inline void wq_node_del(struct io_wq_work_list *list, > + struct io_wq_work_node *node) > +{ > + __wq_node_del(list, node, NULL); > +} > + > +#define wq_first_entry(list, type, member) \ > +({ \ > + struct io_wq_work *__work = NULL; \ > + if (!wq_list_empty((list))) \ > + __work = container_of((list)->first, type, member); \ > + __work; \ > +}) > + > #define wq_list_for_each(pos, prv, head) \ > for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) > > -- Pavel Begunkov