On 22/03/2020 19:24, Pavel Begunkov wrote: > On 22/03/2020 19:09, Pavel Begunkov wrote: >> On 19/03/2020 21:56, Jens Axboe wrote: >>> We always punt async buffered writes to an io-wq helper, as the core >>> kernel does not have IOCB_NOWAIT support for that. Most buffered async >>> writes complete very quickly, as it's just a copy operation. This means >>> that doing multiple locking roundtrips on the shared wqe lock for each >>> buffered write is wasteful. Additionally, buffered writes are hashed >>> work items, which means that any buffered write to a given file is >>> serialized. >>> >>> When looking for a new work item, build a chain of identicaly hashed >>> work items, and then hand back that batch. Until the batch is done, the >>> caller doesn't have to synchronize with the wqe or worker locks again. > > I have an idea, how to do it a bit better. Let me try it. The diff below is buggy (Ooopses somewhere in blk-mq for read-write.c:read_poll_link), I'll double check it on a fresh head. The idea is to keep same-hashed works continuously in @work_list, so they can be spliced in one go. For each hash bucket, I keep last added work - on enqueue, it adds a work after the last one - on dequeue it splices [first, last]. Where @first is the current one, because of how we traverse @work_list. It throws a bit of extra memory, but - removes extra looping - and also takes all hashed requests, but not only sequentially submitted e.g. for the following submission sequence, it will take all (hash=0) requests. REQ(hash=0) REQ(hash=1) REQ(hash=0) REQ() REQ(hash=0) ... Please, tell if you see a hole in the concept. And as said, there is still a bug somewhere. --- fs/io-wq.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ fs/io-wq.h | 34 +++++++++++++++++++++++++++------- 2 files changed, 70 insertions(+), 13 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index b3fb61ec0870..00d8f8b12df3 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -69,6 +69,8 @@ struct io_worker { #define IO_WQ_HASH_ORDER 5 #endif +#define IO_WQ_HASH_NR_BUCKETS (1u << IO_WQ_HASH_ORDER) + struct io_wqe_acct { unsigned nr_workers; unsigned max_workers; @@ -98,6 +100,7 @@ struct io_wqe { struct list_head all_list; struct io_wq *wq; + struct io_wq_work *last_hashed[IO_WQ_HASH_NR_BUCKETS]; }; /* @@ -400,7 +403,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) hash = io_get_work_hash(work); if (!(wqe->hash_map & BIT(hash))) { wqe->hash_map |= BIT(hash); - wq_node_del(&wqe->work_list, node, prev); + wq_del_node_range(&wqe->work_list, + &wqe->last_hashed[hash]->list, prev); + wqe->last_hashed[hash] = NULL; return work; } } @@ -508,7 +513,11 @@ static void io_worker_handle_work(struct io_worker *worker) /* handle a whole dependent link */ do { - struct io_wq_work *old_work; + struct io_wq_work *old_work, *next_hashed = NULL; + + if (work->list.next) + next_hashed = container_of(work->list.next, + struct io_wq_work, list); io_impersonate_work(worker, work); /* @@ -523,12 +532,20 @@ static void io_worker_handle_work(struct io_worker *worker) work->func(&work); work = (old_work == work) ? NULL : work; - assign_work = work; - if (work && io_wq_is_hashed(work)) - assign_work = NULL; + assign_work = next_hashed; + if (!next_hashed && work && !io_wq_is_hashed(work)) + assign_work = work; + io_assign_current_work(worker, assign_work); wq->free_work(old_work); + if (next_hashed) { + if (work) + io_wqe_enqueue(wqe, work); + work = next_hashed; + continue; + } + if (work && !assign_work) { io_wqe_enqueue(wqe, work); work = NULL; @@ -776,6 +793,26 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) } while (work); } +static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) +{ + int hash; + struct io_wq_work *last_hashed; + + if (!io_wq_is_hashed(work)) { +append: + wq_list_add_tail(&work->list, &wqe->work_list); + return; + } + + hash = io_get_work_hash(work); + last_hashed = wqe->last_hashed[hash]; + wqe->last_hashed[hash] = work; + if (!last_hashed) + goto append; + + wq_list_add_after(&work->list, &last_hashed->list, &wqe->work_list); +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); @@ -795,7 +832,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) work_flags = work->flags; spin_lock_irqsave(&wqe->lock, flags); - wq_list_add_tail(&work->list, &wqe->work_list); + io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); diff --git a/fs/io-wq.h b/fs/io-wq.h index 298b21f4a4d2..10367b6238da 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -40,17 +40,37 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } +static inline void wq_list_add_after(struct io_wq_work_node *node, + struct io_wq_work_node *pos, + struct io_wq_work_list *list) +{ + struct io_wq_work_node *next = pos->next; + + pos->next = node; + node->next = next; + if (!next) + list->last = node; +} + +static inline void wq_del_node_range(struct io_wq_work_list *list, + struct io_wq_work_node *last, + struct io_wq_work_node *prev) +{ + if (!prev) + WRITE_ONCE(list->first, last->next); + else + prev->next = last->next; + + if (last == list->last) + list->last = prev; + last->next = NULL; +} + static inline void wq_node_del(struct io_wq_work_list *list, struct io_wq_work_node *node, struct io_wq_work_node *prev) { - if (node == list->first) - WRITE_ONCE(list->first, node->next); - if (node == list->last) - list->last = prev; - if (prev) - prev->next = node->next; - node->next = NULL; + wq_del_node_range(list, node, prev); } #define wq_list_for_each(pos, prv, head) \ -- 2.24.0