We always punt async buffered writes to an io-wq helper, as the core kernel does not have IOCB_NOWAIT support for that. Most buffered async writes complete very quickly, as it's just a copy operation. This means that doing multiple locking roundtrips on the shared wqe lock for each buffered write is wasteful. Additionally, buffered writes are hashed work items, which means that any buffered write to a given file is serialized. When looking for a new work item, build a chain of identicaly hashed work items, and then hand back that batch. Until the batch is done, the caller doesn't have to synchronize with the wqe or worker locks again. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- Changes: - Don't overwrite passed back work diff --git a/fs/io-wq.c b/fs/io-wq.c index 9541df2729de..8402c6e417e1 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -380,32 +380,65 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work) return work->flags >> IO_WQ_HASH_SHIFT; } -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) +/* + * Returns the next work item to process, if any. For hashed work that hash + * to the same key, we can't process N+1 before N is done. To make the + * processing more efficient, return N+1 and later identically hashed work + * in the passed in list. This avoids repeated hammering on the wqe lock for, + * as the caller can just process items in the on-stack list. + */ +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, + struct io_wq_work_list *list) __must_hold(wqe->lock) { - struct io_wq_work_node *node, *prev; - struct io_wq_work *work; - unsigned int hash; + struct io_wq_work *ret = NULL; - wq_list_for_each(node, prev, &wqe->work_list) { - work = container_of(node, struct io_wq_work, list); + do { + unsigned int new_hash, hash; + struct io_wq_work *work; + + work = wq_first_entry(&wqe->work_list, struct io_wq_work, list); + if (!work) + break; /* not hashed, can run anytime */ if (!io_wq_is_hashed(work)) { - wq_node_del(&wqe->work_list, node, prev); - return work; + /* already have hashed work, let new worker get this */ + if (ret) { + struct io_wqe_acct *acct; + + /* get new worker for unhashed, if none now */ + acct = io_work_get_acct(wqe, work); + if (!atomic_read(&acct->nr_running)) + io_wqe_wake_worker(wqe, acct); + break; + } + wq_node_del(&wqe->work_list, &work->list); + ret = work; + break; } /* hashed, can run if not already running */ - hash = io_get_work_hash(work); - if (!(wqe->hash_map & BIT(hash))) { + new_hash = io_get_work_hash(work); + if (wqe->hash_map & BIT(new_hash)) + break; + + if (!ret) { + hash = new_hash; wqe->hash_map |= BIT(hash); - wq_node_del(&wqe->work_list, node, prev); - return work; + } else if (hash != new_hash) { + break; } - } - return NULL; + wq_node_del(&wqe->work_list, &work->list); + /* return first node, add subsequent same hash to the list */ + if (!ret) + ret = work; + else + wq_list_add_tail(&work->list, list); + } while (1); + + return ret; } static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) @@ -481,6 +514,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); static void io_worker_handle_work(struct io_worker *worker) __releases(wqe->lock) { + struct io_wq_work_list list = { .first = NULL, .last = NULL }; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; @@ -495,7 +529,7 @@ static void io_worker_handle_work(struct io_worker *worker) * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(wqe); + work = io_get_next_work(wqe, &list); if (work) __io_worker_busy(wqe, worker, work); else if (!wq_list_empty(&wqe->work_list)) @@ -504,6 +538,7 @@ static void io_worker_handle_work(struct io_worker *worker) spin_unlock_irq(&wqe->lock); if (!work) break; +got_work: io_assign_current_work(worker, work); /* handle a whole dependent link */ @@ -530,6 +565,24 @@ static void io_worker_handle_work(struct io_worker *worker) work = NULL; } if (hash != -1U) { + /* + * If the local list is non-empty, then we + * have work that hashed to the same key. + * No need for a lock round-trip, or fiddling + * the the free/busy state of the worker, or + * clearing the hashed state. Just process the + * next one. + */ + if (!work) { + work = wq_first_entry(&list, + struct io_wq_work, + list); + if (work) { + wq_node_del(&list, &work->list); + goto got_work; + } + } + spin_lock_irq(&wqe->lock); wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; @@ -910,7 +963,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, work = container_of(node, struct io_wq_work, list); if (match->fn(work, match->data)) { - wq_node_del(&wqe->work_list, node, prev); + __wq_node_del(&wqe->work_list, node, prev); found = true; break; } diff --git a/fs/io-wq.h b/fs/io-wq.h index 298b21f4a4d2..9a194339bd9d 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -40,9 +40,9 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } -static inline void wq_node_del(struct io_wq_work_list *list, - struct io_wq_work_node *node, - struct io_wq_work_node *prev) +static inline void __wq_node_del(struct io_wq_work_list *list, + struct io_wq_work_node *node, + struct io_wq_work_node *prev) { if (node == list->first) WRITE_ONCE(list->first, node->next); @@ -53,6 +53,21 @@ static inline void wq_node_del(struct io_wq_work_list *list, node->next = NULL; } + +static inline void wq_node_del(struct io_wq_work_list *list, + struct io_wq_work_node *node) +{ + __wq_node_del(list, node, NULL); +} + +#define wq_first_entry(list, type, member) \ +({ \ + struct io_wq_work *__work = NULL; \ + if (!wq_list_empty((list))) \ + __work = container_of((list)->first, type, member); \ + __work; \ +}) + #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) -- Jens Axboe
[-- Attachment #1.1: Type: text/plain, Size: 7627 bytes --] On 19/03/2020 21:56, Jens Axboe wrote: > We always punt async buffered writes to an io-wq helper, as the core > kernel does not have IOCB_NOWAIT support for that. Most buffered async > writes complete very quickly, as it's just a copy operation. This means > that doing multiple locking roundtrips on the shared wqe lock for each > buffered write is wasteful. Additionally, buffered writes are hashed > work items, which means that any buffered write to a given file is > serialized. > > When looking for a new work item, build a chain of identicaly hashed > work items, and then hand back that batch. Until the batch is done, the > caller doesn't have to synchronize with the wqe or worker locks again. > > Signed-off-by: Jens Axboe <axboe@kernel.dk> > > --- > > Changes: > - Don't overwrite passed back work > > > diff --git a/fs/io-wq.c b/fs/io-wq.c > index 9541df2729de..8402c6e417e1 100644 > --- a/fs/io-wq.c > +++ b/fs/io-wq.c > @@ -380,32 +380,65 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work) > return work->flags >> IO_WQ_HASH_SHIFT; > } > > -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) > +/* > + * Returns the next work item to process, if any. For hashed work that hash > + * to the same key, we can't process N+1 before N is done. To make the > + * processing more efficient, return N+1 and later identically hashed work > + * in the passed in list. This avoids repeated hammering on the wqe lock for, > + * as the caller can just process items in the on-stack list. > + */ > +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, > + struct io_wq_work_list *list) > __must_hold(wqe->lock) > { > - struct io_wq_work_node *node, *prev; > - struct io_wq_work *work; > - unsigned int hash; > + struct io_wq_work *ret = NULL; > > - wq_list_for_each(node, prev, &wqe->work_list) { > - work = container_of(node, struct io_wq_work, list); > + do { > + unsigned int new_hash, hash; > + struct io_wq_work *work; > + > + work = wq_first_entry(&wqe->work_list, struct io_wq_work, list); > + if (!work) > + break; > > /* not hashed, can run anytime */ > if (!io_wq_is_hashed(work)) { > - wq_node_del(&wqe->work_list, node, prev); > - return work; > + /* already have hashed work, let new worker get this */ > + if (ret) { > + struct io_wqe_acct *acct; > + > + /* get new worker for unhashed, if none now */ > + acct = io_work_get_acct(wqe, work); > + if (!atomic_read(&acct->nr_running)) > + io_wqe_wake_worker(wqe, acct); > + break; > + } > + wq_node_del(&wqe->work_list, &work->list); > + ret = work; > + break; > } > > /* hashed, can run if not already running */ > - hash = io_get_work_hash(work); > - if (!(wqe->hash_map & BIT(hash))) { > + new_hash = io_get_work_hash(work); > + if (wqe->hash_map & BIT(new_hash)) > + break; This will always break for subsequent hashed, as the @hash_map bit is set. Isn't it? And anyway, it seems it doesn't optimise not-contiguous same-hashed requests, e.g. 0: Link(hash=0) 1: Link(hash=1) 2: Link(hash=0) 3: Link(not_hashed) 4: Link(hash=0) ... > + > + if (!ret) { > + hash = new_hash; > wqe->hash_map |= BIT(hash); > - wq_node_del(&wqe->work_list, node, prev); > - return work; > + } else if (hash != new_hash) { > + break; > } > - } > > - return NULL; > + wq_node_del(&wqe->work_list, &work->list); > + /* return first node, add subsequent same hash to the list */ > + if (!ret) > + ret = work; > + else > + wq_list_add_tail(&work->list, list); > + } while (1); > + > + return ret; > } > > static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) > @@ -481,6 +514,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); > static void io_worker_handle_work(struct io_worker *worker) > __releases(wqe->lock) > { > + struct io_wq_work_list list = { .first = NULL, .last = NULL }; > struct io_wqe *wqe = worker->wqe; > struct io_wq *wq = wqe->wq; > > @@ -495,7 +529,7 @@ static void io_worker_handle_work(struct io_worker *worker) > * can't make progress, any work completion or insertion will > * clear the stalled flag. > */ > - work = io_get_next_work(wqe); > + work = io_get_next_work(wqe, &list); > if (work) > __io_worker_busy(wqe, worker, work); > else if (!wq_list_empty(&wqe->work_list)) > @@ -504,6 +538,7 @@ static void io_worker_handle_work(struct io_worker *worker) > spin_unlock_irq(&wqe->lock); > if (!work) > break; > +got_work: > io_assign_current_work(worker, work); > > /* handle a whole dependent link */ > @@ -530,6 +565,24 @@ static void io_worker_handle_work(struct io_worker *worker) > work = NULL; > } > if (hash != -1U) { > + /* > + * If the local list is non-empty, then we > + * have work that hashed to the same key. > + * No need for a lock round-trip, or fiddling > + * the the free/busy state of the worker, or > + * clearing the hashed state. Just process the > + * next one. > + */ > + if (!work) { > + work = wq_first_entry(&list, > + struct io_wq_work, > + list); Wouldn't it just drop a linked request? Probably works because of the comment above. > + if (work) { > + wq_node_del(&list, &work->list); There is a bug, apparently from one of my commits, where it do io_assign_current_work() but then re-enqueue and reassign new work, though there is a gap for cancel to happen, which would screw everything up. I'll send a patch, so it'd be more clear. However, this is a good point to look after for this as well. > + goto got_work; > + } > + } > + > spin_lock_irq(&wqe->lock); > wqe->hash_map &= ~BIT_ULL(hash); > wqe->flags &= ~IO_WQE_FLAG_STALLED; > @@ -910,7 +963,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, > work = container_of(node, struct io_wq_work, list); > > if (match->fn(work, match->data)) { > - wq_node_del(&wqe->work_list, node, prev); > + __wq_node_del(&wqe->work_list, node, prev); > found = true; > break; > } > diff --git a/fs/io-wq.h b/fs/io-wq.h > index 298b21f4a4d2..9a194339bd9d 100644 > --- a/fs/io-wq.h > +++ b/fs/io-wq.h > @@ -40,9 +40,9 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, > } > } > > -static inline void wq_node_del(struct io_wq_work_list *list, > - struct io_wq_work_node *node, > - struct io_wq_work_node *prev) > +static inline void __wq_node_del(struct io_wq_work_list *list, > + struct io_wq_work_node *node, > + struct io_wq_work_node *prev) > { > if (node == list->first) > WRITE_ONCE(list->first, node->next); > @@ -53,6 +53,21 @@ static inline void wq_node_del(struct io_wq_work_list *list, > node->next = NULL; > } > > + > +static inline void wq_node_del(struct io_wq_work_list *list, > + struct io_wq_work_node *node) > +{ > + __wq_node_del(list, node, NULL); > +} > + > +#define wq_first_entry(list, type, member) \ > +({ \ > + struct io_wq_work *__work = NULL; \ > + if (!wq_list_empty((list))) \ > + __work = container_of((list)->first, type, member); \ > + __work; \ > +}) > + > #define wq_list_for_each(pos, prv, head) \ > for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) > > -- Pavel Begunkov [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 833 bytes --]
[-- Attachment #1.1: Type: text/plain, Size: 861 bytes --] On 22/03/2020 19:09, Pavel Begunkov wrote: > On 19/03/2020 21:56, Jens Axboe wrote: >> We always punt async buffered writes to an io-wq helper, as the core >> kernel does not have IOCB_NOWAIT support for that. Most buffered async >> writes complete very quickly, as it's just a copy operation. This means >> that doing multiple locking roundtrips on the shared wqe lock for each >> buffered write is wasteful. Additionally, buffered writes are hashed >> work items, which means that any buffered write to a given file is >> serialized. >> >> When looking for a new work item, build a chain of identicaly hashed >> work items, and then hand back that batch. Until the batch is done, the >> caller doesn't have to synchronize with the wqe or worker locks again. I have an idea, how to do it a bit better. Let me try it. -- Pavel Begunkov [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 833 bytes --]
On 3/22/20 10:09 AM, Pavel Begunkov wrote: >> /* not hashed, can run anytime */ >> if (!io_wq_is_hashed(work)) { >> - wq_node_del(&wqe->work_list, node, prev); >> - return work; >> + /* already have hashed work, let new worker get this */ >> + if (ret) { >> + struct io_wqe_acct *acct; >> + >> + /* get new worker for unhashed, if none now */ >> + acct = io_work_get_acct(wqe, work); >> + if (!atomic_read(&acct->nr_running)) >> + io_wqe_wake_worker(wqe, acct); >> + break; >> + } >> + wq_node_del(&wqe->work_list, &work->list); >> + ret = work; >> + break; >> } >> >> /* hashed, can run if not already running */ >> - hash = io_get_work_hash(work); >> - if (!(wqe->hash_map & BIT(hash))) { >> + new_hash = io_get_work_hash(work); >> + if (wqe->hash_map & BIT(new_hash)) >> + break; > > This will always break for subsequent hashed, as the @hash_map bit is set. > Isn't it? And anyway, it seems it doesn't optimise not-contiguous same-hashed > requests, e.g. > > 0: Link(hash=0) > 1: Link(hash=1) > 2: Link(hash=0) > 3: Link(not_hashed) > 4: Link(hash=0) > ... Yeah, I think I shuffled a bit too much there, should only break on that if hash != new_hash. Will fix! >> @@ -530,6 +565,24 @@ static void io_worker_handle_work(struct io_worker *worker) >> work = NULL; >> } >> if (hash != -1U) { >> + /* >> + * If the local list is non-empty, then we >> + * have work that hashed to the same key. >> + * No need for a lock round-trip, or fiddling >> + * the the free/busy state of the worker, or >> + * clearing the hashed state. Just process the >> + * next one. >> + */ >> + if (!work) { >> + work = wq_first_entry(&list, >> + struct io_wq_work, >> + list); > > Wouldn't it just drop a linked request? Probably works because of the > comment above. Only drops if if we always override 'work', if it's already set we use 'work' and don't grab from the list. So that should work for links. >> + if (work) { >> + wq_node_del(&list, &work->list); > > There is a bug, apparently from one of my commits, where it do > io_assign_current_work() but then re-enqueue and reassign new work, > though there is a gap for cancel to happen, which would screw > everything up. > > I'll send a patch, so it'd be more clear. However, this is a good > point to look after for this as well. Saw it, I'll apply when you have the Fixes line. I'll respin this one after. -- Jens Axboe
On 3/22/20 10:24 AM, Pavel Begunkov wrote:
> On 22/03/2020 19:09, Pavel Begunkov wrote:
>> On 19/03/2020 21:56, Jens Axboe wrote:
>>> We always punt async buffered writes to an io-wq helper, as the core
>>> kernel does not have IOCB_NOWAIT support for that. Most buffered async
>>> writes complete very quickly, as it's just a copy operation. This means
>>> that doing multiple locking roundtrips on the shared wqe lock for each
>>> buffered write is wasteful. Additionally, buffered writes are hashed
>>> work items, which means that any buffered write to a given file is
>>> serialized.
>>>
>>> When looking for a new work item, build a chain of identicaly hashed
>>> work items, and then hand back that batch. Until the batch is done, the
>>> caller doesn't have to synchronize with the wqe or worker locks again.
>
> I have an idea, how to do it a bit better. Let me try it.
Sure, go ahead!
--
Jens Axboe
On 22/03/2020 20:08, Jens Axboe wrote: >>> + */ >>> + if (!work) { >>> + work = wq_first_entry(&list, >>> + struct io_wq_work, >>> + list); >> >> Wouldn't it just drop a linked request? Probably works because of the >> comment above. > > Only drops if if we always override 'work', if it's already set we use > 'work' and don't grab from the list. So that should work for links. > Oh, right, the NULL check is just above. >>> + if (work) { >>> + wq_node_del(&list, &work->list); >> >> There is a bug, apparently from one of my commits, where it do >> io_assign_current_work() but then re-enqueue and reassign new work, >> though there is a gap for cancel to happen, which would screw >> everything up. >> >> I'll send a patch, so it'd be more clear. However, this is a good >> point to look after for this as well. > > Saw it, I'll apply when you have the Fixes line. I'll respin this one > after. > -- Pavel Begunkov
[-- Attachment #1.1: Type: text/plain, Size: 6171 bytes --] On 22/03/2020 19:24, Pavel Begunkov wrote: > On 22/03/2020 19:09, Pavel Begunkov wrote: >> On 19/03/2020 21:56, Jens Axboe wrote: >>> We always punt async buffered writes to an io-wq helper, as the core >>> kernel does not have IOCB_NOWAIT support for that. Most buffered async >>> writes complete very quickly, as it's just a copy operation. This means >>> that doing multiple locking roundtrips on the shared wqe lock for each >>> buffered write is wasteful. Additionally, buffered writes are hashed >>> work items, which means that any buffered write to a given file is >>> serialized. >>> >>> When looking for a new work item, build a chain of identicaly hashed >>> work items, and then hand back that batch. Until the batch is done, the >>> caller doesn't have to synchronize with the wqe or worker locks again. > > I have an idea, how to do it a bit better. Let me try it. The diff below is buggy (Ooopses somewhere in blk-mq for read-write.c:read_poll_link), I'll double check it on a fresh head. The idea is to keep same-hashed works continuously in @work_list, so they can be spliced in one go. For each hash bucket, I keep last added work - on enqueue, it adds a work after the last one - on dequeue it splices [first, last]. Where @first is the current one, because of how we traverse @work_list. It throws a bit of extra memory, but - removes extra looping - and also takes all hashed requests, but not only sequentially submitted e.g. for the following submission sequence, it will take all (hash=0) requests. REQ(hash=0) REQ(hash=1) REQ(hash=0) REQ() REQ(hash=0) ... Please, tell if you see a hole in the concept. And as said, there is still a bug somewhere. --- fs/io-wq.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ fs/io-wq.h | 34 +++++++++++++++++++++++++++------- 2 files changed, 70 insertions(+), 13 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index b3fb61ec0870..00d8f8b12df3 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -69,6 +69,8 @@ struct io_worker { #define IO_WQ_HASH_ORDER 5 #endif +#define IO_WQ_HASH_NR_BUCKETS (1u << IO_WQ_HASH_ORDER) + struct io_wqe_acct { unsigned nr_workers; unsigned max_workers; @@ -98,6 +100,7 @@ struct io_wqe { struct list_head all_list; struct io_wq *wq; + struct io_wq_work *last_hashed[IO_WQ_HASH_NR_BUCKETS]; }; /* @@ -400,7 +403,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) hash = io_get_work_hash(work); if (!(wqe->hash_map & BIT(hash))) { wqe->hash_map |= BIT(hash); - wq_node_del(&wqe->work_list, node, prev); + wq_del_node_range(&wqe->work_list, + &wqe->last_hashed[hash]->list, prev); + wqe->last_hashed[hash] = NULL; return work; } } @@ -508,7 +513,11 @@ static void io_worker_handle_work(struct io_worker *worker) /* handle a whole dependent link */ do { - struct io_wq_work *old_work; + struct io_wq_work *old_work, *next_hashed = NULL; + + if (work->list.next) + next_hashed = container_of(work->list.next, + struct io_wq_work, list); io_impersonate_work(worker, work); /* @@ -523,12 +532,20 @@ static void io_worker_handle_work(struct io_worker *worker) work->func(&work); work = (old_work == work) ? NULL : work; - assign_work = work; - if (work && io_wq_is_hashed(work)) - assign_work = NULL; + assign_work = next_hashed; + if (!next_hashed && work && !io_wq_is_hashed(work)) + assign_work = work; + io_assign_current_work(worker, assign_work); wq->free_work(old_work); + if (next_hashed) { + if (work) + io_wqe_enqueue(wqe, work); + work = next_hashed; + continue; + } + if (work && !assign_work) { io_wqe_enqueue(wqe, work); work = NULL; @@ -776,6 +793,26 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) } while (work); } +static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) +{ + int hash; + struct io_wq_work *last_hashed; + + if (!io_wq_is_hashed(work)) { +append: + wq_list_add_tail(&work->list, &wqe->work_list); + return; + } + + hash = io_get_work_hash(work); + last_hashed = wqe->last_hashed[hash]; + wqe->last_hashed[hash] = work; + if (!last_hashed) + goto append; + + wq_list_add_after(&work->list, &last_hashed->list, &wqe->work_list); +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); @@ -795,7 +832,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) work_flags = work->flags; spin_lock_irqsave(&wqe->lock, flags); - wq_list_add_tail(&work->list, &wqe->work_list); + io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); diff --git a/fs/io-wq.h b/fs/io-wq.h index 298b21f4a4d2..10367b6238da 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -40,17 +40,37 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } +static inline void wq_list_add_after(struct io_wq_work_node *node, + struct io_wq_work_node *pos, + struct io_wq_work_list *list) +{ + struct io_wq_work_node *next = pos->next; + + pos->next = node; + node->next = next; + if (!next) + list->last = node; +} + +static inline void wq_del_node_range(struct io_wq_work_list *list, + struct io_wq_work_node *last, + struct io_wq_work_node *prev) +{ + if (!prev) + WRITE_ONCE(list->first, last->next); + else + prev->next = last->next; + + if (last == list->last) + list->last = prev; + last->next = NULL; +} + static inline void wq_node_del(struct io_wq_work_list *list, struct io_wq_work_node *node, struct io_wq_work_node *prev) { - if (node == list->first) - WRITE_ONCE(list->first, node->next); - if (node == list->last) - list->last = prev; - if (prev) - prev->next = node->next; - node->next = NULL; + wq_del_node_range(list, node, prev); } #define wq_list_for_each(pos, prv, head) \ -- 2.24.0 [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 833 bytes --]
On 3/22/20 12:54 PM, Pavel Begunkov wrote: > On 22/03/2020 19:24, Pavel Begunkov wrote: >> On 22/03/2020 19:09, Pavel Begunkov wrote: >>> On 19/03/2020 21:56, Jens Axboe wrote: >>>> We always punt async buffered writes to an io-wq helper, as the core >>>> kernel does not have IOCB_NOWAIT support for that. Most buffered async >>>> writes complete very quickly, as it's just a copy operation. This means >>>> that doing multiple locking roundtrips on the shared wqe lock for each >>>> buffered write is wasteful. Additionally, buffered writes are hashed >>>> work items, which means that any buffered write to a given file is >>>> serialized. >>>> >>>> When looking for a new work item, build a chain of identicaly hashed >>>> work items, and then hand back that batch. Until the batch is done, the >>>> caller doesn't have to synchronize with the wqe or worker locks again. >> >> I have an idea, how to do it a bit better. Let me try it. > > The diff below is buggy (Ooopses somewhere in blk-mq for > read-write.c:read_poll_link), I'll double check it on a fresh head. Are you running for-5.7/io_uring merged with master? If not you're missing: commit f1d96a8fcbbbb22d4fbc1d69eaaa678bbb0ff6e2 Author: Pavel Begunkov <asml.silence@gmail.com> Date: Fri Mar 13 22:29:14 2020 +0300 io_uring: NULL-deref for IOSQE_{ASYNC,DRAIN} which is what I ran into as well last week... > The idea is to keep same-hashed works continuously in @work_list, so > they can be spliced in one go. For each hash bucket, I keep last added > work > - on enqueue, it adds a work after the last one > - on dequeue it splices [first, last]. Where @first is the current > one, because > > of how we traverse @work_list. > > > It throws a bit of extra memory, but > - removes extra looping > - and also takes all hashed requests, but not only sequentially > submitted > > e.g. for the following submission sequence, it will take all (hash=0) > requests. > REQ(hash=0) > REQ(hash=1) > REQ(hash=0) > REQ() > REQ(hash=0) > ... > > > Please, tell if you see a hole in the concept. And as said, there is > still a bug somewhere. The extra memory isn't a bit deal, it's very minor. My main concern would be fairness, since we'd then be grabbing non-contig hashed chunks, before we did not. May not be a concern as long as we ensure the non-hasned (and differently hashed) work can proceed in parallel. For my end, I deliberately added: + /* already have hashed work, let new worker get this */ + if (ret) { + struct io_wqe_acct *acct; + + /* get new worker for unhashed, if none now */ + acct = io_work_get_acct(wqe, work); + if (!atomic_read(&acct->nr_running)) + io_wqe_wake_worker(wqe, acct); + break; + } to try and improve that. I'll run a quick test with yours. -- Jens Axboe
On 3/22/20 1:51 PM, Jens Axboe wrote:
>> Please, tell if you see a hole in the concept. And as said, there is
>> still a bug somewhere.
One quick guess would be that you're wanting to use both work->list and
work->data, the latter is used by links which is where you are crashing.
Didn't check your list management yet so don't know if that's the case,
but if you're still on the list (or manipulating it after), then that
would be a bug.
--
Jens Axboe
On 3/22/20 2:05 PM, Jens Axboe wrote:
> On 3/22/20 1:51 PM, Jens Axboe wrote:
>>> Please, tell if you see a hole in the concept. And as said, there is
>>> still a bug somewhere.
>
> One quick guess would be that you're wanting to use both work->list and
> work->data, the latter is used by links which is where you are crashing.
> Didn't check your list management yet so don't know if that's the case,
> but if you're still on the list (or manipulating it after), then that
> would be a bug.
IOW, by the time you do work->func(&work), the item must be off the
list. Does indeed look like that's exactly the bug you have.
--
Jens Axboe
[-- Attachment #1.1: Type: text/plain, Size: 944 bytes --] On 22/03/2020 23:15, Jens Axboe wrote: > On 3/22/20 2:05 PM, Jens Axboe wrote: >> On 3/22/20 1:51 PM, Jens Axboe wrote: >>>> Please, tell if you see a hole in the concept. And as said, there is >>>> still a bug somewhere. >> >> One quick guess would be that you're wanting to use both work->list and >> work->data, the latter is used by links which is where you are crashing. >> Didn't check your list management yet so don't know if that's the case, >> but if you're still on the list (or manipulating it after), then that >> would be a bug. > > IOW, by the time you do work->func(&work), the item must be off the > list. Does indeed look like that's exactly the bug you have. > Good guess. I made sure to grab next before ->func(), see next_hashed. And it's not in @work_list, because io_get_next_work() removes it. However, somebody may expect it to be NULL or something. Thanks! I'll check it -- Pavel Begunkov [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 833 bytes --]
[-- Attachment #1.1: Type: text/plain, Size: 1432 bytes --] On 22/03/2020 22:51, Jens Axboe wrote: > commit f1d96a8fcbbbb22d4fbc1d69eaaa678bbb0ff6e2 > Author: Pavel Begunkov <asml.silence@gmail.com> > Date: Fri Mar 13 22:29:14 2020 +0300 > > io_uring: NULL-deref for IOSQE_{ASYNC,DRAIN} > > which is what I ran into as well last week... I picked it before testing > The extra memory isn't a bit deal, it's very minor. My main concern > would be fairness, since we'd then be grabbing non-contig hashed chunks, > before we did not. May not be a concern as long as we ensure the > non-hasned (and differently hashed) work can proceed in parallel. For my > end, I deliberately added: Don't think it's really a problem, all ordering/scheduling is up to users (i.e. io_uring), and it can't infinitely postpone a work, because it's processing spliced requests without taking more, even if new ones hash to the same bit. > + /* already have hashed work, let new worker get this */ > + if (ret) { > + struct io_wqe_acct *acct; > + > + /* get new worker for unhashed, if none now */ > + acct = io_work_get_acct(wqe, work); > + if (!atomic_read(&acct->nr_running)) > + io_wqe_wake_worker(wqe, acct); > + break; > + } > > to try and improve that. Is there performance problems with your patch without this chunk? I may see another problem with yours, I need to think it through. > > I'll run a quick test with yours. -- Pavel Begunkov [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 833 bytes --]
[-- Attachment #1.1: Type: text/plain, Size: 1919 bytes --] On 19/03/2020 21:56, Jens Axboe wrote: > if (hash != -1U) { > + /* > + * If the local list is non-empty, then we > + * have work that hashed to the same key. > + * No need for a lock round-trip, or fiddling > + * the the free/busy state of the worker, or > + * clearing the hashed state. Just process the > + * next one. > + */ > + if (!work) { > + work = wq_first_entry(&list, > + struct io_wq_work, > + list); > + if (work) { > + wq_node_del(&list, &work->list); > + goto got_work; > + } > + } > + > spin_lock_irq(&wqe->lock); > wqe->hash_map &= ~BIT_ULL(hash); > wqe->flags &= ~IO_WQE_FLAG_STALLED; Let's have an example, where "->" is a link req0(hash=0) -> req1(not_hashed) req2(hash=0) 1. it grabs @req0 (@work=@req0) and @req1 (in the local @list) 2. it do @req0->func(), sets @work=@req1 and goes to the hash updating code (see above). 3. ```if (!work)``` check fails, and it clears @hash_map, even though there is one of the same hash in the list. It messes up @hash_map accounting, but probably even can continue working fine. 4. Next, it goes for the second iteration (work == req1), do ->func(). Then checks @hash, which is -1 for non-hashed req1, and exits leaving req2 in the @list. Can miss something by looking at diff only, but there are 2 potential points to fix. BTW, yours patch executes all linked requests first and then goes to the next hashed. Mine do hashed first, and re-enqueue linked requests. The downside in my version is the extra re-enqueue. And your approach can't do some cases in parallel, e.g. the following is purely sequential: req0(hash0) -> ... long link -> req1(hash0) -> ... long link -> req2(hash0) -> ... long link -> req3(hash0) -> ... long link -> It's not hard to convert, though -- Pavel Begunkov [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 833 bytes --]
[-- Attachment #1.1: Type: text/plain, Size: 1118 bytes --] On 22/03/2020 23:20, Pavel Begunkov wrote: > On 22/03/2020 23:15, Jens Axboe wrote: >> On 3/22/20 2:05 PM, Jens Axboe wrote: >>> On 3/22/20 1:51 PM, Jens Axboe wrote: >>>>> Please, tell if you see a hole in the concept. And as said, there is >>>>> still a bug somewhere. >>> >>> One quick guess would be that you're wanting to use both work->list and >>> work->data, the latter is used by links which is where you are crashing. >>> Didn't check your list management yet so don't know if that's the case, >>> but if you're still on the list (or manipulating it after), then that >>> would be a bug. >> >> IOW, by the time you do work->func(&work), the item must be off the >> list. Does indeed look like that's exactly the bug you have. >> > > Good guess. I made sure to grab next before ->func(), see next_hashed. And it's > not in @work_list, because io_get_next_work() removes it. However, somebody may > expect it to be NULL or something. Thanks! I'll check it You're right. Re-enqueue corrupts ->data, so should be fixed for previous patches as well. I'll send it -- Pavel Begunkov [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 833 bytes --]
[-- Attachment #1.1: Type: text/plain, Size: 1245 bytes --] On 23/03/2020 00:16, Pavel Begunkov wrote: > On 22/03/2020 23:20, Pavel Begunkov wrote: >> On 22/03/2020 23:15, Jens Axboe wrote: >>> On 3/22/20 2:05 PM, Jens Axboe wrote: >>>> On 3/22/20 1:51 PM, Jens Axboe wrote: >>>>>> Please, tell if you see a hole in the concept. And as said, there is >>>>>> still a bug somewhere. >>>> >>>> One quick guess would be that you're wanting to use both work->list and >>>> work->data, the latter is used by links which is where you are crashing. >>>> Didn't check your list management yet so don't know if that's the case, >>>> but if you're still on the list (or manipulating it after), then that >>>> would be a bug. >>> >>> IOW, by the time you do work->func(&work), the item must be off the >>> list. Does indeed look like that's exactly the bug you have. >>> >> >> Good guess. I made sure to grab next before ->func(), see next_hashed. And it's >> not in @work_list, because io_get_next_work() removes it. However, somebody may >> expect it to be NULL or something. Thanks! I'll check it > > You're right. Re-enqueue corrupts ->data, so should be fixed for previous > patches as well. I'll send it > And the diff works fine if put on top of the fix! -- Pavel Begunkov [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 833 bytes --]
On 3/22/20 2:25 PM, Pavel Begunkov wrote: > On 22/03/2020 22:51, Jens Axboe wrote: >> commit f1d96a8fcbbbb22d4fbc1d69eaaa678bbb0ff6e2 >> Author: Pavel Begunkov <asml.silence@gmail.com> >> Date: Fri Mar 13 22:29:14 2020 +0300 >> >> io_uring: NULL-deref for IOSQE_{ASYNC,DRAIN} >> >> which is what I ran into as well last week... > > I picked it before testing > >> The extra memory isn't a bit deal, it's very minor. My main concern >> would be fairness, since we'd then be grabbing non-contig hashed chunks, >> before we did not. May not be a concern as long as we ensure the >> non-hasned (and differently hashed) work can proceed in parallel. For my >> end, I deliberately added: > > Don't think it's really a problem, all ordering/scheduling is up to > users (i.e. io_uring), and it can't infinitely postpone a work, > because it's processing spliced requests without taking more, even if > new ones hash to the same bit. I don't disagree with you, just wanted to bring it up! >> + /* already have hashed work, let new worker get this */ >> + if (ret) { >> + struct io_wqe_acct *acct; >> + >> + /* get new worker for unhashed, if none now */ >> + acct = io_work_get_acct(wqe, work); >> + if (!atomic_read(&acct->nr_running)) >> + io_wqe_wake_worker(wqe, acct); >> + break; >> + } >> >> to try and improve that. > > Is there performance problems with your patch without this chunk? I > may see another problem with yours, I need to think it through. No, and in fact it probably should be a separate thing, but I kind of like your approach so not moving forward with mine. I do think it's worth looking into separately, as there's no reason why we can't wake a non-hashed worker if we're just doing hashed work from the existing thread. If that thread is just doing copies and not blocking, the unhashed (or next hashed) work is just sitting idle while it could be running instead. Hence I added that hunk, to kick a new worker to proceed in parallel. -- Jens Axboe
[-- Attachment #1.1: Type: text/plain, Size: 2508 bytes --] On 23/03/2020 04:37, Jens Axboe wrote: > On 3/22/20 2:25 PM, Pavel Begunkov wrote: >> On 22/03/2020 22:51, Jens Axboe wrote: >>> commit f1d96a8fcbbbb22d4fbc1d69eaaa678bbb0ff6e2 >>> Author: Pavel Begunkov <asml.silence@gmail.com> >>> Date: Fri Mar 13 22:29:14 2020 +0300 >>> >>> io_uring: NULL-deref for IOSQE_{ASYNC,DRAIN} >>> >>> which is what I ran into as well last week... >> >> I picked it before testing >> >>> The extra memory isn't a bit deal, it's very minor. My main concern >>> would be fairness, since we'd then be grabbing non-contig hashed chunks, >>> before we did not. May not be a concern as long as we ensure the >>> non-hasned (and differently hashed) work can proceed in parallel. For my >>> end, I deliberately added: >> >> Don't think it's really a problem, all ordering/scheduling is up to >> users (i.e. io_uring), and it can't infinitely postpone a work, >> because it's processing spliced requests without taking more, even if >> new ones hash to the same bit. > > I don't disagree with you, just wanted to bring it up! Sure, there is a lot to think about. E.g. I don't like this reenqueueing, and if all other thread have enough work to do, then it can avoided, but don't want to over-complicate. > >>> + /* already have hashed work, let new worker get this */ >>> + if (ret) { >>> + struct io_wqe_acct *acct; >>> + >>> + /* get new worker for unhashed, if none now */ >>> + acct = io_work_get_acct(wqe, work); >>> + if (!atomic_read(&acct->nr_running)) >>> + io_wqe_wake_worker(wqe, acct); >>> + break; >>> + } >>> >>> to try and improve that. >> >> Is there performance problems with your patch without this chunk? I >> may see another problem with yours, I need to think it through. > > No, and in fact it probably should be a separate thing, but I kind of > like your approach so not moving forward with mine. I do think it's > worth looking into separately, as there's no reason why we can't wake a > non-hashed worker if we're just doing hashed work from the existing > thread. If that thread is just doing copies and not blocking, the > unhashed (or next hashed) work is just sitting idle while it could be > running instead. Then, I'll clean the diff, hopefully soon. Could I steal parts of your patch description? > > Hence I added that hunk, to kick a new worker to proceed in parallel. It seems, I need to take a closer look at this accounting in general. -- Pavel Begunkov [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 833 bytes --]
On 3/23/20 2:38 AM, Pavel Begunkov wrote: >> No, and in fact it probably should be a separate thing, but I kind of >> like your approach so not moving forward with mine. I do think it's >> worth looking into separately, as there's no reason why we can't wake a >> non-hashed worker if we're just doing hashed work from the existing >> thread. If that thread is just doing copies and not blocking, the >> unhashed (or next hashed) work is just sitting idle while it could be >> running instead. > > Then, I'll clean the diff, hopefully soon. Could I steal parts of your patch > description? Of course, go ahead. >> Hence I added that hunk, to kick a new worker to proceed in parallel. > > It seems, I need to take a closer look at this accounting in general. Agree, I think we have some room for improvement there. -- Jens Axboe
We always punt async buffered writes to an io-wq helper, as the core kernel does not have IOCB_NOWAIT support for that. Most buffered async writes complete very quickly, as it's just a copy operation. This means that doing multiple locking roundtrips on the shared wqe lock for each buffered write is wasteful. Additionally, buffered writes are hashed work items, which means that any buffered write to a given file is serialized. Keep identicaly hashed work items contiguously in @wqe->work_list, and track a tail for each hash bucket. On dequeue of a hashed item, splice all of the same hash in one go using the tracked tail. Until the batch is done, the caller doesn't have to synchronize with the wqe or worker locks again. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> --- v2 (since diff): - add wq_next_work() helper - rename list helpers to match list.h naming - reshuffle io_worker_handle_work() bits, but keeping functionally identical fs/io-wq.c | 68 ++++++++++++++++++++++++++++++++++++++---------------- fs/io-wq.h | 45 +++++++++++++++++++++++++++++------- 2 files changed, 85 insertions(+), 28 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index b3fb61ec0870..cc5cf2209fb0 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -69,6 +69,8 @@ struct io_worker { #define IO_WQ_HASH_ORDER 5 #endif +#define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) + struct io_wqe_acct { unsigned nr_workers; unsigned max_workers; @@ -98,6 +100,7 @@ struct io_wqe { struct list_head all_list; struct io_wq *wq; + struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; }; /* @@ -384,7 +387,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; - struct io_wq_work *work; + struct io_wq_work *work, *tail; unsigned int hash; wq_list_for_each(node, prev, &wqe->work_list) { @@ -392,7 +395,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) /* not hashed, can run anytime */ if (!io_wq_is_hashed(work)) { - wq_node_del(&wqe->work_list, node, prev); + wq_list_del(&wqe->work_list, node, prev); return work; } @@ -400,7 +403,10 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) hash = io_get_work_hash(work); if (!(wqe->hash_map & BIT(hash))) { wqe->hash_map |= BIT(hash); - wq_node_del(&wqe->work_list, node, prev); + /* all items with this hash lie in [work, tail] */ + tail = wqe->hash_tail[hash]; + wqe->hash_tail[hash] = NULL; + wq_list_cut(&wqe->work_list, &tail->list, prev); return work; } } @@ -485,7 +491,7 @@ static void io_worker_handle_work(struct io_worker *worker) struct io_wq *wq = wqe->wq; do { - struct io_wq_work *work, *assign_work; + struct io_wq_work *work; unsigned int hash; get_next: /* @@ -508,8 +514,9 @@ static void io_worker_handle_work(struct io_worker *worker) /* handle a whole dependent link */ do { - struct io_wq_work *old_work; + struct io_wq_work *old_work, *next_hashed, *linked; + next_hashed = wq_next_work(work); io_impersonate_work(worker, work); /* * OK to set IO_WQ_WORK_CANCEL even for uncancellable @@ -518,22 +525,23 @@ static void io_worker_handle_work(struct io_worker *worker) if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; - old_work = work; hash = io_get_work_hash(work); - work->func(&work); - work = (old_work == work) ? NULL : work; - - assign_work = work; - if (work && io_wq_is_hashed(work)) - assign_work = NULL; - io_assign_current_work(worker, assign_work); + linked = old_work = work; + linked->func(&linked); + linked = (old_work == linked) ? NULL : linked; + + work = next_hashed; + if (!work && linked && !io_wq_is_hashed(linked)) { + work = linked; + linked = NULL; + } + io_assign_current_work(worker, work); wq->free_work(old_work); - if (work && !assign_work) { - io_wqe_enqueue(wqe, work); - work = NULL; - } - if (hash != -1U) { + if (linked) + io_wqe_enqueue(wqe, linked); + + if (hash != -1U && !next_hashed) { spin_lock_irq(&wqe->lock); wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; @@ -776,6 +784,26 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) } while (work); } +static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) +{ + unsigned int hash; + struct io_wq_work *tail; + + if (!io_wq_is_hashed(work)) { +append: + wq_list_add_tail(&work->list, &wqe->work_list); + return; + } + + hash = io_get_work_hash(work); + tail = wqe->hash_tail[hash]; + wqe->hash_tail[hash] = work; + if (!tail) + goto append; + + wq_list_add_after(&work->list, &tail->list, &wqe->work_list); +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); @@ -795,7 +823,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) work_flags = work->flags; spin_lock_irqsave(&wqe->lock, flags); - wq_list_add_tail(&work->list, &wqe->work_list); + io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); @@ -914,7 +942,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, work = container_of(node, struct io_wq_work, list); if (match->fn(work, match->data)) { - wq_node_del(&wqe->work_list, node, prev); + wq_list_del(&wqe->work_list, node, prev); found = true; break; } diff --git a/fs/io-wq.h b/fs/io-wq.h index d2a5684bf673..3ee7356d6be5 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -28,6 +28,18 @@ struct io_wq_work_list { struct io_wq_work_node *last; }; +static inline void wq_list_add_after(struct io_wq_work_node *node, + struct io_wq_work_node *pos, + struct io_wq_work_list *list) +{ + struct io_wq_work_node *next = pos->next; + + pos->next = node; + node->next = next; + if (!next) + list->last = node; +} + static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { @@ -40,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } -static inline void wq_node_del(struct io_wq_work_list *list, - struct io_wq_work_node *node, +static inline void wq_list_cut(struct io_wq_work_list *list, + struct io_wq_work_node *last, struct io_wq_work_node *prev) { - if (node == list->first) - WRITE_ONCE(list->first, node->next); - if (node == list->last) + /* first in the list, if prev==NULL */ + if (!prev) + WRITE_ONCE(list->first, last->next); + else + prev->next = last->next; + + if (last == list->last) list->last = prev; - if (prev) - prev->next = node->next; - node->next = NULL; + last->next = NULL; +} + +static inline void wq_list_del(struct io_wq_work_list *list, + struct io_wq_work_node *node, + struct io_wq_work_node *prev) +{ + wq_list_cut(list, node, prev); } #define wq_list_for_each(pos, prv, head) \ @@ -78,6 +99,14 @@ struct io_wq_work { *(work) = (struct io_wq_work){ .func = _func }; \ } while (0) \ +static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) +{ + if (!work->list.next) + return NULL; + + return container_of(work->list.next, struct io_wq_work, list); +} + typedef void (free_work_fn)(struct io_wq_work *); struct io_wq_data { -- 2.24.0
On 3/23/20 1:57 PM, Pavel Begunkov wrote:
> We always punt async buffered writes to an io-wq helper, as the core
> kernel does not have IOCB_NOWAIT support for that. Most buffered async
> writes complete very quickly, as it's just a copy operation. This means
> that doing multiple locking roundtrips on the shared wqe lock for each
> buffered write is wasteful. Additionally, buffered writes are hashed
> work items, which means that any buffered write to a given file is
> serialized.
>
> Keep identicaly hashed work items contiguously in @wqe->work_list, and
> track a tail for each hash bucket. On dequeue of a hashed item, splice
> all of the same hash in one go using the tracked tail. Until the batch
> is done, the caller doesn't have to synchronize with the wqe or worker
> locks again.
Looks good to me, and also passes testing. I've applied this for 5.7.
Next we can start looking into cases where it'd be an improvement
to kick off another worker. Say if we still have work after grabbing
a chain, would probably not be a bad idea.
--
Jens Axboe