io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] io-wq: handle hashed writes in chains
@ 2020-03-19 18:56 Jens Axboe
  2020-03-22 16:09 ` Pavel Begunkov
  2020-03-22 20:56 ` Pavel Begunkov
  0 siblings, 2 replies; 20+ messages in thread
From: Jens Axboe @ 2020-03-19 18:56 UTC (permalink / raw)
  To: io-uring

We always punt async buffered writes to an io-wq helper, as the core
kernel does not have IOCB_NOWAIT support for that. Most buffered async
writes complete very quickly, as it's just a copy operation. This means
that doing multiple locking roundtrips on the shared wqe lock for each
buffered write is wasteful. Additionally, buffered writes are hashed
work items, which means that any buffered write to a given file is
serialized.

When looking for a new work item, build a chain of identicaly hashed
work items, and then hand back that batch. Until the batch is done, the
caller doesn't have to synchronize with the wqe or worker locks again.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

---

Changes:
- Don't overwrite passed back work


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 9541df2729de..8402c6e417e1 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -380,32 +380,65 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
 	return work->flags >> IO_WQ_HASH_SHIFT;
 }
 
-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
+/*
+ * Returns the next work item to process, if any. For hashed work that hash
+ * to the same key, we can't process N+1 before N is done. To make the
+ * processing more efficient, return N+1 and later identically hashed work
+ * in the passed in list. This avoids repeated hammering on the wqe lock for,
+ * as the caller can just process items in the on-stack list.
+ */
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
+					   struct io_wq_work_list *list)
 	__must_hold(wqe->lock)
 {
-	struct io_wq_work_node *node, *prev;
-	struct io_wq_work *work;
-	unsigned int hash;
+	struct io_wq_work *ret = NULL;
 
-	wq_list_for_each(node, prev, &wqe->work_list) {
-		work = container_of(node, struct io_wq_work, list);
+	do {
+		unsigned int new_hash, hash;
+		struct io_wq_work *work;
+
+		work = wq_first_entry(&wqe->work_list, struct io_wq_work, list);
+		if (!work)
+			break;
 
 		/* not hashed, can run anytime */
 		if (!io_wq_is_hashed(work)) {
-			wq_node_del(&wqe->work_list, node, prev);
-			return work;
+			/* already have hashed work, let new worker get this */
+			if (ret) {
+				struct io_wqe_acct *acct;
+
+				/* get new worker for unhashed, if none now */
+				acct = io_work_get_acct(wqe, work);
+				if (!atomic_read(&acct->nr_running))
+					io_wqe_wake_worker(wqe, acct);
+				break;
+			}
+			wq_node_del(&wqe->work_list, &work->list);
+			ret = work;
+			break;
 		}
 
 		/* hashed, can run if not already running */
-		hash = io_get_work_hash(work);
-		if (!(wqe->hash_map & BIT(hash))) {
+		new_hash = io_get_work_hash(work);
+		if (wqe->hash_map & BIT(new_hash))
+			break;
+
+		if (!ret) {
+			hash = new_hash;
 			wqe->hash_map |= BIT(hash);
-			wq_node_del(&wqe->work_list, node, prev);
-			return work;
+		} else if (hash != new_hash) {
+			break;
 		}
-	}
 
-	return NULL;
+		wq_node_del(&wqe->work_list, &work->list);
+		/* return first node, add subsequent same hash to the list */
+		if (!ret)
+			ret = work;
+		else
+			wq_list_add_tail(&work->list, list);
+	} while (1);
+
+	return ret;
 }
 
 static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
@@ -481,6 +514,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
 static void io_worker_handle_work(struct io_worker *worker)
 	__releases(wqe->lock)
 {
+	struct io_wq_work_list list = { .first = NULL, .last = NULL };
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
 
@@ -495,7 +529,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 		 * can't make progress, any work completion or insertion will
 		 * clear the stalled flag.
 		 */
-		work = io_get_next_work(wqe);
+		work = io_get_next_work(wqe, &list);
 		if (work)
 			__io_worker_busy(wqe, worker, work);
 		else if (!wq_list_empty(&wqe->work_list))
@@ -504,6 +538,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 		spin_unlock_irq(&wqe->lock);
 		if (!work)
 			break;
+got_work:
 		io_assign_current_work(worker, work);
 
 		/* handle a whole dependent link */
@@ -530,6 +565,24 @@ static void io_worker_handle_work(struct io_worker *worker)
 				work = NULL;
 			}
 			if (hash != -1U) {
+				/*
+				 * If the local list is non-empty, then we
+				 * have work that hashed to the same key.
+				 * No need for a lock round-trip, or fiddling
+				 * the the free/busy state of the worker, or
+				 * clearing the hashed state. Just process the
+				 * next one.
+				 */
+				if (!work) {
+					work = wq_first_entry(&list,
+							      struct io_wq_work,
+							      list);
+					if (work) {
+						wq_node_del(&list, &work->list);
+						goto got_work;
+					}
+				}
+
 				spin_lock_irq(&wqe->lock);
 				wqe->hash_map &= ~BIT_ULL(hash);
 				wqe->flags &= ~IO_WQE_FLAG_STALLED;
@@ -910,7 +963,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 		work = container_of(node, struct io_wq_work, list);
 
 		if (match->fn(work, match->data)) {
-			wq_node_del(&wqe->work_list, node, prev);
+			__wq_node_del(&wqe->work_list, node, prev);
 			found = true;
 			break;
 		}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 298b21f4a4d2..9a194339bd9d 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -40,9 +40,9 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
 	}
 }
 
-static inline void wq_node_del(struct io_wq_work_list *list,
-			       struct io_wq_work_node *node,
-			       struct io_wq_work_node *prev)
+static inline void __wq_node_del(struct io_wq_work_list *list,
+				struct io_wq_work_node *node,
+				struct io_wq_work_node *prev)
 {
 	if (node == list->first)
 		WRITE_ONCE(list->first, node->next);
@@ -53,6 +53,21 @@ static inline void wq_node_del(struct io_wq_work_list *list,
 	node->next = NULL;
 }
 
+
+static inline void wq_node_del(struct io_wq_work_list *list,
+			       struct io_wq_work_node *node)
+{
+	__wq_node_del(list, node, NULL);
+}
+
+#define wq_first_entry(list, type, member)				\
+({									\
+	struct io_wq_work *__work = NULL;				\
+	if (!wq_list_empty((list)))					\
+		__work = container_of((list)->first, type, member);	\
+	__work;								\
+})
+
 #define wq_list_for_each(pos, prv, head)			\
 	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
 
-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-19 18:56 [PATCH v2] io-wq: handle hashed writes in chains Jens Axboe
@ 2020-03-22 16:09 ` Pavel Begunkov
  2020-03-22 16:24   ` Pavel Begunkov
  2020-03-22 17:08   ` Jens Axboe
  2020-03-22 20:56 ` Pavel Begunkov
  1 sibling, 2 replies; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-22 16:09 UTC (permalink / raw)
  To: Jens Axboe, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 7627 bytes --]

On 19/03/2020 21:56, Jens Axboe wrote:
> We always punt async buffered writes to an io-wq helper, as the core
> kernel does not have IOCB_NOWAIT support for that. Most buffered async
> writes complete very quickly, as it's just a copy operation. This means
> that doing multiple locking roundtrips on the shared wqe lock for each
> buffered write is wasteful. Additionally, buffered writes are hashed
> work items, which means that any buffered write to a given file is
> serialized.
> 
> When looking for a new work item, build a chain of identicaly hashed
> work items, and then hand back that batch. Until the batch is done, the
> caller doesn't have to synchronize with the wqe or worker locks again.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> ---
> 
> Changes:
> - Don't overwrite passed back work
> 
> 
> diff --git a/fs/io-wq.c b/fs/io-wq.c
> index 9541df2729de..8402c6e417e1 100644
> --- a/fs/io-wq.c
> +++ b/fs/io-wq.c
> @@ -380,32 +380,65 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
>  	return work->flags >> IO_WQ_HASH_SHIFT;
>  }
>  
> -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
> +/*
> + * Returns the next work item to process, if any. For hashed work that hash
> + * to the same key, we can't process N+1 before N is done. To make the
> + * processing more efficient, return N+1 and later identically hashed work
> + * in the passed in list. This avoids repeated hammering on the wqe lock for,
> + * as the caller can just process items in the on-stack list.
> + */
> +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
> +					   struct io_wq_work_list *list)
>  	__must_hold(wqe->lock)
>  {
> -	struct io_wq_work_node *node, *prev;
> -	struct io_wq_work *work;
> -	unsigned int hash;
> +	struct io_wq_work *ret = NULL;
>  
> -	wq_list_for_each(node, prev, &wqe->work_list) {
> -		work = container_of(node, struct io_wq_work, list);
> +	do {
> +		unsigned int new_hash, hash;
> +		struct io_wq_work *work;
> +
> +		work = wq_first_entry(&wqe->work_list, struct io_wq_work, list);
> +		if (!work)
> +			break;
>  
>  		/* not hashed, can run anytime */
>  		if (!io_wq_is_hashed(work)) {
> -			wq_node_del(&wqe->work_list, node, prev);
> -			return work;
> +			/* already have hashed work, let new worker get this */
> +			if (ret) {
> +				struct io_wqe_acct *acct;
> +
> +				/* get new worker for unhashed, if none now */
> +				acct = io_work_get_acct(wqe, work);
> +				if (!atomic_read(&acct->nr_running))
> +					io_wqe_wake_worker(wqe, acct);
> +				break;
> +			}
> +			wq_node_del(&wqe->work_list, &work->list);
> +			ret = work;
> +			break;
>  		}
>  
>  		/* hashed, can run if not already running */
> -		hash = io_get_work_hash(work);
> -		if (!(wqe->hash_map & BIT(hash))) {
> +		new_hash = io_get_work_hash(work);
> +		if (wqe->hash_map & BIT(new_hash))
> +			break;

This will always break for subsequent hashed, as the @hash_map bit is set.
Isn't it? And anyway, it seems it doesn't optimise not-contiguous same-hashed
requests, e.g.

0: Link(hash=0)
1: Link(hash=1)
2: Link(hash=0)
3: Link(not_hashed)
4: Link(hash=0)
...

> +
> +		if (!ret) {
> +			hash = new_hash;
>  			wqe->hash_map |= BIT(hash);
> -			wq_node_del(&wqe->work_list, node, prev);
> -			return work;
> +		} else if (hash != new_hash) {
> +			break;
>  		}
> -	}
>  
> -	return NULL;
> +		wq_node_del(&wqe->work_list, &work->list);
> +		/* return first node, add subsequent same hash to the list */
> +		if (!ret)
> +			ret = work;
> +		else
> +			wq_list_add_tail(&work->list, list);
> +	} while (1);
> +
> +	return ret;
>  }
>  
>  static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
> @@ -481,6 +514,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
>  static void io_worker_handle_work(struct io_worker *worker)
>  	__releases(wqe->lock)
>  {
> +	struct io_wq_work_list list = { .first = NULL, .last = NULL };
>  	struct io_wqe *wqe = worker->wqe;
>  	struct io_wq *wq = wqe->wq;
>  
> @@ -495,7 +529,7 @@ static void io_worker_handle_work(struct io_worker *worker)
>  		 * can't make progress, any work completion or insertion will
>  		 * clear the stalled flag.
>  		 */
> -		work = io_get_next_work(wqe);
> +		work = io_get_next_work(wqe, &list);
>  		if (work)
>  			__io_worker_busy(wqe, worker, work);
>  		else if (!wq_list_empty(&wqe->work_list))
> @@ -504,6 +538,7 @@ static void io_worker_handle_work(struct io_worker *worker)
>  		spin_unlock_irq(&wqe->lock);
>  		if (!work)
>  			break;
> +got_work:
>  		io_assign_current_work(worker, work);
>  
>  		/* handle a whole dependent link */
> @@ -530,6 +565,24 @@ static void io_worker_handle_work(struct io_worker *worker)
>  				work = NULL;
>  			}
>  			if (hash != -1U) {
> +				/*
> +				 * If the local list is non-empty, then we
> +				 * have work that hashed to the same key.
> +				 * No need for a lock round-trip, or fiddling
> +				 * the the free/busy state of the worker, or
> +				 * clearing the hashed state. Just process the
> +				 * next one.
> +				 */
> +				if (!work) {
> +					work = wq_first_entry(&list,
> +							      struct io_wq_work,
> +							      list);

Wouldn't it just drop a linked request? Probably works because of the comment above.

> +					if (work) {
> +						wq_node_del(&list, &work->list);

There is a bug, apparently from one of my commits, where it do
io_assign_current_work() but then re-enqueue and reassign new work, though there
is a gap for cancel to happen, which would screw everything up.

I'll send a patch, so it'd be more clear. However, this is a good point to look
after for this as well.

> +						goto got_work;
> +					}
> +				}
> +
>  				spin_lock_irq(&wqe->lock);
>  				wqe->hash_map &= ~BIT_ULL(hash);
>  				wqe->flags &= ~IO_WQE_FLAG_STALLED;
> @@ -910,7 +963,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
>  		work = container_of(node, struct io_wq_work, list);
>  
>  		if (match->fn(work, match->data)) {
> -			wq_node_del(&wqe->work_list, node, prev);
> +			__wq_node_del(&wqe->work_list, node, prev);
>  			found = true;
>  			break;
>  		}
> diff --git a/fs/io-wq.h b/fs/io-wq.h
> index 298b21f4a4d2..9a194339bd9d 100644
> --- a/fs/io-wq.h
> +++ b/fs/io-wq.h
> @@ -40,9 +40,9 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
>  	}
>  }
>  
> -static inline void wq_node_del(struct io_wq_work_list *list,
> -			       struct io_wq_work_node *node,
> -			       struct io_wq_work_node *prev)
> +static inline void __wq_node_del(struct io_wq_work_list *list,
> +				struct io_wq_work_node *node,
> +				struct io_wq_work_node *prev)
>  {
>  	if (node == list->first)
>  		WRITE_ONCE(list->first, node->next);
> @@ -53,6 +53,21 @@ static inline void wq_node_del(struct io_wq_work_list *list,
>  	node->next = NULL;
>  }
>  
> +
> +static inline void wq_node_del(struct io_wq_work_list *list,
> +			       struct io_wq_work_node *node)
> +{
> +	__wq_node_del(list, node, NULL);
> +}
> +
> +#define wq_first_entry(list, type, member)				\
> +({									\
> +	struct io_wq_work *__work = NULL;				\
> +	if (!wq_list_empty((list)))					\
> +		__work = container_of((list)->first, type, member);	\
> +	__work;								\
> +})
> +
>  #define wq_list_for_each(pos, prv, head)			\
>  	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
>  
> 

-- 
Pavel Begunkov




[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 16:09 ` Pavel Begunkov
@ 2020-03-22 16:24   ` Pavel Begunkov
  2020-03-22 17:08     ` Jens Axboe
  2020-03-22 18:54     ` Pavel Begunkov
  2020-03-22 17:08   ` Jens Axboe
  1 sibling, 2 replies; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-22 16:24 UTC (permalink / raw)
  To: Jens Axboe, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 861 bytes --]

On 22/03/2020 19:09, Pavel Begunkov wrote:
> On 19/03/2020 21:56, Jens Axboe wrote:
>> We always punt async buffered writes to an io-wq helper, as the core
>> kernel does not have IOCB_NOWAIT support for that. Most buffered async
>> writes complete very quickly, as it's just a copy operation. This means
>> that doing multiple locking roundtrips on the shared wqe lock for each
>> buffered write is wasteful. Additionally, buffered writes are hashed
>> work items, which means that any buffered write to a given file is
>> serialized.
>>
>> When looking for a new work item, build a chain of identicaly hashed
>> work items, and then hand back that batch. Until the batch is done, the
>> caller doesn't have to synchronize with the wqe or worker locks again.

I have an idea, how to do it a bit better. Let me try it.


-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 16:09 ` Pavel Begunkov
  2020-03-22 16:24   ` Pavel Begunkov
@ 2020-03-22 17:08   ` Jens Axboe
  2020-03-22 17:37     ` Pavel Begunkov
  1 sibling, 1 reply; 20+ messages in thread
From: Jens Axboe @ 2020-03-22 17:08 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 3/22/20 10:09 AM, Pavel Begunkov wrote:
>>  		/* not hashed, can run anytime */
>>  		if (!io_wq_is_hashed(work)) {
>> -			wq_node_del(&wqe->work_list, node, prev);
>> -			return work;
>> +			/* already have hashed work, let new worker get this */
>> +			if (ret) {
>> +				struct io_wqe_acct *acct;
>> +
>> +				/* get new worker for unhashed, if none now */
>> +				acct = io_work_get_acct(wqe, work);
>> +				if (!atomic_read(&acct->nr_running))
>> +					io_wqe_wake_worker(wqe, acct);
>> +				break;
>> +			}
>> +			wq_node_del(&wqe->work_list, &work->list);
>> +			ret = work;
>> +			break;
>>  		}
>>  
>>  		/* hashed, can run if not already running */
>> -		hash = io_get_work_hash(work);
>> -		if (!(wqe->hash_map & BIT(hash))) {
>> +		new_hash = io_get_work_hash(work);
>> +		if (wqe->hash_map & BIT(new_hash))
>> +			break;
> 
> This will always break for subsequent hashed, as the @hash_map bit is set.
> Isn't it? And anyway, it seems it doesn't optimise not-contiguous same-hashed
> requests, e.g.
> 
> 0: Link(hash=0)
> 1: Link(hash=1)
> 2: Link(hash=0)
> 3: Link(not_hashed)
> 4: Link(hash=0)
> ...

Yeah, I think I shuffled a bit too much there, should only break on that
if hash != new_hash. Will fix!

>> @@ -530,6 +565,24 @@ static void io_worker_handle_work(struct io_worker *worker)
>>  				work = NULL;
>>  			}
>>  			if (hash != -1U) {
>> +				/*
>> +				 * If the local list is non-empty, then we
>> +				 * have work that hashed to the same key.
>> +				 * No need for a lock round-trip, or fiddling
>> +				 * the the free/busy state of the worker, or
>> +				 * clearing the hashed state. Just process the
>> +				 * next one.
>> +				 */
>> +				if (!work) {
>> +					work = wq_first_entry(&list,
>> +							      struct io_wq_work,
>> +							      list);
> 
> Wouldn't it just drop a linked request? Probably works because of the
> comment above.

Only drops if if we always override 'work', if it's already set we use
'work' and don't grab from the list. So that should work for links.

>> +					if (work) {
>> +						wq_node_del(&list, &work->list);
> 
> There is a bug, apparently from one of my commits, where it do
> io_assign_current_work() but then re-enqueue and reassign new work,
> though there is a gap for cancel to happen, which would screw
> everything up.
> 
> I'll send a patch, so it'd be more clear. However, this is a good
> point to look after for this as well.

Saw it, I'll apply when you have the Fixes line. I'll respin this one
after.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 16:24   ` Pavel Begunkov
@ 2020-03-22 17:08     ` Jens Axboe
  2020-03-22 18:54     ` Pavel Begunkov
  1 sibling, 0 replies; 20+ messages in thread
From: Jens Axboe @ 2020-03-22 17:08 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 3/22/20 10:24 AM, Pavel Begunkov wrote:
> On 22/03/2020 19:09, Pavel Begunkov wrote:
>> On 19/03/2020 21:56, Jens Axboe wrote:
>>> We always punt async buffered writes to an io-wq helper, as the core
>>> kernel does not have IOCB_NOWAIT support for that. Most buffered async
>>> writes complete very quickly, as it's just a copy operation. This means
>>> that doing multiple locking roundtrips on the shared wqe lock for each
>>> buffered write is wasteful. Additionally, buffered writes are hashed
>>> work items, which means that any buffered write to a given file is
>>> serialized.
>>>
>>> When looking for a new work item, build a chain of identicaly hashed
>>> work items, and then hand back that batch. Until the batch is done, the
>>> caller doesn't have to synchronize with the wqe or worker locks again.
> 
> I have an idea, how to do it a bit better. Let me try it.

Sure, go ahead!

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 17:08   ` Jens Axboe
@ 2020-03-22 17:37     ` Pavel Begunkov
  0 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-22 17:37 UTC (permalink / raw)
  To: Jens Axboe, io-uring

On 22/03/2020 20:08, Jens Axboe wrote:
>>> +				 */
>>> +				if (!work) {
>>> +					work = wq_first_entry(&list,
>>> +							      struct io_wq_work,
>>> +							      list);
>>
>> Wouldn't it just drop a linked request? Probably works because of the
>> comment above.
> 
> Only drops if if we always override 'work', if it's already set we use
> 'work' and don't grab from the list. So that should work for links.
> 

Oh, right, the NULL check is just above.

>>> +					if (work) {
>>> +						wq_node_del(&list, &work->list);
>>
>> There is a bug, apparently from one of my commits, where it do
>> io_assign_current_work() but then re-enqueue and reassign new work,
>> though there is a gap for cancel to happen, which would screw
>> everything up.
>>
>> I'll send a patch, so it'd be more clear. However, this is a good
>> point to look after for this as well.
> 
> Saw it, I'll apply when you have the Fixes line. I'll respin this one
> after.
> 

-- 
Pavel Begunkov

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 16:24   ` Pavel Begunkov
  2020-03-22 17:08     ` Jens Axboe
@ 2020-03-22 18:54     ` Pavel Begunkov
  2020-03-22 19:51       ` Jens Axboe
  1 sibling, 1 reply; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-22 18:54 UTC (permalink / raw)
  To: Jens Axboe, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 6171 bytes --]

On 22/03/2020 19:24, Pavel Begunkov wrote:
> On 22/03/2020 19:09, Pavel Begunkov wrote:
>> On 19/03/2020 21:56, Jens Axboe wrote:
>>> We always punt async buffered writes to an io-wq helper, as the core
>>> kernel does not have IOCB_NOWAIT support for that. Most buffered async
>>> writes complete very quickly, as it's just a copy operation. This means
>>> that doing multiple locking roundtrips on the shared wqe lock for each
>>> buffered write is wasteful. Additionally, buffered writes are hashed
>>> work items, which means that any buffered write to a given file is
>>> serialized.
>>>
>>> When looking for a new work item, build a chain of identicaly hashed
>>> work items, and then hand back that batch. Until the batch is done, the
>>> caller doesn't have to synchronize with the wqe or worker locks again.
> 
> I have an idea, how to do it a bit better. Let me try it.

The diff below is buggy (Ooopses somewhere in blk-mq for
read-write.c:read_poll_link), I'll double check it on a fresh head.

The idea is to keep same-hashed works continuously in @work_list, so they
can be spliced in one go. For each hash bucket, I keep last added work
- on enqueue, it adds a work after the last one
- on dequeue it splices [first, last]. Where @first is the current one, because
of how we traverse @work_list.

It throws a bit of extra memory, but
- removes extra looping
- and also takes all hashed requests, but not only sequentially submitted

e.g. for the following submission sequence, it will take all (hash=0) requests.
REQ(hash=0)
REQ(hash=1)
REQ(hash=0)
REQ()
REQ(hash=0)
...


Please, tell if you see a hole in the concept. And as said, there is still a bug
somewhere.

---
 fs/io-wq.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
 fs/io-wq.h | 34 +++++++++++++++++++++++++++-------
 2 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index b3fb61ec0870..00d8f8b12df3 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -69,6 +69,8 @@ struct io_worker {
 #define IO_WQ_HASH_ORDER	5
 #endif

+#define IO_WQ_HASH_NR_BUCKETS	(1u << IO_WQ_HASH_ORDER)
+
 struct io_wqe_acct {
 	unsigned nr_workers;
 	unsigned max_workers;
@@ -98,6 +100,7 @@ struct io_wqe {
 	struct list_head all_list;

 	struct io_wq *wq;
+	struct io_wq_work *last_hashed[IO_WQ_HASH_NR_BUCKETS];
 };

 /*
@@ -400,7 +403,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 		hash = io_get_work_hash(work);
 		if (!(wqe->hash_map & BIT(hash))) {
 			wqe->hash_map |= BIT(hash);
-			wq_node_del(&wqe->work_list, node, prev);
+			wq_del_node_range(&wqe->work_list,
+					  &wqe->last_hashed[hash]->list, prev);
+			wqe->last_hashed[hash] = NULL;
 			return work;
 		}
 	}
@@ -508,7 +513,11 @@ static void io_worker_handle_work(struct io_worker *worker)

 		/* handle a whole dependent link */
 		do {
-			struct io_wq_work *old_work;
+			struct io_wq_work *old_work, *next_hashed = NULL;
+
+			if (work->list.next)
+				next_hashed = container_of(work->list.next,
+						       struct io_wq_work, list);

 			io_impersonate_work(worker, work);
 			/*
@@ -523,12 +532,20 @@ static void io_worker_handle_work(struct io_worker *worker)
 			work->func(&work);
 			work = (old_work == work) ? NULL : work;

-			assign_work = work;
-			if (work && io_wq_is_hashed(work))
-				assign_work = NULL;
+			assign_work = next_hashed;
+			if (!next_hashed && work && !io_wq_is_hashed(work))
+				assign_work = work;
+
 			io_assign_current_work(worker, assign_work);
 			wq->free_work(old_work);

+			if (next_hashed) {
+				if (work)
+					io_wqe_enqueue(wqe, work);
+				work = next_hashed;
+				continue;
+			}
+
 			if (work && !assign_work) {
 				io_wqe_enqueue(wqe, work);
 				work = NULL;
@@ -776,6 +793,26 @@ static void io_run_cancel(struct io_wq_work *work, struct
io_wqe *wqe)
 	} while (work);
 }

+static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
+{
+	int hash;
+	struct io_wq_work *last_hashed;
+
+	if (!io_wq_is_hashed(work)) {
+append:
+		wq_list_add_tail(&work->list, &wqe->work_list);
+		return;
+	}
+
+	hash = io_get_work_hash(work);
+	last_hashed = wqe->last_hashed[hash];
+	wqe->last_hashed[hash] = work;
+	if (!last_hashed)
+		goto append;
+
+	wq_list_add_after(&work->list, &last_hashed->list, &wqe->work_list);
+}
+
 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 {
 	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
@@ -795,7 +832,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct
io_wq_work *work)

 	work_flags = work->flags;
 	spin_lock_irqsave(&wqe->lock, flags);
-	wq_list_add_tail(&work->list, &wqe->work_list);
+	io_wqe_insert_work(wqe, work);
 	wqe->flags &= ~IO_WQE_FLAG_STALLED;
 	spin_unlock_irqrestore(&wqe->lock, flags);

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 298b21f4a4d2..10367b6238da 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -40,17 +40,37 @@ static inline void wq_list_add_tail(struct io_wq_work_node
*node,
 	}
 }

+static inline void wq_list_add_after(struct io_wq_work_node *node,
+				    struct io_wq_work_node *pos,
+				    struct io_wq_work_list *list)
+{
+	struct io_wq_work_node *next = pos->next;
+
+	pos->next = node;
+	node->next = next;
+	if (!next)
+		list->last = node;
+}
+
+static inline void wq_del_node_range(struct io_wq_work_list *list,
+				     struct io_wq_work_node *last,
+				     struct io_wq_work_node *prev)
+{
+	if (!prev)
+		WRITE_ONCE(list->first, last->next);
+	else
+		prev->next = last->next;
+
+	if (last == list->last)
+		list->last = prev;
+	last->next = NULL;
+}
+
 static inline void wq_node_del(struct io_wq_work_list *list,
 			       struct io_wq_work_node *node,
 			       struct io_wq_work_node *prev)
 {
-	if (node == list->first)
-		WRITE_ONCE(list->first, node->next);
-	if (node == list->last)
-		list->last = prev;
-	if (prev)
-		prev->next = node->next;
-	node->next = NULL;
+	wq_del_node_range(list, node, prev);
 }

 #define wq_list_for_each(pos, prv, head)			\
-- 
2.24.0


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 18:54     ` Pavel Begunkov
@ 2020-03-22 19:51       ` Jens Axboe
  2020-03-22 20:05         ` Jens Axboe
  2020-03-22 20:25         ` Pavel Begunkov
  0 siblings, 2 replies; 20+ messages in thread
From: Jens Axboe @ 2020-03-22 19:51 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 3/22/20 12:54 PM, Pavel Begunkov wrote:
> On 22/03/2020 19:24, Pavel Begunkov wrote:
>> On 22/03/2020 19:09, Pavel Begunkov wrote:
>>> On 19/03/2020 21:56, Jens Axboe wrote:
>>>> We always punt async buffered writes to an io-wq helper, as the core
>>>> kernel does not have IOCB_NOWAIT support for that. Most buffered async
>>>> writes complete very quickly, as it's just a copy operation. This means
>>>> that doing multiple locking roundtrips on the shared wqe lock for each
>>>> buffered write is wasteful. Additionally, buffered writes are hashed
>>>> work items, which means that any buffered write to a given file is
>>>> serialized.
>>>>
>>>> When looking for a new work item, build a chain of identicaly hashed
>>>> work items, and then hand back that batch. Until the batch is done, the
>>>> caller doesn't have to synchronize with the wqe or worker locks again.
>>
>> I have an idea, how to do it a bit better. Let me try it.
> 
> The diff below is buggy (Ooopses somewhere in blk-mq for
> read-write.c:read_poll_link), I'll double check it on a fresh head.

Are you running for-5.7/io_uring merged with master? If not you're
missing:

commit f1d96a8fcbbbb22d4fbc1d69eaaa678bbb0ff6e2
Author: Pavel Begunkov <asml.silence@gmail.com>
Date:   Fri Mar 13 22:29:14 2020 +0300

    io_uring: NULL-deref for IOSQE_{ASYNC,DRAIN}

which is what I ran into as well last week...

> The idea is to keep same-hashed works continuously in @work_list, so
> they can be spliced in one go. For each hash bucket, I keep last added
> work
> - on enqueue, it adds a work after the last one
> - on dequeue it splices [first, last]. Where @first is the current
>   one, because
>
> of how we traverse @work_list.
>
> 
> It throws a bit of extra memory, but
> - removes extra looping
> - and also takes all hashed requests, but not only sequentially
>   submitted
> 
> e.g. for the following submission sequence, it will take all (hash=0)
> requests.

> REQ(hash=0)
> REQ(hash=1)
> REQ(hash=0)
> REQ()
> REQ(hash=0)
> ...
> 
> 
> Please, tell if you see a hole in the concept. And as said, there is
> still a bug somewhere.

The extra memory isn't a bit deal, it's very minor. My main concern
would be fairness, since we'd then be grabbing non-contig hashed chunks,
before we did not. May not be a concern as long as we ensure the
non-hasned (and differently hashed) work can proceed in parallel. For my
end, I deliberately added:

+	/* already have hashed work, let new worker get this */
+	if (ret) {
+		struct io_wqe_acct *acct;
+
+		/* get new worker for unhashed, if none now */
+		acct = io_work_get_acct(wqe, work);
+		if (!atomic_read(&acct->nr_running))
+			io_wqe_wake_worker(wqe, acct);
+		break;
+	}

to try and improve that.

I'll run a quick test with yours.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 19:51       ` Jens Axboe
@ 2020-03-22 20:05         ` Jens Axboe
  2020-03-22 20:15           ` Jens Axboe
  2020-03-22 20:25         ` Pavel Begunkov
  1 sibling, 1 reply; 20+ messages in thread
From: Jens Axboe @ 2020-03-22 20:05 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 3/22/20 1:51 PM, Jens Axboe wrote:
>> Please, tell if you see a hole in the concept. And as said, there is
>> still a bug somewhere.

One quick guess would be that you're wanting to use both work->list and
work->data, the latter is used by links which is where you are crashing.
Didn't check your list management yet so don't know if that's the case,
but if you're still on the list (or manipulating it after), then that
would be a bug.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 20:05         ` Jens Axboe
@ 2020-03-22 20:15           ` Jens Axboe
  2020-03-22 20:20             ` Pavel Begunkov
  0 siblings, 1 reply; 20+ messages in thread
From: Jens Axboe @ 2020-03-22 20:15 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 3/22/20 2:05 PM, Jens Axboe wrote:
> On 3/22/20 1:51 PM, Jens Axboe wrote:
>>> Please, tell if you see a hole in the concept. And as said, there is
>>> still a bug somewhere.
> 
> One quick guess would be that you're wanting to use both work->list and
> work->data, the latter is used by links which is where you are crashing.
> Didn't check your list management yet so don't know if that's the case,
> but if you're still on the list (or manipulating it after), then that
> would be a bug.

IOW, by the time you do work->func(&work), the item must be off the
list. Does indeed look like that's exactly the bug you have.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 20:15           ` Jens Axboe
@ 2020-03-22 20:20             ` Pavel Begunkov
  2020-03-22 21:16               ` Pavel Begunkov
  0 siblings, 1 reply; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-22 20:20 UTC (permalink / raw)
  To: Jens Axboe, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 944 bytes --]

On 22/03/2020 23:15, Jens Axboe wrote:
> On 3/22/20 2:05 PM, Jens Axboe wrote:
>> On 3/22/20 1:51 PM, Jens Axboe wrote:
>>>> Please, tell if you see a hole in the concept. And as said, there is
>>>> still a bug somewhere.
>>
>> One quick guess would be that you're wanting to use both work->list and
>> work->data, the latter is used by links which is where you are crashing.
>> Didn't check your list management yet so don't know if that's the case,
>> but if you're still on the list (or manipulating it after), then that
>> would be a bug.
> 
> IOW, by the time you do work->func(&work), the item must be off the
> list. Does indeed look like that's exactly the bug you have.
> 

Good guess. I made sure to grab next before ->func(), see next_hashed. And it's
not in @work_list, because io_get_next_work() removes it. However, somebody may
expect it to be NULL or something. Thanks! I'll check it

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 19:51       ` Jens Axboe
  2020-03-22 20:05         ` Jens Axboe
@ 2020-03-22 20:25         ` Pavel Begunkov
  2020-03-23  1:37           ` Jens Axboe
  1 sibling, 1 reply; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-22 20:25 UTC (permalink / raw)
  To: Jens Axboe, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 1432 bytes --]

On 22/03/2020 22:51, Jens Axboe wrote:
> commit f1d96a8fcbbbb22d4fbc1d69eaaa678bbb0ff6e2
> Author: Pavel Begunkov <asml.silence@gmail.com>
> Date:   Fri Mar 13 22:29:14 2020 +0300
> 
>     io_uring: NULL-deref for IOSQE_{ASYNC,DRAIN}
> 
> which is what I ran into as well last week...

I picked it before testing

> The extra memory isn't a bit deal, it's very minor. My main concern
> would be fairness, since we'd then be grabbing non-contig hashed chunks,
> before we did not. May not be a concern as long as we ensure the
> non-hasned (and differently hashed) work can proceed in parallel. For my
> end, I deliberately added:

Don't think it's really a problem, all ordering/scheduling is up to users (i.e.
io_uring), and it can't infinitely postpone a work, because it's processing
spliced requests without taking more, even if new ones hash to the same bit.

> +	/* already have hashed work, let new worker get this */
> +	if (ret) {
> +		struct io_wqe_acct *acct;
> +
> +		/* get new worker for unhashed, if none now */
> +		acct = io_work_get_acct(wqe, work);
> +		if (!atomic_read(&acct->nr_running))
> +			io_wqe_wake_worker(wqe, acct);
> +		break;
> +	}
> 
> to try and improve that.

Is there performance problems with your patch without this chunk? I may see
another problem with yours, I need to think it through.

> 
> I'll run a quick test with yours.

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-19 18:56 [PATCH v2] io-wq: handle hashed writes in chains Jens Axboe
  2020-03-22 16:09 ` Pavel Begunkov
@ 2020-03-22 20:56 ` Pavel Begunkov
  1 sibling, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-22 20:56 UTC (permalink / raw)
  To: Jens Axboe, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 1919 bytes --]

On 19/03/2020 21:56, Jens Axboe wrote:

>  			if (hash != -1U) {
> +				/*
> +				 * If the local list is non-empty, then we
> +				 * have work that hashed to the same key.
> +				 * No need for a lock round-trip, or fiddling
> +				 * the the free/busy state of the worker, or
> +				 * clearing the hashed state. Just process the
> +				 * next one.
> +				 */
> +				if (!work) {
> +					work = wq_first_entry(&list,
> +							      struct io_wq_work,
> +							      list);
> +					if (work) {
> +						wq_node_del(&list, &work->list);
> +						goto got_work;
> +					}
> +				}
> +
>  				spin_lock_irq(&wqe->lock);
>  				wqe->hash_map &= ~BIT_ULL(hash);
>  				wqe->flags &= ~IO_WQE_FLAG_STALLED;

Let's have an example, where "->" is a link
req0(hash=0) -> req1(not_hashed)
req2(hash=0)

1. it grabs @req0 (@work=@req0) and @req1 (in the local @list)
2. it do @req0->func(), sets @work=@req1 and goes to the hash updating code (see
above).

3. ```if (!work)``` check  fails, and it clears @hash_map, even though there is
one of the same hash in the list. It messes up @hash_map accounting, but
probably even can continue working fine.

4. Next, it goes for the second iteration (work == req1), do ->func().
Then checks @hash, which is -1 for non-hashed req1, and exits leaving req2 in
the @list.

Can miss something by looking at diff only, but there are 2 potential points to
fix.

BTW, yours patch executes all linked requests first and then goes to the next
hashed. Mine do hashed first, and re-enqueue linked requests. The downside in my
version is the extra re-enqueue. And your approach can't do some cases in
parallel, e.g. the following is purely sequential:

req0(hash0) -> ... long link ->
req1(hash0) -> ... long link ->
req2(hash0) -> ... long link ->
req3(hash0) -> ... long link ->

It's not hard to convert, though

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 20:20             ` Pavel Begunkov
@ 2020-03-22 21:16               ` Pavel Begunkov
  2020-03-22 21:31                 ` Pavel Begunkov
  0 siblings, 1 reply; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-22 21:16 UTC (permalink / raw)
  To: Jens Axboe, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 1118 bytes --]

On 22/03/2020 23:20, Pavel Begunkov wrote:
> On 22/03/2020 23:15, Jens Axboe wrote:
>> On 3/22/20 2:05 PM, Jens Axboe wrote:
>>> On 3/22/20 1:51 PM, Jens Axboe wrote:
>>>>> Please, tell if you see a hole in the concept. And as said, there is
>>>>> still a bug somewhere.
>>>
>>> One quick guess would be that you're wanting to use both work->list and
>>> work->data, the latter is used by links which is where you are crashing.
>>> Didn't check your list management yet so don't know if that's the case,
>>> but if you're still on the list (or manipulating it after), then that
>>> would be a bug.
>>
>> IOW, by the time you do work->func(&work), the item must be off the
>> list. Does indeed look like that's exactly the bug you have.
>>
> 
> Good guess. I made sure to grab next before ->func(), see next_hashed. And it's
> not in @work_list, because io_get_next_work() removes it. However, somebody may
> expect it to be NULL or something. Thanks! I'll check it

You're right. Re-enqueue corrupts ->data, so should be fixed for previous
patches as well. I'll send it

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 21:16               ` Pavel Begunkov
@ 2020-03-22 21:31                 ` Pavel Begunkov
  0 siblings, 0 replies; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-22 21:31 UTC (permalink / raw)
  To: Jens Axboe, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 1245 bytes --]

On 23/03/2020 00:16, Pavel Begunkov wrote:
> On 22/03/2020 23:20, Pavel Begunkov wrote:
>> On 22/03/2020 23:15, Jens Axboe wrote:
>>> On 3/22/20 2:05 PM, Jens Axboe wrote:
>>>> On 3/22/20 1:51 PM, Jens Axboe wrote:
>>>>>> Please, tell if you see a hole in the concept. And as said, there is
>>>>>> still a bug somewhere.
>>>>
>>>> One quick guess would be that you're wanting to use both work->list and
>>>> work->data, the latter is used by links which is where you are crashing.
>>>> Didn't check your list management yet so don't know if that's the case,
>>>> but if you're still on the list (or manipulating it after), then that
>>>> would be a bug.
>>>
>>> IOW, by the time you do work->func(&work), the item must be off the
>>> list. Does indeed look like that's exactly the bug you have.
>>>
>>
>> Good guess. I made sure to grab next before ->func(), see next_hashed. And it's
>> not in @work_list, because io_get_next_work() removes it. However, somebody may
>> expect it to be NULL or something. Thanks! I'll check it
> 
> You're right. Re-enqueue corrupts ->data, so should be fixed for previous
> patches as well. I'll send it
> 

And the diff works fine if put on top of the fix!

-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-22 20:25         ` Pavel Begunkov
@ 2020-03-23  1:37           ` Jens Axboe
  2020-03-23  8:38             ` Pavel Begunkov
  0 siblings, 1 reply; 20+ messages in thread
From: Jens Axboe @ 2020-03-23  1:37 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 3/22/20 2:25 PM, Pavel Begunkov wrote:
> On 22/03/2020 22:51, Jens Axboe wrote:
>> commit f1d96a8fcbbbb22d4fbc1d69eaaa678bbb0ff6e2
>> Author: Pavel Begunkov <asml.silence@gmail.com>
>> Date:   Fri Mar 13 22:29:14 2020 +0300
>>
>>     io_uring: NULL-deref for IOSQE_{ASYNC,DRAIN}
>>
>> which is what I ran into as well last week...
> 
> I picked it before testing
> 
>> The extra memory isn't a bit deal, it's very minor. My main concern
>> would be fairness, since we'd then be grabbing non-contig hashed chunks,
>> before we did not. May not be a concern as long as we ensure the
>> non-hasned (and differently hashed) work can proceed in parallel. For my
>> end, I deliberately added:
> 
> Don't think it's really a problem, all ordering/scheduling is up to
> users (i.e.  io_uring), and it can't infinitely postpone a work,
> because it's processing spliced requests without taking more, even if
> new ones hash to the same bit.

I don't disagree with you, just wanted to bring it up!

>> +	/* already have hashed work, let new worker get this */
>> +	if (ret) {
>> +		struct io_wqe_acct *acct;
>> +
>> +		/* get new worker for unhashed, if none now */
>> +		acct = io_work_get_acct(wqe, work);
>> +		if (!atomic_read(&acct->nr_running))
>> +			io_wqe_wake_worker(wqe, acct);
>> +		break;
>> +	}
>>
>> to try and improve that.
> 
> Is there performance problems with your patch without this chunk? I
> may see another problem with yours, I need to think it through.

No, and in fact it probably should be a separate thing, but I kind of
like your approach so not moving forward with mine. I do think it's
worth looking into separately, as there's no reason why we can't wake a
non-hashed worker if we're just doing hashed work from the existing
thread. If that thread is just doing copies and not blocking, the
unhashed (or next hashed) work is just sitting idle while it could be
running instead.

Hence I added that hunk, to kick a new worker to proceed in parallel.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-23  1:37           ` Jens Axboe
@ 2020-03-23  8:38             ` Pavel Begunkov
  2020-03-23 14:26               ` Jens Axboe
  0 siblings, 1 reply; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-23  8:38 UTC (permalink / raw)
  To: Jens Axboe, io-uring


[-- Attachment #1.1: Type: text/plain, Size: 2508 bytes --]

On 23/03/2020 04:37, Jens Axboe wrote:
> On 3/22/20 2:25 PM, Pavel Begunkov wrote:
>> On 22/03/2020 22:51, Jens Axboe wrote:
>>> commit f1d96a8fcbbbb22d4fbc1d69eaaa678bbb0ff6e2
>>> Author: Pavel Begunkov <asml.silence@gmail.com>
>>> Date:   Fri Mar 13 22:29:14 2020 +0300
>>>
>>>     io_uring: NULL-deref for IOSQE_{ASYNC,DRAIN}
>>>
>>> which is what I ran into as well last week...
>>
>> I picked it before testing
>>
>>> The extra memory isn't a bit deal, it's very minor. My main concern
>>> would be fairness, since we'd then be grabbing non-contig hashed chunks,
>>> before we did not. May not be a concern as long as we ensure the
>>> non-hasned (and differently hashed) work can proceed in parallel. For my
>>> end, I deliberately added:
>>
>> Don't think it's really a problem, all ordering/scheduling is up to
>> users (i.e.  io_uring), and it can't infinitely postpone a work,
>> because it's processing spliced requests without taking more, even if
>> new ones hash to the same bit.
> 
> I don't disagree with you, just wanted to bring it up!

Sure, there is a lot to think about. E.g. I don't like this reenqueueing,
and if all other thread have enough work to do, then it can avoided, but don't
want to over-complicate.


> 
>>> +	/* already have hashed work, let new worker get this */
>>> +	if (ret) {
>>> +		struct io_wqe_acct *acct;
>>> +
>>> +		/* get new worker for unhashed, if none now */
>>> +		acct = io_work_get_acct(wqe, work);
>>> +		if (!atomic_read(&acct->nr_running))
>>> +			io_wqe_wake_worker(wqe, acct);
>>> +		break;
>>> +	}
>>>
>>> to try and improve that.
>>
>> Is there performance problems with your patch without this chunk? I
>> may see another problem with yours, I need to think it through.
> 
> No, and in fact it probably should be a separate thing, but I kind of
> like your approach so not moving forward with mine. I do think it's
> worth looking into separately, as there's no reason why we can't wake a
> non-hashed worker if we're just doing hashed work from the existing
> thread. If that thread is just doing copies and not blocking, the
> unhashed (or next hashed) work is just sitting idle while it could be
> running instead.

Then, I'll clean the diff, hopefully soon. Could I steal parts of your patch
description?

> 
> Hence I added that hunk, to kick a new worker to proceed in parallel.

It seems, I need to take a closer look at this accounting in general.


-- 
Pavel Begunkov


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-23  8:38             ` Pavel Begunkov
@ 2020-03-23 14:26               ` Jens Axboe
  0 siblings, 0 replies; 20+ messages in thread
From: Jens Axboe @ 2020-03-23 14:26 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring

On 3/23/20 2:38 AM, Pavel Begunkov wrote:
>> No, and in fact it probably should be a separate thing, but I kind of
>> like your approach so not moving forward with mine. I do think it's
>> worth looking into separately, as there's no reason why we can't wake a
>> non-hashed worker if we're just doing hashed work from the existing
>> thread. If that thread is just doing copies and not blocking, the
>> unhashed (or next hashed) work is just sitting idle while it could be
>> running instead.
> 
> Then, I'll clean the diff, hopefully soon. Could I steal parts of your patch
> description?

Of course, go ahead.

>> Hence I added that hunk, to kick a new worker to proceed in parallel.
> 
> It seems, I need to take a closer look at this accounting in general.

Agree, I think we have some room for improvement there.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2] io-wq: handle hashed writes in chains
  2020-03-23 19:57 Pavel Begunkov
@ 2020-03-24  2:31 ` Jens Axboe
  0 siblings, 0 replies; 20+ messages in thread
From: Jens Axboe @ 2020-03-24  2:31 UTC (permalink / raw)
  To: Pavel Begunkov, io-uring, linux-kernel

On 3/23/20 1:57 PM, Pavel Begunkov wrote:
> We always punt async buffered writes to an io-wq helper, as the core
> kernel does not have IOCB_NOWAIT support for that. Most buffered async
> writes complete very quickly, as it's just a copy operation. This means
> that doing multiple locking roundtrips on the shared wqe lock for each
> buffered write is wasteful. Additionally, buffered writes are hashed
> work items, which means that any buffered write to a given file is
> serialized.
> 
> Keep identicaly hashed work items contiguously in @wqe->work_list, and
> track a tail for each hash bucket. On dequeue of a hashed item, splice
> all of the same hash in one go using the tracked tail. Until the batch
> is done, the caller doesn't have to synchronize with the wqe or worker
> locks again.

Looks good to me, and also passes testing. I've applied this for 5.7.
Next we can start looking into cases where it'd be an improvement
to kick off another worker. Say if we still have work after grabbing
a chain, would probably not be a bad idea.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v2] io-wq: handle hashed writes in chains
@ 2020-03-23 19:57 Pavel Begunkov
  2020-03-24  2:31 ` Jens Axboe
  0 siblings, 1 reply; 20+ messages in thread
From: Pavel Begunkov @ 2020-03-23 19:57 UTC (permalink / raw)
  To: Jens Axboe, io-uring, linux-kernel

We always punt async buffered writes to an io-wq helper, as the core
kernel does not have IOCB_NOWAIT support for that. Most buffered async
writes complete very quickly, as it's just a copy operation. This means
that doing multiple locking roundtrips on the shared wqe lock for each
buffered write is wasteful. Additionally, buffered writes are hashed
work items, which means that any buffered write to a given file is
serialized.

Keep identicaly hashed work items contiguously in @wqe->work_list, and
track a tail for each hash bucket. On dequeue of a hashed item, splice
all of the same hash in one go using the tracked tail. Until the batch
is done, the caller doesn't have to synchronize with the wqe or worker
locks again.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---

v2 (since diff):
- add wq_next_work() helper
- rename list helpers to match list.h naming
- reshuffle io_worker_handle_work() bits, but keeping
  functionally identical

 fs/io-wq.c | 68 ++++++++++++++++++++++++++++++++++++++----------------
 fs/io-wq.h | 45 +++++++++++++++++++++++++++++-------
 2 files changed, 85 insertions(+), 28 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index b3fb61ec0870..cc5cf2209fb0 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -69,6 +69,8 @@ struct io_worker {
 #define IO_WQ_HASH_ORDER	5
 #endif
 
+#define IO_WQ_NR_HASH_BUCKETS	(1u << IO_WQ_HASH_ORDER)
+
 struct io_wqe_acct {
 	unsigned nr_workers;
 	unsigned max_workers;
@@ -98,6 +100,7 @@ struct io_wqe {
 	struct list_head all_list;
 
 	struct io_wq *wq;
+	struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
 };
 
 /*
@@ -384,7 +387,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 	__must_hold(wqe->lock)
 {
 	struct io_wq_work_node *node, *prev;
-	struct io_wq_work *work;
+	struct io_wq_work *work, *tail;
 	unsigned int hash;
 
 	wq_list_for_each(node, prev, &wqe->work_list) {
@@ -392,7 +395,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 
 		/* not hashed, can run anytime */
 		if (!io_wq_is_hashed(work)) {
-			wq_node_del(&wqe->work_list, node, prev);
+			wq_list_del(&wqe->work_list, node, prev);
 			return work;
 		}
 
@@ -400,7 +403,10 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 		hash = io_get_work_hash(work);
 		if (!(wqe->hash_map & BIT(hash))) {
 			wqe->hash_map |= BIT(hash);
-			wq_node_del(&wqe->work_list, node, prev);
+			/* all items with this hash lie in [work, tail] */
+			tail = wqe->hash_tail[hash];
+			wqe->hash_tail[hash] = NULL;
+			wq_list_cut(&wqe->work_list, &tail->list, prev);
 			return work;
 		}
 	}
@@ -485,7 +491,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 	struct io_wq *wq = wqe->wq;
 
 	do {
-		struct io_wq_work *work, *assign_work;
+		struct io_wq_work *work;
 		unsigned int hash;
 get_next:
 		/*
@@ -508,8 +514,9 @@ static void io_worker_handle_work(struct io_worker *worker)
 
 		/* handle a whole dependent link */
 		do {
-			struct io_wq_work *old_work;
+			struct io_wq_work *old_work, *next_hashed, *linked;
 
+			next_hashed = wq_next_work(work);
 			io_impersonate_work(worker, work);
 			/*
 			 * OK to set IO_WQ_WORK_CANCEL even for uncancellable
@@ -518,22 +525,23 @@ static void io_worker_handle_work(struct io_worker *worker)
 			if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
 				work->flags |= IO_WQ_WORK_CANCEL;
 
-			old_work = work;
 			hash = io_get_work_hash(work);
-			work->func(&work);
-			work = (old_work == work) ? NULL : work;
-
-			assign_work = work;
-			if (work && io_wq_is_hashed(work))
-				assign_work = NULL;
-			io_assign_current_work(worker, assign_work);
+			linked = old_work = work;
+			linked->func(&linked);
+			linked = (old_work == linked) ? NULL : linked;
+
+			work = next_hashed;
+			if (!work && linked && !io_wq_is_hashed(linked)) {
+				work = linked;
+				linked = NULL;
+			}
+			io_assign_current_work(worker, work);
 			wq->free_work(old_work);
 
-			if (work && !assign_work) {
-				io_wqe_enqueue(wqe, work);
-				work = NULL;
-			}
-			if (hash != -1U) {
+			if (linked)
+				io_wqe_enqueue(wqe, linked);
+
+			if (hash != -1U && !next_hashed) {
 				spin_lock_irq(&wqe->lock);
 				wqe->hash_map &= ~BIT_ULL(hash);
 				wqe->flags &= ~IO_WQE_FLAG_STALLED;
@@ -776,6 +784,26 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
 	} while (work);
 }
 
+static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
+{
+	unsigned int hash;
+	struct io_wq_work *tail;
+
+	if (!io_wq_is_hashed(work)) {
+append:
+		wq_list_add_tail(&work->list, &wqe->work_list);
+		return;
+	}
+
+	hash = io_get_work_hash(work);
+	tail = wqe->hash_tail[hash];
+	wqe->hash_tail[hash] = work;
+	if (!tail)
+		goto append;
+
+	wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
+}
+
 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 {
 	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
@@ -795,7 +823,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 
 	work_flags = work->flags;
 	spin_lock_irqsave(&wqe->lock, flags);
-	wq_list_add_tail(&work->list, &wqe->work_list);
+	io_wqe_insert_work(wqe, work);
 	wqe->flags &= ~IO_WQE_FLAG_STALLED;
 	spin_unlock_irqrestore(&wqe->lock, flags);
 
@@ -914,7 +942,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 		work = container_of(node, struct io_wq_work, list);
 
 		if (match->fn(work, match->data)) {
-			wq_node_del(&wqe->work_list, node, prev);
+			wq_list_del(&wqe->work_list, node, prev);
 			found = true;
 			break;
 		}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index d2a5684bf673..3ee7356d6be5 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -28,6 +28,18 @@ struct io_wq_work_list {
 	struct io_wq_work_node *last;
 };
 
+static inline void wq_list_add_after(struct io_wq_work_node *node,
+				     struct io_wq_work_node *pos,
+				     struct io_wq_work_list *list)
+{
+	struct io_wq_work_node *next = pos->next;
+
+	pos->next = node;
+	node->next = next;
+	if (!next)
+		list->last = node;
+}
+
 static inline void wq_list_add_tail(struct io_wq_work_node *node,
 				    struct io_wq_work_list *list)
 {
@@ -40,17 +52,26 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
 	}
 }
 
-static inline void wq_node_del(struct io_wq_work_list *list,
-			       struct io_wq_work_node *node,
+static inline void wq_list_cut(struct io_wq_work_list *list,
+			       struct io_wq_work_node *last,
 			       struct io_wq_work_node *prev)
 {
-	if (node == list->first)
-		WRITE_ONCE(list->first, node->next);
-	if (node == list->last)
+	/* first in the list, if prev==NULL */
+	if (!prev)
+		WRITE_ONCE(list->first, last->next);
+	else
+		prev->next = last->next;
+
+	if (last == list->last)
 		list->last = prev;
-	if (prev)
-		prev->next = node->next;
-	node->next = NULL;
+	last->next = NULL;
+}
+
+static inline void wq_list_del(struct io_wq_work_list *list,
+			       struct io_wq_work_node *node,
+			       struct io_wq_work_node *prev)
+{
+	wq_list_cut(list, node, prev);
 }
 
 #define wq_list_for_each(pos, prv, head)			\
@@ -78,6 +99,14 @@ struct io_wq_work {
 		*(work) = (struct io_wq_work){ .func = _func };	\
 	} while (0)						\
 
+static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
+{
+	if (!work->list.next)
+		return NULL;
+
+	return container_of(work->list.next, struct io_wq_work, list);
+}
+
 typedef void (free_work_fn)(struct io_wq_work *);
 
 struct io_wq_data {
-- 
2.24.0


^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2020-03-24  2:31 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-19 18:56 [PATCH v2] io-wq: handle hashed writes in chains Jens Axboe
2020-03-22 16:09 ` Pavel Begunkov
2020-03-22 16:24   ` Pavel Begunkov
2020-03-22 17:08     ` Jens Axboe
2020-03-22 18:54     ` Pavel Begunkov
2020-03-22 19:51       ` Jens Axboe
2020-03-22 20:05         ` Jens Axboe
2020-03-22 20:15           ` Jens Axboe
2020-03-22 20:20             ` Pavel Begunkov
2020-03-22 21:16               ` Pavel Begunkov
2020-03-22 21:31                 ` Pavel Begunkov
2020-03-22 20:25         ` Pavel Begunkov
2020-03-23  1:37           ` Jens Axboe
2020-03-23  8:38             ` Pavel Begunkov
2020-03-23 14:26               ` Jens Axboe
2020-03-22 17:08   ` Jens Axboe
2020-03-22 17:37     ` Pavel Begunkov
2020-03-22 20:56 ` Pavel Begunkov
2020-03-23 19:57 Pavel Begunkov
2020-03-24  2:31 ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).