io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jens Axboe <axboe@kernel.dk>
To: io-uring <io-uring@vger.kernel.org>
Subject: [PATCH v2] io-wq: handle hashed writes in chains
Date: Thu, 19 Mar 2020 12:56:41 -0600	[thread overview]
Message-ID: <ab2e967f-754f-6dcf-95a0-4f24c47a9d5e@kernel.dk> (raw)

We always punt async buffered writes to an io-wq helper, as the core
kernel does not have IOCB_NOWAIT support for that. Most buffered async
writes complete very quickly, as it's just a copy operation. This means
that doing multiple locking roundtrips on the shared wqe lock for each
buffered write is wasteful. Additionally, buffered writes are hashed
work items, which means that any buffered write to a given file is
serialized.

When looking for a new work item, build a chain of identicaly hashed
work items, and then hand back that batch. Until the batch is done, the
caller doesn't have to synchronize with the wqe or worker locks again.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

---

Changes:
- Don't overwrite passed back work


diff --git a/fs/io-wq.c b/fs/io-wq.c
index 9541df2729de..8402c6e417e1 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -380,32 +380,65 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
 	return work->flags >> IO_WQ_HASH_SHIFT;
 }
 
-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
+/*
+ * Returns the next work item to process, if any. For hashed work that hash
+ * to the same key, we can't process N+1 before N is done. To make the
+ * processing more efficient, return N+1 and later identically hashed work
+ * in the passed in list. This avoids repeated hammering on the wqe lock for,
+ * as the caller can just process items in the on-stack list.
+ */
+static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
+					   struct io_wq_work_list *list)
 	__must_hold(wqe->lock)
 {
-	struct io_wq_work_node *node, *prev;
-	struct io_wq_work *work;
-	unsigned int hash;
+	struct io_wq_work *ret = NULL;
 
-	wq_list_for_each(node, prev, &wqe->work_list) {
-		work = container_of(node, struct io_wq_work, list);
+	do {
+		unsigned int new_hash, hash;
+		struct io_wq_work *work;
+
+		work = wq_first_entry(&wqe->work_list, struct io_wq_work, list);
+		if (!work)
+			break;
 
 		/* not hashed, can run anytime */
 		if (!io_wq_is_hashed(work)) {
-			wq_node_del(&wqe->work_list, node, prev);
-			return work;
+			/* already have hashed work, let new worker get this */
+			if (ret) {
+				struct io_wqe_acct *acct;
+
+				/* get new worker for unhashed, if none now */
+				acct = io_work_get_acct(wqe, work);
+				if (!atomic_read(&acct->nr_running))
+					io_wqe_wake_worker(wqe, acct);
+				break;
+			}
+			wq_node_del(&wqe->work_list, &work->list);
+			ret = work;
+			break;
 		}
 
 		/* hashed, can run if not already running */
-		hash = io_get_work_hash(work);
-		if (!(wqe->hash_map & BIT(hash))) {
+		new_hash = io_get_work_hash(work);
+		if (wqe->hash_map & BIT(new_hash))
+			break;
+
+		if (!ret) {
+			hash = new_hash;
 			wqe->hash_map |= BIT(hash);
-			wq_node_del(&wqe->work_list, node, prev);
-			return work;
+		} else if (hash != new_hash) {
+			break;
 		}
-	}
 
-	return NULL;
+		wq_node_del(&wqe->work_list, &work->list);
+		/* return first node, add subsequent same hash to the list */
+		if (!ret)
+			ret = work;
+		else
+			wq_list_add_tail(&work->list, list);
+	} while (1);
+
+	return ret;
 }
 
 static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
@@ -481,6 +514,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
 static void io_worker_handle_work(struct io_worker *worker)
 	__releases(wqe->lock)
 {
+	struct io_wq_work_list list = { .first = NULL, .last = NULL };
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
 
@@ -495,7 +529,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 		 * can't make progress, any work completion or insertion will
 		 * clear the stalled flag.
 		 */
-		work = io_get_next_work(wqe);
+		work = io_get_next_work(wqe, &list);
 		if (work)
 			__io_worker_busy(wqe, worker, work);
 		else if (!wq_list_empty(&wqe->work_list))
@@ -504,6 +538,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 		spin_unlock_irq(&wqe->lock);
 		if (!work)
 			break;
+got_work:
 		io_assign_current_work(worker, work);
 
 		/* handle a whole dependent link */
@@ -530,6 +565,24 @@ static void io_worker_handle_work(struct io_worker *worker)
 				work = NULL;
 			}
 			if (hash != -1U) {
+				/*
+				 * If the local list is non-empty, then we
+				 * have work that hashed to the same key.
+				 * No need for a lock round-trip, or fiddling
+				 * the the free/busy state of the worker, or
+				 * clearing the hashed state. Just process the
+				 * next one.
+				 */
+				if (!work) {
+					work = wq_first_entry(&list,
+							      struct io_wq_work,
+							      list);
+					if (work) {
+						wq_node_del(&list, &work->list);
+						goto got_work;
+					}
+				}
+
 				spin_lock_irq(&wqe->lock);
 				wqe->hash_map &= ~BIT_ULL(hash);
 				wqe->flags &= ~IO_WQE_FLAG_STALLED;
@@ -910,7 +963,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 		work = container_of(node, struct io_wq_work, list);
 
 		if (match->fn(work, match->data)) {
-			wq_node_del(&wqe->work_list, node, prev);
+			__wq_node_del(&wqe->work_list, node, prev);
 			found = true;
 			break;
 		}
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 298b21f4a4d2..9a194339bd9d 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -40,9 +40,9 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
 	}
 }
 
-static inline void wq_node_del(struct io_wq_work_list *list,
-			       struct io_wq_work_node *node,
-			       struct io_wq_work_node *prev)
+static inline void __wq_node_del(struct io_wq_work_list *list,
+				struct io_wq_work_node *node,
+				struct io_wq_work_node *prev)
 {
 	if (node == list->first)
 		WRITE_ONCE(list->first, node->next);
@@ -53,6 +53,21 @@ static inline void wq_node_del(struct io_wq_work_list *list,
 	node->next = NULL;
 }
 
+
+static inline void wq_node_del(struct io_wq_work_list *list,
+			       struct io_wq_work_node *node)
+{
+	__wq_node_del(list, node, NULL);
+}
+
+#define wq_first_entry(list, type, member)				\
+({									\
+	struct io_wq_work *__work = NULL;				\
+	if (!wq_list_empty((list)))					\
+		__work = container_of((list)->first, type, member);	\
+	__work;								\
+})
+
 #define wq_list_for_each(pos, prv, head)			\
 	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
 
-- 
Jens Axboe


             reply	other threads:[~2020-03-19 18:56 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-03-19 18:56 Jens Axboe [this message]
2020-03-22 16:09 ` [PATCH v2] io-wq: handle hashed writes in chains Pavel Begunkov
2020-03-22 16:24   ` Pavel Begunkov
2020-03-22 17:08     ` Jens Axboe
2020-03-22 18:54     ` Pavel Begunkov
2020-03-22 19:51       ` Jens Axboe
2020-03-22 20:05         ` Jens Axboe
2020-03-22 20:15           ` Jens Axboe
2020-03-22 20:20             ` Pavel Begunkov
2020-03-22 21:16               ` Pavel Begunkov
2020-03-22 21:31                 ` Pavel Begunkov
2020-03-22 20:25         ` Pavel Begunkov
2020-03-23  1:37           ` Jens Axboe
2020-03-23  8:38             ` Pavel Begunkov
2020-03-23 14:26               ` Jens Axboe
2020-03-22 17:08   ` Jens Axboe
2020-03-22 17:37     ` Pavel Begunkov
2020-03-22 20:56 ` Pavel Begunkov
2020-03-23 19:57 Pavel Begunkov
2020-03-24  2:31 ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ab2e967f-754f-6dcf-95a0-4f24c47a9d5e@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=io-uring@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).