On 22/03/2020 19:24, Pavel Begunkov wrote:
> On 22/03/2020 19:09, Pavel Begunkov wrote:
>> On 19/03/2020 21:56, Jens Axboe wrote:
>>> We always punt async buffered writes to an io-wq helper, as the core
>>> kernel does not have IOCB_NOWAIT support for that. Most buffered async
>>> writes complete very quickly, as it's just a copy operation. This means
>>> that doing multiple locking roundtrips on the shared wqe lock for each
>>> buffered write is wasteful. Additionally, buffered writes are hashed
>>> work items, which means that any buffered write to a given file is
>>> serialized.
>>>
>>> When looking for a new work item, build a chain of identicaly hashed
>>> work items, and then hand back that batch. Until the batch is done, the
>>> caller doesn't have to synchronize with the wqe or worker locks again.
> 
> I have an idea, how to do it a bit better. Let me try it.

The diff below is buggy (Ooopses somewhere in blk-mq for
read-write.c:read_poll_link), I'll double check it on a fresh head.

The idea is to keep same-hashed works continuously in @work_list, so they
can be spliced in one go. For each hash bucket, I keep last added work
- on enqueue, it adds a work after the last one
- on dequeue it splices [first, last]. Where @first is the current one, because
of how we traverse @work_list.

It throws a bit of extra memory, but
- removes extra looping
- and also takes all hashed requests, but not only sequentially submitted

e.g. for the following submission sequence, it will take all (hash=0) requests.
REQ(hash=0)
REQ(hash=1)
REQ(hash=0)
REQ()
REQ(hash=0)
...


Please, tell if you see a hole in the concept. And as said, there is still a bug
somewhere.

---
 fs/io-wq.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
 fs/io-wq.h | 34 +++++++++++++++++++++++++++-------
 2 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index b3fb61ec0870..00d8f8b12df3 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -69,6 +69,8 @@ struct io_worker {
 #define IO_WQ_HASH_ORDER	5
 #endif

+#define IO_WQ_HASH_NR_BUCKETS	(1u << IO_WQ_HASH_ORDER)
+
 struct io_wqe_acct {
 	unsigned nr_workers;
 	unsigned max_workers;
@@ -98,6 +100,7 @@ struct io_wqe {
 	struct list_head all_list;

 	struct io_wq *wq;
+	struct io_wq_work *last_hashed[IO_WQ_HASH_NR_BUCKETS];
 };

 /*
@@ -400,7 +403,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 		hash = io_get_work_hash(work);
 		if (!(wqe->hash_map & BIT(hash))) {
 			wqe->hash_map |= BIT(hash);
-			wq_node_del(&wqe->work_list, node, prev);
+			wq_del_node_range(&wqe->work_list,
+					  &wqe->last_hashed[hash]->list, prev);
+			wqe->last_hashed[hash] = NULL;
 			return work;
 		}
 	}
@@ -508,7 +513,11 @@ static void io_worker_handle_work(struct io_worker *worker)

 		/* handle a whole dependent link */
 		do {
-			struct io_wq_work *old_work;
+			struct io_wq_work *old_work, *next_hashed = NULL;
+
+			if (work->list.next)
+				next_hashed = container_of(work->list.next,
+						       struct io_wq_work, list);

 			io_impersonate_work(worker, work);
 			/*
@@ -523,12 +532,20 @@ static void io_worker_handle_work(struct io_worker *worker)
 			work->func(&work);
 			work = (old_work == work) ? NULL : work;

-			assign_work = work;
-			if (work && io_wq_is_hashed(work))
-				assign_work = NULL;
+			assign_work = next_hashed;
+			if (!next_hashed && work && !io_wq_is_hashed(work))
+				assign_work = work;
+
 			io_assign_current_work(worker, assign_work);
 			wq->free_work(old_work);

+			if (next_hashed) {
+				if (work)
+					io_wqe_enqueue(wqe, work);
+				work = next_hashed;
+				continue;
+			}
+
 			if (work && !assign_work) {
 				io_wqe_enqueue(wqe, work);
 				work = NULL;
@@ -776,6 +793,26 @@ static void io_run_cancel(struct io_wq_work *work, struct
io_wqe *wqe)
 	} while (work);
 }

+static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
+{
+	int hash;
+	struct io_wq_work *last_hashed;
+
+	if (!io_wq_is_hashed(work)) {
+append:
+		wq_list_add_tail(&work->list, &wqe->work_list);
+		return;
+	}
+
+	hash = io_get_work_hash(work);
+	last_hashed = wqe->last_hashed[hash];
+	wqe->last_hashed[hash] = work;
+	if (!last_hashed)
+		goto append;
+
+	wq_list_add_after(&work->list, &last_hashed->list, &wqe->work_list);
+}
+
 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 {
 	struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
@@ -795,7 +832,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct
io_wq_work *work)

 	work_flags = work->flags;
 	spin_lock_irqsave(&wqe->lock, flags);
-	wq_list_add_tail(&work->list, &wqe->work_list);
+	io_wqe_insert_work(wqe, work);
 	wqe->flags &= ~IO_WQE_FLAG_STALLED;
 	spin_unlock_irqrestore(&wqe->lock, flags);

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 298b21f4a4d2..10367b6238da 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -40,17 +40,37 @@ static inline void wq_list_add_tail(struct io_wq_work_node
*node,
 	}
 }

+static inline void wq_list_add_after(struct io_wq_work_node *node,
+				    struct io_wq_work_node *pos,
+				    struct io_wq_work_list *list)
+{
+	struct io_wq_work_node *next = pos->next;
+
+	pos->next = node;
+	node->next = next;
+	if (!next)
+		list->last = node;
+}
+
+static inline void wq_del_node_range(struct io_wq_work_list *list,
+				     struct io_wq_work_node *last,
+				     struct io_wq_work_node *prev)
+{
+	if (!prev)
+		WRITE_ONCE(list->first, last->next);
+	else
+		prev->next = last->next;
+
+	if (last == list->last)
+		list->last = prev;
+	last->next = NULL;
+}
+
 static inline void wq_node_del(struct io_wq_work_list *list,
 			       struct io_wq_work_node *node,
 			       struct io_wq_work_node *prev)
 {
-	if (node == list->first)
-		WRITE_ONCE(list->first, node->next);
-	if (node == list->last)
-		list->last = prev;
-	if (prev)
-		prev->next = node->next;
-	node->next = NULL;
+	wq_del_node_range(list, node, prev);
 }

 #define wq_list_for_each(pos, prv, head)			\
-- 
2.24.0