All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC -next] sbitmap: fix possible io hung due to lost wakeups
@ 2022-06-17 14:11 Yu Kuai
  2022-06-17 14:04 ` Keith Busch
  2022-06-20 12:24 ` Jan Kara
  0 siblings, 2 replies; 8+ messages in thread
From: Yu Kuai @ 2022-06-17 14:11 UTC (permalink / raw)
  To: axboe, ming.lei; +Cc: jack, linux-block, linux-kernel, yukuai3, yi.zhang

Currently, same waitqueue might be woken up continuously:

__sbq_wake_up		__sbq_wake_up
 sbq_wake_ptr -> assume	0
			 sbq_wake_ptr -> 0
 atomic_dec_return
			atomic_dec_return
 atomic_cmpxchg -> succeed
			 atomic_cmpxchg -> failed
			  return true

			__sbq_wake_up
			 sbq_wake_ptr
			  atomic_read(&sbq->wake_index) -> still 0
 sbq_index_atomic_inc -> inc to 1
			  if (waitqueue_active(&ws->wait))
			   if (wake_index != atomic_read(&sbq->wake_index))
			    atomic_set -> reset from 1 to 0
 wake_up_nr -> wake up first waitqueue
			    // continue to wake up in first waitqueue

What's worse, io hung is possible in theory because wakeups might be
missed. For example, 2 * wake_batch tags are put, while only wake_batch
threads are worken:

__sbq_wake_up
 atomic_cmpxchg -> reset wait_cnt
			__sbq_wake_up -> decrease wait_cnt
			...
			__sbq_wake_up -> wait_cnt is decreased to 0 again
			 atomic_cmpxchg
			 sbq_index_atomic_inc -> increase wake_index
			 wake_up_nr -> wake up and waitqueue might be empty
 sbq_index_atomic_inc -> increase again, one waitqueue is skipped
 wake_up_nr -> invalid wake up because old wakequeue might be empty

To fix the problem, refactor to make sure waitqueues will be woken up
one by one,

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 lib/sbitmap.c | 83 ++++++++++++++++++++++++++-------------------------
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index ae4fd4de9ebe..dc2959cb188c 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -574,66 +574,69 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);
 
-static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
+static void sbq_update_wake_index(struct sbitmap_queue *sbq,
+				  int old_wake_index)
 {
 	int i, wake_index;
-
-	if (!atomic_read(&sbq->ws_active))
-		return NULL;
+	struct sbq_wait_state *ws;
 
 	wake_index = atomic_read(&sbq->wake_index);
-	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
-		struct sbq_wait_state *ws = &sbq->ws[wake_index];
+	if (old_wake_index != wake_index)
+		return;
 
+	for (i = 1; i < SBQ_WAIT_QUEUES; i++) {
+		wake_index = sbq_index_inc(wake_index);
+		ws = &sbq->ws[wake_index];
+		/* Find the next active waitqueue in round robin manner */
 		if (waitqueue_active(&ws->wait)) {
-			if (wake_index != atomic_read(&sbq->wake_index))
-				atomic_set(&sbq->wake_index, wake_index);
-			return ws;
+			atomic_cmpxchg(&sbq->wake_index, old_wake_index,
+				       wake_index);
+			return;
 		}
-
-		wake_index = sbq_index_inc(wake_index);
 	}
-
-	return NULL;
 }
 
 static bool __sbq_wake_up(struct sbitmap_queue *sbq)
 {
 	struct sbq_wait_state *ws;
 	unsigned int wake_batch;
-	int wait_cnt;
+	int wait_cnt, wake_index;
 
-	ws = sbq_wake_ptr(sbq);
-	if (!ws)
+	if (!atomic_read(&sbq->ws_active))
 		return false;
 
-	wait_cnt = atomic_dec_return(&ws->wait_cnt);
-	if (wait_cnt <= 0) {
-		int ret;
-
-		wake_batch = READ_ONCE(sbq->wake_batch);
-
-		/*
-		 * Pairs with the memory barrier in sbitmap_queue_resize() to
-		 * ensure that we see the batch size update before the wait
-		 * count is reset.
-		 */
-		smp_mb__before_atomic();
+	wake_index = atomic_read(&sbq->wake_index);
+	ws = &sbq->ws[wake_index];
+	/*
+	 * This can only happen in the first wakeup when sbitmap waitqueues
+	 * are no longer idle.
+	 */
+	if (!waitqueue_active(&ws->wait)) {
+		sbq_update_wake_index(sbq, wake_index);
+		return true;
+	}
 
-		/*
-		 * For concurrent callers of this, the one that failed the
-		 * atomic_cmpxhcg() race should call this function again
-		 * to wakeup a new batch on a different 'ws'.
-		 */
-		ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch);
-		if (ret == wait_cnt) {
-			sbq_index_atomic_inc(&sbq->wake_index);
-			wake_up_nr(&ws->wait, wake_batch);
-			return false;
-		}
+	wait_cnt = atomic_dec_return(&ws->wait_cnt);
+	if (wait_cnt > 0)
+		return false;
 
+	sbq_update_wake_index(sbq, wake_index);
+	/*
+	 * Concurrent callers should call this function again
+	 * to wakeup a new batch on a different 'ws'.
+	 */
+	if (wait_cnt < 0)
 		return true;
-	}
+
+	wake_batch = READ_ONCE(sbq->wake_batch);
+	/*
+	 * Pairs with the memory barrier in sbitmap_queue_resize() to
+	 * ensure that we see the batch size update before the wait
+	 * count is reset.
+	 */
+	smp_mb__before_atomic();
+	atomic_set(&ws->wait_cnt, wake_batch);
+	wake_up_nr(&ws->wait, wake_batch);
 
 	return false;
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-06-22  6:41 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-17 14:11 [PATCH RFC -next] sbitmap: fix possible io hung due to lost wakeups Yu Kuai
2022-06-17 14:04 ` Keith Busch
2022-06-20 12:24 ` Jan Kara
2022-06-20 12:48   ` Jan Kara
2022-06-20 13:44     ` Yu Kuai
2022-06-20 17:02       ` Jan Kara
2022-06-22  3:58         ` Yu Kuai
2022-06-22  6:41           ` Yu Kuai

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.