linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Bin Lai <sclaibin@gmail.com>
To: mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com,
	vincent.guittot@linaro.org, akpm@linux-foundation.org
Cc: dietmar.eggemann@arm.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, bristot@redhat.com,
	vschneid@redhat.com, willy@infradead.org,
	linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, Bin Lai <robinlai@tencent.com>
Subject: [PATCH] sched/wait: introduce endmark in __wake_up_common
Date: Tue, 10 Oct 2023 11:28:33 +0800	[thread overview]
Message-ID: <20231010032833.398033-1-robinlai@tencent.com> (raw)

Without this patch applied, it can cause the waker to fall into an
infinite loop in some cases. The commit 2554db916586 ("sched/wait: Break
up long wake list walk") introduces WQ_FLAG_BOOKMARK to break up long
wake list walk. When the number of walked entries reach 64, the waker
will record scan position and release the queue lock, which reduces
interrupts and rescheduling latency.

Let's take an example, ltp-aiodio case of runltp-ng create 100 processes
for io testing. These processes write the same test file and wait for page
writing complete. Because these processes are all writing to the same
file, they may all be waiting for the writeback bit of the same page
to be cleared. When the page writeback is completed, the end_page_writeback
will clear the writeback bit of the page and wake up all processes on the
wake list for the page. At the same time, another process could submit
the page and set the writeback bit again. When the awakened processes find
that the writeback bit has not been cleared, thery will try to add
themselves to the wake list immediately. Because of the WQ_FLAG_BOOKMARK
feature, the awakened processes will be added to the tail of wake list
again after the waker releases the queue lock. It causes the waker to
fall into an infinite loop.

Therefore, we introduce the endmark to indicate the end of bookmark state.
When we get the endmark entry, stop placing the bookmark flag and hold the
lock until all remaining entries have been walked.

Signed-off-by: Bin Lai <robinlai@tencent.com>

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5ec7739400f4..3413babd2db4 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -213,7 +213,8 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *
 void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
-		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark,
+		wait_queue_entry_t *endmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 802d98cf2de3..9ecb59193710 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -79,10 +79,11 @@ EXPORT_SYMBOL(remove_wait_queue);
  */
 static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, int wake_flags, void *key,
-			wait_queue_entry_t *bookmark)
+			wait_queue_entry_t *bookmark,
+			wait_queue_entry_t *endmark)
 {
 	wait_queue_entry_t *curr, *next;
-	int cnt = 0;
+	int cnt = 0, touch_endmark = 0;
 
 	lockdep_assert_held(&wq_head->lock);
 
@@ -95,12 +96,17 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 		curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
 
 	if (&curr->entry == &wq_head->head)
-		return nr_exclusive;
+		goto out;
 
 	list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
 		unsigned flags = curr->flags;
 		int ret;
 
+		if (curr == endmark) {
+			touch_endmark = 1;
+			continue;
+		}
+
 		if (flags & WQ_FLAG_BOOKMARK)
 			continue;
 
@@ -110,14 +116,24 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 		if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 
-		if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+		if (bookmark && !touch_endmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
 				(&next->entry != &wq_head->head)) {
 			bookmark->flags = WQ_FLAG_BOOKMARK;
 			list_add_tail(&bookmark->entry, &next->entry);
-			break;
+
+			if (endmark && !(endmark->flags & WQ_FLAG_BOOKMARK)) {
+				endmark->flags = WQ_FLAG_BOOKMARK;
+				list_add_tail(&endmark->entry, &wq_head->head);
+			}
+
+			return nr_exclusive;
 		}
 	}
 
+out:
+	if (endmark && (endmark->flags & WQ_FLAG_BOOKMARK))
+		list_del(&endmark->entry);
+
 	return nr_exclusive;
 }
 
@@ -125,7 +141,7 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
 			int nr_exclusive, int wake_flags, void *key)
 {
 	unsigned long flags;
-	wait_queue_entry_t bookmark;
+	wait_queue_entry_t bookmark, endmark;
 	int remaining = nr_exclusive;
 
 	bookmark.flags = 0;
@@ -133,10 +149,15 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m
 	bookmark.func = NULL;
 	INIT_LIST_HEAD(&bookmark.entry);
 
+	endmark.flags = 0;
+	endmark.private = NULL;
+	endmark.func = NULL;
+	INIT_LIST_HEAD(&endmark.entry);
+
 	do {
 		spin_lock_irqsave(&wq_head->lock, flags);
 		remaining = __wake_up_common(wq_head, mode, remaining,
-						wake_flags, key, &bookmark);
+					     wake_flags, key, &bookmark, &endmark);
 		spin_unlock_irqrestore(&wq_head->lock, flags);
 	} while (bookmark.flags & WQ_FLAG_BOOKMARK);
 
@@ -171,20 +192,21 @@ void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode
  */
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
 {
-	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
+	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key, NULL);
+	__wake_up_common(wq_head, mode, 1, 0, key, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
 void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
-		unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark,
+		wait_queue_entry_t *endmark)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+	__wake_up_common(wq_head, mode, 1, 0, key, bookmark, endmark);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
 
@@ -233,7 +255,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
 			       unsigned int mode, void *key)
 {
-        __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+	__wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ea4387053e8..49dc8620271d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1135,7 +1135,7 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
 	wait_queue_head_t *q = folio_waitqueue(folio);
 	struct wait_page_key key;
 	unsigned long flags;
-	wait_queue_entry_t bookmark;
+	wait_queue_entry_t bookmark, endmark;
 
 	key.folio = folio;
 	key.bit_nr = bit_nr;
@@ -1146,8 +1146,13 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
 	bookmark.func = NULL;
 	INIT_LIST_HEAD(&bookmark.entry);
 
+	endmark.flags = 0;
+	endmark.private = NULL;
+	endmark.func = NULL;
+	INIT_LIST_HEAD(&endmark.entry);
+
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark, &endmark);
 
 	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
 		/*
@@ -1159,7 +1164,7 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
 		spin_unlock_irqrestore(&q->lock, flags);
 		cpu_relax();
 		spin_lock_irqsave(&q->lock, flags);
-		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark, &endmark);
 	}
 
 	/*
-- 
2.39.3



             reply	other threads:[~2023-10-10  3:28 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-10-10  3:28 Bin Lai [this message]
2023-10-10  3:57 ` [PATCH] sched/wait: introduce endmark in __wake_up_common Matthew Wilcox
2023-10-10  7:04   ` lai bin
2023-10-10  3:58 ` [PATCH 1/2] filemap: Remove use of wait bookmarks Matthew Wilcox (Oracle)
2023-10-10  3:58   ` [PATCH 2/2] sched: Remove " Matthew Wilcox (Oracle)
2023-10-10  7:13     ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231010032833.398033-1-robinlai@tencent.com \
    --to=sclaibin@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=bristot@redhat.com \
    --cc=bsegall@google.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=robinlai@tencent.com \
    --cc=rostedt@goodmis.org \
    --cc=vincent.guittot@linaro.org \
    --cc=vschneid@redhat.com \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).