Re: [RFC PATCH] mm: silence soft lockups from unlock_page

From: Linus Torvalds <torvalds@linux-foundation.org>
To: Hugh Dickins <hughd@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>, Michal Hocko <mhocko@kernel.org>,
	Linux-MM <linux-mm@kvack.org>,
	 LKML <linux-kernel@vger.kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	 Tim Chen <tim.c.chen@linux.intel.com>,
	Michal Hocko <mhocko@suse.com>,
	 Greg KH <gregkh@linuxfoundation.org>
Subject: Re: [RFC PATCH] mm: silence soft lockups from unlock_page
Date: Thu, 6 Aug 2020 10:07:07 -0700	[thread overview]
Message-ID: <CAHk-=whf7wCUV2oTDUg0eeNafhhk_OhJBT2SbHZXwgtmAzNeTg@mail.gmail.com> (raw)
In-Reply-To: <alpine.LSU.2.11.2008052105040.8716@eggly.anvils>

[-- Attachment #1: Type: text/plain, Size: 1241 bytes --]

On Wed, Aug 5, 2020 at 10:21 PM Hugh Dickins <hughd@google.com> wrote:
>
> Something I was interested to realize in looking at this: trylock_page()
> on a contended lock is now much less likely to jump the queue and
> succeed than before, since your lock holder hands off the page lock to
> the next holder: much smaller window than waiting for the next to wake
> to take it. Nothing wrong with that, but effect might be seen somewhere.

Yeah, the window is smaller, but it's not gone.

It *might* be interesting to actually do the handover directly from
"unlock_page()", and avoid clearing (and then setting) the bit
entirely.

Something like the attached TOTALLY UNTESTED patch.

NOTE! Sometimes when I say something is untested, I think the patch is
fine because it's simple and straightforward, I just didn't test it.

This time it's both untested and very very subtle indeed. Did I get
the hand-over case SMP memory barriers right? Did I screw something
else up?

So this might be complete garbage. I really don't know. But it might
close the window for an unfair trylock (or lucky page_lock())
entirely.

Or maybe it just makes page locking break entirely. It's a very real risk.

The upside may not be worth it.

               Linus

[-- Attachment #2: patch --]
[-- Type: application/octet-stream, Size: 5218 bytes --]

 include/linux/wait.h |  2 +-
 kernel/sched/wait.c  |  9 +++++++--
 mm/filemap.c         | 51 ++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 898c890fc153..5ab3df535f39 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -200,7 +200,7 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+int __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
 		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 01f5d3020589..578f4f4a400d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -158,10 +158,15 @@ void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, vo
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+/*
+ * This returns true if it woke up an exclusive waiter (ie
+ * 'nr_exclusive' dropped from 1 to 0). May be useful for
+ * lock handoff.
+ */
+int __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
 		unsigned int mode, void *key, wait_queue_entry_t *bookmark)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+	return !__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 9f131f1cfde3..1e2536b98000 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -998,8 +998,9 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
 		return 0;
 
 	/*
-	 * If it's an exclusive wait, we get the bit for it, and
-	 * stop walking if we can't.
+	 * If it's an exclusive wait, we just tell the waker that
+	 * we have done the exclusive wait. It will know never to
+	 * actually even clear the bit.
 	 *
 	 * If it's a non-exclusive wait, then the fact that this
 	 * wake function was called means that the bit already
@@ -1007,11 +1008,8 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
 	 * re-took it.
 	 */
 	ret = 0;
-	if (wait->flags & WQ_FLAG_EXCLUSIVE) {
-		if (test_and_set_bit(key->bit_nr, &key->page->flags))
-			return -1;
+	if (wait->flags & WQ_FLAG_EXCLUSIVE)
 		ret = 1;
-	}
 	wait->flags |= WQ_FLAG_WOKEN;
 
 	wake_up_state(wait->private, mode);
@@ -1029,12 +1027,13 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
 	return ret;
 }
 
-static void wake_up_page_bit(struct page *page, int bit_nr)
+static int wake_up_page_bit(struct page *page, int bit_nr)
 {
 	wait_queue_head_t *q = page_waitqueue(page);
 	struct wait_page_key key;
 	unsigned long flags;
 	wait_queue_entry_t bookmark;
+	int exclusive;
 
 	key.page = page;
 	key.bit_nr = bit_nr;
@@ -1046,7 +1045,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 	INIT_LIST_HEAD(&bookmark.entry);
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+	exclusive = __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
 
 	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
 		/*
@@ -1058,7 +1057,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 		spin_unlock_irqrestore(&q->lock, flags);
 		cpu_relax();
 		spin_lock_irqsave(&q->lock, flags);
-		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+		exclusive |= __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
 	}
 
 	/*
@@ -1081,6 +1080,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 		 */
 	}
 	spin_unlock_irqrestore(&q->lock, flags);
+	return exclusive;
 }
 
 static void wake_up_page(struct page *page, int bit)
@@ -1339,11 +1339,36 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
  */
 void unlock_page(struct page *page)
 {
-	BUILD_BUG_ON(PG_waiters != 7);
+	unsigned long flags;
+
 	page = compound_head(page);
-	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
-		wake_up_page_bit(page, PG_locked);
+
+	flags = READ_ONCE(page->flags);
+	VM_BUG_ON_PAGE(!(flags & (1 << PG_locked)), page);
+
+	for (;;) {
+		unsigned long new;
+
+		/*
+		 * If wake_up_page_bit() wakes an exclusive
+		 * waiter, it will have handed the lock over
+		 * directly.
+		 */
+		if (flags & (1 << PG_waiters)) {
+			/*
+			 * Lock hand-over serialization. The atomic is the
+			 * spinlock wake_up_page_bit() will do.
+			 */
+			smp_mb__before_atomic();
+			if (wake_up_page_bit(page, PG_locked))
+				return;
+		}
+		new = cmpxchg_release(&page->flags, flags, flags & ~(1 << PG_locked));
+		if (likely(new == flags))
+			return;
+
+		flags = new;
+	}
 }
 EXPORT_SYMBOL(unlock_page);