All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2 v2] sched/wait: Break up long wake list walk
@ 2017-08-25 16:13 ` Tim Chen
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-25 16:13 UTC (permalink / raw)
  To: Linus Torvalds, Mel Gorman, Peter Zijlstra
  Cc: Tim Chen, Ingo Molnar, Andi Kleen, Kan Liang, Andrew Morton,
	Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm, linux-kernel

We encountered workloads that have very long wake up list on large
systems. A waker takes a long time to traverse the entire wake list and
execute all the wake functions.

We saw page wait list that are up to 3700+ entries long in tests of large
4 and 8 socket systems.  It took 0.8 sec to traverse such list during
wake up.  Any other CPU that contends for the list spin lock will spin
for a long time.  It is a result of the numa balancing migration of hot
pages that are shared by many threads.

Multiple CPUs waking are queued up behind the lock, and the last one queued
has to wait until all CPUs did all the wakeups.

The page wait list is traversed with interrupt disabled, which caused
various problems. This was the original cause that triggered the NMI
watch dog timer in: https://patchwork.kernel.org/patch/9800303/ . Only
extending the NMI watch dog timer there helped.

This patch bookmarks the waker's scan position in wake list and break
the wake up walk, to allow access to the list before the waker resume
its walk down the rest of the wait list.  It lowers the interrupt and
rescheduling latency.

This patch also provides a performance boost when combined with the next
patch to break up page wakeup list walk. We saw 22% improvement in the
will-it-scale file pread2 test on a Xeon Phi system running 256 threads.

Thanks.

Tim

v2:
Merged in Linus' changes to remove the bookmark_wake_function, and
simply access to flags.

Reported-by: Kan Liang <kan.liang@intel.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/wait.h |  1 +
 kernel/sched/wait.c  | 74 ++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5b74e36..80034e8 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -18,6 +18,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 /* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
+#define WQ_FLAG_BOOKMARK	0x04
 
 /*
  * A single wait-queue entry structure:
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 17f11c6..789dc24 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
 }
 EXPORT_SYMBOL(remove_wait_queue);
 
+/*
+ * Scan threshold to break wait queue walk.
+ * This allows a waker to take a break from holding the
+ * wait queue lock during the wait queue walk.
+ */
+#define WAITQUEUE_WALK_BREAK_CNT 64
 
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
@@ -63,17 +69,64 @@ EXPORT_SYMBOL(remove_wait_queue);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
-			int nr_exclusive, int wake_flags, void *key)
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key,
+			wait_queue_entry_t *bookmark)
 {
 	wait_queue_entry_t *curr, *next;
+	int cnt = 0;
+
+	if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+		curr = list_next_entry(bookmark, entry);
 
-	list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
+		list_del(&bookmark->entry);
+		bookmark->flags = 0;
+	} else
+		curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+	if (&curr->entry == &wq_head->head)
+		return nr_exclusive;
+
+	list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
 		unsigned flags = curr->flags;
 
+		if (flags & WQ_FLAG_BOOKMARK)
+			continue;
+
 		if (curr->func(curr, mode, wake_flags, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
+
+		if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+				(&next->entry != &wq_head->head)) {
+			bookmark->flags = WQ_FLAG_BOOKMARK;
+			list_add_tail(&bookmark->entry, &next->entry);
+			break;
+		}
+	}
+	return nr_exclusive;
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key)
+{
+	unsigned long flags;
+	wait_queue_entry_t bookmark;
+
+	bookmark.flags = 0;
+	bookmark.private = NULL;
+	bookmark.func = NULL;
+	INIT_LIST_HEAD(&bookmark.entry);
+
+	spin_lock_irqsave(&wq_head->lock, flags);
+	nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+
+	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+		spin_lock_irqsave(&wq_head->lock, flags);
+		nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+						wake_flags, key, &bookmark);
+		spin_unlock_irqrestore(&wq_head->lock, flags);
 	}
 }
 
@@ -90,11 +143,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&wq_head->lock, flags);
-	__wake_up_common(wq_head, mode, nr_exclusive, 0, key);
-	spin_unlock_irqrestore(&wq_head->lock, flags);
+	__wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
 }
 EXPORT_SYMBOL(__wake_up);
 
@@ -103,13 +152,13 @@ EXPORT_SYMBOL(__wake_up);
  */
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
 {
-	__wake_up_common(wq_head, mode, nr, 0, NULL);
+	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key);
+	__wake_up_common(wq_head, mode, 1, 0, key, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -133,7 +182,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
-	unsigned long flags;
 	int wake_flags = 1; /* XXX WF_SYNC */
 
 	if (unlikely(!wq_head))
@@ -142,9 +190,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 	if (unlikely(nr_exclusive != 1))
 		wake_flags = 0;
 
-	spin_lock_irqsave(&wq_head->lock, flags);
-	__wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
-	spin_unlock_irqrestore(&wq_head->lock, flags);
+	__wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
-- 
2.9.4

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 1/2 v2] sched/wait: Break up long wake list walk
@ 2017-08-25 16:13 ` Tim Chen
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-25 16:13 UTC (permalink / raw)
  To: Linus Torvalds, Mel Gorman, Peter Zijlstra
  Cc: Tim Chen, Ingo Molnar, Andi Kleen, Kan Liang, Andrew Morton,
	Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm, linux-kernel

We encountered workloads that have very long wake up list on large
systems. A waker takes a long time to traverse the entire wake list and
execute all the wake functions.

We saw page wait list that are up to 3700+ entries long in tests of large
4 and 8 socket systems.  It took 0.8 sec to traverse such list during
wake up.  Any other CPU that contends for the list spin lock will spin
for a long time.  It is a result of the numa balancing migration of hot
pages that are shared by many threads.

Multiple CPUs waking are queued up behind the lock, and the last one queued
has to wait until all CPUs did all the wakeups.

The page wait list is traversed with interrupt disabled, which caused
various problems. This was the original cause that triggered the NMI
watch dog timer in: https://patchwork.kernel.org/patch/9800303/ . Only
extending the NMI watch dog timer there helped.

This patch bookmarks the waker's scan position in wake list and break
the wake up walk, to allow access to the list before the waker resume
its walk down the rest of the wait list.  It lowers the interrupt and
rescheduling latency.

This patch also provides a performance boost when combined with the next
patch to break up page wakeup list walk. We saw 22% improvement in the
will-it-scale file pread2 test on a Xeon Phi system running 256 threads.

Thanks.

Tim

v2:
Merged in Linus' changes to remove the bookmark_wake_function, and
simply access to flags.

Reported-by: Kan Liang <kan.liang@intel.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/wait.h |  1 +
 kernel/sched/wait.c  | 74 ++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5b74e36..80034e8 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -18,6 +18,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 /* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
+#define WQ_FLAG_BOOKMARK	0x04
 
 /*
  * A single wait-queue entry structure:
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 17f11c6..789dc24 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
 }
 EXPORT_SYMBOL(remove_wait_queue);
 
+/*
+ * Scan threshold to break wait queue walk.
+ * This allows a waker to take a break from holding the
+ * wait queue lock during the wait queue walk.
+ */
+#define WAITQUEUE_WALK_BREAK_CNT 64
 
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
@@ -63,17 +69,64 @@ EXPORT_SYMBOL(remove_wait_queue);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
-			int nr_exclusive, int wake_flags, void *key)
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key,
+			wait_queue_entry_t *bookmark)
 {
 	wait_queue_entry_t *curr, *next;
+	int cnt = 0;
+
+	if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+		curr = list_next_entry(bookmark, entry);
 
-	list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
+		list_del(&bookmark->entry);
+		bookmark->flags = 0;
+	} else
+		curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+	if (&curr->entry == &wq_head->head)
+		return nr_exclusive;
+
+	list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
 		unsigned flags = curr->flags;
 
+		if (flags & WQ_FLAG_BOOKMARK)
+			continue;
+
 		if (curr->func(curr, mode, wake_flags, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
+
+		if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+				(&next->entry != &wq_head->head)) {
+			bookmark->flags = WQ_FLAG_BOOKMARK;
+			list_add_tail(&bookmark->entry, &next->entry);
+			break;
+		}
+	}
+	return nr_exclusive;
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key)
+{
+	unsigned long flags;
+	wait_queue_entry_t bookmark;
+
+	bookmark.flags = 0;
+	bookmark.private = NULL;
+	bookmark.func = NULL;
+	INIT_LIST_HEAD(&bookmark.entry);
+
+	spin_lock_irqsave(&wq_head->lock, flags);
+	nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+
+	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+		spin_lock_irqsave(&wq_head->lock, flags);
+		nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+						wake_flags, key, &bookmark);
+		spin_unlock_irqrestore(&wq_head->lock, flags);
 	}
 }
 
@@ -90,11 +143,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&wq_head->lock, flags);
-	__wake_up_common(wq_head, mode, nr_exclusive, 0, key);
-	spin_unlock_irqrestore(&wq_head->lock, flags);
+	__wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
 }
 EXPORT_SYMBOL(__wake_up);
 
@@ -103,13 +152,13 @@ EXPORT_SYMBOL(__wake_up);
  */
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
 {
-	__wake_up_common(wq_head, mode, nr, 0, NULL);
+	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key);
+	__wake_up_common(wq_head, mode, 1, 0, key, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -133,7 +182,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
-	unsigned long flags;
 	int wake_flags = 1; /* XXX WF_SYNC */
 
 	if (unlikely(!wq_head))
@@ -142,9 +190,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 	if (unlikely(nr_exclusive != 1))
 		wake_flags = 0;
 
-	spin_lock_irqsave(&wq_head->lock, flags);
-	__wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
-	spin_unlock_irqrestore(&wq_head->lock, flags);
+	__wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
-- 
2.9.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-25 16:13 ` Tim Chen
@ 2017-08-25 16:13   ` Tim Chen
  -1 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-25 16:13 UTC (permalink / raw)
  To: Linus Torvalds, Mel Gorman, Peter Zijlstra
  Cc: Tim Chen, Ingo Molnar, Andi Kleen, Kan Liang, Andrew Morton,
	Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm, linux-kernel

Now that we have added breaks in the wait queue scan and allow bookmark
on scan position, we put this logic in the wake_up_page_bit function.

We can have very long page wait list in large system where multiple
pages share the same wait list. We break the wake up walk here to allow
other cpus a chance to access the list, and not to disable the interrupts
when traversing the list for too long.  This reduces the interrupt and
rescheduling latency, and excessive page wait queue lock hold time.

We have to add logic to detect any new arrivals to appropriately clear
the wait bit on the page only when there are no new waiters for a page.
The break in wait list walk open windows for new arrivals for a page
on the wait list during the wake ups. They could be added at the head
or tail of the wait queue depending on whether they are exclusive in
prepare_to_wait_event. So we can't clear the PageWaiters flag if there
are new arrivals during the wake up process.  Otherwise we will skip
the wake_up_page when there are still entries to be woken up.

v2:
Remove bookmark_wake_function

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/wait.h |  7 +++++++
 kernel/sched/wait.c  |  7 +++++++
 mm/filemap.c         | 36 ++++++++++++++++++++++++++++++++++--
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 80034e8..b926960 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -19,6 +19,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
 #define WQ_FLAG_BOOKMARK	0x04
+#define WQ_FLAG_ARRIVALS	0x08
 
 /*
  * A single wait-queue entry structure:
@@ -32,6 +33,8 @@ struct wait_queue_entry {
 
 struct wait_queue_head {
 	spinlock_t		lock;
+	unsigned int		waker;
+	unsigned int		flags;
 	struct list_head	head;
 };
 typedef struct wait_queue_head wait_queue_head_t;
@@ -52,6 +55,8 @@ struct task_struct;
 
 #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {					\
 	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),			\
+	.waker		= 0,							\
+	.flags		= 0,							\
 	.head		= { &(name).head, &(name).head } }
 
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
@@ -185,6 +190,8 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 789dc24..81e7e55 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -162,6 +162,13 @@ void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, vo
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+{
+	__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
+
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue
diff --git a/mm/filemap.c b/mm/filemap.c
index a497024..a6c7917 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -920,13 +920,41 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 	wait_queue_head_t *q = page_waitqueue(page);
 	struct wait_page_key key;
 	unsigned long flags;
+	wait_queue_entry_t bookmark;
 
 	key.page = page;
 	key.bit_nr = bit_nr;
 	key.page_match = 0;
 
+	bookmark.flags = 0;
+	bookmark.private = NULL;
+	bookmark.func = NULL;
+	INIT_LIST_HEAD(&bookmark.entry);
+
+	spin_lock_irqsave(&q->lock, flags);
+	/* q->flags will be set to WQ_FLAG_ARRIVALS if items added to wait queue */
+	if (!q->waker)
+		q->flags &= ~WQ_FLAG_ARRIVALS;
+	++ q->waker;
+	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+	if (!(bookmark.flags & WQ_FLAG_BOOKMARK))
+		goto finish;
+	/*
+	 * Take a breather from holding the lock,
+	 * allow pages that finish wake up asynchronously
+	 * to acquire the lock and remove themselves
+	 * from wait queue
+	 */
+	spin_unlock_irqrestore(&q->lock, flags);
+
+again:
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key(q, TASK_NORMAL, &key);
+	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+	if (bookmark.flags & WQ_FLAG_BOOKMARK) {
+		spin_unlock_irqrestore(&q->lock, flags);
+		goto again;
+	}
+finish:
 	/*
 	 * It is possible for other pages to have collided on the waitqueue
 	 * hash, so in that case check for a page match. That prevents a long-
@@ -936,7 +964,8 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 	 * and removed them from the waitqueue, but there are still other
 	 * page waiters.
 	 */
-	if (!waitqueue_active(q) || !key.page_match) {
+	if (!waitqueue_active(q) ||
+	    (!key.page_match && (q->waker == 1) && !(q->flags & WQ_FLAG_ARRIVALS))) {
 		ClearPageWaiters(page);
 		/*
 		 * It's possible to miss clearing Waiters here, when we woke
@@ -946,6 +975,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 		 * That's okay, it's a rare case. The next waker will clear it.
 		 */
 	}
+	-- q->waker;
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 
@@ -976,6 +1006,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 				__add_wait_queue_entry_tail_exclusive(q, wait);
 			else
 				__add_wait_queue(q, wait);
+			q->flags = WQ_FLAG_ARRIVALS;
 			SetPageWaiters(page);
 		}
 
@@ -1041,6 +1072,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
 	spin_lock_irqsave(&q->lock, flags);
 	__add_wait_queue(q, waiter);
 	SetPageWaiters(page);
+	q->flags = WQ_FLAG_ARRIVALS;
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_page_wait_queue);
-- 
2.9.4

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-25 16:13   ` Tim Chen
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-25 16:13 UTC (permalink / raw)
  To: Linus Torvalds, Mel Gorman, Peter Zijlstra
  Cc: Tim Chen, Ingo Molnar, Andi Kleen, Kan Liang, Andrew Morton,
	Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm, linux-kernel

Now that we have added breaks in the wait queue scan and allow bookmark
on scan position, we put this logic in the wake_up_page_bit function.

We can have very long page wait list in large system where multiple
pages share the same wait list. We break the wake up walk here to allow
other cpus a chance to access the list, and not to disable the interrupts
when traversing the list for too long.  This reduces the interrupt and
rescheduling latency, and excessive page wait queue lock hold time.

We have to add logic to detect any new arrivals to appropriately clear
the wait bit on the page only when there are no new waiters for a page.
The break in wait list walk open windows for new arrivals for a page
on the wait list during the wake ups. They could be added at the head
or tail of the wait queue depending on whether they are exclusive in
prepare_to_wait_event. So we can't clear the PageWaiters flag if there
are new arrivals during the wake up process.  Otherwise we will skip
the wake_up_page when there are still entries to be woken up.

v2:
Remove bookmark_wake_function

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/wait.h |  7 +++++++
 kernel/sched/wait.c  |  7 +++++++
 mm/filemap.c         | 36 ++++++++++++++++++++++++++++++++++--
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 80034e8..b926960 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -19,6 +19,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
 #define WQ_FLAG_BOOKMARK	0x04
+#define WQ_FLAG_ARRIVALS	0x08
 
 /*
  * A single wait-queue entry structure:
@@ -32,6 +33,8 @@ struct wait_queue_entry {
 
 struct wait_queue_head {
 	spinlock_t		lock;
+	unsigned int		waker;
+	unsigned int		flags;
 	struct list_head	head;
 };
 typedef struct wait_queue_head wait_queue_head_t;
@@ -52,6 +55,8 @@ struct task_struct;
 
 #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {					\
 	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),			\
+	.waker		= 0,							\
+	.flags		= 0,							\
 	.head		= { &(name).head, &(name).head } }
 
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
@@ -185,6 +190,8 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 789dc24..81e7e55 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -162,6 +162,13 @@ void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, vo
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+{
+	__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
+
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue
diff --git a/mm/filemap.c b/mm/filemap.c
index a497024..a6c7917 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -920,13 +920,41 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 	wait_queue_head_t *q = page_waitqueue(page);
 	struct wait_page_key key;
 	unsigned long flags;
+	wait_queue_entry_t bookmark;
 
 	key.page = page;
 	key.bit_nr = bit_nr;
 	key.page_match = 0;
 
+	bookmark.flags = 0;
+	bookmark.private = NULL;
+	bookmark.func = NULL;
+	INIT_LIST_HEAD(&bookmark.entry);
+
+	spin_lock_irqsave(&q->lock, flags);
+	/* q->flags will be set to WQ_FLAG_ARRIVALS if items added to wait queue */
+	if (!q->waker)
+		q->flags &= ~WQ_FLAG_ARRIVALS;
+	++ q->waker;
+	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+	if (!(bookmark.flags & WQ_FLAG_BOOKMARK))
+		goto finish;
+	/*
+	 * Take a breather from holding the lock,
+	 * allow pages that finish wake up asynchronously
+	 * to acquire the lock and remove themselves
+	 * from wait queue
+	 */
+	spin_unlock_irqrestore(&q->lock, flags);
+
+again:
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key(q, TASK_NORMAL, &key);
+	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+	if (bookmark.flags & WQ_FLAG_BOOKMARK) {
+		spin_unlock_irqrestore(&q->lock, flags);
+		goto again;
+	}
+finish:
 	/*
 	 * It is possible for other pages to have collided on the waitqueue
 	 * hash, so in that case check for a page match. That prevents a long-
@@ -936,7 +964,8 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 	 * and removed them from the waitqueue, but there are still other
 	 * page waiters.
 	 */
-	if (!waitqueue_active(q) || !key.page_match) {
+	if (!waitqueue_active(q) ||
+	    (!key.page_match && (q->waker == 1) && !(q->flags & WQ_FLAG_ARRIVALS))) {
 		ClearPageWaiters(page);
 		/*
 		 * It's possible to miss clearing Waiters here, when we woke
@@ -946,6 +975,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 		 * That's okay, it's a rare case. The next waker will clear it.
 		 */
 	}
+	-- q->waker;
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 
@@ -976,6 +1006,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 				__add_wait_queue_entry_tail_exclusive(q, wait);
 			else
 				__add_wait_queue(q, wait);
+			q->flags = WQ_FLAG_ARRIVALS;
 			SetPageWaiters(page);
 		}
 
@@ -1041,6 +1072,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
 	spin_lock_irqsave(&q->lock, flags);
 	__add_wait_queue(q, waiter);
 	SetPageWaiters(page);
+	q->flags = WQ_FLAG_ARRIVALS;
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_page_wait_queue);
-- 
2.9.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/2 v2] sched/wait: Break up long wake list walk
  2017-08-25 16:13 ` Tim Chen
@ 2017-08-25 17:46   ` Christopher Lameter
  -1 siblings, 0 replies; 57+ messages in thread
From: Christopher Lameter @ 2017-08-25 17:46 UTC (permalink / raw)
  To: Tim Chen
  Cc: Linus Torvalds, Mel Gorman, Peter Zijlstra, Ingo Molnar,
	Andi Kleen, Kan Liang, Andrew Morton, Johannes Weiner, Jan Kara,
	Eric W . Biederman, Davidlohr Bueso, linux-mm, linux-kernel

On Fri, 25 Aug 2017, Tim Chen wrote:

> for a long time.  It is a result of the numa balancing migration of hot
> pages that are shared by many threads.

I think that would also call for some work to limit numa balacing of hot
shared pages. The cache lines of hot pages are likely in present the low
level processor caches anyways so moving them would not cause a
performance benefit. Limiting the migration there could stop wasting a lot
of effort.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/2 v2] sched/wait: Break up long wake list walk
@ 2017-08-25 17:46   ` Christopher Lameter
  0 siblings, 0 replies; 57+ messages in thread
From: Christopher Lameter @ 2017-08-25 17:46 UTC (permalink / raw)
  To: Tim Chen
  Cc: Linus Torvalds, Mel Gorman, Peter Zijlstra, Ingo Molnar,
	Andi Kleen, Kan Liang, Andrew Morton, Johannes Weiner, Jan Kara,
	Eric W . Biederman, Davidlohr Bueso, linux-mm, linux-kernel

On Fri, 25 Aug 2017, Tim Chen wrote:

> for a long time.  It is a result of the numa balancing migration of hot
> pages that are shared by many threads.

I think that would also call for some work to limit numa balacing of hot
shared pages. The cache lines of hot pages are likely in present the low
level processor caches anyways so moving them would not cause a
performance benefit. Limiting the migration there could stop wasting a lot
of effort.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-25 16:13   ` Tim Chen
@ 2017-08-25 19:58     ` Linus Torvalds
  -1 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-25 19:58 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Fri, Aug 25, 2017 at 9:13 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
> Now that we have added breaks in the wait queue scan and allow bookmark
> on scan position, we put this logic in the wake_up_page_bit function.

Oh, _this_ is the other patch you were talking about. I thought it was
the NUMA counter threashold that was discussed around the same time,
and that's why I referred to Mel.

Gods, _this_ patch is ugly.  No, I'm not happy with it at all. It
makes that wait_queue_head much bigger, for this disgusting one use.

So no, this is no good.

Now, maybe the page_wait_table[] itself could be changed to be
something that is *not* just the wait-queue head.

But if we need to change the page_wait_table[] itself to have more
information, then we should make it not be a wait-queue at all, we
should make it be a list of much more specialized entries, indexed by
the {page,bit} tuple.

And once we do that, we probably *could* use something like two
specialized lists: one that is wake-all, and one that is wake-one.

So you'd have something like

    struct page_wait_struct {
        struct list_node list;
        struct page *page;
        int bit;
        struct llist_head all;
        struct llist_head exclusive;
    };

and then the "page_wait_table[]" would be just an array of

    struct page_wait_head {
        spinlock_t lock;
        struct hlist_head list;
    };

and then the rule is:

 - each page/bit combination allocates one of these page_wait_struct
entries when somebody starts waiting for it for the first time (and we
use the spinlock in the page_wait_head to serialize that list).

 - an exclusive wait (ie somebody who waits to get the bit for
locking) adds itself to the 'exclusive' llist

 - non-locking waiters add themselves to the 'all' list

 - we can use llist_del_all() to remove the 'all' list and then walk
it and wake them up without any locks

 - we can use llist_del_first() to remove the first exclusive waiter
and wait _it_ up without any locks.

Hmm? How does that sound? That means that we wouldn't use the normal
wait-queues at all for the page hash waiting. We'd use this two-level
list: one list to find the page/bit thing, and then two lists within
that fdor the wait-queues for just *that* page/bit.

So no need for the 'key' stuff at all, because the page/bit data would
be in the first data list, and the second list wouldn't have any of
these traversal issues where you have to be careful and do it one
entry at a time.

                 Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-25 19:58     ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-25 19:58 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Fri, Aug 25, 2017 at 9:13 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
> Now that we have added breaks in the wait queue scan and allow bookmark
> on scan position, we put this logic in the wake_up_page_bit function.

Oh, _this_ is the other patch you were talking about. I thought it was
the NUMA counter threashold that was discussed around the same time,
and that's why I referred to Mel.

Gods, _this_ patch is ugly.  No, I'm not happy with it at all. It
makes that wait_queue_head much bigger, for this disgusting one use.

So no, this is no good.

Now, maybe the page_wait_table[] itself could be changed to be
something that is *not* just the wait-queue head.

But if we need to change the page_wait_table[] itself to have more
information, then we should make it not be a wait-queue at all, we
should make it be a list of much more specialized entries, indexed by
the {page,bit} tuple.

And once we do that, we probably *could* use something like two
specialized lists: one that is wake-all, and one that is wake-one.

So you'd have something like

    struct page_wait_struct {
        struct list_node list;
        struct page *page;
        int bit;
        struct llist_head all;
        struct llist_head exclusive;
    };

and then the "page_wait_table[]" would be just an array of

    struct page_wait_head {
        spinlock_t lock;
        struct hlist_head list;
    };

and then the rule is:

 - each page/bit combination allocates one of these page_wait_struct
entries when somebody starts waiting for it for the first time (and we
use the spinlock in the page_wait_head to serialize that list).

 - an exclusive wait (ie somebody who waits to get the bit for
locking) adds itself to the 'exclusive' llist

 - non-locking waiters add themselves to the 'all' list

 - we can use llist_del_all() to remove the 'all' list and then walk
it and wake them up without any locks

 - we can use llist_del_first() to remove the first exclusive waiter
and wait _it_ up without any locks.

Hmm? How does that sound? That means that we wouldn't use the normal
wait-queues at all for the page hash waiting. We'd use this two-level
list: one list to find the page/bit thing, and then two lists within
that fdor the wait-queues for just *that* page/bit.

So no need for the 'key' stuff at all, because the page/bit data would
be in the first data list, and the second list wouldn't have any of
these traversal issues where you have to be careful and do it one
entry at a time.

                 Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-25 19:58     ` Linus Torvalds
@ 2017-08-25 22:19       ` Tim Chen
  -1 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-25 22:19 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/25/2017 12:58 PM, Linus Torvalds wrote:
> On Fri, Aug 25, 2017 at 9:13 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>> Now that we have added breaks in the wait queue scan and allow bookmark
>> on scan position, we put this logic in the wake_up_page_bit function.
> 
> Oh, _this_ is the other patch you were talking about. I thought it was
> the NUMA counter threashold that was discussed around the same time,
> and that's why I referred to Mel.
> 
> Gods, _this_ patch is ugly.  No, I'm not happy with it at all. It
> makes that wait_queue_head much bigger, for this disgusting one use.
> 
> So no, this is no good.
> 
> Now, maybe the page_wait_table[] itself could be changed to be
> something that is *not* just the wait-queue head.
> 
> But if we need to change the page_wait_table[] itself to have more
> information, then we should make it not be a wait-queue at all, we
> should make it be a list of much more specialized entries, indexed by
> the {page,bit} tuple.
> 
> And once we do that, we probably *could* use something like two
> specialized lists: one that is wake-all, and one that is wake-one.
> 
> So you'd have something like
> 
>     struct page_wait_struct {
>         struct list_node list;
>         struct page *page;
>         int bit;
>         struct llist_head all;
>         struct llist_head exclusive;
>     };
> 
> and then the "page_wait_table[]" would be just an array of
> 
>     struct page_wait_head {
>         spinlock_t lock;
>         struct hlist_head list;
>     };
> 
> and then the rule is:
> 
>  - each page/bit combination allocates one of these page_wait_struct
> entries when somebody starts waiting for it for the first time (and we
> use the spinlock in the page_wait_head to serialize that list).
> 
>  - an exclusive wait (ie somebody who waits to get the bit for
> locking) adds itself to the 'exclusive' llist
> 
>  - non-locking waiters add themselves to the 'all' list
> 
>  - we can use llist_del_all() to remove the 'all' list and then walk
> it and wake them up without any locks
> 
>  - we can use llist_del_first() to remove the first exclusive waiter
> and wait _it_ up without any locks.
> 
> Hmm? How does that sound? That means that we wouldn't use the normal
> wait-queues at all for the page hash waiting. We'd use this two-level
> list: one list to find the page/bit thing, and then two lists within
> that fdor the wait-queues for just *that* page/bit.
> 
> So no need for the 'key' stuff at all, because the page/bit data would
> be in the first data list, and the second list wouldn't have any of
> these traversal issues where you have to be careful and do it one
> entry at a time.

If I have the waker count and flags on the wait_page_queue structure
will that have been more acceptable?

Also I think patch 1 is still a good idea for a fail safe mechanism
in case there are other long wait list.

That said, I do think your suggested approach is cleaner.  However, it
is a much more substantial change.  Let me take a look and see if I
have any issues implementing it.

Tim



> 
>                  Linus
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-25 22:19       ` Tim Chen
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-25 22:19 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/25/2017 12:58 PM, Linus Torvalds wrote:
> On Fri, Aug 25, 2017 at 9:13 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>> Now that we have added breaks in the wait queue scan and allow bookmark
>> on scan position, we put this logic in the wake_up_page_bit function.
> 
> Oh, _this_ is the other patch you were talking about. I thought it was
> the NUMA counter threashold that was discussed around the same time,
> and that's why I referred to Mel.
> 
> Gods, _this_ patch is ugly.  No, I'm not happy with it at all. It
> makes that wait_queue_head much bigger, for this disgusting one use.
> 
> So no, this is no good.
> 
> Now, maybe the page_wait_table[] itself could be changed to be
> something that is *not* just the wait-queue head.
> 
> But if we need to change the page_wait_table[] itself to have more
> information, then we should make it not be a wait-queue at all, we
> should make it be a list of much more specialized entries, indexed by
> the {page,bit} tuple.
> 
> And once we do that, we probably *could* use something like two
> specialized lists: one that is wake-all, and one that is wake-one.
> 
> So you'd have something like
> 
>     struct page_wait_struct {
>         struct list_node list;
>         struct page *page;
>         int bit;
>         struct llist_head all;
>         struct llist_head exclusive;
>     };
> 
> and then the "page_wait_table[]" would be just an array of
> 
>     struct page_wait_head {
>         spinlock_t lock;
>         struct hlist_head list;
>     };
> 
> and then the rule is:
> 
>  - each page/bit combination allocates one of these page_wait_struct
> entries when somebody starts waiting for it for the first time (and we
> use the spinlock in the page_wait_head to serialize that list).
> 
>  - an exclusive wait (ie somebody who waits to get the bit for
> locking) adds itself to the 'exclusive' llist
> 
>  - non-locking waiters add themselves to the 'all' list
> 
>  - we can use llist_del_all() to remove the 'all' list and then walk
> it and wake them up without any locks
> 
>  - we can use llist_del_first() to remove the first exclusive waiter
> and wait _it_ up without any locks.
> 
> Hmm? How does that sound? That means that we wouldn't use the normal
> wait-queues at all for the page hash waiting. We'd use this two-level
> list: one list to find the page/bit thing, and then two lists within
> that fdor the wait-queues for just *that* page/bit.
> 
> So no need for the 'key' stuff at all, because the page/bit data would
> be in the first data list, and the second list wouldn't have any of
> these traversal issues where you have to be careful and do it one
> entry at a time.

If I have the waker count and flags on the wait_page_queue structure
will that have been more acceptable?

Also I think patch 1 is still a good idea for a fail safe mechanism
in case there are other long wait list.

That said, I do think your suggested approach is cleaner.  However, it
is a much more substantial change.  Let me take a look and see if I
have any issues implementing it.

Tim



> 
>                  Linus
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-25 22:19       ` Tim Chen
  (?)
@ 2017-08-25 22:51       ` Linus Torvalds
  2017-08-25 23:03         ` Linus Torvalds
  -1 siblings, 1 reply; 57+ messages in thread
From: Linus Torvalds @ 2017-08-25 22:51 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 2384 bytes --]

On Fri, Aug 25, 2017 at 3:19 PM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> Also I think patch 1 is still a good idea for a fail safe mechanism
> in case there are other long wait list.

Yeah, I don't hate patch #1.

But that other patch is just nasty.

> That said, I do think your suggested approach is cleaner.  However, it
> is a much more substantial change.  Let me take a look and see if I
> have any issues implementing it.

Actually, I tried it myself.

It was painful. But I actually have a TOTALLY UNTESTED set of two
patches that implements the idea.

And by "implements the idea" I mean "it kind of compiles, and it kind
of looks like it might work".

But by "kind of compiles" I mean that I didn't implement the nasty
add_page_wait_queue() thing that the cachefiles interface wants.
Honestly, I think the sanest way to do that is to just have a hashed
wait queue *just* for cachefiles.

And by "kind of looks like it might work" I really mean just that.
It's entirely untested. It's more of a "let's take that description of
mine and turn it into code". I really have not tested this AT ALL.

And it's subtle enough that I suspect it really is majorly buggy. It
uses the locking hash list code (hlist_bl_node) to keep the hash table
fairly small and hide the lock in the hash table itself.

And then it plays horrible games with linked lists. Yeah, that may
have been a mistake, but I thought I should try to avoid the doubly
linked lists in that "struct page_wait_struct" because it's allocated
on the stack, and each list_head is 16 bytes on 64-bit architectures.

But that "let's save 24 bytes in the structure" made it much nastier
to remove entries, so it was probably a bad trade-off.

But I'm attaching the two patches because I have no shame. If somebody
is willing to look at my completely untested crap code.

I *really* want to emphasize that "untested crap".

This is meant to be example code of the *concept* rather than anything
that actually works.

So take it as that: example pseudo-code that happens to pass a
compiler, but is meant as a RFD rather than actually working.

The first patch just moves code around because I wanted to experiment
with the new code in a new file. That first patch is probably fine. It
shouldn't change any code, just move it.

The second patch is the "broken patch to illustrate the idea".

                    Linus

[-- Attachment #2: 0001-Split-page-bit-waiting-functions-into-their-own-file.patch --]
[-- Type: text/x-patch, Size: 17127 bytes --]

From b0bec1de8a7dba69b223dd30dd2a734401f335aa Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 25 Aug 2017 13:25:43 -0700
Subject: [PATCH 1/2] Split page bit waiting functions into their own file

It turns out that the page-bit waiting logic is very special indeed, and
will get even more so.  Split it up into its own file to make this clear.

I'll rewrite the logic of the page wait queue completely, but this is
pure prep-work with no actual code changes, just code movement.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Makefile        |   2 +-
 mm/filemap.c       | 260 --------------------------------------------------
 mm/page_wait_bit.c | 274 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 275 insertions(+), 261 deletions(-)
 create mode 100644 mm/page_wait_bit.c

diff --git a/mm/Makefile b/mm/Makefile
index 411bd24d4a7c..8bc0194207a3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,7 +32,7 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH
 mmu-$(CONFIG_MMU)	+= process_vm_access.o
 endif
 
-obj-y			:= filemap.o mempool.o oom_kill.o \
+obj-y			:= filemap.o page_wait_bit.o mempool.o oom_kill.o \
 			   maccess.o page_alloc.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
diff --git a/mm/filemap.c b/mm/filemap.c
index a49702445ce0..f9b710e8f76e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -856,195 +856,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
 
-/*
- * In order to wait for pages to become available there must be
- * waitqueues associated with pages. By using a hash table of
- * waitqueues where the bucket discipline is to maintain all
- * waiters on the same queue and wake all when any of the pages
- * become available, and for the woken contexts to check to be
- * sure the appropriate page became available, this saves space
- * at a cost of "thundering herd" phenomena during rare hash
- * collisions.
- */
-#define PAGE_WAIT_TABLE_BITS 8
-#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
-static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
-
-static wait_queue_head_t *page_waitqueue(struct page *page)
-{
-	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
-}
-
-void __init pagecache_init(void)
-{
-	int i;
-
-	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
-		init_waitqueue_head(&page_wait_table[i]);
-
-	page_writeback_init();
-}
-
-struct wait_page_key {
-	struct page *page;
-	int bit_nr;
-	int page_match;
-};
-
-struct wait_page_queue {
-	struct page *page;
-	int bit_nr;
-	wait_queue_entry_t wait;
-};
-
-static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
-{
-	struct wait_page_key *key = arg;
-	struct wait_page_queue *wait_page
-		= container_of(wait, struct wait_page_queue, wait);
-
-	if (wait_page->page != key->page)
-	       return 0;
-	key->page_match = 1;
-
-	if (wait_page->bit_nr != key->bit_nr)
-		return 0;
-	if (test_bit(key->bit_nr, &key->page->flags))
-		return 0;
-
-	return autoremove_wake_function(wait, mode, sync, key);
-}
-
-static void wake_up_page_bit(struct page *page, int bit_nr)
-{
-	wait_queue_head_t *q = page_waitqueue(page);
-	struct wait_page_key key;
-	unsigned long flags;
-
-	key.page = page;
-	key.bit_nr = bit_nr;
-	key.page_match = 0;
-
-	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key(q, TASK_NORMAL, &key);
-	/*
-	 * It is possible for other pages to have collided on the waitqueue
-	 * hash, so in that case check for a page match. That prevents a long-
-	 * term waiter
-	 *
-	 * It is still possible to miss a case here, when we woke page waiters
-	 * and removed them from the waitqueue, but there are still other
-	 * page waiters.
-	 */
-	if (!waitqueue_active(q) || !key.page_match) {
-		ClearPageWaiters(page);
-		/*
-		 * It's possible to miss clearing Waiters here, when we woke
-		 * our page waiters, but the hashed waitqueue has waiters for
-		 * other pages on it.
-		 *
-		 * That's okay, it's a rare case. The next waker will clear it.
-		 */
-	}
-	spin_unlock_irqrestore(&q->lock, flags);
-}
-
-static void wake_up_page(struct page *page, int bit)
-{
-	if (!PageWaiters(page))
-		return;
-	wake_up_page_bit(page, bit);
-}
-
-static inline int wait_on_page_bit_common(wait_queue_head_t *q,
-		struct page *page, int bit_nr, int state, bool lock)
-{
-	struct wait_page_queue wait_page;
-	wait_queue_entry_t *wait = &wait_page.wait;
-	int ret = 0;
-
-	init_wait(wait);
-	wait->func = wake_page_function;
-	wait_page.page = page;
-	wait_page.bit_nr = bit_nr;
-
-	for (;;) {
-		spin_lock_irq(&q->lock);
-
-		if (likely(list_empty(&wait->entry))) {
-			if (lock)
-				__add_wait_queue_entry_tail_exclusive(q, wait);
-			else
-				__add_wait_queue(q, wait);
-			SetPageWaiters(page);
-		}
-
-		set_current_state(state);
-
-		spin_unlock_irq(&q->lock);
-
-		if (likely(test_bit(bit_nr, &page->flags))) {
-			io_schedule();
-			if (unlikely(signal_pending_state(state, current))) {
-				ret = -EINTR;
-				break;
-			}
-		}
-
-		if (lock) {
-			if (!test_and_set_bit_lock(bit_nr, &page->flags))
-				break;
-		} else {
-			if (!test_bit(bit_nr, &page->flags))
-				break;
-		}
-	}
-
-	finish_wait(q, wait);
-
-	/*
-	 * A signal could leave PageWaiters set. Clearing it here if
-	 * !waitqueue_active would be possible (by open-coding finish_wait),
-	 * but still fail to catch it in the case of wait hash collision. We
-	 * already can fail to clear wait hash collision cases, so don't
-	 * bother with signals either.
-	 */
-
-	return ret;
-}
-
-void wait_on_page_bit(struct page *page, int bit_nr)
-{
-	wait_queue_head_t *q = page_waitqueue(page);
-	wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
-}
-EXPORT_SYMBOL(wait_on_page_bit);
-
-int wait_on_page_bit_killable(struct page *page, int bit_nr)
-{
-	wait_queue_head_t *q = page_waitqueue(page);
-	return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
-}
-
-/**
- * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
- * @page: Page defining the wait queue of interest
- * @waiter: Waiter to add to the queue
- *
- * Add an arbitrary @waiter to the wait queue for the nominated @page.
- */
-void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
-{
-	wait_queue_head_t *q = page_waitqueue(page);
-	unsigned long flags;
-
-	spin_lock_irqsave(&q->lock, flags);
-	__add_wait_queue(q, waiter);
-	SetPageWaiters(page);
-	spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(add_page_wait_queue);
-
 #ifndef clear_bit_unlock_is_negative_byte
 
 /*
@@ -1068,57 +879,6 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
 
 #endif
 
-/**
- * unlock_page - unlock a locked page
- * @page: the page
- *
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechanism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
- *
- * Note that this depends on PG_waiters being the sign bit in the byte
- * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
- * clear the PG_locked bit and test PG_waiters at the same time fairly
- * portably (architectures that do LL/SC can test any bit, while x86 can
- * test the sign bit).
- */
-void unlock_page(struct page *page)
-{
-	BUILD_BUG_ON(PG_waiters != 7);
-	page = compound_head(page);
-	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
-		wake_up_page_bit(page, PG_locked);
-}
-EXPORT_SYMBOL(unlock_page);
-
-/**
- * end_page_writeback - end writeback against a page
- * @page: the page
- */
-void end_page_writeback(struct page *page)
-{
-	/*
-	 * TestClearPageReclaim could be used here but it is an atomic
-	 * operation and overkill in this particular case. Failing to
-	 * shuffle a page marked for immediate reclaim is too mild to
-	 * justify taking an atomic operation penalty at the end of
-	 * ever page writeback.
-	 */
-	if (PageReclaim(page)) {
-		ClearPageReclaim(page);
-		rotate_reclaimable_page(page);
-	}
-
-	if (!test_clear_page_writeback(page))
-		BUG();
-
-	smp_mb__after_atomic();
-	wake_up_page(page, PG_writeback);
-}
-EXPORT_SYMBOL(end_page_writeback);
-
 /*
  * After completing I/O on a page, call this routine to update the page
  * flags appropriately
@@ -1147,26 +907,6 @@ void page_endio(struct page *page, bool is_write, int err)
 }
 EXPORT_SYMBOL_GPL(page_endio);
 
-/**
- * __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @__page: the page to lock
- */
-void __lock_page(struct page *__page)
-{
-	struct page *page = compound_head(__page);
-	wait_queue_head_t *q = page_waitqueue(page);
-	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
-}
-EXPORT_SYMBOL(__lock_page);
-
-int __lock_page_killable(struct page *__page)
-{
-	struct page *page = compound_head(__page);
-	wait_queue_head_t *q = page_waitqueue(page);
-	return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
-}
-EXPORT_SYMBOL_GPL(__lock_page_killable);
-
 /*
  * Return values:
  * 1 - page is locked; mmap_sem is still held.
diff --git a/mm/page_wait_bit.c b/mm/page_wait_bit.c
new file mode 100644
index 000000000000..7550b6d2715a
--- /dev/null
+++ b/mm/page_wait_bit.c
@@ -0,0 +1,274 @@
+/*
+ *	linux/mm/page_wait_bit.c
+ *
+ * Copyright (C) 2017  Linus Torvalds
+ */
+
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/export.h>
+#include <linux/sched/signal.h>
+
+#include "internal.h"
+
+/*
+ * In order to wait for pages to become available there must be
+ * waitqueues associated with pages. By using a hash table of
+ * waitqueues where the bucket discipline is to maintain all
+ * waiters on the same queue and wake all when any of the pages
+ * become available, and for the woken contexts to check to be
+ * sure the appropriate page became available, this saves space
+ * at a cost of "thundering herd" phenomena during rare hash
+ * collisions.
+ */
+#define PAGE_WAIT_TABLE_BITS 8
+#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
+static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+
+static wait_queue_head_t *page_waitqueue(struct page *page)
+{
+	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+}
+
+void __init pagecache_init(void)
+{
+	int i;
+
+	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
+		init_waitqueue_head(&page_wait_table[i]);
+
+	page_writeback_init();
+}
+
+struct wait_page_key {
+	struct page *page;
+	int bit_nr;
+	int page_match;
+};
+
+struct wait_page_queue {
+	struct page *page;
+	int bit_nr;
+	wait_queue_entry_t wait;
+};
+
+static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
+{
+	struct wait_page_key *key = arg;
+	struct wait_page_queue *wait_page
+		= container_of(wait, struct wait_page_queue, wait);
+
+	if (wait_page->page != key->page)
+	       return 0;
+	key->page_match = 1;
+
+	if (wait_page->bit_nr != key->bit_nr)
+		return 0;
+	if (test_bit(key->bit_nr, &key->page->flags))
+		return 0;
+
+	return autoremove_wake_function(wait, mode, sync, key);
+}
+
+static void wake_up_page_bit(struct page *page, int bit_nr)
+{
+	wait_queue_head_t *q = page_waitqueue(page);
+	struct wait_page_key key;
+	unsigned long flags;
+
+	key.page = page;
+	key.bit_nr = bit_nr;
+	key.page_match = 0;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__wake_up_locked_key(q, TASK_NORMAL, &key);
+	/*
+	 * It is possible for other pages to have collided on the waitqueue
+	 * hash, so in that case check for a page match. That prevents a long-
+	 * term waiter
+	 *
+	 * It is still possible to miss a case here, when we woke page waiters
+	 * and removed them from the waitqueue, but there are still other
+	 * page waiters.
+	 */
+	if (!waitqueue_active(q) || !key.page_match) {
+		ClearPageWaiters(page);
+		/*
+		 * It's possible to miss clearing Waiters here, when we woke
+		 * our page waiters, but the hashed waitqueue has waiters for
+		 * other pages on it.
+		 *
+		 * That's okay, it's a rare case. The next waker will clear it.
+		 */
+	}
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+
+static void wake_up_page(struct page *page, int bit)
+{
+	if (!PageWaiters(page))
+		return;
+	wake_up_page_bit(page, bit);
+}
+
+static inline int wait_on_page_bit_common(wait_queue_head_t *q,
+		struct page *page, int bit_nr, int state, bool lock)
+{
+	struct wait_page_queue wait_page;
+	wait_queue_entry_t *wait = &wait_page.wait;
+	int ret = 0;
+
+	init_wait(wait);
+	wait->func = wake_page_function;
+	wait_page.page = page;
+	wait_page.bit_nr = bit_nr;
+
+	for (;;) {
+		spin_lock_irq(&q->lock);
+
+		if (likely(list_empty(&wait->entry))) {
+			if (lock)
+				__add_wait_queue_entry_tail_exclusive(q, wait);
+			else
+				__add_wait_queue(q, wait);
+			SetPageWaiters(page);
+		}
+
+		set_current_state(state);
+
+		spin_unlock_irq(&q->lock);
+
+		if (likely(test_bit(bit_nr, &page->flags))) {
+			io_schedule();
+			if (unlikely(signal_pending_state(state, current))) {
+				ret = -EINTR;
+				break;
+			}
+		}
+
+		if (lock) {
+			if (!test_and_set_bit_lock(bit_nr, &page->flags))
+				break;
+		} else {
+			if (!test_bit(bit_nr, &page->flags))
+				break;
+		}
+	}
+
+	finish_wait(q, wait);
+
+	/*
+	 * A signal could leave PageWaiters set. Clearing it here if
+	 * !waitqueue_active would be possible (by open-coding finish_wait),
+	 * but still fail to catch it in the case of wait hash collision. We
+	 * already can fail to clear wait hash collision cases, so don't
+	 * bother with signals either.
+	 */
+
+	return ret;
+}
+
+void wait_on_page_bit(struct page *page, int bit_nr)
+{
+	wait_queue_head_t *q = page_waitqueue(page);
+	wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+}
+EXPORT_SYMBOL(wait_on_page_bit);
+
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+	wait_queue_head_t *q = page_waitqueue(page);
+	return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
+}
+
+/**
+ * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
+ * @page: Page defining the wait queue of interest
+ * @waiter: Waiter to add to the queue
+ *
+ * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ */
+void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
+{
+	wait_queue_head_t *q = page_waitqueue(page);
+	unsigned long flags;
+
+	spin_lock_irqsave(&q->lock, flags);
+	__add_wait_queue(q, waiter);
+	SetPageWaiters(page);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_page_wait_queue);
+
+
+/**
+ * __lock_page - get a lock on the page, assuming we need to sleep to get it
+ * @__page: the page to lock
+ */
+void __lock_page(struct page *__page)
+{
+	struct page *page = compound_head(__page);
+	wait_queue_head_t *q = page_waitqueue(page);
+	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
+}
+EXPORT_SYMBOL(__lock_page);
+
+int __lock_page_killable(struct page *__page)
+{
+	struct page *page = compound_head(__page);
+	wait_queue_head_t *q = page_waitqueue(page);
+	return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
+}
+EXPORT_SYMBOL_GPL(__lock_page_killable);
+
+/**
+ * unlock_page - unlock a locked page
+ * @page: the page
+ *
+ * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Also wakes sleepers in wait_on_page_writeback() because the wakeup
+ * mechanism between PageLocked pages and PageWriteback pages is shared.
+ * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ *
+ * Note that this depends on PG_waiters being the sign bit in the byte
+ * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
+ * clear the PG_locked bit and test PG_waiters at the same time fairly
+ * portably (architectures that do LL/SC can test any bit, while x86 can
+ * test the sign bit).
+ */
+void unlock_page(struct page *page)
+{
+	BUILD_BUG_ON(PG_waiters != 7);
+	page = compound_head(page);
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
+		wake_up_page_bit(page, PG_locked);
+}
+EXPORT_SYMBOL(unlock_page);
+
+/**
+ * end_page_writeback - end writeback against a page
+ * @page: the page
+ */
+void end_page_writeback(struct page *page)
+{
+	/*
+	 * TestClearPageReclaim could be used here but it is an atomic
+	 * operation and overkill in this particular case. Failing to
+	 * shuffle a page marked for immediate reclaim is too mild to
+	 * justify taking an atomic operation penalty at the end of
+	 * ever page writeback.
+	 */
+	if (PageReclaim(page)) {
+		ClearPageReclaim(page);
+		rotate_reclaimable_page(page);
+	}
+
+	if (!test_clear_page_writeback(page))
+		BUG();
+
+	smp_mb__after_atomic();
+	wake_up_page(page, PG_writeback);
+}
+EXPORT_SYMBOL(end_page_writeback);
-- 
2.14.0.rc1.2.g4c8247ec3


[-- Attachment #3: 0002-Re-implement-the-page-bit-wait-code.patch --]
[-- Type: text/x-patch, Size: 12642 bytes --]

From 8cf9377c98d77b2a2de0dbc451e8ac283684a4e2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 25 Aug 2017 15:45:43 -0700
Subject: [PATCH 2/2] Re-implement the page bit-wait code

The page wait-queues have some horrible scaling issues, and they seem to
be hard to fix.  And the way they use the regular wait-queues made that
really bad, with interrupts disabled for long times etc.

This tries to re-implement them with a totally different model.  It's
known broken, and the add_page_wait_queue() thing that the cachefiles
code wants to use is not implemented at all (it probably will just need
to have a parallel set of wait-queues that are *only* used for that).

The code is untested and probably horribly buggy, but I'm hoping others
will take a look at this.

Not-signed-off-yet-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_wait_bit.c | 309 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 209 insertions(+), 100 deletions(-)

diff --git a/mm/page_wait_bit.c b/mm/page_wait_bit.c
index 7550b6d2715a..47fdb305ddae 100644
--- a/mm/page_wait_bit.c
+++ b/mm/page_wait_bit.c
@@ -9,9 +9,38 @@
 #include <linux/wait.h>
 #include <linux/export.h>
 #include <linux/sched/signal.h>
+#include <linux/list_bl.h>
 
 #include "internal.h"
 
+/*
+ * Each waiter on a page will register this
+ * 'page_wait_struct' as part of waiting.
+ *
+ * Note that for any particular page, only one of the
+ * structs will actually be visible at the head of the
+ * wait queue hash table at any particular time, but
+ * everybody has one, because as one waiter is woken
+ * up we will need to pick another head for waiters.
+ *
+ * NOTE! All the list operations are protected by the
+ * hlist_bl_lock on the hash table.
+ */
+struct page_wait_struct {
+	// This is the hash queue head entry
+	// only used once per { page, bit }
+	struct hlist_bl_node list;
+	struct page *page;
+	int bit;
+
+	struct page_wait_struct *all;
+	struct page_wait_struct *exclusive;
+
+	// This is the waiter list
+	struct page_wait_struct *next;
+	struct task_struct *wake;
+};
+
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -22,11 +51,11 @@
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-#define PAGE_WAIT_TABLE_BITS 8
+#define PAGE_WAIT_TABLE_BITS 12
 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
-static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+static struct hlist_bl_head page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
 
-static wait_queue_head_t *page_waitqueue(struct page *page)
+static struct hlist_bl_head *page_waitqueue(struct page *page)
 {
 	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
 }
@@ -36,73 +65,129 @@ void __init pagecache_init(void)
 	int i;
 
 	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
-		init_waitqueue_head(&page_wait_table[i]);
+		INIT_HLIST_BL_HEAD(&page_wait_table[i]);
 
 	page_writeback_init();
 }
 
-struct wait_page_key {
-	struct page *page;
-	int bit_nr;
-	int page_match;
-};
+/*
+ * We found a wait entry for the requested page and bit.
+ *
+ * We now need to create a wakeup list, which includes the
+ * first exclusive waiter (if any), and all the non-exclusive
+ * ones.
+ *
+ * If there are more than one exclusive waiters, we need to
+ * turns the next exclusive waiter into a wait entry, and
+ * add it back to the page wait list.
+ */
+static struct page_wait_struct *create_wake_up_list(struct page_wait_struct *entry, struct hlist_bl_head *head)
+{
+	struct page_wait_struct *all = entry->all;
+	struct page_wait_struct *exclusive = entry->exclusive;
 
-struct wait_page_queue {
-	struct page *page;
-	int bit_nr;
-	wait_queue_entry_t wait;
-};
+	if (exclusive) {
+		struct page_wait_struct *remain = exclusive->next;
 
-static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
+		if (remain) {
+			remain->all = NULL;
+			remain->exclusive = remain;
+			hlist_bl_add_head(&remain->list, head);
+		}
+		exclusive->next = all;
+		all = exclusive;
+	}
+	return all;
+}
+
+static inline int remove_myself_from_one_list(struct page_wait_struct **p, struct page_wait_struct *entry)
+{
+	while (*p) {
+		struct page_wait_struct *n = *p;
+		if (n == entry) {
+			*p = n->next;
+			return 1;
+		}
+		p = &n->next;
+	}
+	return 0;
+}
+
+/*
+ * We got woken up, and we need to make sure there is no more
+ * access to us in the list.
+ */
+static void remove_myself_from(struct page_wait_struct *old, struct page_wait_struct *entry, struct hlist_bl_head *head)
 {
-	struct wait_page_key *key = arg;
-	struct wait_page_queue *wait_page
-		= container_of(wait, struct wait_page_queue, wait);
+	/* We can be on only one list */
+	if (!remove_myself_from_one_list(&old->all, entry))
+		remove_myself_from_one_list(&old->exclusive, entry);
 
-	if (wait_page->page != key->page)
-	       return 0;
-	key->page_match = 1;
+	hlist_bl_del_init(&entry->list);
+}
 
-	if (wait_page->bit_nr != key->bit_nr)
-		return 0;
-	if (test_bit(key->bit_nr, &key->page->flags))
-		return 0;
 
-	return autoremove_wake_function(wait, mode, sync, key);
+/*
+ * Find and remove the matching page/bit entry from the (locked) bl list
+ *
+ * Return ERR_PTR(-ESRCH) if no matching page at all, NULL if page found
+ * but not with matching bit.
+ */
+static struct page_wait_struct *find_del_entry(struct page *page, int bit_nr, struct hlist_bl_head *head)
+{
+	struct page_wait_struct *entry;
+	struct page_wait_struct *ret = ERR_PTR(-ESRCH);
+	struct hlist_bl_node *node;
+
+	hlist_bl_for_each_entry(entry, node, head, list) {
+		if (entry->page != page)
+			continue;
+		ret = NULL;
+		if (entry->bit != bit_nr)
+			continue;
+		__hlist_bl_del(node);
+		INIT_HLIST_BL_NODE(node);
+		ret = entry;
+		break;
+	}
+	return ret;
 }
 
 static void wake_up_page_bit(struct page *page, int bit_nr)
 {
-	wait_queue_head_t *q = page_waitqueue(page);
-	struct wait_page_key key;
+	struct hlist_bl_head *head = page_waitqueue(page);
+	struct page_wait_struct *wake;
 	unsigned long flags;
 
-	key.page = page;
-	key.bit_nr = bit_nr;
-	key.page_match = 0;
+	local_save_flags(flags);
+	hlist_bl_lock(head);
+
+	wake = find_del_entry(page, bit_nr, head);
+	if (IS_ERR(wake)) {
+		ClearPageWaiters(page);
+		wake = NULL;
+	} else if (wake) {
+		wake = create_wake_up_list(wake, head);
+	}
+
+	hlist_bl_unlock(head);
+	local_irq_restore(flags);
 
-	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key(q, TASK_NORMAL, &key);
 	/*
-	 * It is possible for other pages to have collided on the waitqueue
-	 * hash, so in that case check for a page match. That prevents a long-
-	 * term waiter
+	 * Actually wake everybody up. Note that as we
+	 * wake them up, we can't use the 'wake_list'
+	 * entry any more, because it's on their stack.
 	 *
-	 * It is still possible to miss a case here, when we woke page waiters
-	 * and removed them from the waitqueue, but there are still other
-	 * page waiters.
+	 * We also clear the 'wake' field so that the
+	 * target process can see if they got woken up
+	 * by a page bit event.
 	 */
-	if (!waitqueue_active(q) || !key.page_match) {
-		ClearPageWaiters(page);
-		/*
-		 * It's possible to miss clearing Waiters here, when we woke
-		 * our page waiters, but the hashed waitqueue has waiters for
-		 * other pages on it.
-		 *
-		 * That's okay, it's a rare case. The next waker will clear it.
-		 */
-	}
-	spin_unlock_irqrestore(&q->lock, flags);
+	while (wake) {
+		struct task_struct *p = wake->wake;
+		wake = wake->next;
+		smp_store_release(&wake->wake, NULL);
+		wake_up_process(p);
+	};
 }
 
 static void wake_up_page(struct page *page, int bit)
@@ -112,76 +197,101 @@ static void wake_up_page(struct page *page, int bit)
 	wake_up_page_bit(page, bit);
 }
 
-static inline int wait_on_page_bit_common(wait_queue_head_t *q,
-		struct page *page, int bit_nr, int state, bool lock)
+/*
+ * Wait for the specific page bit to clear.
+ */
+static void wait_once_on_page_bit(struct page *page, int bit_nr, int state, bool lock)
 {
-	struct wait_page_queue wait_page;
-	wait_queue_entry_t *wait = &wait_page.wait;
-	int ret = 0;
+	struct page_wait_struct entry, *old;
+	struct hlist_bl_head *head;
+	unsigned long flags;
 
-	init_wait(wait);
-	wait->func = wake_page_function;
-	wait_page.page = page;
-	wait_page.bit_nr = bit_nr;
+	INIT_HLIST_BL_NODE(&entry.list);
+	entry.page = page;
+	entry.bit = bit_nr;
+	entry.all = entry.exclusive = NULL;
+	entry.next = NULL;
+	entry.wake = current;
+
+	head = page_waitqueue(page);
+	local_save_flags(flags);
+	hlist_bl_lock(head);
+
+	old = find_del_entry(page, bit_nr, head);
+	if (IS_ERR(old))
+		old = NULL;
+	if (old) {
+		entry.all = old->all;
+		entry.exclusive = old->exclusive;
+	}
 
-	for (;;) {
-		spin_lock_irq(&q->lock);
-
-		if (likely(list_empty(&wait->entry))) {
-			if (lock)
-				__add_wait_queue_entry_tail_exclusive(q, wait);
-			else
-				__add_wait_queue(q, wait);
-			SetPageWaiters(page);
-		}
+	if (lock) {
+		entry.next = entry.exclusive;
+		entry.exclusive = &entry;
+	} else {
+		entry.next = entry.all;
+		entry.all = &entry;
+	}
 
-		set_current_state(state);
+	hlist_bl_add_head(&entry.list, head);
+	current->state = state;
 
-		spin_unlock_irq(&q->lock);
+	hlist_bl_unlock(head);
+	local_irq_restore(flags);
 
-		if (likely(test_bit(bit_nr, &page->flags))) {
-			io_schedule();
-			if (unlikely(signal_pending_state(state, current))) {
-				ret = -EINTR;
-				break;
-			}
-		}
+	if (likely(test_bit(bit_nr, &page->flags)))
+		io_schedule();
+
+	/*
+	 * NOTE! If we were woken up by something else,
+	 * we have to remove ourselves from the hash list.
+	 *
+	 * But in order to avoid extra locking overhead in
+	 * the common case, we only do this if we can't
+	 * already tell that we were woken up (and thus
+	 * no longer on the lists).
+	 */
+	if (smp_load_acquire(&entry.wake) != NULL) {
+		local_save_flags(flags);
+		hlist_bl_lock(head);
+
+		old = find_del_entry(page, bit_nr, head);
+		if (old && !IS_ERR(old))
+			remove_myself_from(old, &entry, head);
 
+		hlist_bl_unlock(head);
+		local_irq_restore(flags);
+	}
+}
+
+static inline int wait_on_page_bit_common(struct page *page, int bit_nr, int state, bool lock)
+{
+	for (;;) {
+		wait_once_on_page_bit(page, bit_nr, state, lock);
 		if (lock) {
 			if (!test_and_set_bit_lock(bit_nr, &page->flags))
-				break;
+				return 0;
 		} else {
 			if (!test_bit(bit_nr, &page->flags))
-				break;
+				return 0;
 		}
+		if (unlikely(signal_pending_state(state, current)))
+			return -EINTR;
 	}
-
-	finish_wait(q, wait);
-
-	/*
-	 * A signal could leave PageWaiters set. Clearing it here if
-	 * !waitqueue_active would be possible (by open-coding finish_wait),
-	 * but still fail to catch it in the case of wait hash collision. We
-	 * already can fail to clear wait hash collision cases, so don't
-	 * bother with signals either.
-	 */
-
-	return ret;
 }
 
 void wait_on_page_bit(struct page *page, int bit_nr)
 {
-	wait_queue_head_t *q = page_waitqueue(page);
-	wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+	wait_on_page_bit_common(page, bit_nr, TASK_UNINTERRUPTIBLE, false);
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
 int wait_on_page_bit_killable(struct page *page, int bit_nr)
 {
-	wait_queue_head_t *q = page_waitqueue(page);
-	return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
+	return wait_on_page_bit_common(page, bit_nr, TASK_KILLABLE, false);
 }
 
+#if 0
 /**
  * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
  * @page: Page defining the wait queue of interest
@@ -200,6 +310,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_page_wait_queue);
+#endif
 
 
 /**
@@ -209,16 +320,14 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
 void __lock_page(struct page *__page)
 {
 	struct page *page = compound_head(__page);
-	wait_queue_head_t *q = page_waitqueue(page);
-	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
+	wait_on_page_bit_common(page, PG_locked, TASK_UNINTERRUPTIBLE, true);
 }
 EXPORT_SYMBOL(__lock_page);
 
 int __lock_page_killable(struct page *__page)
 {
 	struct page *page = compound_head(__page);
-	wait_queue_head_t *q = page_waitqueue(page);
-	return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
+	return wait_on_page_bit_common(page, PG_locked, TASK_KILLABLE, true);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
 
-- 
2.14.0.rc1.2.g4c8247ec3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-25 22:51       ` Linus Torvalds
@ 2017-08-25 23:03         ` Linus Torvalds
  2017-08-26  0:31             ` Linus Torvalds
  0 siblings, 1 reply; 57+ messages in thread
From: Linus Torvalds @ 2017-08-25 23:03 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 654 bytes --]

On Fri, Aug 25, 2017 at 3:51 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> So take it as that: example pseudo-code that happens to pass a
> compiler, but is meant as a RFD rather than actually working.

Oh, and after I sent it out, I wanted to look once again, and realized
that the "remove_myself_from()" function is entirely broken.

The caller has already removed the page entry, we don't want to remove
it again, we want to add a *new* one with us removed from it.

So here's an updated 2/2 patch with that fixed.

Let this be a lesson in just *how* little tested, and *how* crap that
patch probably still is.

                 Linus

[-- Attachment #2: 0002-Re-implement-the-page-bit-wait-code.patch --]
[-- Type: text/x-patch, Size: 13238 bytes --]

From 3f3355eab709e6fa418466e5487a30eb6ec80423 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 25 Aug 2017 16:01:36 -0700
Subject: [PATCH 2/2] Re-implement the page bit-wait code

The page wait-queues have some horrible scaling issues, and they seem to
be hard to fix.  And the way they use the regular wait-queues made that
really bad, with interrupts disabled for long times etc.

This tries to re-implement them with a totally different model.  It's
known broken, and the add_page_wait_queue() thing that the cachefiles
code wants to use is not implemented at all (it probably will just need
to have a parallel set of wait-queues that are *only* used for that).

The code is untested and probably horribly buggy, but I'm hoping others
will take a look at this.

Not-signed-off-yet-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_wait_bit.c | 335 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 235 insertions(+), 100 deletions(-)

diff --git a/mm/page_wait_bit.c b/mm/page_wait_bit.c
index 7550b6d2715a..968bc9b1cf21 100644
--- a/mm/page_wait_bit.c
+++ b/mm/page_wait_bit.c
@@ -9,9 +9,38 @@
 #include <linux/wait.h>
 #include <linux/export.h>
 #include <linux/sched/signal.h>
+#include <linux/list_bl.h>
 
 #include "internal.h"
 
+/*
+ * Each waiter on a page will register this
+ * 'page_wait_struct' as part of waiting.
+ *
+ * Note that for any particular page, only one of the
+ * structs will actually be visible at the head of the
+ * wait queue hash table at any particular time, but
+ * everybody has one, because as one waiter is woken
+ * up we will need to pick another head for waiters.
+ *
+ * NOTE! All the list operations are protected by the
+ * hlist_bl_lock on the hash table.
+ */
+struct page_wait_struct {
+	// This is the hash queue head entry
+	// only used once per { page, bit }
+	struct hlist_bl_node list;
+	struct page *page;
+	int bit;
+
+	struct page_wait_struct *all;
+	struct page_wait_struct *exclusive;
+
+	// This is the waiter list
+	struct page_wait_struct *next;
+	struct task_struct *wake;
+};
+
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -22,11 +51,11 @@
  * at a cost of "thundering herd" phenomena during rare hash
  * collisions.
  */
-#define PAGE_WAIT_TABLE_BITS 8
+#define PAGE_WAIT_TABLE_BITS 12
 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
-static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+static struct hlist_bl_head page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
 
-static wait_queue_head_t *page_waitqueue(struct page *page)
+static struct hlist_bl_head *page_waitqueue(struct page *page)
 {
 	return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
 }
@@ -36,73 +65,155 @@ void __init pagecache_init(void)
 	int i;
 
 	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
-		init_waitqueue_head(&page_wait_table[i]);
+		INIT_HLIST_BL_HEAD(&page_wait_table[i]);
 
 	page_writeback_init();
 }
 
-struct wait_page_key {
-	struct page *page;
-	int bit_nr;
-	int page_match;
-};
+/*
+ * We found a wait entry for the requested page and bit.
+ *
+ * We now need to create a wakeup list, which includes the
+ * first exclusive waiter (if any), and all the non-exclusive
+ * ones.
+ *
+ * If there are more than one exclusive waiters, we need to
+ * turns the next exclusive waiter into a wait entry, and
+ * add it back to the page wait list.
+ */
+static struct page_wait_struct *create_wake_up_list(struct page_wait_struct *entry, struct hlist_bl_head *head)
+{
+	struct page_wait_struct *all = entry->all;
+	struct page_wait_struct *exclusive = entry->exclusive;
 
-struct wait_page_queue {
-	struct page *page;
-	int bit_nr;
-	wait_queue_entry_t wait;
-};
+	if (exclusive) {
+		struct page_wait_struct *remain = exclusive->next;
+
+		if (remain) {
+			remain->all = NULL;
+			remain->exclusive = remain;
+			hlist_bl_add_head(&remain->list, head);
+		}
+		exclusive->next = all;
+		all = exclusive;
+	}
+	return all;
+}
 
-static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
+static inline int remove_myself_from_one_list(struct page_wait_struct **p, struct page_wait_struct *entry)
 {
-	struct wait_page_key *key = arg;
-	struct wait_page_queue *wait_page
-		= container_of(wait, struct wait_page_queue, wait);
+	while (*p) {
+		struct page_wait_struct *n = *p;
+		if (n == entry) {
+			*p = n->next;
+			return 1;
+		}
+		p = &n->next;
+	}
+	return 0;
+}
 
-	if (wait_page->page != key->page)
-	       return 0;
-	key->page_match = 1;
+/*
+ * We got woken up, and we need to make sure there is no more
+ * access to us in the list.
+ */
+static void remove_myself_from(struct page_wait_struct *old, struct page_wait_struct *entry, struct hlist_bl_head *head)
+{
+	struct page_wait_struct *new;
 
-	if (wait_page->bit_nr != key->bit_nr)
-		return 0;
-	if (test_bit(key->bit_nr, &key->page->flags))
-		return 0;
+	/* We can be on only one list */
+	if (!remove_myself_from_one_list(&old->all, entry))
+		remove_myself_from_one_list(&old->exclusive, entry);
 
-	return autoremove_wake_function(wait, mode, sync, key);
+	/*
+	 * If we were the old entry for the page/bit on the hash list,
+	 * we need to create a new one from one of the existing other
+	 * ones.
+	 *
+	 * If the head entry was somebody else, or if there are no
+	 * other wait entries for this page, we're done.
+	 */
+	if (old != entry)
+		return;
+
+	new = entry->exclusive;
+	if (!new) {
+		new = entry->all;
+		if (!new)
+			return;
+	}
+
+	/*
+	 * We can just use our old lists - we already removed our own
+	 * entry from them above.
+	 */
+	new->exclusive = entry->exclusive;
+	new->all = entry->all;
+	hlist_bl_add_head(&new->list, head);
+}
+
+
+/*
+ * Find and remove the matching page/bit entry from the (locked) bl list
+ *
+ * Return ERR_PTR(-ESRCH) if no matching page at all, NULL if page found
+ * but not with matching bit.
+ */
+static struct page_wait_struct *find_del_entry(struct page *page, int bit_nr, struct hlist_bl_head *head)
+{
+	struct page_wait_struct *entry;
+	struct page_wait_struct *ret = ERR_PTR(-ESRCH);
+	struct hlist_bl_node *node;
+
+	hlist_bl_for_each_entry(entry, node, head, list) {
+		if (entry->page != page)
+			continue;
+		ret = NULL;
+		if (entry->bit != bit_nr)
+			continue;
+		__hlist_bl_del(node);
+		INIT_HLIST_BL_NODE(node);
+		ret = entry;
+		break;
+	}
+	return ret;
 }
 
 static void wake_up_page_bit(struct page *page, int bit_nr)
 {
-	wait_queue_head_t *q = page_waitqueue(page);
-	struct wait_page_key key;
+	struct hlist_bl_head *head = page_waitqueue(page);
+	struct page_wait_struct *wake;
 	unsigned long flags;
 
-	key.page = page;
-	key.bit_nr = bit_nr;
-	key.page_match = 0;
+	local_save_flags(flags);
+	hlist_bl_lock(head);
+
+	wake = find_del_entry(page, bit_nr, head);
+	if (IS_ERR(wake)) {
+		ClearPageWaiters(page);
+		wake = NULL;
+	} else if (wake) {
+		wake = create_wake_up_list(wake, head);
+	}
+
+	hlist_bl_unlock(head);
+	local_irq_restore(flags);
 
-	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key(q, TASK_NORMAL, &key);
 	/*
-	 * It is possible for other pages to have collided on the waitqueue
-	 * hash, so in that case check for a page match. That prevents a long-
-	 * term waiter
+	 * Actually wake everybody up. Note that as we
+	 * wake them up, we can't use the 'wake_list'
+	 * entry any more, because it's on their stack.
 	 *
-	 * It is still possible to miss a case here, when we woke page waiters
-	 * and removed them from the waitqueue, but there are still other
-	 * page waiters.
+	 * We also clear the 'wake' field so that the
+	 * target process can see if they got woken up
+	 * by a page bit event.
 	 */
-	if (!waitqueue_active(q) || !key.page_match) {
-		ClearPageWaiters(page);
-		/*
-		 * It's possible to miss clearing Waiters here, when we woke
-		 * our page waiters, but the hashed waitqueue has waiters for
-		 * other pages on it.
-		 *
-		 * That's okay, it's a rare case. The next waker will clear it.
-		 */
-	}
-	spin_unlock_irqrestore(&q->lock, flags);
+	while (wake) {
+		struct task_struct *p = wake->wake;
+		wake = wake->next;
+		smp_store_release(&wake->wake, NULL);
+		wake_up_process(p);
+	};
 }
 
 static void wake_up_page(struct page *page, int bit)
@@ -112,76 +223,101 @@ static void wake_up_page(struct page *page, int bit)
 	wake_up_page_bit(page, bit);
 }
 
-static inline int wait_on_page_bit_common(wait_queue_head_t *q,
-		struct page *page, int bit_nr, int state, bool lock)
+/*
+ * Wait for the specific page bit to clear.
+ */
+static void wait_once_on_page_bit(struct page *page, int bit_nr, int state, bool lock)
 {
-	struct wait_page_queue wait_page;
-	wait_queue_entry_t *wait = &wait_page.wait;
-	int ret = 0;
+	struct page_wait_struct entry, *old;
+	struct hlist_bl_head *head;
+	unsigned long flags;
 
-	init_wait(wait);
-	wait->func = wake_page_function;
-	wait_page.page = page;
-	wait_page.bit_nr = bit_nr;
+	INIT_HLIST_BL_NODE(&entry.list);
+	entry.page = page;
+	entry.bit = bit_nr;
+	entry.all = entry.exclusive = NULL;
+	entry.next = NULL;
+	entry.wake = current;
+
+	head = page_waitqueue(page);
+	local_save_flags(flags);
+	hlist_bl_lock(head);
+
+	old = find_del_entry(page, bit_nr, head);
+	if (IS_ERR(old))
+		old = NULL;
+	if (old) {
+		entry.all = old->all;
+		entry.exclusive = old->exclusive;
+	}
 
-	for (;;) {
-		spin_lock_irq(&q->lock);
-
-		if (likely(list_empty(&wait->entry))) {
-			if (lock)
-				__add_wait_queue_entry_tail_exclusive(q, wait);
-			else
-				__add_wait_queue(q, wait);
-			SetPageWaiters(page);
-		}
+	if (lock) {
+		entry.next = entry.exclusive;
+		entry.exclusive = &entry;
+	} else {
+		entry.next = entry.all;
+		entry.all = &entry;
+	}
 
-		set_current_state(state);
+	hlist_bl_add_head(&entry.list, head);
+	current->state = state;
 
-		spin_unlock_irq(&q->lock);
+	hlist_bl_unlock(head);
+	local_irq_restore(flags);
 
-		if (likely(test_bit(bit_nr, &page->flags))) {
-			io_schedule();
-			if (unlikely(signal_pending_state(state, current))) {
-				ret = -EINTR;
-				break;
-			}
-		}
+	if (likely(test_bit(bit_nr, &page->flags)))
+		io_schedule();
 
+	/*
+	 * NOTE! If we were woken up by something else,
+	 * we have to remove ourselves from the hash list.
+	 *
+	 * But in order to avoid extra locking overhead in
+	 * the common case, we only do this if we can't
+	 * already tell that we were woken up (and thus
+	 * no longer on the lists).
+	 */
+	if (smp_load_acquire(&entry.wake) != NULL) {
+		local_save_flags(flags);
+		hlist_bl_lock(head);
+
+		old = find_del_entry(page, bit_nr, head);
+		if (old && !IS_ERR(old))
+			remove_myself_from(old, &entry, head);
+
+		hlist_bl_unlock(head);
+		local_irq_restore(flags);
+	}
+}
+
+static inline int wait_on_page_bit_common(struct page *page, int bit_nr, int state, bool lock)
+{
+	for (;;) {
+		wait_once_on_page_bit(page, bit_nr, state, lock);
 		if (lock) {
 			if (!test_and_set_bit_lock(bit_nr, &page->flags))
-				break;
+				return 0;
 		} else {
 			if (!test_bit(bit_nr, &page->flags))
-				break;
+				return 0;
 		}
+		if (unlikely(signal_pending_state(state, current)))
+			return -EINTR;
 	}
-
-	finish_wait(q, wait);
-
-	/*
-	 * A signal could leave PageWaiters set. Clearing it here if
-	 * !waitqueue_active would be possible (by open-coding finish_wait),
-	 * but still fail to catch it in the case of wait hash collision. We
-	 * already can fail to clear wait hash collision cases, so don't
-	 * bother with signals either.
-	 */
-
-	return ret;
 }
 
 void wait_on_page_bit(struct page *page, int bit_nr)
 {
-	wait_queue_head_t *q = page_waitqueue(page);
-	wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+	wait_on_page_bit_common(page, bit_nr, TASK_UNINTERRUPTIBLE, false);
 }
 EXPORT_SYMBOL(wait_on_page_bit);
 
 int wait_on_page_bit_killable(struct page *page, int bit_nr)
 {
-	wait_queue_head_t *q = page_waitqueue(page);
-	return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
+	return wait_on_page_bit_common(page, bit_nr, TASK_KILLABLE, false);
 }
 
+#if 0
 /**
  * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
  * @page: Page defining the wait queue of interest
@@ -200,6 +336,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_page_wait_queue);
+#endif
 
 
 /**
@@ -209,16 +346,14 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
 void __lock_page(struct page *__page)
 {
 	struct page *page = compound_head(__page);
-	wait_queue_head_t *q = page_waitqueue(page);
-	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
+	wait_on_page_bit_common(page, PG_locked, TASK_UNINTERRUPTIBLE, true);
 }
 EXPORT_SYMBOL(__lock_page);
 
 int __lock_page_killable(struct page *__page)
 {
 	struct page *page = compound_head(__page);
-	wait_queue_head_t *q = page_waitqueue(page);
-	return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
+	return wait_on_page_bit_common(page, PG_locked, TASK_KILLABLE, true);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
 
-- 
2.14.0.rc1.2.g4c8247ec3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-25 23:03         ` Linus Torvalds
@ 2017-08-26  0:31             ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-26  0:31 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Fri, Aug 25, 2017 at 4:03 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Let this be a lesson in just *how* little tested, and *how* crap that
> patch probably still is.

I haven't had time to look at it any more (trying to merge the pull
requests that came in today instead), but the more I think about it,
the more I think it was a mistake to do that page_wait_struct
allocation on the stack.

It made it way more fragile and complicated, having to rewrite things
so carefully. A simple slab cache would likely be a lot cleaner and
simpler.

So even if that thing can be made to work, it's probably not worth the pain.

               Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-26  0:31             ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-26  0:31 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Fri, Aug 25, 2017 at 4:03 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Let this be a lesson in just *how* little tested, and *how* crap that
> patch probably still is.

I haven't had time to look at it any more (trying to merge the pull
requests that came in today instead), but the more I think about it,
the more I think it was a mistake to do that page_wait_struct
allocation on the stack.

It made it way more fragile and complicated, having to rewrite things
so carefully. A simple slab cache would likely be a lot cleaner and
simpler.

So even if that thing can be made to work, it's probably not worth the pain.

               Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-26  0:31             ` Linus Torvalds
@ 2017-08-26  2:54               ` Linus Torvalds
  -1 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-26  2:54 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Fri, Aug 25, 2017 at 5:31 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> It made it way more fragile and complicated, having to rewrite things
> so carefully. A simple slab cache would likely be a lot cleaner and
> simpler.

It also turns out that despite all the interfaces, we only really ever
wait on two different bits: PG_locked and PG_writeback. Nothing else.

Even the add_page_wait_queue() thing, which looks oh-so-generic,
really only waits on PG_locked.

And the PG_writeback case never really cares for the "locked" case, so
this incredibly generic interface that allows you to wait on any bit
you want, and has the whole exclusive wait support for getting
exclusive access to the bit really only has three cases:

 - wait for locked exclusive (wake up first waiter when unlocked)

 - wait for locked (wake up all waiters when unlocked)

 - wait for writeback (wake up all waiters when no longer under writeback)

and those last two could probably even share the same queue.

But even without sharing the same queue, we could just do a per-page
allocation for the three queues - and probably that stupiud
add_page_wait_queue() waitqueue too. So no "per-page and per-bit"
thing, just a per-page thing.

I'll try writing that up.

Simplify, simplify, simplify.

               Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-26  2:54               ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-26  2:54 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Fri, Aug 25, 2017 at 5:31 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> It made it way more fragile and complicated, having to rewrite things
> so carefully. A simple slab cache would likely be a lot cleaner and
> simpler.

It also turns out that despite all the interfaces, we only really ever
wait on two different bits: PG_locked and PG_writeback. Nothing else.

Even the add_page_wait_queue() thing, which looks oh-so-generic,
really only waits on PG_locked.

And the PG_writeback case never really cares for the "locked" case, so
this incredibly generic interface that allows you to wait on any bit
you want, and has the whole exclusive wait support for getting
exclusive access to the bit really only has three cases:

 - wait for locked exclusive (wake up first waiter when unlocked)

 - wait for locked (wake up all waiters when unlocked)

 - wait for writeback (wake up all waiters when no longer under writeback)

and those last two could probably even share the same queue.

But even without sharing the same queue, we could just do a per-page
allocation for the three queues - and probably that stupiud
add_page_wait_queue() waitqueue too. So no "per-page and per-bit"
thing, just a per-page thing.

I'll try writing that up.

Simplify, simplify, simplify.

               Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-26  2:54               ` Linus Torvalds
  (?)
@ 2017-08-26 18:15               ` Linus Torvalds
  2017-08-27 21:40                   ` Linus Torvalds
  2017-08-28 14:51                 ` Liang, Kan
  -1 siblings, 2 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-26 18:15 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 5970 bytes --]

On Fri, Aug 25, 2017 at 7:54 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Simplify, simplify, simplify.

I've now tried three different approaches (I stopped sending broken
patches after deciding the first one was unfixable), and they all
suck.

I was hoping for something lockless to avoid the whole issue with
latency over the long list, but anything that has the wait queue entry
allocated on the stack needs to synchronize the wakeup due to the
stack usage, so once you have lists that are thousands of entries,
either you hold the lock for long times (normal wait queues) or take
and release them constantly (the swait approach), or you batch things
up (Tim's wait-queue patches).

Of those three approaches, the batching does seem to be the best of
the lot. Allocating non-stack wait entries while waiting for pages
seems like a bad idea. We're probably waiting for IO in normal
circumstances, possibly because we're low on memory.

So I *am* ok with the batching (your patch #1), but I'm *not* ok with
the nasty horrible book-keeping to avoid new entries when you batch
and release the lock and that allows new entries on the list (your
patch #2).

That said, I have now stared *way* too much at the page wait code
after having unsuccessfully tried to replace that wait-queue with
other "clever" approaches (all of which ended up being crap).

And I'm starting to see a possible solution, or at least improvement.

Let's just assume I take the batching patch. It's not conceptually
ugly, it's fairly simple, and there are lots of independent arguments
for it (ie latency, but also possible performance from better
parallelism).

That patch doesn't make any data structures bigger or more
complicated, it's just that single "is this a bookmark entry" thing.
So that patch just looks fundamentally fine to me, and the only real
argument I ever had against it was that I would really really _really_
have wanted to root-cause the behavior.

So that leaves my fundamental dislike of your other patch.

And it turns out that all my looking at the page wait code wasn't
entirely unproductive. Yes, I went through three crap approaches
before I gave up on rewriting it, but in the meantime I did get way
too intimate with looking at that pile of crud.

And I think I found the reason why you needed that patch #2 in the first place.

Here's what is going on:

 - we're going to assume that the problem is all with a single page,
not due to hash collisions (as per your earlier reports)

 - we also know that the only bit that matters is the PG_locked bit

 - that means that the only way to get that "multiple concurrent
waker" situation that your patch #2 tries to handle better is because
you have multiple people unlocking the page - since that is what
causes the wakeups

 - that in turn means that you obviously had multiple threads *locking* it too.

 - and that means that among those new entries there are lockers
coming in in the middle and adding an exclusive entry.

 - the exclusive entry already stops the wakeup list walking

 - but we add non-exclusive entries TO THE BEGINNING of the page waiters list

And I really think that last thing is fundamentally wrong.

It's wrong for several reasons:

 - because it's unfair: threads that want to lock get put behind
threads that just want to see the unlocked state.

 - because it's stupid: our non-locking waiters will end up waiting
again if the page got locked, so waking up a locker *and* a lot of
non-locking waiters will just cause them to go back to sleep anyway

 - because it causes us to walk longer lists: we stop walking when we
wake up the exclusive waiter, but because we always put the
non-exclusive waiters in there first, we always walk the long list of
non-exclusive waiters even if we could just stop walking because we
woke up an exclusive one.

Now the reason we do this seems to be entirely random: for a *normal*
wait queue, you really want to always wake up all the non-exclusive
waiters, because exclusive waiters are only exclusive wrt each other.

But for a page lock, an exclusive waiter really is getting the lock,
and the other waiters are going to wait for the lock to clear anyway,
so the page wait thing is actually almost exactly the reverse
situation. We *could* put exclusive waiters at the beginning of the
list instead, and actively try to avoid walking the list at all if we
have pending lockers.

I'm not doing that, because I think the fair thing to do is to just do
things in the order they came in. Plus the code is actually simpler if
we just always add to the tail.

Now, the other thing to look at is how the wakeup function works. It
checks the aliasing information (page and bit number), but then it
*also* does:

        if (test_bit(key->bit_nr, &key->page->flags))
                return 0;

basically saying "continue walking if somebody else already got the bit".

That's *INSANE*. It's exactly the wrong thing to do. It's basically
saying "even if we had an exclusive waiter, let's not wake it up, but
do let us continue to walk the list even though the bit we're waiting
for to clear is set already".

What would be sane is to say "stop walking the list now".. So just do
that - by making "negative return from wake function" mean "stop
walking".

So how about just this fairly trivial patch?

In fact, I think this may help even *without* Tim's patch #1. So I
think this would be lovely to test on that problem load on its own,
and seeing if it makes the wait queues behave better.

It might not cut down on the total length of the wait-queue, but it
should hopefully cause it to walk it much less. We now hit the
exclusive waiters earlier and stop waking things up when we have a new
locker thread pending. And when the page ends up being locked again,
we stop walking the list entirely, so no unnecessarily traversal.

This patch is small and at least minimally works (I'm running it right now).

                               Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/plain, Size: 1782 bytes --]

 kernel/sched/wait.c |  7 ++++---
 mm/filemap.c        | 10 +++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 17f11c6b0a9f..d6afed6d0752 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 
 	list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
 		unsigned flags = curr->flags;
-
-		if (curr->func(curr, mode, wake_flags, key) &&
-				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+		int ret = curr->func(curr, mode, wake_flags, key);
+		if (ret < 0)
+			break;
+		if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index a49702445ce0..60705b760983 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -909,8 +909,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
 
 	if (wait_page->bit_nr != key->bit_nr)
 		return 0;
+
+	/* Stop walking if it's locked */
 	if (test_bit(key->bit_nr, &key->page->flags))
-		return 0;
+		return -1;
 
 	return autoremove_wake_function(wait, mode, sync, key);
 }
@@ -964,6 +966,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 	int ret = 0;
 
 	init_wait(wait);
+	wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
 	wait->func = wake_page_function;
 	wait_page.page = page;
 	wait_page.bit_nr = bit_nr;
@@ -972,10 +975,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 		spin_lock_irq(&q->lock);
 
 		if (likely(list_empty(&wait->entry))) {
-			if (lock)
-				__add_wait_queue_entry_tail_exclusive(q, wait);
-			else
-				__add_wait_queue(q, wait);
+			__add_wait_queue_entry_tail(q, wait);
 			SetPageWaiters(page);
 		}
 

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-26 18:15               ` Linus Torvalds
@ 2017-08-27 21:40                   ` Linus Torvalds
  2017-08-28 14:51                 ` Liang, Kan
  1 sibling, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-27 21:40 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Sat, Aug 26, 2017 at 11:15 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> So how about just this fairly trivial patch?

So I just committed that trivial patch, because I think it's right,
but more importantly because I think I found a real and non-trivial
fundamental problem.

The reason I found it is actually that I was thinking about this
patch, and how the WQ_FLAG_EXCLUSIVE ordering matters.

And I don't really think the WQ_FLAG_EXCLUSIVE ordering matters all
that much, but just *thinking* about it made me realize that the code
is broken.

In particular, this caller:

    int __lock_page_killable(struct page *__page)
    {
        struct page *page = compound_head(__page);
        wait_queue_head_t *q = page_waitqueue(page);
        return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
    }
    EXPORT_SYMBOL_GPL(__lock_page_killable);

is completely broken crap.

Why?

It's the combination of "TASK_KILLABLE" and "true" that is broken.
Always has been broken, afaik.

The "true" is that "bool lock" argument, and when it is set, we set
the WQ_FLAG_EXCLUSIVE bit.

But that bit - by definition, that's the whole point - means that the
waking side only wakes up *one* waiter.

So there's a race in anybody who uses __lock_page_killable().

The race goes like this:

  thread1       thread2         thread3
  ----          ----            ----

  .. CPU1 ...
  __lock_page_killable
    wait_on_page_bit_common()
      get wq lock
      __add_wait_queue_entry_tail_exclusive()
      set_current_state(TASK_KILLABLE);
      release wq lock
        io_schedule

                ... CPU2 ...
                __lock_page[_killable]()
                  wait_on_page_bit_common()
                    get wq lock
                    __add_wait_queue_entry_tail_exclusive()
                    set_current_state(TASK_KILLABLE);
                    release wq lock
                    io_schedule

                                ... CPU3 ...
                                unlock_page()
                                wake_up_page_bit(page, PG_Locked)
                                  wakes up CPU1 _only_

  ... lethal signal for thread1 happens ...
     if (unlikely(signal_pending_state(state, current))) {
          ret = -EINTR;
          break;
     }


End result: page is unlocked, CPU3 is waiting, nothing will wake CPU3 up.

Of course, if we have multiple threads all locking that page
concurrently, we probably will have *another* thread lock it
(successfully), and then when it unlocks it thread3 does get woken up
eventually.

But the keyword is "eventually". It could be a long while,
particularly if we don't lock the page *all* the time, just
occasionally.

So it might be a while, and it might explain how some waiters might queue up.

And who does __lock_page_killable? Page faults.

And who does a lot of page faults and page locking? That NUMA load from hell.

Does it have lethal signals, though? Probably not. That lethal signal
case really is unusual.

So I'm not saying that this is actually THE BUG. In particular,
despite that race, the page actually *is* unlocked afterwards. It's
just that one of the threads that wanted the lock didn't get notified
of it. So it doesn't really explain how non-locking waiters (ie the
people who don't do migrations, just wait for the migration entry)
would queue up.

But it sure looks like a real bug to me.

Basically, if you ask for anm exclusive wakeup, you *have* to take the
resource you are waiting for. Youl can't just say "never mind, I'll
return -EINTR".

I don't see a simple fix for this yet other than perhaps adding a
wakeup to the "ret = -EINTR; break" case.

Does anybody else see anything? Or maybe see a reason why this
wouldn't be a bug in the first place?

Anyway, I am officially starting to hate that page wait code.  I've
stared at it for days now, and I am not getting more fond of it.

                   Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-27 21:40                   ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-27 21:40 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Sat, Aug 26, 2017 at 11:15 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> So how about just this fairly trivial patch?

So I just committed that trivial patch, because I think it's right,
but more importantly because I think I found a real and non-trivial
fundamental problem.

The reason I found it is actually that I was thinking about this
patch, and how the WQ_FLAG_EXCLUSIVE ordering matters.

And I don't really think the WQ_FLAG_EXCLUSIVE ordering matters all
that much, but just *thinking* about it made me realize that the code
is broken.

In particular, this caller:

    int __lock_page_killable(struct page *__page)
    {
        struct page *page = compound_head(__page);
        wait_queue_head_t *q = page_waitqueue(page);
        return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
    }
    EXPORT_SYMBOL_GPL(__lock_page_killable);

is completely broken crap.

Why?

It's the combination of "TASK_KILLABLE" and "true" that is broken.
Always has been broken, afaik.

The "true" is that "bool lock" argument, and when it is set, we set
the WQ_FLAG_EXCLUSIVE bit.

But that bit - by definition, that's the whole point - means that the
waking side only wakes up *one* waiter.

So there's a race in anybody who uses __lock_page_killable().

The race goes like this:

  thread1       thread2         thread3
  ----          ----            ----

  .. CPU1 ...
  __lock_page_killable
    wait_on_page_bit_common()
      get wq lock
      __add_wait_queue_entry_tail_exclusive()
      set_current_state(TASK_KILLABLE);
      release wq lock
        io_schedule

                ... CPU2 ...
                __lock_page[_killable]()
                  wait_on_page_bit_common()
                    get wq lock
                    __add_wait_queue_entry_tail_exclusive()
                    set_current_state(TASK_KILLABLE);
                    release wq lock
                    io_schedule

                                ... CPU3 ...
                                unlock_page()
                                wake_up_page_bit(page, PG_Locked)
                                  wakes up CPU1 _only_

  ... lethal signal for thread1 happens ...
     if (unlikely(signal_pending_state(state, current))) {
          ret = -EINTR;
          break;
     }


End result: page is unlocked, CPU3 is waiting, nothing will wake CPU3 up.

Of course, if we have multiple threads all locking that page
concurrently, we probably will have *another* thread lock it
(successfully), and then when it unlocks it thread3 does get woken up
eventually.

But the keyword is "eventually". It could be a long while,
particularly if we don't lock the page *all* the time, just
occasionally.

So it might be a while, and it might explain how some waiters might queue up.

And who does __lock_page_killable? Page faults.

And who does a lot of page faults and page locking? That NUMA load from hell.

Does it have lethal signals, though? Probably not. That lethal signal
case really is unusual.

So I'm not saying that this is actually THE BUG. In particular,
despite that race, the page actually *is* unlocked afterwards. It's
just that one of the threads that wanted the lock didn't get notified
of it. So it doesn't really explain how non-locking waiters (ie the
people who don't do migrations, just wait for the migration entry)
would queue up.

But it sure looks like a real bug to me.

Basically, if you ask for anm exclusive wakeup, you *have* to take the
resource you are waiting for. Youl can't just say "never mind, I'll
return -EINTR".

I don't see a simple fix for this yet other than perhaps adding a
wakeup to the "ret = -EINTR; break" case.

Does anybody else see anything? Or maybe see a reason why this
wouldn't be a bug in the first place?

Anyway, I am officially starting to hate that page wait code.  I've
stared at it for days now, and I am not getting more fond of it.

                   Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-27 21:40                   ` Linus Torvalds
@ 2017-08-27 21:42                     ` Linus Torvalds
  -1 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-27 21:42 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Sun, Aug 27, 2017 at 2:40 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> End result: page is unlocked, CPU3 is waiting, nothing will wake CPU3 up.

Not CPU3. CPU3 was the waker. It's thread 2 that is waiting and never
got woken up, of course.

Other than that, the scenario still looks real to me.

Ideas?

                 Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-27 21:42                     ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-27 21:42 UTC (permalink / raw)
  To: Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Sun, Aug 27, 2017 at 2:40 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> End result: page is unlocked, CPU3 is waiting, nothing will wake CPU3 up.

Not CPU3. CPU3 was the waker. It's thread 2 that is waiting and never
got woken up, of course.

Other than that, the scenario still looks real to me.

Ideas?

                 Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-27 21:40                   ` Linus Torvalds
@ 2017-08-27 23:12                     ` Linus Torvalds
  -1 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-27 23:12 UTC (permalink / raw)
  To: Tim Chen, Nick Piggin
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Sun, Aug 27, 2017 at 2:40 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> The race goes like this:
>
>   thread1       thread2         thread3
>   ----          ----            ----
>
>   .. CPU1 ...
>   __lock_page_killable
>     wait_on_page_bit_common()
>       get wq lock
>       __add_wait_queue_entry_tail_exclusive()
>       set_current_state(TASK_KILLABLE);
>       release wq lock
>         io_schedule
>
>                 ... CPU2 ...
>                 __lock_page[_killable]()
>                   wait_on_page_bit_common()
>                     get wq lock
>                     __add_wait_queue_entry_tail_exclusive()
>                     set_current_state(TASK_KILLABLE);
>                     release wq lock
>                     io_schedule
>
>                                 ... CPU3 ...
>                                 unlock_page()
>                                 wake_up_page_bit(page, PG_Locked)
>                                   wakes up CPU1 _only_
>
>   ... lethal signal for thread1 happens ...
>      if (unlikely(signal_pending_state(state, current))) {
>           ret = -EINTR;
>           break;
>      }

With the race meaning that thread2 never gets woken up due to the
exclusive wakeup being caught by thread1 (which doesn't actually take
the lock).

I think that this bug was introduced by commit 62906027091f ("mm: add
PageWaiters indicating tasks are waiting for a page bit"), which
changed the page lock from using the wait_on_bit_lock() code to its
own _slightly_ different version.

Because it looks like _almost_ the same thing existed in the old
wait_on_bit_lock() code - and that is still used by a couple of
filesystems.

*Most* of the users seem to use TASK_UNINTERRUPTIBLE, which is fine.
But cifs and the sunrpc XPRT_LOCKED code both use the TASK_KILLABLE
form that would seem to have the exact same issue: wait_on_bit_lock()
uses exclusive wait-queues, but then may return with an error without
actually taking the lock.

Now, it turns out that I think the wait_on_bit_lock() code is actually
safe, for a subtle reason.

Why? Unlike the page lock code, the wait_on_bit_lock() code always
tries to get the lock bit before returning an error. So
wait_on_bit_lock() will prefer a successful lock taking over EINTR,
which means that if the bit really was unlocked, it would have been
taken.

And if something else locked the bit again under us and we didn't get
it, that "something else" presumably will then wake things up when it
unlocks.

So the wait_on_bit_lock() code could _also_ lose the wakeup event, but
it would only lose it in situations where somebody else would then
re-send it.

Do people agree with that analysis?

So I think the old wait_on_bit_lock() code ends up being safe -
despite having this same pattern of "exclusive wait but might error
out without taking the lock".

Whether that "wait_on_bit_lock() is safe" was just a fluke or was
because people thought about it is unclear. It's old code. People
probably *did* think about it. I really can't remember.

But it does point to a fix for the page lock case: just move the
signal_pending_state() test to after the bit checking.

So the page locking code is racy because you could have this:

 - another cpu does the unlock_page() and wakes up the process (and
uses the exclusive event)

 - we then get a lethal signal before we get toi the
"signal_pending_state()" state.

 - we end up prioritizing the lethal signal, because obviously we
don't care about locking the page any more.

 - so now the lock bit may be still clear and there's nobody who is
going to wake up the remaining waiter

Moving the signal_pending_state() down actually fixes the race,
because we know that in order for the exclusive thing to have
mattered, it *has* to actually wake us up. So the unlock_page() must
happen before the lethal signal (where before is well-defined because
of that "try_to_wake_up()" taking a lock and looking at the task
state). The exclusive accounting is only done if the process is
actually woken up, not if it was already running (see
"try_to_wake_up()").

And if the unlock_page() happened before the lethal signal, then we
know that test_and_set_bit_lock() will either work (in which case
we're ok), or another locker successfully came in later - in which
case we're _also_ ok, because that other locker will then do the
unlock again, and wake up subsequent waiters that might have been
blocked by our first exclusive waiter.

So I propose that the fix might be as simple as this:

    diff --git a/mm/filemap.c b/mm/filemap.c
    index baba290c276b..0b41c8cbeabc 100644
    --- a/mm/filemap.c
    +++ b/mm/filemap.c
    @@ -986,10 +986,6 @@ static inline int
wait_on_page_bit_common(wait_queue_head_t *q,

                if (likely(test_bit(bit_nr, &page->flags))) {
                        io_schedule();
    -                   if (unlikely(signal_pending_state(state, current))) {
    -                           ret = -EINTR;
    -                           break;
    -                   }
                }

                if (lock) {
    @@ -999,6 +995,11 @@ static inline int
wait_on_page_bit_common(wait_queue_head_t *q,
                        if (!test_bit(bit_nr, &page->flags))
                                break;
                }
    +
    +           if (unlikely(signal_pending_state(state, current))) {
    +                   ret = -EINTR;
    +                   break;
    +           }
        }

        finish_wait(q, wait);

but maybe I'm missing something.

Nick, comments?

                 Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-27 23:12                     ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-27 23:12 UTC (permalink / raw)
  To: Tim Chen, Nick Piggin
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen, Kan Liang,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Sun, Aug 27, 2017 at 2:40 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> The race goes like this:
>
>   thread1       thread2         thread3
>   ----          ----            ----
>
>   .. CPU1 ...
>   __lock_page_killable
>     wait_on_page_bit_common()
>       get wq lock
>       __add_wait_queue_entry_tail_exclusive()
>       set_current_state(TASK_KILLABLE);
>       release wq lock
>         io_schedule
>
>                 ... CPU2 ...
>                 __lock_page[_killable]()
>                   wait_on_page_bit_common()
>                     get wq lock
>                     __add_wait_queue_entry_tail_exclusive()
>                     set_current_state(TASK_KILLABLE);
>                     release wq lock
>                     io_schedule
>
>                                 ... CPU3 ...
>                                 unlock_page()
>                                 wake_up_page_bit(page, PG_Locked)
>                                   wakes up CPU1 _only_
>
>   ... lethal signal for thread1 happens ...
>      if (unlikely(signal_pending_state(state, current))) {
>           ret = -EINTR;
>           break;
>      }

With the race meaning that thread2 never gets woken up due to the
exclusive wakeup being caught by thread1 (which doesn't actually take
the lock).

I think that this bug was introduced by commit 62906027091f ("mm: add
PageWaiters indicating tasks are waiting for a page bit"), which
changed the page lock from using the wait_on_bit_lock() code to its
own _slightly_ different version.

Because it looks like _almost_ the same thing existed in the old
wait_on_bit_lock() code - and that is still used by a couple of
filesystems.

*Most* of the users seem to use TASK_UNINTERRUPTIBLE, which is fine.
But cifs and the sunrpc XPRT_LOCKED code both use the TASK_KILLABLE
form that would seem to have the exact same issue: wait_on_bit_lock()
uses exclusive wait-queues, but then may return with an error without
actually taking the lock.

Now, it turns out that I think the wait_on_bit_lock() code is actually
safe, for a subtle reason.

Why? Unlike the page lock code, the wait_on_bit_lock() code always
tries to get the lock bit before returning an error. So
wait_on_bit_lock() will prefer a successful lock taking over EINTR,
which means that if the bit really was unlocked, it would have been
taken.

And if something else locked the bit again under us and we didn't get
it, that "something else" presumably will then wake things up when it
unlocks.

So the wait_on_bit_lock() code could _also_ lose the wakeup event, but
it would only lose it in situations where somebody else would then
re-send it.

Do people agree with that analysis?

So I think the old wait_on_bit_lock() code ends up being safe -
despite having this same pattern of "exclusive wait but might error
out without taking the lock".

Whether that "wait_on_bit_lock() is safe" was just a fluke or was
because people thought about it is unclear. It's old code. People
probably *did* think about it. I really can't remember.

But it does point to a fix for the page lock case: just move the
signal_pending_state() test to after the bit checking.

So the page locking code is racy because you could have this:

 - another cpu does the unlock_page() and wakes up the process (and
uses the exclusive event)

 - we then get a lethal signal before we get toi the
"signal_pending_state()" state.

 - we end up prioritizing the lethal signal, because obviously we
don't care about locking the page any more.

 - so now the lock bit may be still clear and there's nobody who is
going to wake up the remaining waiter

Moving the signal_pending_state() down actually fixes the race,
because we know that in order for the exclusive thing to have
mattered, it *has* to actually wake us up. So the unlock_page() must
happen before the lethal signal (where before is well-defined because
of that "try_to_wake_up()" taking a lock and looking at the task
state). The exclusive accounting is only done if the process is
actually woken up, not if it was already running (see
"try_to_wake_up()").

And if the unlock_page() happened before the lethal signal, then we
know that test_and_set_bit_lock() will either work (in which case
we're ok), or another locker successfully came in later - in which
case we're _also_ ok, because that other locker will then do the
unlock again, and wake up subsequent waiters that might have been
blocked by our first exclusive waiter.

So I propose that the fix might be as simple as this:

    diff --git a/mm/filemap.c b/mm/filemap.c
    index baba290c276b..0b41c8cbeabc 100644
    --- a/mm/filemap.c
    +++ b/mm/filemap.c
    @@ -986,10 +986,6 @@ static inline int
wait_on_page_bit_common(wait_queue_head_t *q,

                if (likely(test_bit(bit_nr, &page->flags))) {
                        io_schedule();
    -                   if (unlikely(signal_pending_state(state, current))) {
    -                           ret = -EINTR;
    -                           break;
    -                   }
                }

                if (lock) {
    @@ -999,6 +995,11 @@ static inline int
wait_on_page_bit_common(wait_queue_head_t *q,
                        if (!test_bit(bit_nr, &page->flags))
                                break;
                }
    +
    +           if (unlikely(signal_pending_state(state, current))) {
    +                   ret = -EINTR;
    +                   break;
    +           }
        }

        finish_wait(q, wait);

but maybe I'm missing something.

Nick, comments?

                 Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-27 23:12                     ` Linus Torvalds
@ 2017-08-28  1:16                       ` Nicholas Piggin
  -1 siblings, 0 replies; 57+ messages in thread
From: Nicholas Piggin @ 2017-08-28  1:16 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Kan Liang, Andrew Morton, Johannes Weiner, Jan Kara,
	Christopher Lameter, Eric W . Biederman, Davidlohr Bueso,
	linux-mm, Linux Kernel Mailing List

On Sun, 27 Aug 2017 16:12:19 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sun, Aug 27, 2017 at 2:40 PM, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > The race goes like this:
> >
> >   thread1       thread2         thread3
> >   ----          ----            ----
> >
> >   .. CPU1 ...
> >   __lock_page_killable
> >     wait_on_page_bit_common()
> >       get wq lock
> >       __add_wait_queue_entry_tail_exclusive()
> >       set_current_state(TASK_KILLABLE);
> >       release wq lock
> >         io_schedule
> >
> >                 ... CPU2 ...
> >                 __lock_page[_killable]()
> >                   wait_on_page_bit_common()
> >                     get wq lock
> >                     __add_wait_queue_entry_tail_exclusive()
> >                     set_current_state(TASK_KILLABLE);
> >                     release wq lock
> >                     io_schedule
> >
> >                                 ... CPU3 ...
> >                                 unlock_page()
> >                                 wake_up_page_bit(page, PG_Locked)
> >                                   wakes up CPU1 _only_
> >
> >   ... lethal signal for thread1 happens ...
> >      if (unlikely(signal_pending_state(state, current))) {
> >           ret = -EINTR;
> >           break;
> >      }  
> 
> With the race meaning that thread2 never gets woken up due to the
> exclusive wakeup being caught by thread1 (which doesn't actually take
> the lock).
> 
> I think that this bug was introduced by commit 62906027091f ("mm: add
> PageWaiters indicating tasks are waiting for a page bit"), which
> changed the page lock from using the wait_on_bit_lock() code to its
> own _slightly_ different version.
> 
> Because it looks like _almost_ the same thing existed in the old
> wait_on_bit_lock() code - and that is still used by a couple of
> filesystems.
> 
> *Most* of the users seem to use TASK_UNINTERRUPTIBLE, which is fine.
> But cifs and the sunrpc XPRT_LOCKED code both use the TASK_KILLABLE
> form that would seem to have the exact same issue: wait_on_bit_lock()
> uses exclusive wait-queues, but then may return with an error without
> actually taking the lock.
> 
> Now, it turns out that I think the wait_on_bit_lock() code is actually
> safe, for a subtle reason.
> 
> Why? Unlike the page lock code, the wait_on_bit_lock() code always
> tries to get the lock bit before returning an error. So
> wait_on_bit_lock() will prefer a successful lock taking over EINTR,
> which means that if the bit really was unlocked, it would have been
> taken.
> 
> And if something else locked the bit again under us and we didn't get
> it, that "something else" presumably will then wake things up when it
> unlocks.
> 
> So the wait_on_bit_lock() code could _also_ lose the wakeup event, but
> it would only lose it in situations where somebody else would then
> re-send it.
> 
> Do people agree with that analysis?
> 
> So I think the old wait_on_bit_lock() code ends up being safe -
> despite having this same pattern of "exclusive wait but might error
> out without taking the lock".
> 
> Whether that "wait_on_bit_lock() is safe" was just a fluke or was
> because people thought about it is unclear. It's old code. People
> probably *did* think about it. I really can't remember.
> 
> But it does point to a fix for the page lock case: just move the
> signal_pending_state() test to after the bit checking.
> 
> So the page locking code is racy because you could have this:
> 
>  - another cpu does the unlock_page() and wakes up the process (and
> uses the exclusive event)
> 
>  - we then get a lethal signal before we get toi the
> "signal_pending_state()" state.
> 
>  - we end up prioritizing the lethal signal, because obviously we
> don't care about locking the page any more.
> 
>  - so now the lock bit may be still clear and there's nobody who is
> going to wake up the remaining waiter
> 
> Moving the signal_pending_state() down actually fixes the race,
> because we know that in order for the exclusive thing to have
> mattered, it *has* to actually wake us up. So the unlock_page() must
> happen before the lethal signal (where before is well-defined because
> of that "try_to_wake_up()" taking a lock and looking at the task
> state). The exclusive accounting is only done if the process is
> actually woken up, not if it was already running (see
> "try_to_wake_up()").
> 
> And if the unlock_page() happened before the lethal signal, then we
> know that test_and_set_bit_lock() will either work (in which case
> we're ok), or another locker successfully came in later - in which
> case we're _also_ ok, because that other locker will then do the
> unlock again, and wake up subsequent waiters that might have been
> blocked by our first exclusive waiter.
> 
> So I propose that the fix might be as simple as this:
> 
>     diff --git a/mm/filemap.c b/mm/filemap.c
>     index baba290c276b..0b41c8cbeabc 100644
>     --- a/mm/filemap.c
>     +++ b/mm/filemap.c
>     @@ -986,10 +986,6 @@ static inline int
> wait_on_page_bit_common(wait_queue_head_t *q,
> 
>                 if (likely(test_bit(bit_nr, &page->flags))) {
>                         io_schedule();
>     -                   if (unlikely(signal_pending_state(state, current))) {
>     -                           ret = -EINTR;
>     -                           break;
>     -                   }
>                 }
> 
>                 if (lock) {
>     @@ -999,6 +995,11 @@ static inline int
> wait_on_page_bit_common(wait_queue_head_t *q,
>                         if (!test_bit(bit_nr, &page->flags))
>                                 break;
>                 }
>     +
>     +           if (unlikely(signal_pending_state(state, current))) {
>     +                   ret = -EINTR;
>     +                   break;
>     +           }
>         }
> 
>         finish_wait(q, wait);
> 
> but maybe I'm missing something.
> 
> Nick, comments?

No I don't think you're missing something. We surely could lose our only
wakeup in this window. So an exclusive waiter has to always make sure
they propagate the wakeup (regardless of what they do with the contended
resources itself).

Seems like your fix should solve it. By the look of how wait_on_bit_lock
is structured, the author probably did think about this case a little
better than I did :\

Thanks,
Nick

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-28  1:16                       ` Nicholas Piggin
  0 siblings, 0 replies; 57+ messages in thread
From: Nicholas Piggin @ 2017-08-28  1:16 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Kan Liang, Andrew Morton, Johannes Weiner, Jan Kara,
	Christopher Lameter, Eric W . Biederman, Davidlohr Bueso,
	linux-mm, Linux Kernel Mailing List

On Sun, 27 Aug 2017 16:12:19 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sun, Aug 27, 2017 at 2:40 PM, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > The race goes like this:
> >
> >   thread1       thread2         thread3
> >   ----          ----            ----
> >
> >   .. CPU1 ...
> >   __lock_page_killable
> >     wait_on_page_bit_common()
> >       get wq lock
> >       __add_wait_queue_entry_tail_exclusive()
> >       set_current_state(TASK_KILLABLE);
> >       release wq lock
> >         io_schedule
> >
> >                 ... CPU2 ...
> >                 __lock_page[_killable]()
> >                   wait_on_page_bit_common()
> >                     get wq lock
> >                     __add_wait_queue_entry_tail_exclusive()
> >                     set_current_state(TASK_KILLABLE);
> >                     release wq lock
> >                     io_schedule
> >
> >                                 ... CPU3 ...
> >                                 unlock_page()
> >                                 wake_up_page_bit(page, PG_Locked)
> >                                   wakes up CPU1 _only_
> >
> >   ... lethal signal for thread1 happens ...
> >      if (unlikely(signal_pending_state(state, current))) {
> >           ret = -EINTR;
> >           break;
> >      }  
> 
> With the race meaning that thread2 never gets woken up due to the
> exclusive wakeup being caught by thread1 (which doesn't actually take
> the lock).
> 
> I think that this bug was introduced by commit 62906027091f ("mm: add
> PageWaiters indicating tasks are waiting for a page bit"), which
> changed the page lock from using the wait_on_bit_lock() code to its
> own _slightly_ different version.
> 
> Because it looks like _almost_ the same thing existed in the old
> wait_on_bit_lock() code - and that is still used by a couple of
> filesystems.
> 
> *Most* of the users seem to use TASK_UNINTERRUPTIBLE, which is fine.
> But cifs and the sunrpc XPRT_LOCKED code both use the TASK_KILLABLE
> form that would seem to have the exact same issue: wait_on_bit_lock()
> uses exclusive wait-queues, but then may return with an error without
> actually taking the lock.
> 
> Now, it turns out that I think the wait_on_bit_lock() code is actually
> safe, for a subtle reason.
> 
> Why? Unlike the page lock code, the wait_on_bit_lock() code always
> tries to get the lock bit before returning an error. So
> wait_on_bit_lock() will prefer a successful lock taking over EINTR,
> which means that if the bit really was unlocked, it would have been
> taken.
> 
> And if something else locked the bit again under us and we didn't get
> it, that "something else" presumably will then wake things up when it
> unlocks.
> 
> So the wait_on_bit_lock() code could _also_ lose the wakeup event, but
> it would only lose it in situations where somebody else would then
> re-send it.
> 
> Do people agree with that analysis?
> 
> So I think the old wait_on_bit_lock() code ends up being safe -
> despite having this same pattern of "exclusive wait but might error
> out without taking the lock".
> 
> Whether that "wait_on_bit_lock() is safe" was just a fluke or was
> because people thought about it is unclear. It's old code. People
> probably *did* think about it. I really can't remember.
> 
> But it does point to a fix for the page lock case: just move the
> signal_pending_state() test to after the bit checking.
> 
> So the page locking code is racy because you could have this:
> 
>  - another cpu does the unlock_page() and wakes up the process (and
> uses the exclusive event)
> 
>  - we then get a lethal signal before we get toi the
> "signal_pending_state()" state.
> 
>  - we end up prioritizing the lethal signal, because obviously we
> don't care about locking the page any more.
> 
>  - so now the lock bit may be still clear and there's nobody who is
> going to wake up the remaining waiter
> 
> Moving the signal_pending_state() down actually fixes the race,
> because we know that in order for the exclusive thing to have
> mattered, it *has* to actually wake us up. So the unlock_page() must
> happen before the lethal signal (where before is well-defined because
> of that "try_to_wake_up()" taking a lock and looking at the task
> state). The exclusive accounting is only done if the process is
> actually woken up, not if it was already running (see
> "try_to_wake_up()").
> 
> And if the unlock_page() happened before the lethal signal, then we
> know that test_and_set_bit_lock() will either work (in which case
> we're ok), or another locker successfully came in later - in which
> case we're _also_ ok, because that other locker will then do the
> unlock again, and wake up subsequent waiters that might have been
> blocked by our first exclusive waiter.
> 
> So I propose that the fix might be as simple as this:
> 
>     diff --git a/mm/filemap.c b/mm/filemap.c
>     index baba290c276b..0b41c8cbeabc 100644
>     --- a/mm/filemap.c
>     +++ b/mm/filemap.c
>     @@ -986,10 +986,6 @@ static inline int
> wait_on_page_bit_common(wait_queue_head_t *q,
> 
>                 if (likely(test_bit(bit_nr, &page->flags))) {
>                         io_schedule();
>     -                   if (unlikely(signal_pending_state(state, current))) {
>     -                           ret = -EINTR;
>     -                           break;
>     -                   }
>                 }
> 
>                 if (lock) {
>     @@ -999,6 +995,11 @@ static inline int
> wait_on_page_bit_common(wait_queue_head_t *q,
>                         if (!test_bit(bit_nr, &page->flags))
>                                 break;
>                 }
>     +
>     +           if (unlikely(signal_pending_state(state, current))) {
>     +                   ret = -EINTR;
>     +                   break;
>     +           }
>         }
> 
>         finish_wait(q, wait);
> 
> but maybe I'm missing something.
> 
> Nick, comments?

No I don't think you're missing something. We surely could lose our only
wakeup in this window. So an exclusive waiter has to always make sure
they propagate the wakeup (regardless of what they do with the contended
resources itself).

Seems like your fix should solve it. By the look of how wait_on_bit_lock
is structured, the author probably did think about this case a little
better than I did :\

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-28  1:16                       ` Nicholas Piggin
@ 2017-08-28  1:29                         ` Nicholas Piggin
  -1 siblings, 0 replies; 57+ messages in thread
From: Nicholas Piggin @ 2017-08-28  1:29 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Kan Liang, Andrew Morton, Johannes Weiner, Jan Kara,
	Christopher Lameter, Eric W . Biederman, Davidlohr Bueso,
	linux-mm, Linux Kernel Mailing List

On Mon, 28 Aug 2017 11:16:48 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> On Sun, 27 Aug 2017 16:12:19 -0700
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 

> >     diff --git a/mm/filemap.c b/mm/filemap.c
> >     index baba290c276b..0b41c8cbeabc 100644
> >     --- a/mm/filemap.c
> >     +++ b/mm/filemap.c
> >     @@ -986,10 +986,6 @@ static inline int
> > wait_on_page_bit_common(wait_queue_head_t *q,
> > 
> >                 if (likely(test_bit(bit_nr, &page->flags))) {
> >                         io_schedule();
> >     -                   if (unlikely(signal_pending_state(state, current))) {
> >     -                           ret = -EINTR;
> >     -                           break;
> >     -                   }
> >                 }
> > 
> >                 if (lock) {
> >     @@ -999,6 +995,11 @@ static inline int
> > wait_on_page_bit_common(wait_queue_head_t *q,
> >                         if (!test_bit(bit_nr, &page->flags))
> >                                 break;
> >                 }
> >     +
> >     +           if (unlikely(signal_pending_state(state, current))) {
> >     +                   ret = -EINTR;
> >     +                   break;
> >     +           }
> >         }
> > 
> >         finish_wait(q, wait);
> > 
> > but maybe I'm missing something.
> > 
> > Nick, comments?  
> 
> No I don't think you're missing something. We surely could lose our only
> wakeup in this window. So an exclusive waiter has to always make sure
> they propagate the wakeup (regardless of what they do with the contended
> resources itself).
> 
> Seems like your fix should solve it. By the look of how wait_on_bit_lock
> is structured, the author probably did think about this case a little
> better than I did :\

BTW. since you are looking at this stuff, one other small problem I remember
with exclusive waiters is that losing to a concurrent locker puts them to
the back of the queue. I think that could be fixed with some small change to
the wait loops (first add to tail, then retries add to head). Thoughts?

Thanks,
Nick

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-28  1:29                         ` Nicholas Piggin
  0 siblings, 0 replies; 57+ messages in thread
From: Nicholas Piggin @ 2017-08-28  1:29 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Kan Liang, Andrew Morton, Johannes Weiner, Jan Kara,
	Christopher Lameter, Eric W . Biederman, Davidlohr Bueso,
	linux-mm, Linux Kernel Mailing List

On Mon, 28 Aug 2017 11:16:48 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> On Sun, 27 Aug 2017 16:12:19 -0700
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 

> >     diff --git a/mm/filemap.c b/mm/filemap.c
> >     index baba290c276b..0b41c8cbeabc 100644
> >     --- a/mm/filemap.c
> >     +++ b/mm/filemap.c
> >     @@ -986,10 +986,6 @@ static inline int
> > wait_on_page_bit_common(wait_queue_head_t *q,
> > 
> >                 if (likely(test_bit(bit_nr, &page->flags))) {
> >                         io_schedule();
> >     -                   if (unlikely(signal_pending_state(state, current))) {
> >     -                           ret = -EINTR;
> >     -                           break;
> >     -                   }
> >                 }
> > 
> >                 if (lock) {
> >     @@ -999,6 +995,11 @@ static inline int
> > wait_on_page_bit_common(wait_queue_head_t *q,
> >                         if (!test_bit(bit_nr, &page->flags))
> >                                 break;
> >                 }
> >     +
> >     +           if (unlikely(signal_pending_state(state, current))) {
> >     +                   ret = -EINTR;
> >     +                   break;
> >     +           }
> >         }
> > 
> >         finish_wait(q, wait);
> > 
> > but maybe I'm missing something.
> > 
> > Nick, comments?  
> 
> No I don't think you're missing something. We surely could lose our only
> wakeup in this window. So an exclusive waiter has to always make sure
> they propagate the wakeup (regardless of what they do with the contended
> resources itself).
> 
> Seems like your fix should solve it. By the look of how wait_on_bit_lock
> is structured, the author probably did think about this case a little
> better than I did :\

BTW. since you are looking at this stuff, one other small problem I remember
with exclusive waiters is that losing to a concurrent locker puts them to
the back of the queue. I think that could be fixed with some small change to
the wait loops (first add to tail, then retries add to head). Thoughts?

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-28  1:29                         ` Nicholas Piggin
@ 2017-08-28  5:17                           ` Linus Torvalds
  -1 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-28  5:17 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Kan Liang, Andrew Morton, Johannes Weiner, Jan Kara,
	Christopher Lameter, Eric W . Biederman, Davidlohr Bueso,
	linux-mm, Linux Kernel Mailing List

On Sun, Aug 27, 2017 at 6:29 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>
> BTW. since you are looking at this stuff, one other small problem I remember
> with exclusive waiters is that losing to a concurrent locker puts them to
> the back of the queue. I think that could be fixed with some small change to
> the wait loops (first add to tail, then retries add to head). Thoughts?

No, not that way.

First off, it's oddly complicated, but more importantly, the real
unfairness you lose to is not other things on the wait queue, but to
other lockers that aren't on the wait-queue at all, but instead just
come in and do a "test-and-set" without ever even going through the
slow path.

So instead of playing queuing games, you'd need to just change the
unlock sequence. Right now we basically do:

 - clear lock bit and atomically test if contended (and we play games
with bit numbering to do that atomic test efficiently)

 - if contended, wake things up

and you'd change the logic to be

 - if contended, don't clear the lock bit at all, just transfer the
lock ownership directly to the waiters by walking the wait list

 - clear the lock bit only once there are no more wait entries (either
because there were no waiters at all, or because all the entries were
just waiting for the lock to be released)

which is certainly doable with a couple of small extensions to the
page wait key data structure.

But most of my clever schemes the last few days were abject failures,
and honestly, it's late in the rc.

In fact, this late in the game I probably wouldn't even have committed
the small cleanups I did if it wasn't for the fact that thinking of
the whole WQ_FLAG_EXCLUSIVE bit made me find the bug.

So the cleanups were actually what got me to look at the problem in
the first place, and then I went "I'm going to commit the cleanup, and
then I can think about the bug I just found".

I'm just happy that the fix seems to be trivial. I was afraid I'd have
to do something nastier (like have the EINTR case send another
explicit wakeup to make up for the lost one, or some ugly hack like
that).

It was only when I started looking at the history of that code, and I
saw the old bit_lock code, and I went "Hmm. That has the _same_ bug -
oh wait, no it doesn't!" that I realized that there was that simple
fix.

You weren't cc'd on the earlier part of the discussion, you only got
added when I realized what the history and simple fix was.

               Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-28  5:17                           ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-28  5:17 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Kan Liang, Andrew Morton, Johannes Weiner, Jan Kara,
	Christopher Lameter, Eric W . Biederman, Davidlohr Bueso,
	linux-mm, Linux Kernel Mailing List

On Sun, Aug 27, 2017 at 6:29 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>
> BTW. since you are looking at this stuff, one other small problem I remember
> with exclusive waiters is that losing to a concurrent locker puts them to
> the back of the queue. I think that could be fixed with some small change to
> the wait loops (first add to tail, then retries add to head). Thoughts?

No, not that way.

First off, it's oddly complicated, but more importantly, the real
unfairness you lose to is not other things on the wait queue, but to
other lockers that aren't on the wait-queue at all, but instead just
come in and do a "test-and-set" without ever even going through the
slow path.

So instead of playing queuing games, you'd need to just change the
unlock sequence. Right now we basically do:

 - clear lock bit and atomically test if contended (and we play games
with bit numbering to do that atomic test efficiently)

 - if contended, wake things up

and you'd change the logic to be

 - if contended, don't clear the lock bit at all, just transfer the
lock ownership directly to the waiters by walking the wait list

 - clear the lock bit only once there are no more wait entries (either
because there were no waiters at all, or because all the entries were
just waiting for the lock to be released)

which is certainly doable with a couple of small extensions to the
page wait key data structure.

But most of my clever schemes the last few days were abject failures,
and honestly, it's late in the rc.

In fact, this late in the game I probably wouldn't even have committed
the small cleanups I did if it wasn't for the fact that thinking of
the whole WQ_FLAG_EXCLUSIVE bit made me find the bug.

So the cleanups were actually what got me to look at the problem in
the first place, and then I went "I'm going to commit the cleanup, and
then I can think about the bug I just found".

I'm just happy that the fix seems to be trivial. I was afraid I'd have
to do something nastier (like have the EINTR case send another
explicit wakeup to make up for the lost one, or some ugly hack like
that).

It was only when I started looking at the history of that code, and I
saw the old bit_lock code, and I went "Hmm. That has the _same_ bug -
oh wait, no it doesn't!" that I realized that there was that simple
fix.

You weren't cc'd on the earlier part of the discussion, you only got
added when I realized what the history and simple fix was.

               Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-28  5:17                           ` Linus Torvalds
@ 2017-08-28  7:18                             ` Nicholas Piggin
  -1 siblings, 0 replies; 57+ messages in thread
From: Nicholas Piggin @ 2017-08-28  7:18 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Kan Liang, Andrew Morton, Johannes Weiner, Jan Kara,
	Christopher Lameter, Eric W . Biederman, Davidlohr Bueso,
	linux-mm, Linux Kernel Mailing List

On Sun, 27 Aug 2017 22:17:55 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sun, Aug 27, 2017 at 6:29 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> >
> > BTW. since you are looking at this stuff, one other small problem I remember
> > with exclusive waiters is that losing to a concurrent locker puts them to
> > the back of the queue. I think that could be fixed with some small change to
> > the wait loops (first add to tail, then retries add to head). Thoughts?  
> 
> No, not that way.
> 
> First off, it's oddly complicated, but more importantly, the real
> unfairness you lose to is not other things on the wait queue, but to
> other lockers that aren't on the wait-queue at all, but instead just
> come in and do a "test-and-set" without ever even going through the
> slow path.

Right, there is that unfairness *as well*. The requeue-to-tail logic
seems to make that worse and I thought it seemed like a simple way
to improve it.

> 
> So instead of playing queuing games, you'd need to just change the
> unlock sequence. Right now we basically do:
> 
>  - clear lock bit and atomically test if contended (and we play games
> with bit numbering to do that atomic test efficiently)
> 
>  - if contended, wake things up
> 
> and you'd change the logic to be
> 
>  - if contended, don't clear the lock bit at all, just transfer the
> lock ownership directly to the waiters by walking the wait list
> 
>  - clear the lock bit only once there are no more wait entries (either
> because there were no waiters at all, or because all the entries were
> just waiting for the lock to be released)
> 
> which is certainly doable with a couple of small extensions to the
> page wait key data structure.

Yeah that would be ideal. Conceptually trivial, I guess care has to
be taken with transferring the memory ordering with the lock. Could
be a good concept to apply elsewhere too.

> 
> But most of my clever schemes the last few days were abject failures,
> and honestly, it's late in the rc.
> 
> In fact, this late in the game I probably wouldn't even have committed
> the small cleanups I did if it wasn't for the fact that thinking of
> the whole WQ_FLAG_EXCLUSIVE bit made me find the bug.
> 
> So the cleanups were actually what got me to look at the problem in
> the first place, and then I went "I'm going to commit the cleanup, and
> then I can think about the bug I just found".
> 
> I'm just happy that the fix seems to be trivial. I was afraid I'd have
> to do something nastier (like have the EINTR case send another
> explicit wakeup to make up for the lost one, or some ugly hack like
> that).
> 
> It was only when I started looking at the history of that code, and I
> saw the old bit_lock code, and I went "Hmm. That has the _same_ bug -
> oh wait, no it doesn't!" that I realized that there was that simple
> fix.
> 
> You weren't cc'd on the earlier part of the discussion, you only got
> added when I realized what the history and simple fix was.

You're right, no such improvement would be appropriate for 4.14.

Thanks,
Nick

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-28  7:18                             ` Nicholas Piggin
  0 siblings, 0 replies; 57+ messages in thread
From: Nicholas Piggin @ 2017-08-28  7:18 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Kan Liang, Andrew Morton, Johannes Weiner, Jan Kara,
	Christopher Lameter, Eric W . Biederman, Davidlohr Bueso,
	linux-mm, Linux Kernel Mailing List

On Sun, 27 Aug 2017 22:17:55 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sun, Aug 27, 2017 at 6:29 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> >
> > BTW. since you are looking at this stuff, one other small problem I remember
> > with exclusive waiters is that losing to a concurrent locker puts them to
> > the back of the queue. I think that could be fixed with some small change to
> > the wait loops (first add to tail, then retries add to head). Thoughts?  
> 
> No, not that way.
> 
> First off, it's oddly complicated, but more importantly, the real
> unfairness you lose to is not other things on the wait queue, but to
> other lockers that aren't on the wait-queue at all, but instead just
> come in and do a "test-and-set" without ever even going through the
> slow path.

Right, there is that unfairness *as well*. The requeue-to-tail logic
seems to make that worse and I thought it seemed like a simple way
to improve it.

> 
> So instead of playing queuing games, you'd need to just change the
> unlock sequence. Right now we basically do:
> 
>  - clear lock bit and atomically test if contended (and we play games
> with bit numbering to do that atomic test efficiently)
> 
>  - if contended, wake things up
> 
> and you'd change the logic to be
> 
>  - if contended, don't clear the lock bit at all, just transfer the
> lock ownership directly to the waiters by walking the wait list
> 
>  - clear the lock bit only once there are no more wait entries (either
> because there were no waiters at all, or because all the entries were
> just waiting for the lock to be released)
> 
> which is certainly doable with a couple of small extensions to the
> page wait key data structure.

Yeah that would be ideal. Conceptually trivial, I guess care has to
be taken with transferring the memory ordering with the lock. Could
be a good concept to apply elsewhere too.

> 
> But most of my clever schemes the last few days were abject failures,
> and honestly, it's late in the rc.
> 
> In fact, this late in the game I probably wouldn't even have committed
> the small cleanups I did if it wasn't for the fact that thinking of
> the whole WQ_FLAG_EXCLUSIVE bit made me find the bug.
> 
> So the cleanups were actually what got me to look at the problem in
> the first place, and then I went "I'm going to commit the cleanup, and
> then I can think about the bug I just found".
> 
> I'm just happy that the fix seems to be trivial. I was afraid I'd have
> to do something nastier (like have the EINTR case send another
> explicit wakeup to make up for the lost one, or some ugly hack like
> that).
> 
> It was only when I started looking at the history of that code, and I
> saw the old bit_lock code, and I went "Hmm. That has the _same_ bug -
> oh wait, no it doesn't!" that I realized that there was that simple
> fix.
> 
> You weren't cc'd on the earlier part of the discussion, you only got
> added when I realized what the history and simple fix was.

You're right, no such improvement would be appropriate for 4.14.

Thanks,
Nick

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-26 18:15               ` Linus Torvalds
  2017-08-27 21:40                   ` Linus Torvalds
@ 2017-08-28 14:51                 ` Liang, Kan
  2017-08-28 16:48                   ` Linus Torvalds
  1 sibling, 1 reply; 57+ messages in thread
From: Liang, Kan @ 2017-08-28 14:51 UTC (permalink / raw)
  To: Linus Torvalds, Tim Chen
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List



.
> On Fri, Aug 25, 2017 at 7:54 PM, Linus Torvalds <torvalds@linux-
> foundation.org> wrote:
> >
> > Simplify, simplify, simplify.
> 
> I've now tried three different approaches (I stopped sending broken patches
> after deciding the first one was unfixable), and they all suck.
> 
> I was hoping for something lockless to avoid the whole issue with latency
> over the long list, but anything that has the wait queue entry allocated on the
> stack needs to synchronize the wakeup due to the stack usage, so once you
> have lists that are thousands of entries, either you hold the lock for long
> times (normal wait queues) or take and release them constantly (the swait
> approach), or you batch things up (Tim's wait-queue patches).
> 
> Of those three approaches, the batching does seem to be the best of the lot.
> Allocating non-stack wait entries while waiting for pages seems like a bad
> idea. We're probably waiting for IO in normal circumstances, possibly
> because we're low on memory.
> 
> So I *am* ok with the batching (your patch #1), but I'm *not* ok with the
> nasty horrible book-keeping to avoid new entries when you batch and
> release the lock and that allows new entries on the list (your patch #2).
> 
> That said, I have now stared *way* too much at the page wait code after
> having unsuccessfully tried to replace that wait-queue with other "clever"
> approaches (all of which ended up being crap).
> 
> And I'm starting to see a possible solution, or at least improvement.
> 
> Let's just assume I take the batching patch. It's not conceptually ugly, it's
> fairly simple, and there are lots of independent arguments for it (ie latency,
> but also possible performance from better parallelism).
> 
> That patch doesn't make any data structures bigger or more complicated, it's
> just that single "is this a bookmark entry" thing.
> So that patch just looks fundamentally fine to me, and the only real
> argument I ever had against it was that I would really really _really_ have
> wanted to root-cause the behavior.
> 
> So that leaves my fundamental dislike of your other patch.
> 
> And it turns out that all my looking at the page wait code wasn't entirely
> unproductive. Yes, I went through three crap approaches before I gave up on
> rewriting it, but in the meantime I did get way too intimate with looking at
> that pile of crud.
> 
> And I think I found the reason why you needed that patch #2 in the first place.
> 
> Here's what is going on:
> 
>  - we're going to assume that the problem is all with a single page, not due to
> hash collisions (as per your earlier reports)
> 
>  - we also know that the only bit that matters is the PG_locked bit
> 
>  - that means that the only way to get that "multiple concurrent waker"
> situation that your patch #2 tries to handle better is because you have
> multiple people unlocking the page - since that is what causes the wakeups
> 
>  - that in turn means that you obviously had multiple threads *locking* it too.
> 
>  - and that means that among those new entries there are lockers coming in
> in the middle and adding an exclusive entry.
> 
>  - the exclusive entry already stops the wakeup list walking
> 
>  - but we add non-exclusive entries TO THE BEGINNING of the page waiters
> list
> 
> And I really think that last thing is fundamentally wrong.
> 
> It's wrong for several reasons:
> 
>  - because it's unfair: threads that want to lock get put behind threads that
> just want to see the unlocked state.
> 
>  - because it's stupid: our non-locking waiters will end up waiting again if the
> page got locked, so waking up a locker *and* a lot of non-locking waiters will
> just cause them to go back to sleep anyway
> 
>  - because it causes us to walk longer lists: we stop walking when we wake up
> the exclusive waiter, but because we always put the non-exclusive waiters in
> there first, we always walk the long list of non-exclusive waiters even if we
> could just stop walking because we woke up an exclusive one.
> 
> Now the reason we do this seems to be entirely random: for a *normal* wait
> queue, you really want to always wake up all the non-exclusive waiters,
> because exclusive waiters are only exclusive wrt each other.
> 
> But for a page lock, an exclusive waiter really is getting the lock, and the
> other waiters are going to wait for the lock to clear anyway, so the page wait
> thing is actually almost exactly the reverse situation. We *could* put
> exclusive waiters at the beginning of the list instead, and actively try to avoid
> walking the list at all if we have pending lockers.
> 
> I'm not doing that, because I think the fair thing to do is to just do things in
> the order they came in. Plus the code is actually simpler if we just always add
> to the tail.
> 
> Now, the other thing to look at is how the wakeup function works. It checks
> the aliasing information (page and bit number), but then it
> *also* does:
> 
>         if (test_bit(key->bit_nr, &key->page->flags))
>                 return 0;
> 
> basically saying "continue walking if somebody else already got the bit".
> 
> That's *INSANE*. It's exactly the wrong thing to do. It's basically saying "even
> if we had an exclusive waiter, let's not wake it up, but do let us continue to
> walk the list even though the bit we're waiting for to clear is set already".
> 
> What would be sane is to say "stop walking the list now".. So just do that - by
> making "negative return from wake function" mean "stop walking".
> 
> So how about just this fairly trivial patch?

I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
But they don't fix the issue. I can still get the similar call stack.

Thanks,
Kan

> 
> In fact, I think this may help even *without* Tim's patch #1. So I think this
> would be lovely to test on that problem load on its own, and seeing if it
> makes the wait queues behave better.
> 
> It might not cut down on the total length of the wait-queue, but it should
> hopefully cause it to walk it much less. We now hit the exclusive waiters
> earlier and stop waking things up when we have a new locker thread pending.
> And when the page ends up being locked again, we stop walking the list
> entirely, so no unnecessarily traversal.
> 
> This patch is small and at least minimally works (I'm running it right now).
> 
>                                Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-28 14:51                 ` Liang, Kan
@ 2017-08-28 16:48                   ` Linus Torvalds
  2017-08-28 20:01                       ` Tim Chen
                                       ` (2 more replies)
  0 siblings, 3 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-28 16:48 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 2340 bytes --]

On Mon, Aug 28, 2017 at 7:51 AM, Liang, Kan <kan.liang@intel.com> wrote:
>
> I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
> But they don't fix the issue. I can still get the similar call stack.

So the main issue was that I *really* hated Tim's patch #2, and the
patch to clean up the page wait queue should now make his patch series
much more palatable.

Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
always put the new entries at the end of the waitqueue.

The attached patches just apply directly on top of plain 4.13-rc7.

That makes patch #2 much more palatable, since it now doesn't need to
play games and worry about new arrivals.

But note the lack of testing. I've actually booted this and am running
these two patches right now, but honestly, you should consider them
"untested" simply because I can't trigger the page waiters contention
case to begin with.

But it's really just Tim's patches, modified for the page waitqueue
cleanup which makes patch #2 become much simpler, and now it's
palatable: it's just using the same bookmark thing that the normal
wakeup uses, no extra hacks.

So Tim should look these over, and they should definitely be tested on
that load-from-hell that you guys have, but if this set works, at
least I'm ok with it now.

Tim - did I miss anything? I added a "cpu_relax()" in there between
the release lock and irq and re-take it, I'm not convinced it makes
any difference, but I wanted to mark that "take a breather" thing.

Oh, there's one more case I only realized after the patches: the
stupid add_page_wait_queue() code still adds to the head of the list.
So technically you need this too:

    diff --git a/mm/filemap.c b/mm/filemap.c
    index 74123a298f53..598c3be57509 100644
    --- a/mm/filemap.c
    +++ b/mm/filemap.c
    @@ -1061,7 +1061,7 @@ void add_page_wait_queue(struct page *page,
wait_queue_entry_t *waiter)
        unsigned long flags;

        spin_lock_irqsave(&q->lock, flags);
    -   __add_wait_queue(q, waiter);
    +   __add_wait_queue_entry_tail(q, waiter);
        SetPageWaiters(page);
        spin_unlock_irqrestore(&q->lock, flags);
     }

but that only matters if you actually use the cachefiles thing, which
I hope/assume you don't.

       Linus

[-- Attachment #2: 0001-sched-wait-Break-up-long-wake-list-walk.patch --]
[-- Type: text/x-patch, Size: 7004 bytes --]

From 59e4341e041d7aa1f9339a03f876eee566768c84 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 25 Aug 2017 09:13:54 -0700
Subject: [PATCH 1/2] sched/wait: Break up long wake list walk

We encountered workloads that have very long wake up list on large
systems. A waker takes a long time to traverse the entire wake list and
execute all the wake functions.

We saw page wait list that are up to 3700+ entries long in tests of
large 4 and 8 socket systems. It took 0.8 sec to traverse such list
during wake up. Any other CPU that contends for the list spin lock will
spin for a long time. It is a result of the numa balancing migration of
hot pages that are shared by many threads.

Multiple CPUs waking are queued up behind the lock, and the last one
queued has to wait until all CPUs did all the wakeups.

The page wait list is traversed with interrupt disabled, which caused
various problems. This was the original cause that triggered the NMI
watch dog timer in: https://patchwork.kernel.org/patch/9800303/ . Only
extending the NMI watch dog timer there helped.

This patch bookmarks the waker's scan position in wake list and break
the wake up walk, to allow access to the list before the waker resume
its walk down the rest of the wait list. It lowers the interrupt and
rescheduling latency.

This patch also provides a performance boost when combined with the next
patch to break up page wakeup list walk. We saw 22% improvement in the
will-it-scale file pread2 test on a Xeon Phi system running 256 threads.

[ v2: Merged in Linus' changes to remove the bookmark_wake_function, and
  simply access to flags. ]

Reported-by: Kan Liang <kan.liang@intel.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h |  1 +
 kernel/sched/wait.c  | 78 ++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index dc19880c02f5..78401ef02d29 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -18,6 +18,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 /* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
+#define WQ_FLAG_BOOKMARK	0x04
 
 /*
  * A single wait-queue entry structure:
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index d6afed6d0752..70701ef50465 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
 }
 EXPORT_SYMBOL(remove_wait_queue);
 
+/*
+ * Scan threshold to break wait queue walk.
+ * This allows a waker to take a break from holding the
+ * wait queue lock during the wait queue walk.
+ */
+#define WAITQUEUE_WALK_BREAK_CNT 64
 
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
@@ -63,18 +69,67 @@ EXPORT_SYMBOL(remove_wait_queue);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
-			int nr_exclusive, int wake_flags, void *key)
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key,
+			wait_queue_entry_t *bookmark)
 {
 	wait_queue_entry_t *curr, *next;
+	int cnt = 0;
+
+	if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+		curr = list_next_entry(bookmark, entry);
 
-	list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
+		list_del(&bookmark->entry);
+		bookmark->flags = 0;
+	} else
+		curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+	if (&curr->entry == &wq_head->head)
+		return nr_exclusive;
+
+	list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
 		unsigned flags = curr->flags;
-		int ret = curr->func(curr, mode, wake_flags, key);
+		int ret;
+
+		if (flags & WQ_FLAG_BOOKMARK)
+			continue;
+
+		ret = curr->func(curr, mode, wake_flags, key);
 		if (ret < 0)
 			break;
 		if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
+
+		if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+				(&next->entry != &wq_head->head)) {
+			bookmark->flags = WQ_FLAG_BOOKMARK;
+			list_add_tail(&bookmark->entry, &next->entry);
+			break;
+		}
+	}
+	return nr_exclusive;
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key)
+{
+	unsigned long flags;
+	wait_queue_entry_t bookmark;
+
+	bookmark.flags = 0;
+	bookmark.private = NULL;
+	bookmark.func = NULL;
+	INIT_LIST_HEAD(&bookmark.entry);
+
+	spin_lock_irqsave(&wq_head->lock, flags);
+	nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+
+	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+		spin_lock_irqsave(&wq_head->lock, flags);
+		nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+						wake_flags, key, &bookmark);
+		spin_unlock_irqrestore(&wq_head->lock, flags);
 	}
 }
 
@@ -91,11 +146,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&wq_head->lock, flags);
-	__wake_up_common(wq_head, mode, nr_exclusive, 0, key);
-	spin_unlock_irqrestore(&wq_head->lock, flags);
+	__wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
 }
 EXPORT_SYMBOL(__wake_up);
 
@@ -104,13 +155,13 @@ EXPORT_SYMBOL(__wake_up);
  */
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
 {
-	__wake_up_common(wq_head, mode, nr, 0, NULL);
+	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key);
+	__wake_up_common(wq_head, mode, 1, 0, key, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -134,7 +185,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
-	unsigned long flags;
 	int wake_flags = 1; /* XXX WF_SYNC */
 
 	if (unlikely(!wq_head))
@@ -143,9 +193,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 	if (unlikely(nr_exclusive != 1))
 		wake_flags = 0;
 
-	spin_lock_irqsave(&wq_head->lock, flags);
-	__wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
-	spin_unlock_irqrestore(&wq_head->lock, flags);
+	__wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
-- 
2.14.0.rc1.2.g4c8247ec3


[-- Attachment #3: 0002-sched-wait-Introduce-wakeup-boomark-in-wake_up_page_.patch --]
[-- Type: text/x-patch, Size: 3759 bytes --]

From 6a519e86f2042edcf878463ed19e37dfd774f28b Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 25 Aug 2017 09:13:55 -0700
Subject: [PATCH 2/2] sched/wait: Introduce wakeup boomark in wake_up_page_bit

Now that we have added breaks in the wait queue scan and allow bookmark
on scan position, we put this logic in the wake_up_page_bit function.

We can have very long page wait list in large system where multiple
pages share the same wait list. We break the wake up walk here to allow
other cpus a chance to access the list, and not to disable the interrupts
when traversing the list for too long.  This reduces the interrupt and
rescheduling latency, and excessive page wait queue lock hold time.

[ v2: Remove bookmark_wake_function ]

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h |  2 ++
 kernel/sched/wait.c  |  7 +++++++
 mm/filemap.c         | 22 +++++++++++++++++++++-
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 78401ef02d29..87c4641023fb 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -185,6 +185,8 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 70701ef50465..98feab7933c7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -165,6 +165,13 @@ void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, vo
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+{
+	__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
+
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue
diff --git a/mm/filemap.c b/mm/filemap.c
index 0b41c8cbeabc..74123a298f53 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -923,13 +923,33 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 	wait_queue_head_t *q = page_waitqueue(page);
 	struct wait_page_key key;
 	unsigned long flags;
+	wait_queue_entry_t bookmark;
 
 	key.page = page;
 	key.bit_nr = bit_nr;
 	key.page_match = 0;
 
+	bookmark.flags = 0;
+	bookmark.private = NULL;
+	bookmark.func = NULL;
+	INIT_LIST_HEAD(&bookmark.entry);
+
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key(q, TASK_NORMAL, &key);
+	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+
+	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+		/*
+		 * Take a breather from holding the lock,
+		 * allow pages that finish wake up asynchronously
+		 * to acquire the lock and remove themselves
+		 * from wait queue
+		 */
+		spin_unlock_irqrestore(&q->lock, flags);
+		cpu_relax();
+		spin_lock_irqsave(&q->lock, flags);
+		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+	}
+
 	/*
 	 * It is possible for other pages to have collided on the waitqueue
 	 * hash, so in that case check for a page match. That prevents a long-
-- 
2.14.0.rc1.2.g4c8247ec3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-28 16:48                   ` Linus Torvalds
@ 2017-08-28 20:01                       ` Tim Chen
  2017-08-29 12:57                     ` Liang, Kan
  2017-08-29 16:17                       ` Tim Chen
  2 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-28 20:01 UTC (permalink / raw)
  To: Linus Torvalds, Liang, Kan
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/28/2017 09:48 AM, Linus Torvalds wrote:
> On Mon, Aug 28, 2017 at 7:51 AM, Liang, Kan <kan.liang@intel.com> wrote:
>>
>> I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
>> But they don't fix the issue. I can still get the similar call stack.
> 
> So the main issue was that I *really* hated Tim's patch #2, and the
> patch to clean up the page wait queue should now make his patch series
> much more palatable.
> 
> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
> always put the new entries at the end of the waitqueue.
> 
> The attached patches just apply directly on top of plain 4.13-rc7.
> 
> That makes patch #2 much more palatable, since it now doesn't need to
> play games and worry about new arrivals.
> 
> But note the lack of testing. I've actually booted this and am running
> these two patches right now, but honestly, you should consider them
> "untested" simply because I can't trigger the page waiters contention
> case to begin with.
> 
> But it's really just Tim's patches, modified for the page waitqueue
> cleanup which makes patch #2 become much simpler, and now it's
> palatable: it's just using the same bookmark thing that the normal
> wakeup uses, no extra hacks.
> 
> So Tim should look these over, and they should definitely be tested on
> that load-from-hell that you guys have, but if this set works, at
> least I'm ok with it now.
> 
> Tim - did I miss anything? I added a "cpu_relax()" in there between
> the release lock and irq and re-take it, I'm not convinced it makes
> any difference, but I wanted to mark that "take a breather" thing.
> 
> Oh, there's one more case I only realized after the patches: the
> stupid add_page_wait_queue() code still adds to the head of the list.
> So technically you need this too:
> 
>     diff --git a/mm/filemap.c b/mm/filemap.c
>     index 74123a298f53..598c3be57509 100644
>     --- a/mm/filemap.c
>     +++ b/mm/filemap.c
>     @@ -1061,7 +1061,7 @@ void add_page_wait_queue(struct page *page,
> wait_queue_entry_t *waiter)
>         unsigned long flags;
> 
>         spin_lock_irqsave(&q->lock, flags);
>     -   __add_wait_queue(q, waiter);
>     +   __add_wait_queue_entry_tail(q, waiter);

I've also found this part of the code odd that add to head, but
wasn't sure about the history behind it to have changed it.

Adding to tail makes things much cleaner.  I'm glad that
those ugly hacks that added flags and counter
to track new arrivals can be discarded.

The modified patchset looks fine to me.  So pending Kan's test
on the new code I think we are good.

Thanks.

Tim

>         SetPageWaiters(page);
>         spin_unlock_irqrestore(&q->lock, flags);
>      }
> 
> but that only matters if you actually use the cachefiles thing, which
> I hope/assume you don't.
> 
>        Linus
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-28 20:01                       ` Tim Chen
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-28 20:01 UTC (permalink / raw)
  To: Linus Torvalds, Liang, Kan
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/28/2017 09:48 AM, Linus Torvalds wrote:
> On Mon, Aug 28, 2017 at 7:51 AM, Liang, Kan <kan.liang@intel.com> wrote:
>>
>> I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
>> But they don't fix the issue. I can still get the similar call stack.
> 
> So the main issue was that I *really* hated Tim's patch #2, and the
> patch to clean up the page wait queue should now make his patch series
> much more palatable.
> 
> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
> always put the new entries at the end of the waitqueue.
> 
> The attached patches just apply directly on top of plain 4.13-rc7.
> 
> That makes patch #2 much more palatable, since it now doesn't need to
> play games and worry about new arrivals.
> 
> But note the lack of testing. I've actually booted this and am running
> these two patches right now, but honestly, you should consider them
> "untested" simply because I can't trigger the page waiters contention
> case to begin with.
> 
> But it's really just Tim's patches, modified for the page waitqueue
> cleanup which makes patch #2 become much simpler, and now it's
> palatable: it's just using the same bookmark thing that the normal
> wakeup uses, no extra hacks.
> 
> So Tim should look these over, and they should definitely be tested on
> that load-from-hell that you guys have, but if this set works, at
> least I'm ok with it now.
> 
> Tim - did I miss anything? I added a "cpu_relax()" in there between
> the release lock and irq and re-take it, I'm not convinced it makes
> any difference, but I wanted to mark that "take a breather" thing.
> 
> Oh, there's one more case I only realized after the patches: the
> stupid add_page_wait_queue() code still adds to the head of the list.
> So technically you need this too:
> 
>     diff --git a/mm/filemap.c b/mm/filemap.c
>     index 74123a298f53..598c3be57509 100644
>     --- a/mm/filemap.c
>     +++ b/mm/filemap.c
>     @@ -1061,7 +1061,7 @@ void add_page_wait_queue(struct page *page,
> wait_queue_entry_t *waiter)
>         unsigned long flags;
> 
>         spin_lock_irqsave(&q->lock, flags);
>     -   __add_wait_queue(q, waiter);
>     +   __add_wait_queue_entry_tail(q, waiter);

I've also found this part of the code odd that add to head, but
wasn't sure about the history behind it to have changed it.

Adding to tail makes things much cleaner.  I'm glad that
those ugly hacks that added flags and counter
to track new arrivals can be discarded.

The modified patchset looks fine to me.  So pending Kan's test
on the new code I think we are good.

Thanks.

Tim

>         SetPageWaiters(page);
>         spin_unlock_irqrestore(&q->lock, flags);
>      }
> 
> but that only matters if you actually use the cachefiles thing, which
> I hope/assume you don't.
> 
>        Linus
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-28 16:48                   ` Linus Torvalds
  2017-08-28 20:01                       ` Tim Chen
@ 2017-08-29 12:57                     ` Liang, Kan
  2017-08-29 16:01                         ` Linus Torvalds
  2017-08-29 16:17                       ` Tim Chen
  2 siblings, 1 reply; 57+ messages in thread
From: Liang, Kan @ 2017-08-29 12:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

> On Mon, Aug 28, 2017 at 7:51 AM, Liang, Kan <kan.liang@intel.com> wrote:
> >
> > I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
> > But they don't fix the issue. I can still get the similar call stack.
> 
> So the main issue was that I *really* hated Tim's patch #2, and the patch to
> clean up the page wait queue should now make his patch series much more
> palatable.
> 
> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
> always put the new entries at the end of the waitqueue.
> 

The patches fix the long wait issue.

Tested-by: Kan Liang <kan.liang@intel.com>

> The attached patches just apply directly on top of plain 4.13-rc7.
> 
> That makes patch #2 much more palatable, since it now doesn't need to play
> games and worry about new arrivals.
> 
> But note the lack of testing. I've actually booted this and am running these
> two patches right now, but honestly, you should consider them "untested"
> simply because I can't trigger the page waiters contention case to begin with.
> 
> But it's really just Tim's patches, modified for the page waitqueue cleanup
> which makes patch #2 become much simpler, and now it's
> palatable: it's just using the same bookmark thing that the normal wakeup
> uses, no extra hacks.
> 
> So Tim should look these over, and they should definitely be tested on that
> load-from-hell that you guys have, but if this set works, at least I'm ok with it
> now.
> 
> Tim - did I miss anything? I added a "cpu_relax()" in there between the
> release lock and irq and re-take it, I'm not convinced it makes any difference,
> but I wanted to mark that "take a breather" thing.
> 
> Oh, there's one more case I only realized after the patches: the stupid
> add_page_wait_queue() code still adds to the head of the list.
> So technically you need this too:
> 
>     diff --git a/mm/filemap.c b/mm/filemap.c
>     index 74123a298f53..598c3be57509 100644
>     --- a/mm/filemap.c
>     +++ b/mm/filemap.c
>     @@ -1061,7 +1061,7 @@ void add_page_wait_queue(struct page *page,
> wait_queue_entry_t *waiter)
>         unsigned long flags;
> 
>         spin_lock_irqsave(&q->lock, flags);
>     -   __add_wait_queue(q, waiter);
>     +   __add_wait_queue_entry_tail(q, waiter);
>         SetPageWaiters(page);
>         spin_unlock_irqrestore(&q->lock, flags);
>      }
> 
> but that only matters if you actually use the cachefiles thing, which I
> hope/assume you don't.
> 
>        Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-29 12:57                     ` Liang, Kan
@ 2017-08-29 16:01                         ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-29 16:01 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Tue, Aug 29, 2017 at 5:57 AM, Liang, Kan <kan.liang@intel.com> wrote:
>>
>> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
>> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
>> always put the new entries at the end of the waitqueue.
>
> The patches fix the long wait issue.
>
> Tested-by: Kan Liang <kan.liang@intel.com>

Ok. I'm not 100% comfortable applying them at rc7, so let me think
about it. There's only one known load triggering this, and by "known"
I mean "not really known" since we don't even know what the heck it
does outside of intel and whoever your customer is.

So I suspect I'll apply the patches next merge window, and we can
maybe mark them for stable if this actually ends up mattering.

Can you tell if the problem is actually hitting _production_ use or
was some kind of benchmark stress-test?

               Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-29 16:01                         ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-29 16:01 UTC (permalink / raw)
  To: Liang, Kan
  Cc: Tim Chen, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Tue, Aug 29, 2017 at 5:57 AM, Liang, Kan <kan.liang@intel.com> wrote:
>>
>> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
>> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
>> always put the new entries at the end of the waitqueue.
>
> The patches fix the long wait issue.
>
> Tested-by: Kan Liang <kan.liang@intel.com>

Ok. I'm not 100% comfortable applying them at rc7, so let me think
about it. There's only one known load triggering this, and by "known"
I mean "not really known" since we don't even know what the heck it
does outside of intel and whoever your customer is.

So I suspect I'll apply the patches next merge window, and we can
maybe mark them for stable if this actually ends up mattering.

Can you tell if the problem is actually hitting _production_ use or
was some kind of benchmark stress-test?

               Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-29 16:01                         ` Linus Torvalds
@ 2017-08-29 16:13                           ` Tim Chen
  -1 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-29 16:13 UTC (permalink / raw)
  To: Linus Torvalds, Liang, Kan
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/29/2017 09:01 AM, Linus Torvalds wrote:
> On Tue, Aug 29, 2017 at 5:57 AM, Liang, Kan <kan.liang@intel.com> wrote:
>>>
>>> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
>>> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
>>> always put the new entries at the end of the waitqueue.
>>
>> The patches fix the long wait issue.
>>
>> Tested-by: Kan Liang <kan.liang@intel.com>
> 
> Ok. I'm not 100% comfortable applying them at rc7, so let me think
> about it. There's only one known load triggering this, and by "known"
> I mean "not really known" since we don't even know what the heck it
> does outside of intel and whoever your customer is.
> 
> So I suspect I'll apply the patches next merge window, and we can
> maybe mark them for stable if this actually ends up mattering.
> 
> Can you tell if the problem is actually hitting _production_ use or
> was some kind of benchmark stress-test?
> 
> 

It is affecting not a production use, but the customer's acceptance
test for their systems.  So I suspect it is a stress test.

Tim

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-29 16:13                           ` Tim Chen
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-29 16:13 UTC (permalink / raw)
  To: Linus Torvalds, Liang, Kan
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/29/2017 09:01 AM, Linus Torvalds wrote:
> On Tue, Aug 29, 2017 at 5:57 AM, Liang, Kan <kan.liang@intel.com> wrote:
>>>
>>> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
>>> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
>>> always put the new entries at the end of the waitqueue.
>>
>> The patches fix the long wait issue.
>>
>> Tested-by: Kan Liang <kan.liang@intel.com>
> 
> Ok. I'm not 100% comfortable applying them at rc7, so let me think
> about it. There's only one known load triggering this, and by "known"
> I mean "not really known" since we don't even know what the heck it
> does outside of intel and whoever your customer is.
> 
> So I suspect I'll apply the patches next merge window, and we can
> maybe mark them for stable if this actually ends up mattering.
> 
> Can you tell if the problem is actually hitting _production_ use or
> was some kind of benchmark stress-test?
> 
> 

It is affecting not a production use, but the customer's acceptance
test for their systems.  So I suspect it is a stress test.

Tim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-28 16:48                   ` Linus Torvalds
@ 2017-08-29 16:17                       ` Tim Chen
  2017-08-29 12:57                     ` Liang, Kan
  2017-08-29 16:17                       ` Tim Chen
  2 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-29 16:17 UTC (permalink / raw)
  To: Linus Torvalds, Liang, Kan
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/28/2017 09:48 AM, Linus Torvalds wrote:
> On Mon, Aug 28, 2017 at 7:51 AM, Liang, Kan <kan.liang@intel.com> wrote:
>>
>> I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
>> But they don't fix the issue. I can still get the similar call stack.
> 
> So the main issue was that I *really* hated Tim's patch #2, and the
> patch to clean up the page wait queue should now make his patch series
> much more palatable.
> 
> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
> always put the new entries at the end of the waitqueue.
> 
> The attached patches just apply directly on top of plain 4.13-rc7.
> 
> That makes patch #2 much more palatable, since it now doesn't need to
> play games and worry about new arrivals.
> 
> But note the lack of testing. I've actually booted this and am running
> these two patches right now, but honestly, you should consider them
> "untested" simply because I can't trigger the page waiters contention
> case to begin with.
> 
> But it's really just Tim's patches, modified for the page waitqueue
> cleanup which makes patch #2 become much simpler, and now it's
> palatable: it's just using the same bookmark thing that the normal
> wakeup uses, no extra hacks.
> 
> So Tim should look these over, and they should definitely be tested on
> that load-from-hell that you guys have, but if this set works, at
> least I'm ok with it now.
> 
> Tim - did I miss anything? I added a "cpu_relax()" in there between
> the release lock and irq and re-take it, I'm not convinced it makes
> any difference, but I wanted to mark that "take a breather" thing.
> 
> Oh, there's one more case I only realized after the patches: the
> stupid add_page_wait_queue() code still adds to the head of the list.
> So technically you need this too:

BTW, are you going to add the chunk below separately as part of your
wait queue cleanup patch?

Tim

> 
>     diff --git a/mm/filemap.c b/mm/filemap.c
>     index 74123a298f53..598c3be57509 100644
>     --- a/mm/filemap.c
>     +++ b/mm/filemap.c
>     @@ -1061,7 +1061,7 @@ void add_page_wait_queue(struct page *page,
> wait_queue_entry_t *waiter)
>         unsigned long flags;
> 
>         spin_lock_irqsave(&q->lock, flags);
>     -   __add_wait_queue(q, waiter);
>     +   __add_wait_queue_entry_tail(q, waiter);
>         SetPageWaiters(page);
>         spin_unlock_irqrestore(&q->lock, flags);
>      }
> 
> but that only matters if you actually use the cachefiles thing, which
> I hope/assume you don't.
> 
>        Linus
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-29 16:17                       ` Tim Chen
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-29 16:17 UTC (permalink / raw)
  To: Linus Torvalds, Liang, Kan
  Cc: Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/28/2017 09:48 AM, Linus Torvalds wrote:
> On Mon, Aug 28, 2017 at 7:51 AM, Liang, Kan <kan.liang@intel.com> wrote:
>>
>> I tried this patch and https://lkml.org/lkml/2017/8/27/222 together.
>> But they don't fix the issue. I can still get the similar call stack.
> 
> So the main issue was that I *really* hated Tim's patch #2, and the
> patch to clean up the page wait queue should now make his patch series
> much more palatable.
> 
> Attached is an ALMOST COMPLETELY UNTESTED forward-port of those two
> patches, now without that nasty WQ_FLAG_ARRIVALS logic, because we now
> always put the new entries at the end of the waitqueue.
> 
> The attached patches just apply directly on top of plain 4.13-rc7.
> 
> That makes patch #2 much more palatable, since it now doesn't need to
> play games and worry about new arrivals.
> 
> But note the lack of testing. I've actually booted this and am running
> these two patches right now, but honestly, you should consider them
> "untested" simply because I can't trigger the page waiters contention
> case to begin with.
> 
> But it's really just Tim's patches, modified for the page waitqueue
> cleanup which makes patch #2 become much simpler, and now it's
> palatable: it's just using the same bookmark thing that the normal
> wakeup uses, no extra hacks.
> 
> So Tim should look these over, and they should definitely be tested on
> that load-from-hell that you guys have, but if this set works, at
> least I'm ok with it now.
> 
> Tim - did I miss anything? I added a "cpu_relax()" in there between
> the release lock and irq and re-take it, I'm not convinced it makes
> any difference, but I wanted to mark that "take a breather" thing.
> 
> Oh, there's one more case I only realized after the patches: the
> stupid add_page_wait_queue() code still adds to the head of the list.
> So technically you need this too:

BTW, are you going to add the chunk below separately as part of your
wait queue cleanup patch?

Tim

> 
>     diff --git a/mm/filemap.c b/mm/filemap.c
>     index 74123a298f53..598c3be57509 100644
>     --- a/mm/filemap.c
>     +++ b/mm/filemap.c
>     @@ -1061,7 +1061,7 @@ void add_page_wait_queue(struct page *page,
> wait_queue_entry_t *waiter)
>         unsigned long flags;
> 
>         spin_lock_irqsave(&q->lock, flags);
>     -   __add_wait_queue(q, waiter);
>     +   __add_wait_queue_entry_tail(q, waiter);
>         SetPageWaiters(page);
>         spin_unlock_irqrestore(&q->lock, flags);
>      }
> 
> but that only matters if you actually use the cachefiles thing, which
> I hope/assume you don't.
> 
>        Linus
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-29 16:17                       ` Tim Chen
@ 2017-08-29 16:22                         ` Linus Torvalds
  -1 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-29 16:22 UTC (permalink / raw)
  To: Tim Chen
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Tue, Aug 29, 2017 at 9:17 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> BTW, are you going to add the chunk below separately as part of your
> wait queue cleanup patch?

I did.

Commit 9c3a815f471a ("page waitqueue: always add new entries at the end")

               Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-29 16:22                         ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-29 16:22 UTC (permalink / raw)
  To: Tim Chen
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Tue, Aug 29, 2017 at 9:17 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> BTW, are you going to add the chunk below separately as part of your
> wait queue cleanup patch?

I did.

Commit 9c3a815f471a ("page waitqueue: always add new entries at the end")

               Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-29 16:13                           ` Tim Chen
@ 2017-08-29 16:24                             ` Linus Torvalds
  -1 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-29 16:24 UTC (permalink / raw)
  To: Tim Chen
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Tue, Aug 29, 2017 at 9:13 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> It is affecting not a production use, but the customer's acceptance
> test for their systems.  So I suspect it is a stress test.

Can you gently poke them and ask if they might make theie stress test
code available?

Tell them that we have a fix, but right now it's delayed into 4.14
because we have no visibility into what it is that it actually fixes,
and whether it's all that critical or just some microbenchmark.

Thanks,

           Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-29 16:24                             ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-08-29 16:24 UTC (permalink / raw)
  To: Tim Chen
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Tue, Aug 29, 2017 at 9:13 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> It is affecting not a production use, but the customer's acceptance
> test for their systems.  So I suspect it is a stress test.

Can you gently poke them and ask if they might make theie stress test
code available?

Tell them that we have a fix, but right now it's delayed into 4.14
because we have no visibility into what it is that it actually fixes,
and whether it's all that critical or just some microbenchmark.

Thanks,

           Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-29 16:24                             ` Linus Torvalds
@ 2017-08-29 16:57                               ` Tim Chen
  -1 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-29 16:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/29/2017 09:24 AM, Linus Torvalds wrote:
> On Tue, Aug 29, 2017 at 9:13 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>
>> It is affecting not a production use, but the customer's acceptance
>> test for their systems.  So I suspect it is a stress test.
> 
> Can you gently poke them and ask if they might make theie stress test
> code available?
> 
> Tell them that we have a fix, but right now it's delayed into 4.14
> because we have no visibility into what it is that it actually fixes,
> and whether it's all that critical or just some microbenchmark.
> 

Thanks. We'll do that.

Tim

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-08-29 16:57                               ` Tim Chen
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-08-29 16:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/29/2017 09:24 AM, Linus Torvalds wrote:
> On Tue, Aug 29, 2017 at 9:13 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>
>> It is affecting not a production use, but the customer's acceptance
>> test for their systems.  So I suspect it is a stress test.
> 
> Can you gently poke them and ask if they might make theie stress test
> code available?
> 
> Tell them that we have a fix, but right now it's delayed into 4.14
> because we have no visibility into what it is that it actually fixes,
> and whether it's all that critical or just some microbenchmark.
> 

Thanks. We'll do that.

Tim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-08-29 16:24                             ` Linus Torvalds
@ 2017-09-14  2:12                               ` Tim Chen
  -1 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-09-14  2:12 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/29/2017 09:24 AM, Linus Torvalds wrote:
> On Tue, Aug 29, 2017 at 9:13 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>
>> It is affecting not a production use, but the customer's acceptance
>> test for their systems.  So I suspect it is a stress test.
> 
> Can you gently poke them and ask if they might make theie stress test
> code available?
> 
> Tell them that we have a fix, but right now it's delayed into 4.14
> because we have no visibility into what it is that it actually fixes,
> and whether it's all that critical or just some microbenchmark.
> 
>

Linus,

Here's what the customer think happened and is willing to tell us.
They have a parent process that spawns off 10 children per core and
kicked them to run. The child processes all access a common library.
We have 384 cores so 3840 child processes running.  When migration occur on
a page in the common library, the first child that access the page will
page fault and lock the page, with the other children also page faulting
quickly and pile up in the page wait list, till the first child is done.

Probably some kind of access pattern of the common library induces the
page migration to happen.

BTW, will you be merging these 2 patches in 4.14?

Thanks.

Tim

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-09-14  2:12                               ` Tim Chen
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-09-14  2:12 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On 08/29/2017 09:24 AM, Linus Torvalds wrote:
> On Tue, Aug 29, 2017 at 9:13 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>
>> It is affecting not a production use, but the customer's acceptance
>> test for their systems.  So I suspect it is a stress test.
> 
> Can you gently poke them and ask if they might make theie stress test
> code available?
> 
> Tell them that we have a fix, but right now it's delayed into 4.14
> because we have no visibility into what it is that it actually fixes,
> and whether it's all that critical or just some microbenchmark.
> 
>

Linus,

Here's what the customer think happened and is willing to tell us.
They have a parent process that spawns off 10 children per core and
kicked them to run. The child processes all access a common library.
We have 384 cores so 3840 child processes running.  When migration occur on
a page in the common library, the first child that access the page will
page fault and lock the page, with the other children also page faulting
quickly and pile up in the page wait list, till the first child is done.

Probably some kind of access pattern of the common library induces the
page migration to happen.

BTW, will you be merging these 2 patches in 4.14?

Thanks.

Tim


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-09-14  2:12                               ` Tim Chen
@ 2017-09-14  2:27                                 ` Linus Torvalds
  -1 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-09-14  2:27 UTC (permalink / raw)
  To: Tim Chen
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Wed, Sep 13, 2017 at 7:12 PM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> BTW, will you be merging these 2 patches in 4.14?

Yes, and thanks for reminding me.

In fact, would you mind sending me the latest versions, rather than me
digging them out of the disaster area that is my mailbox and possibly
picking an older version?

                 Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-09-14  2:27                                 ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-09-14  2:27 UTC (permalink / raw)
  To: Tim Chen
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Wed, Sep 13, 2017 at 7:12 PM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> BTW, will you be merging these 2 patches in 4.14?

Yes, and thanks for reminding me.

In fact, would you mind sending me the latest versions, rather than me
digging them out of the disaster area that is my mailbox and possibly
picking an older version?

                 Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-09-14  2:12                               ` Tim Chen
  (?)
  (?)
@ 2017-09-14 16:39                               ` Christopher Lameter
  -1 siblings, 0 replies; 57+ messages in thread
From: Christopher Lameter @ 2017-09-14 16:39 UTC (permalink / raw)
  To: Tim Chen
  Cc: Linus Torvalds, Liang, Kan, Mel Gorman, Peter Zijlstra,
	Ingo Molnar, Andi Kleen, Andrew Morton, Johannes Weiner,
	Jan Kara, Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Wed, 13 Sep 2017, Tim Chen wrote:

> Here's what the customer think happened and is willing to tell us.
> They have a parent process that spawns off 10 children per core and
> kicked them to run. The child processes all access a common library.
> We have 384 cores so 3840 child processes running.  When migration occur on
> a page in the common library, the first child that access the page will
> page fault and lock the page, with the other children also page faulting
> quickly and pile up in the page wait list, till the first child is done.

I think we need some way to avoid migration in cases like this. This is
crazy. Page migration was not written to deal with something like this.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-09-14  2:27                                 ` Linus Torvalds
@ 2017-09-14 16:50                                   ` Tim Chen
  -1 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-09-14 16:50 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 724 bytes --]

On 09/13/2017 07:27 PM, Linus Torvalds wrote:
> On Wed, Sep 13, 2017 at 7:12 PM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>
>> BTW, will you be merging these 2 patches in 4.14?
> 
> Yes, and thanks for reminding me.
> 
> In fact, would you mind sending me the latest versions, rather than me
> digging them out of the disaster area that is my mailbox and possibly
> picking an older version?
> 
>                  Linus
> 

Attached the two patches that you have updated to sync with your other
page wait queue clean up and sent to Kan and me:
https://marc.info/?l=linux-kernel&m=150393893927105&w=2

Kan tested this before so it should be still good. 
I checked that it applied cleanly on latest master.

Thanks.

Tim

[-- Attachment #2: 0001-sched-wait-Break-up-long-wake-list-walk.patch --]
[-- Type: text/x-patch, Size: 7005 bytes --]

>From 59e4341e041d7aa1f9339a03f876eee566768c84 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 25 Aug 2017 09:13:54 -0700
Subject: [PATCH 1/2] sched/wait: Break up long wake list walk

We encountered workloads that have very long wake up list on large
systems. A waker takes a long time to traverse the entire wake list and
execute all the wake functions.

We saw page wait list that are up to 3700+ entries long in tests of
large 4 and 8 socket systems. It took 0.8 sec to traverse such list
during wake up. Any other CPU that contends for the list spin lock will
spin for a long time. It is a result of the numa balancing migration of
hot pages that are shared by many threads.

Multiple CPUs waking are queued up behind the lock, and the last one
queued has to wait until all CPUs did all the wakeups.

The page wait list is traversed with interrupt disabled, which caused
various problems. This was the original cause that triggered the NMI
watch dog timer in: https://patchwork.kernel.org/patch/9800303/ . Only
extending the NMI watch dog timer there helped.

This patch bookmarks the waker's scan position in wake list and break
the wake up walk, to allow access to the list before the waker resume
its walk down the rest of the wait list. It lowers the interrupt and
rescheduling latency.

This patch also provides a performance boost when combined with the next
patch to break up page wakeup list walk. We saw 22% improvement in the
will-it-scale file pread2 test on a Xeon Phi system running 256 threads.

[ v2: Merged in Linus' changes to remove the bookmark_wake_function, and
  simply access to flags. ]

Reported-by: Kan Liang <kan.liang@intel.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h |  1 +
 kernel/sched/wait.c  | 78 ++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index dc19880c02f5..78401ef02d29 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -18,6 +18,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 /* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
+#define WQ_FLAG_BOOKMARK	0x04
 
 /*
  * A single wait-queue entry structure:
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index d6afed6d0752..70701ef50465 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,12 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry
 }
 EXPORT_SYMBOL(remove_wait_queue);
 
+/*
+ * Scan threshold to break wait queue walk.
+ * This allows a waker to take a break from holding the
+ * wait queue lock during the wait queue walk.
+ */
+#define WAITQUEUE_WALK_BREAK_CNT 64
 
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
@@ -63,18 +69,67 @@ EXPORT_SYMBOL(remove_wait_queue);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
-			int nr_exclusive, int wake_flags, void *key)
+static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key,
+			wait_queue_entry_t *bookmark)
 {
 	wait_queue_entry_t *curr, *next;
+	int cnt = 0;
+
+	if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
+		curr = list_next_entry(bookmark, entry);
 
-	list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
+		list_del(&bookmark->entry);
+		bookmark->flags = 0;
+	} else
+		curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);
+
+	if (&curr->entry == &wq_head->head)
+		return nr_exclusive;
+
+	list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
 		unsigned flags = curr->flags;
-		int ret = curr->func(curr, mode, wake_flags, key);
+		int ret;
+
+		if (flags & WQ_FLAG_BOOKMARK)
+			continue;
+
+		ret = curr->func(curr, mode, wake_flags, key);
 		if (ret < 0)
 			break;
 		if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
+
+		if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
+				(&next->entry != &wq_head->head)) {
+			bookmark->flags = WQ_FLAG_BOOKMARK;
+			list_add_tail(&bookmark->entry, &next->entry);
+			break;
+		}
+	}
+	return nr_exclusive;
+}
+
+static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+			int nr_exclusive, int wake_flags, void *key)
+{
+	unsigned long flags;
+	wait_queue_entry_t bookmark;
+
+	bookmark.flags = 0;
+	bookmark.private = NULL;
+	bookmark.func = NULL;
+	INIT_LIST_HEAD(&bookmark.entry);
+
+	spin_lock_irqsave(&wq_head->lock, flags);
+	nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+
+	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+		spin_lock_irqsave(&wq_head->lock, flags);
+		nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+						wake_flags, key, &bookmark);
+		spin_unlock_irqrestore(&wq_head->lock, flags);
 	}
 }
 
@@ -91,11 +146,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&wq_head->lock, flags);
-	__wake_up_common(wq_head, mode, nr_exclusive, 0, key);
-	spin_unlock_irqrestore(&wq_head->lock, flags);
+	__wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
 }
 EXPORT_SYMBOL(__wake_up);
 
@@ -104,13 +155,13 @@ EXPORT_SYMBOL(__wake_up);
  */
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
 {
-	__wake_up_common(wq_head, mode, nr, 0, NULL);
+	__wake_up_common(wq_head, mode, nr, 0, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
 {
-	__wake_up_common(wq_head, mode, 1, 0, key);
+	__wake_up_common(wq_head, mode, 1, 0, key, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -134,7 +185,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
-	unsigned long flags;
 	int wake_flags = 1; /* XXX WF_SYNC */
 
 	if (unlikely(!wq_head))
@@ -143,9 +193,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 	if (unlikely(nr_exclusive != 1))
 		wake_flags = 0;
 
-	spin_lock_irqsave(&wq_head->lock, flags);
-	__wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
-	spin_unlock_irqrestore(&wq_head->lock, flags);
+	__wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
-- 
2.14.0.rc1.2.g4c8247ec3


[-- Attachment #3: 0002-sched-wait-Introduce-wakeup-boomark-in-wake_up_page_.patch --]
[-- Type: text/x-patch, Size: 3760 bytes --]

>From 6a519e86f2042edcf878463ed19e37dfd774f28b Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Fri, 25 Aug 2017 09:13:55 -0700
Subject: [PATCH 2/2] sched/wait: Introduce wakeup boomark in wake_up_page_bit

Now that we have added breaks in the wait queue scan and allow bookmark
on scan position, we put this logic in the wake_up_page_bit function.

We can have very long page wait list in large system where multiple
pages share the same wait list. We break the wake up walk here to allow
other cpus a chance to access the list, and not to disable the interrupts
when traversing the list for too long.  This reduces the interrupt and
rescheduling latency, and excessive page wait queue lock hold time.

[ v2: Remove bookmark_wake_function ]

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h |  2 ++
 kernel/sched/wait.c  |  7 +++++++
 mm/filemap.c         | 22 +++++++++++++++++++++-
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 78401ef02d29..87c4641023fb 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -185,6 +185,8 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 70701ef50465..98feab7933c7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -165,6 +165,13 @@ void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, vo
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
+void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
+		unsigned int mode, void *key, wait_queue_entry_t *bookmark)
+{
+	__wake_up_common(wq_head, mode, 1, 0, key, bookmark);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
+
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue
diff --git a/mm/filemap.c b/mm/filemap.c
index 0b41c8cbeabc..74123a298f53 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -923,13 +923,33 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
 	wait_queue_head_t *q = page_waitqueue(page);
 	struct wait_page_key key;
 	unsigned long flags;
+	wait_queue_entry_t bookmark;
 
 	key.page = page;
 	key.bit_nr = bit_nr;
 	key.page_match = 0;
 
+	bookmark.flags = 0;
+	bookmark.private = NULL;
+	bookmark.func = NULL;
+	INIT_LIST_HEAD(&bookmark.entry);
+
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_locked_key(q, TASK_NORMAL, &key);
+	__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+
+	while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+		/*
+		 * Take a breather from holding the lock,
+		 * allow pages that finish wake up asynchronously
+		 * to acquire the lock and remove themselves
+		 * from wait queue
+		 */
+		spin_unlock_irqrestore(&q->lock, flags);
+		cpu_relax();
+		spin_lock_irqsave(&q->lock, flags);
+		__wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark);
+	}
+
 	/*
 	 * It is possible for other pages to have collided on the waitqueue
 	 * hash, so in that case check for a page match. That prevents a long-
-- 
2.14.0.rc1.2.g4c8247ec3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-09-14 16:50                                   ` Tim Chen
  0 siblings, 0 replies; 57+ messages in thread
From: Tim Chen @ 2017-09-14 16:50 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 724 bytes --]

On 09/13/2017 07:27 PM, Linus Torvalds wrote:
> On Wed, Sep 13, 2017 at 7:12 PM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>>
>> BTW, will you be merging these 2 patches in 4.14?
> 
> Yes, and thanks for reminding me.
> 
> In fact, would you mind sending me the latest versions, rather than me
> digging them out of the disaster area that is my mailbox and possibly
> picking an older version?
> 
>                  Linus
> 

Attached the two patches that you have updated to sync with your other
page wait queue clean up and sent to Kan and me:
https://marc.info/?l=linux-kernel&m=150393893927105&w=2

Kan tested this before so it should be still good. 
I checked that it applied cleanly on latest master.

Thanks.

Tim

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-sched-wait-Break-up-long-wake-list-walk.patch --]
[-- Type: text/x-patch; name="0001-sched-wait-Break-up-long-wake-list-walk.patch", Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
  2017-09-14 16:50                                   ` Tim Chen
@ 2017-09-14 17:00                                     ` Linus Torvalds
  -1 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-09-14 17:00 UTC (permalink / raw)
  To: Tim Chen
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Thu, Sep 14, 2017 at 9:50 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> Kan tested this before so it should be still good.
> I checked that it applied cleanly on latest master.

Thanks, applied.

I really hope we end up fixing the migration thing too, but at least
4.14 will have the mitigation for the long wait queues.

               Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit
@ 2017-09-14 17:00                                     ` Linus Torvalds
  0 siblings, 0 replies; 57+ messages in thread
From: Linus Torvalds @ 2017-09-14 17:00 UTC (permalink / raw)
  To: Tim Chen
  Cc: Liang, Kan, Mel Gorman, Peter Zijlstra, Ingo Molnar, Andi Kleen,
	Andrew Morton, Johannes Weiner, Jan Kara, Christopher Lameter,
	Eric W . Biederman, Davidlohr Bueso, linux-mm,
	Linux Kernel Mailing List

On Thu, Sep 14, 2017 at 9:50 AM, Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> Kan tested this before so it should be still good.
> I checked that it applied cleanly on latest master.

Thanks, applied.

I really hope we end up fixing the migration thing too, but at least
4.14 will have the mitigation for the long wait queues.

               Linus

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2017-09-14 17:01 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-25 16:13 [PATCH 1/2 v2] sched/wait: Break up long wake list walk Tim Chen
2017-08-25 16:13 ` Tim Chen
2017-08-25 16:13 ` [PATCH 2/2 v2] sched/wait: Introduce lock breaker in wake_up_page_bit Tim Chen
2017-08-25 16:13   ` Tim Chen
2017-08-25 19:58   ` Linus Torvalds
2017-08-25 19:58     ` Linus Torvalds
2017-08-25 22:19     ` Tim Chen
2017-08-25 22:19       ` Tim Chen
2017-08-25 22:51       ` Linus Torvalds
2017-08-25 23:03         ` Linus Torvalds
2017-08-26  0:31           ` Linus Torvalds
2017-08-26  0:31             ` Linus Torvalds
2017-08-26  2:54             ` Linus Torvalds
2017-08-26  2:54               ` Linus Torvalds
2017-08-26 18:15               ` Linus Torvalds
2017-08-27 21:40                 ` Linus Torvalds
2017-08-27 21:40                   ` Linus Torvalds
2017-08-27 21:42                   ` Linus Torvalds
2017-08-27 21:42                     ` Linus Torvalds
2017-08-27 23:12                   ` Linus Torvalds
2017-08-27 23:12                     ` Linus Torvalds
2017-08-28  1:16                     ` Nicholas Piggin
2017-08-28  1:16                       ` Nicholas Piggin
2017-08-28  1:29                       ` Nicholas Piggin
2017-08-28  1:29                         ` Nicholas Piggin
2017-08-28  5:17                         ` Linus Torvalds
2017-08-28  5:17                           ` Linus Torvalds
2017-08-28  7:18                           ` Nicholas Piggin
2017-08-28  7:18                             ` Nicholas Piggin
2017-08-28 14:51                 ` Liang, Kan
2017-08-28 16:48                   ` Linus Torvalds
2017-08-28 20:01                     ` Tim Chen
2017-08-28 20:01                       ` Tim Chen
2017-08-29 12:57                     ` Liang, Kan
2017-08-29 16:01                       ` Linus Torvalds
2017-08-29 16:01                         ` Linus Torvalds
2017-08-29 16:13                         ` Tim Chen
2017-08-29 16:13                           ` Tim Chen
2017-08-29 16:24                           ` Linus Torvalds
2017-08-29 16:24                             ` Linus Torvalds
2017-08-29 16:57                             ` Tim Chen
2017-08-29 16:57                               ` Tim Chen
2017-09-14  2:12                             ` Tim Chen
2017-09-14  2:12                               ` Tim Chen
2017-09-14  2:27                               ` Linus Torvalds
2017-09-14  2:27                                 ` Linus Torvalds
2017-09-14 16:50                                 ` Tim Chen
2017-09-14 16:50                                   ` Tim Chen
2017-09-14 17:00                                   ` Linus Torvalds
2017-09-14 17:00                                     ` Linus Torvalds
2017-09-14 16:39                               ` Christopher Lameter
2017-08-29 16:17                     ` Tim Chen
2017-08-29 16:17                       ` Tim Chen
2017-08-29 16:22                       ` Linus Torvalds
2017-08-29 16:22                         ` Linus Torvalds
2017-08-25 17:46 ` [PATCH 1/2 v2] sched/wait: Break up long wake list walk Christopher Lameter
2017-08-25 17:46   ` Christopher Lameter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.