linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] writeback livelock fixes
@ 2010-11-08 23:09 Wu Fengguang
  2010-11-08 23:09 ` [PATCH 1/5] writeback: integrated background writeback work Wu Fengguang
                   ` (5 more replies)
  0 siblings, 6 replies; 23+ messages in thread
From: Wu Fengguang @ 2010-11-08 23:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, Wu Fengguang, LKML

Andrew,

Here are the two writeback livelock fixes (patch 3, 4) from Jan Kara.
The series starts with a preparation patch and carries two more debugging
patches.

Thanks,
Fengguang


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/5] writeback: integrated background writeback work
  2010-11-08 23:09 [PATCH 0/5] writeback livelock fixes Wu Fengguang
@ 2010-11-08 23:09 ` Wu Fengguang
  2010-11-08 23:09 ` [PATCH 2/5] writeback: trace wakeup event for background writeback Wu Fengguang
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 23+ messages in thread
From: Wu Fengguang @ 2010-11-08 23:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner, Wu Fengguang,
	Christoph Hellwig, Jan Engelhardt, LKML

[-- Attachment #1: writeback-integrated-background-work.patch --]
[-- Type: text/plain, Size: 4806 bytes --]

From: Jan Kara <jack@suse.cz>

Check whether background writeback is needed after finishing each work.

When bdi flusher thread finishes doing some work check whether any kind
of background writeback needs to be done (either because
dirty_background_ratio is exceeded or because we need to start flushing
old inodes). If so, just do background write back.

This way, bdi_start_background_writeback() just needs to wake up the
flusher thread. It will do background writeback as soon as there is no
other work.

This is a preparatory patch for the next patch which stops background
writeback as soon as there is other work to do.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c |   61 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 15 deletions(-)

--- linux-next.orig/fs/fs-writeback.c	2010-11-06 23:55:25.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2010-11-07 21:54:19.000000000 +0800
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(str
 	return list_entry(head, struct inode, i_wb_list);
 }
 
-static void bdi_queue_work(struct backing_dev_info *bdi,
-		struct wb_writeback_work *work)
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
 {
-	trace_writeback_queue(bdi, work);
-
-	spin_lock_bh(&bdi->wb_lock);
-	list_add_tail(&work->list, &bdi->work_list);
 	if (bdi->wb.task) {
 		wake_up_process(bdi->wb.task);
 	} else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backin
 		 * The bdi thread isn't there, wake up the forker thread which
 		 * will create and run it.
 		 */
-		trace_writeback_nothread(bdi, work);
 		wake_up_process(default_backing_dev_info.wb.task);
 	}
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+			   struct wb_writeback_work *work)
+{
+	trace_writeback_queue(bdi, work);
+
+	spin_lock_bh(&bdi->wb_lock);
+	list_add_tail(&work->list, &bdi->work_list);
+	if (!bdi->wb.task)
+		trace_writeback_nothread(bdi, work);
+	bdi_wakeup_flusher(bdi);
 	spin_unlock_bh(&bdi->wb_lock);
 }
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		bool range_cyclic, bool for_background)
+		      bool range_cyclic)
 {
 	struct wb_writeback_work *work;
 
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
-	work->for_background = for_background;
 
 	bdi_queue_work(bdi, work);
 }
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-	__bdi_start_writeback(bdi, nr_pages, true, false);
+	__bdi_start_writeback(bdi, nr_pages, true);
 }
 
 /**
@@ -152,13 +158,20 @@ void bdi_start_writeback(struct backing_
  * @bdi: the backing device to write from
  *
  * Description:
- *   This does WB_SYNC_NONE background writeback. The IO is only
- *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
+ *   this function returns, it is only guaranteed that for given BDI
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
  */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-	__bdi_start_writeback(bdi, LONG_MAX, true, true);
+	/*
+	 * We just wake up the flusher thread. It will perform background
+	 * writeback as soon as there is no other work to do.
+	 */
+	spin_lock_bh(&bdi->wb_lock);
+	bdi_wakeup_flusher(bdi);
+	spin_unlock_bh(&bdi->wb_lock);
 }
 
 /*
@@ -707,6 +720,23 @@ get_next_work_item(struct backing_dev_in
 	return work;
 }
 
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+	if (over_bground_thresh()) {
+
+		struct wb_writeback_work work = {
+			.nr_pages	= LONG_MAX,
+			.sync_mode	= WB_SYNC_NONE,
+			.for_background	= 1,
+			.range_cyclic	= 1,
+		};
+
+		return wb_writeback(wb, &work);
+	}
+
+	return 0;
+}
+
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
 	unsigned long expired;
@@ -782,6 +812,7 @@ long wb_do_writeback(struct bdi_writebac
 	 * Check for periodic writeback, kupdated() style
 	 */
 	wrote += wb_check_old_data_flush(wb);
+	wrote += wb_check_background_flush(wb);
 	clear_bit(BDI_writeback_running, &wb->bdi->state);
 
 	return wrote;
@@ -868,7 +899,7 @@ void wakeup_flusher_threads(long nr_page
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, false);
+		__bdi_start_writeback(bdi, nr_pages, false);
 	}
 	rcu_read_unlock();
 }



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 2/5] writeback: trace wakeup event for background writeback
  2010-11-08 23:09 [PATCH 0/5] writeback livelock fixes Wu Fengguang
  2010-11-08 23:09 ` [PATCH 1/5] writeback: integrated background writeback work Wu Fengguang
@ 2010-11-08 23:09 ` Wu Fengguang
  2010-11-08 23:09 ` [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works Wu Fengguang
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 23+ messages in thread
From: Wu Fengguang @ 2010-11-08 23:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner, Wu Fengguang,
	Christoph Hellwig, Jan Engelhardt, LKML

[-- Attachment #1: mutt-wfg-t61-1000-20494-6680a464460b57571 --]
[-- Type: text/plain, Size: 1325 bytes --]

This tracks when balance_dirty_pages() tries to wakeup the flusher
thread for background writeback (if it was not started already).

Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---

 fs/fs-writeback.c                |    1 +
 include/trace/events/writeback.h |    1 +
 2 files changed, 2 insertions(+)

--- linux-next.orig/include/trace/events/writeback.h	2010-11-07 21:54:19.000000000 +0800
+++ linux-next/include/trace/events/writeback.h	2010-11-07 21:56:42.000000000 +0800
@@ -81,6 +81,7 @@ DEFINE_EVENT(writeback_class, name, \
 	TP_ARGS(bdi))
 
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
+DEFINE_WRITEBACK_EVENT(writeback_wake_background);
 DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
 DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
--- linux-next.orig/fs/fs-writeback.c	2010-11-07 21:54:19.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2010-11-07 21:56:42.000000000 +0800
@@ -169,6 +169,7 @@ void bdi_start_background_writeback(stru
 	 * We just wake up the flusher thread. It will perform background
 	 * writeback as soon as there is no other work to do.
 	 */
+	trace_writeback_wake_background(bdi);
 	spin_lock_bh(&bdi->wb_lock);
 	bdi_wakeup_flusher(bdi);
 	spin_unlock_bh(&bdi->wb_lock);



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-08 23:09 [PATCH 0/5] writeback livelock fixes Wu Fengguang
  2010-11-08 23:09 ` [PATCH 1/5] writeback: integrated background writeback work Wu Fengguang
  2010-11-08 23:09 ` [PATCH 2/5] writeback: trace wakeup event for background writeback Wu Fengguang
@ 2010-11-08 23:09 ` Wu Fengguang
  2010-11-09 21:13   ` Andrew Morton
  2010-11-08 23:09 ` [PATCH 4/5] writeback: avoid livelocking WB_SYNC_ALL writeback Wu Fengguang
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 23+ messages in thread
From: Wu Fengguang @ 2010-11-08 23:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner, Wu Fengguang,
	Christoph Hellwig, Jan Engelhardt, LKML

[-- Attachment #1: 0002-mm-Stop-background-writeback-if-there-is-other-work-.patch --]
[-- Type: text/plain, Size: 1364 bytes --]

From: Jan Kara <jack@suse.cz>

Background writeback are easily livelockable (from a definition of their
target). This is inconvenient because it can make sync(1) stall forever waiting
on its queued work to be finished. Generally, when a flusher thread has
some work queued, someone submitted the work to achieve a goal more specific
than what background writeback does. So it makes sense to give it a priority
over a generic page cleaning.

Thus we interrupt background writeback if there is some other work to do. We
return to the background writeback after completing all the queued work.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c |    9 +++++++++
 1 file changed, 9 insertions(+)

--- linux-next.orig/fs/fs-writeback.c	2010-11-07 21:56:42.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2010-11-07 22:00:51.000000000 +0800
@@ -651,6 +651,15 @@ static long wb_writeback(struct bdi_writ
 			break;
 
 		/*
+		 * Background writeout and kupdate-style writeback are
+		 * easily livelockable. Stop them if there is other work
+		 * to do so that e.g. sync can proceed.
+		 */
+		if ((work->for_background || work->for_kupdate) &&
+		    !list_empty(&wb->bdi->work_list))
+			break;
+
+		/*
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 4/5] writeback: avoid livelocking WB_SYNC_ALL writeback
  2010-11-08 23:09 [PATCH 0/5] writeback livelock fixes Wu Fengguang
                   ` (2 preceding siblings ...)
  2010-11-08 23:09 ` [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works Wu Fengguang
@ 2010-11-08 23:09 ` Wu Fengguang
  2010-11-09 22:43   ` Andrew Morton
  2010-11-08 23:09 ` [PATCH 5/5] writeback: check skipped pages on WB_SYNC_ALL Wu Fengguang
  2010-11-08 23:23 ` [PATCH 0/5] writeback livelock fixes Wu Fengguang
  5 siblings, 1 reply; 23+ messages in thread
From: Wu Fengguang @ 2010-11-08 23:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner, Wu Fengguang,
	Christoph Hellwig, Jan Engelhardt, LKML

[-- Attachment #1: mutt-wfg-t61-1000-14567-7029f8c2a01ad3a473 --]
[-- Type: text/plain, Size: 3745 bytes --]

From: Jan Kara <jack@suse.cz>

When wb_writeback() is called in WB_SYNC_ALL mode, work->nr_to_write is
usually set to LONG_MAX. The logic in wb_writeback() then calls
__writeback_inodes_sb() with nr_to_write == MAX_WRITEBACK_PAGES and thus
we easily end up with negative nr_to_write after the function returns.
wb_writeback() then decides we need another round of writeback but this
is wrong in some cases! For example when a single large file is
continuously dirtied, we would never finish syncing it because each pass
would be able to write MAX_WRITEBACK_PAGES and inode dirty timestamp
never gets updated (as inode is never completely clean).

Fix the issue by setting nr_to_write to LONG_MAX in WB_SYNC_ALL mode. We
do not need nr_to_write in WB_SYNC_ALL mode anyway since livelock
avoidance is done differently for it.

After this patch, program from http://lkml.org/lkml/2010/10/24/154 is no
longer able to stall sync forever.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c |   27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

  Fengguang, I've been testing with those writeback fixes you reposted
a few days ago and I've been able to still reproduce livelocks with
Jan Engelhard's test case. Using writeback tracing I've tracked the
problem to the above and with this patch, sync finishes OK (well, it still
takes about 15 minutes but that's about expected time given the throughput
I see to the disk - the test case randomly dirties pages in a huge file).
So could you please add this patch to the previous two send them to Jens
for inclusion?

--- linux-next.orig/fs/fs-writeback.c	2010-11-07 22:00:51.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2010-11-07 22:01:06.000000000 +0800
@@ -630,6 +630,7 @@ static long wb_writeback(struct bdi_writ
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
+	long write_chunk;
 	struct inode *inode;
 
 	if (wbc.for_kupdate) {
@@ -642,6 +643,24 @@ static long wb_writeback(struct bdi_writ
 		wbc.range_end = LLONG_MAX;
 	}
 
+	/*
+	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+	 * here avoids calling into writeback_inodes_wb() more than once.
+	 *
+	 * The intended call sequence for WB_SYNC_ALL writeback is:
+	 *
+	 *      wb_writeback()
+	 *          writeback_inodes_wb()       <== called only once
+	 *              write_cache_pages()     <== called once for each inode
+	 *                   (quickly) tag currently dirty pages
+	 *                   (maybe slowly) sync all tagged pages
+	 */
+	if (wbc.sync_mode == WB_SYNC_NONE)
+		write_chunk = MAX_WRITEBACK_PAGES;
+	else
+		write_chunk = LONG_MAX;
+
 	wbc.wb_start = jiffies; /* livelock avoidance */
 	for (;;) {
 		/*
@@ -667,7 +686,7 @@ static long wb_writeback(struct bdi_writ
 			break;
 
 		wbc.more_io = 0;
-		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
 
 		trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -677,8 +696,8 @@ static long wb_writeback(struct bdi_writ
 			writeback_inodes_wb(wb, &wbc);
 		trace_wbc_writeback_written(&wbc, wb->bdi);
 
-		work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		work->nr_pages -= write_chunk - wbc.nr_to_write;
+		wrote += write_chunk - wbc.nr_to_write;
 
 		/*
 		 * If we consumed everything, see if we have more
@@ -693,7 +712,7 @@ static long wb_writeback(struct bdi_writ
 		/*
 		 * Did we write something? Try for more
 		 */
-		if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+		if (wbc.nr_to_write < write_chunk)
 			continue;
 		/*
 		 * Nothing written. Wait for some inode to



^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 5/5] writeback: check skipped pages on WB_SYNC_ALL
  2010-11-08 23:09 [PATCH 0/5] writeback livelock fixes Wu Fengguang
                   ` (3 preceding siblings ...)
  2010-11-08 23:09 ` [PATCH 4/5] writeback: avoid livelocking WB_SYNC_ALL writeback Wu Fengguang
@ 2010-11-08 23:09 ` Wu Fengguang
  2010-11-09 22:47   ` Andrew Morton
  2010-11-08 23:23 ` [PATCH 0/5] writeback livelock fixes Wu Fengguang
  5 siblings, 1 reply; 23+ messages in thread
From: Wu Fengguang @ 2010-11-08 23:09 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner, Wu Fengguang,
	Christoph Hellwig, Jan Engelhardt, LKML

[-- Attachment #1: writeback-warn-sync-skipped_pages.patch --]
[-- Type: text/plain, Size: 913 bytes --]

In WB_SYNC_ALL mode, filesystems are not expected to skip dirty pages on
temporal lock contentions or non fatal errors, otherwise sync() will
return without actually syncing the skipped pages. Add a check to
catch possible redirty_page_for_writepage() callers that violate this
expectation.

I'd recommend to keep this check in -mm tree for some time and fixup the
possible warnings before pushing it to upstream.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c |    1 +
 1 file changed, 1 insertion(+)

--- linux-next.orig/fs/fs-writeback.c	2010-11-07 22:01:06.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2010-11-07 22:01:15.000000000 +0800
@@ -527,6 +527,7 @@ static int writeback_sb_inodes(struct su
 			 * buffers.  Skip this inode for now.
 			 */
 			redirty_tail(inode);
+			WARN_ON_ONCE(wbc->sync_mode == WB_SYNC_ALL);
 		}
 		spin_unlock(&inode_lock);
 		iput(inode);



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/5] writeback livelock fixes
  2010-11-08 23:09 [PATCH 0/5] writeback livelock fixes Wu Fengguang
                   ` (4 preceding siblings ...)
  2010-11-08 23:09 ` [PATCH 5/5] writeback: check skipped pages on WB_SYNC_ALL Wu Fengguang
@ 2010-11-08 23:23 ` Wu Fengguang
  5 siblings, 0 replies; 23+ messages in thread
From: Wu Fengguang @ 2010-11-08 23:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

[-- Attachment #1: Type: text/plain, Size: 329 bytes --]

On Tue, Nov 09, 2010 at 07:09:16AM +0800, Wu, Fengguang wrote:
> Andrew,
> 
> Here are the two writeback livelock fixes (patch 3, 4) from Jan Kara.
> The series starts with a preparation patch and carries two more debugging
> patches.

Jan: for your convenience, attached is the combined patch for linux-next.

Thanks,
Fengguang

[-- Attachment #2: wb-livelock-jan.patch --]
[-- Type: text/x-diff, Size: 7131 bytes --]

--- linux-next.orig/fs/fs-writeback.c	2010-11-06 23:55:25.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2010-11-07 22:01:15.000000000 +0800
@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(str
 	return list_entry(head, struct inode, i_wb_list);
 }
 
-static void bdi_queue_work(struct backing_dev_info *bdi,
-		struct wb_writeback_work *work)
+/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
+static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
 {
-	trace_writeback_queue(bdi, work);
-
-	spin_lock_bh(&bdi->wb_lock);
-	list_add_tail(&work->list, &bdi->work_list);
 	if (bdi->wb.task) {
 		wake_up_process(bdi->wb.task);
 	} else {
@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backin
 		 * The bdi thread isn't there, wake up the forker thread which
 		 * will create and run it.
 		 */
-		trace_writeback_nothread(bdi, work);
 		wake_up_process(default_backing_dev_info.wb.task);
 	}
+}
+
+static void bdi_queue_work(struct backing_dev_info *bdi,
+			   struct wb_writeback_work *work)
+{
+	trace_writeback_queue(bdi, work);
+
+	spin_lock_bh(&bdi->wb_lock);
+	list_add_tail(&work->list, &bdi->work_list);
+	if (!bdi->wb.task)
+		trace_writeback_nothread(bdi, work);
+	bdi_wakeup_flusher(bdi);
 	spin_unlock_bh(&bdi->wb_lock);
 }
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		bool range_cyclic, bool for_background)
+		      bool range_cyclic)
 {
 	struct wb_writeback_work *work;
 
@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev
 	work->sync_mode	= WB_SYNC_NONE;
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
-	work->for_background = for_background;
 
 	bdi_queue_work(bdi, work);
 }
@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-	__bdi_start_writeback(bdi, nr_pages, true, false);
+	__bdi_start_writeback(bdi, nr_pages, true);
 }
 
 /**
@@ -152,13 +158,21 @@ void bdi_start_writeback(struct backing_
  * @bdi: the backing device to write from
  *
  * Description:
- *   This does WB_SYNC_NONE background writeback. The IO is only
- *   started when this function returns, we make no guarentees on
- *   completion. Caller need not hold sb s_umount semaphore.
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
+ *   this function returns, it is only guaranteed that for given BDI
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
  */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-	__bdi_start_writeback(bdi, LONG_MAX, true, true);
+	/*
+	 * We just wake up the flusher thread. It will perform background
+	 * writeback as soon as there is no other work to do.
+	 */
+	trace_writeback_wake_background(bdi);
+	spin_lock_bh(&bdi->wb_lock);
+	bdi_wakeup_flusher(bdi);
+	spin_unlock_bh(&bdi->wb_lock);
 }
 
 /*
@@ -513,6 +527,7 @@ static int writeback_sb_inodes(struct su
 			 * buffers.  Skip this inode for now.
 			 */
 			redirty_tail(inode);
+			WARN_ON_ONCE(wbc->sync_mode == WB_SYNC_ALL);
 		}
 		spin_unlock(&inode_lock);
 		iput(inode);
@@ -616,6 +631,7 @@ static long wb_writeback(struct bdi_writ
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
+	long write_chunk;
 	struct inode *inode;
 
 	if (wbc.for_kupdate) {
@@ -628,6 +644,24 @@ static long wb_writeback(struct bdi_writ
 		wbc.range_end = LLONG_MAX;
 	}
 
+	/*
+	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+	 * here avoids calling into writeback_inodes_wb() more than once.
+	 *
+	 * The intended call sequence for WB_SYNC_ALL writeback is:
+	 *
+	 *      wb_writeback()
+	 *          writeback_inodes_wb()       <== called only once
+	 *              write_cache_pages()     <== called once for each inode
+	 *                   (quickly) tag currently dirty pages
+	 *                   (maybe slowly) sync all tagged pages
+	 */
+	if (wbc.sync_mode == WB_SYNC_NONE)
+		write_chunk = MAX_WRITEBACK_PAGES;
+	else
+		write_chunk = LONG_MAX;
+
 	wbc.wb_start = jiffies; /* livelock avoidance */
 	for (;;) {
 		/*
@@ -637,6 +671,15 @@ static long wb_writeback(struct bdi_writ
 			break;
 
 		/*
+		 * Background writeout and kupdate-style writeback are
+		 * easily livelockable. Stop them if there is other work
+		 * to do so that e.g. sync can proceed.
+		 */
+		if ((work->for_background || work->for_kupdate) &&
+		    !list_empty(&wb->bdi->work_list))
+			break;
+
+		/*
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
@@ -644,7 +687,7 @@ static long wb_writeback(struct bdi_writ
 			break;
 
 		wbc.more_io = 0;
-		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
 
 		trace_wbc_writeback_start(&wbc, wb->bdi);
@@ -654,8 +697,8 @@ static long wb_writeback(struct bdi_writ
 			writeback_inodes_wb(wb, &wbc);
 		trace_wbc_writeback_written(&wbc, wb->bdi);
 
-		work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		work->nr_pages -= write_chunk - wbc.nr_to_write;
+		wrote += write_chunk - wbc.nr_to_write;
 
 		/*
 		 * If we consumed everything, see if we have more
@@ -670,7 +713,7 @@ static long wb_writeback(struct bdi_writ
 		/*
 		 * Did we write something? Try for more
 		 */
-		if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+		if (wbc.nr_to_write < write_chunk)
 			continue;
 		/*
 		 * Nothing written. Wait for some inode to
@@ -707,6 +750,23 @@ get_next_work_item(struct backing_dev_in
 	return work;
 }
 
+static long wb_check_background_flush(struct bdi_writeback *wb)
+{
+	if (over_bground_thresh()) {
+
+		struct wb_writeback_work work = {
+			.nr_pages	= LONG_MAX,
+			.sync_mode	= WB_SYNC_NONE,
+			.for_background	= 1,
+			.range_cyclic	= 1,
+		};
+
+		return wb_writeback(wb, &work);
+	}
+
+	return 0;
+}
+
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
 {
 	unsigned long expired;
@@ -782,6 +842,7 @@ long wb_do_writeback(struct bdi_writebac
 	 * Check for periodic writeback, kupdated() style
 	 */
 	wrote += wb_check_old_data_flush(wb);
+	wrote += wb_check_background_flush(wb);
 	clear_bit(BDI_writeback_running, &wb->bdi->state);
 
 	return wrote;
@@ -868,7 +929,7 @@ void wakeup_flusher_threads(long nr_page
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, false);
+		__bdi_start_writeback(bdi, nr_pages, false);
 	}
 	rcu_read_unlock();
 }
--- linux-next.orig/include/trace/events/writeback.h	2010-11-07 21:54:19.000000000 +0800
+++ linux-next/include/trace/events/writeback.h	2010-11-07 21:56:42.000000000 +0800
@@ -81,6 +81,7 @@ DEFINE_EVENT(writeback_class, name, \
 	TP_ARGS(bdi))
 
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
+DEFINE_WRITEBACK_EVENT(writeback_wake_background);
 DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
 DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_register);

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-08 23:09 ` [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works Wu Fengguang
@ 2010-11-09 21:13   ` Andrew Morton
  2010-11-09 22:28     ` Jan Kara
  0 siblings, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2010-11-09 21:13 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Tue, 09 Nov 2010 07:09:19 +0800
Wu Fengguang <fengguang.wu@intel.com> wrote:
>

I find the description to be somewhat incomplete...

> From: Jan Kara <jack@suse.cz>
> 
> Background writeback are easily livelockable (from a definition of their
> target).

*why* is background writeback easily livelockable?  Under which
circumstances does this happen and how does it come about?

> This is inconvenient because it can make sync(1) stall forever waiting
> on its queued work to be finished.

Again, why?  Because there are works queued from the flusher thread,
but that thread is stuck in a livelocked state in <unspecified code
location> so it is unable to service the other works?  But the pocess
which called sync() will as a last resort itself perform all the
required IO, will it not?  If so, how can it livelock?

> Generally, when a flusher thread has
> some work queued, someone submitted the work to achieve a goal more specific
> than what background writeback does. So it makes sense to give it a priority
> over a generic page cleaning.
> 
> Thus we interrupt background writeback if there is some other work to do. We
> return to the background writeback after completing all the queued work.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> ---
>  fs/fs-writeback.c |    9 +++++++++
>  1 file changed, 9 insertions(+)
> 
> --- linux-next.orig/fs/fs-writeback.c	2010-11-07 21:56:42.000000000 +0800
> +++ linux-next/fs/fs-writeback.c	2010-11-07 22:00:51.000000000 +0800
> @@ -651,6 +651,15 @@ static long wb_writeback(struct bdi_writ
>  			break;
>  
>  		/*
> +		 * Background writeout and kupdate-style writeback are
> +		 * easily livelockable. Stop them if there is other work
> +		 * to do so that e.g. sync can proceed.
> +		 */
> +		if ((work->for_background || work->for_kupdate) &&
> +		    !list_empty(&wb->bdi->work_list))
> +			break;
> +
> +		/*
>  		 * For background writeout, stop when we are below the
>  		 * background dirty threshold
>  		 */

So...  what prevents higher priority works (eg, sync(1)) from
livelocking or seriously retarding background or kudate writeout?


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-09 21:13   ` Andrew Morton
@ 2010-11-09 22:28     ` Jan Kara
  2010-11-09 23:00       ` Andrew Morton
  0 siblings, 1 reply; 23+ messages in thread
From: Jan Kara @ 2010-11-09 22:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Wu Fengguang, Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

  Hi,

On Tue 09-11-10 13:13:10, Andrew Morton wrote:
> On Tue, 09 Nov 2010 07:09:19 +0800
> Wu Fengguang <fengguang.wu@intel.com> wrote:
> I find the description to be somewhat incomplete...
  OK, so let me fill in the gaps ;)

> > From: Jan Kara <jack@suse.cz>
> > 
> > Background writeback are easily livelockable (from a definition of their
> > target).
> 
> *why* is background writeback easily livelockable?  Under which
> circumstances does this happen and how does it come about?
> 
> > This is inconvenient because it can make sync(1) stall forever waiting
> > on its queued work to be finished.
> 
> Again, why?  Because there are works queued from the flusher thread,
> but that thread is stuck in a livelocked state in <unspecified code
> location> so it is unable to service the other works?  But the pocess
> which called sync() will as a last resort itself perform all the
> required IO, will it not?  If so, how can it livelock?
  New description which should address above questions:
Background writeback is easily livelockable in a loop in wb_writeback() by
a process continuously re-dirtying pages (or continuously appending to a
file). This is in fact intended as the target of background writeback is to
write dirty pages it can find as long as we are over
dirty_background_threshold.

But the above behavior gets inconvenient at times because no other work
queued in the flusher thread's queue gets processed. In particular,
since e.g. sync(1) relies on flusher thread to do all the IO for it,
sync(1) can hang forever waiting for flusher thread to do the work.

Generally, when a flusher thread has some work queued, someone submitted
the work to achieve a goal more specific than what background writeback
does. Moreover by working on the specific work, we also reduce amount of
dirty pages which is exactly the target of background writeout. So it makes
sense to give specific work a priority over a generic page cleaning.

Thus we interrupt background writeback if there is some other work to do. We
return to the background writeback after completing all the queued work.

Is it better now?

> > Generally, when a flusher thread has
> > some work queued, someone submitted the work to achieve a goal more specific
> > than what background writeback does. So it makes sense to give it a priority
> > over a generic page cleaning.
> > 
> > Thus we interrupt background writeback if there is some other work to do. We
> > return to the background writeback after completing all the queued work.
> > 
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> > ---
> >  fs/fs-writeback.c |    9 +++++++++
> >  1 file changed, 9 insertions(+)
> > 
> > --- linux-next.orig/fs/fs-writeback.c	2010-11-07 21:56:42.000000000 +0800
> > +++ linux-next/fs/fs-writeback.c	2010-11-07 22:00:51.000000000 +0800
> > @@ -651,6 +651,15 @@ static long wb_writeback(struct bdi_writ
> >  			break;
> >  
> >  		/*
> > +		 * Background writeout and kupdate-style writeback are
> > +		 * easily livelockable. Stop them if there is other work
> > +		 * to do so that e.g. sync can proceed.
> > +		 */
> > +		if ((work->for_background || work->for_kupdate) &&
> > +		    !list_empty(&wb->bdi->work_list))
> > +			break;
> > +
> > +		/*
> >  		 * For background writeout, stop when we are below the
> >  		 * background dirty threshold
> >  		 */
> 
> So...  what prevents higher priority works (eg, sync(1)) from
> livelocking or seriously retarding background or kudate writeout?
  If other work than background or kupdate writeout livelocks, it's a bug
which should be fixed (either by setting sensible nr_to_write or by tagging
like we do it for WB_SYNC_ALL writeback). Of course, higher priority work
can be running when background or kupdate writeout would need to run as
well. But the idea here is that the purpose of background/kupdate types of
writeout is to get rid of dirty data and any type of writeout does this so
working on it we also work on background/kupdate writeout only possibly
less efficiently.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/5] writeback: avoid livelocking WB_SYNC_ALL writeback
  2010-11-08 23:09 ` [PATCH 4/5] writeback: avoid livelocking WB_SYNC_ALL writeback Wu Fengguang
@ 2010-11-09 22:43   ` Andrew Morton
  2010-11-09 23:18     ` Jan Kara
  0 siblings, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2010-11-09 22:43 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Tue, 09 Nov 2010 07:09:20 +0800
Wu Fengguang <fengguang.wu@intel.com> wrote:

> From: Jan Kara <jack@suse.cz>
> 
> When wb_writeback() is called in WB_SYNC_ALL mode, work->nr_to_write is
> usually set to LONG_MAX. The logic in wb_writeback() then calls
> __writeback_inodes_sb() with nr_to_write == MAX_WRITEBACK_PAGES and thus
> we easily end up with negative nr_to_write after the function returns.

No, nr_to_write can only be negative if the filesystem wrote back more
pages than requested.

> wb_writeback() then decides we need another round of writeback but this
> is wrong in some cases! For example when a single large file is
> continuously dirtied, we would never finish syncing it because each pass
> would be able to write MAX_WRITEBACK_PAGES and inode dirty timestamp
> never gets updated (as inode is never completely clean).

Well we shouldn't have asked the function to write LONG_MAX pages then!

The way this used to work was to try to write back N=(total dirty pages
+ total unstable pages + various fudge factors) to each superblock.  So
each superblock will get fully written back unless someone is madly
writing to it.  If that _is_ happening then we'll write a large amount
of data to it and will then give up and move onto the next superblock.

But the "large amount of data" is constrained to a sane upper limit:
total amount of dirty memory plus fudge factors.  Increasing that sane
upper limit to an insane 2^63-1 pages will *of course* cause sync() to
livelock.

Why was that sane->insane change made?

> Fix the issue by setting nr_to_write to LONG_MAX in WB_SYNC_ALL mode. We
> do not need nr_to_write in WB_SYNC_ALL mode anyway since livelock
> avoidance is done differently for it.

Here the changelog should spell out what "done differently" means. 
Because I really am unsure what is begin referred to.

I don't really see how this patch changes anything.  For WB_SYNC_ALL
requests the code will still try to write out 2^63 pages, only it does
it all in a single writeback_inodes_wb() call.  What prevents that call
itself from getting livelocked?

Perhaps the unmentioned problem here is that each call to
writeback_inodes_wb(MAX_WRITEBACK_PAGES) will restart its walk across
the inode lists.  So instead of giving up on a being-written-to-file,
we continuously revisit it again and again and again.

Correct?  If so, please add the description.  If incorrect, please add
the description as well ;)


Root cause time: it's those damn per-sb inode lists *again*.  They're
just awful.  We need some data structure there which is more amenable
to being iterated over.  Something against which we can store cursors,
for a start.



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 5/5] writeback: check skipped pages on WB_SYNC_ALL
  2010-11-08 23:09 ` [PATCH 5/5] writeback: check skipped pages on WB_SYNC_ALL Wu Fengguang
@ 2010-11-09 22:47   ` Andrew Morton
  2010-11-09 23:16     ` Wu Fengguang
  0 siblings, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2010-11-09 22:47 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Tue, 09 Nov 2010 07:09:21 +0800
Wu Fengguang <fengguang.wu@intel.com> wrote:

> In WB_SYNC_ALL mode, filesystems are not expected to skip dirty pages on
> temporal lock contentions or non fatal errors, otherwise sync() will
> return without actually syncing the skipped pages. Add a check to
> catch possible redirty_page_for_writepage() callers that violate this
> expectation.
> 
> I'd recommend to keep this check in -mm tree for some time and fixup the
> possible warnings before pushing it to upstream.
> 
> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> ---
>  fs/fs-writeback.c |    1 +
>  1 file changed, 1 insertion(+)
> 
> --- linux-next.orig/fs/fs-writeback.c	2010-11-07 22:01:06.000000000 +0800
> +++ linux-next/fs/fs-writeback.c	2010-11-07 22:01:15.000000000 +0800
> @@ -527,6 +527,7 @@ static int writeback_sb_inodes(struct su
>  			 * buffers.  Skip this inode for now.
>  			 */
>  			redirty_tail(inode);
> +			WARN_ON_ONCE(wbc->sync_mode == WB_SYNC_ALL);
>  		}
>  		spin_unlock(&inode_lock);
>  		iput(inode);

This is quite kernel-developer-unfriendly.

Suppose the warning triggers.  Now some poor schmuck looks at the
warning and doesn't have a *clue* why it was added.  He has to run off
and grovel through git trees finding changelogs, which is a real pain
if the code has been trivially altered since it was first added.

As a general rule, a kernel developer should be able to look at a
warning callsite and then work out why the warning was emitted!


IOW, you owe us a code comment, please.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-09 22:28     ` Jan Kara
@ 2010-11-09 23:00       ` Andrew Morton
  2010-11-09 23:56         ` Jan Kara
  0 siblings, 1 reply; 23+ messages in thread
From: Andrew Morton @ 2010-11-09 23:00 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wu Fengguang, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Tue, 9 Nov 2010 23:28:27 +0100
Jan Kara <jack@suse.cz> wrote:

>   Hi,
> 
> On Tue 09-11-10 13:13:10, Andrew Morton wrote:
> > On Tue, 09 Nov 2010 07:09:19 +0800
> > Wu Fengguang <fengguang.wu@intel.com> wrote:
> > I find the description to be somewhat incomplete...
>   OK, so let me fill in the gaps ;)
> 
> > > From: Jan Kara <jack@suse.cz>
> > > 
> > > Background writeback are easily livelockable (from a definition of their
> > > target).
> > 
> > *why* is background writeback easily livelockable?  Under which
> > circumstances does this happen and how does it come about?
> > 
> > > This is inconvenient because it can make sync(1) stall forever waiting
> > > on its queued work to be finished.
> > 
> > Again, why?  Because there are works queued from the flusher thread,
> > but that thread is stuck in a livelocked state in <unspecified code
> > location> so it is unable to service the other works?  But the pocess
> > which called sync() will as a last resort itself perform all the
> > required IO, will it not?  If so, how can it livelock?
>   New description which should address above questions:
> Background writeback is easily livelockable in a loop in wb_writeback() by
> a process continuously re-dirtying pages (or continuously appending to a
> file). This is in fact intended as the target of background writeback is to
> write dirty pages it can find as long as we are over
> dirty_background_threshold.

Well.  The objective of the kupdate function is utterly different.

> But the above behavior gets inconvenient at times because no other work
> queued in the flusher thread's queue gets processed. In particular,
> since e.g. sync(1) relies on flusher thread to do all the IO for it,

That's fixable by doing the work synchronously within sync_inodes_sb(),
rather than twiddling thumbs wasting a thread resource while waiting
for kernel threads to do it.  As an added bonus, this even makes cpu
time accounting more accurate ;)

Please remind me why we decided to hand the sync_inodes_sb() work off
to other threads?

> sync(1) can hang forever waiting for flusher thread to do the work.
> 
> Generally, when a flusher thread has some work queued, someone submitted
> the work to achieve a goal more specific than what background writeback
> does. Moreover by working on the specific work, we also reduce amount of
> dirty pages which is exactly the target of background writeout. So it makes
> sense to give specific work a priority over a generic page cleaning.
> 
> Thus we interrupt background writeback if there is some other work to do. We
> return to the background writeback after completing all the queued work.
> 
> Is it better now?
> 
> > > Generally, when a flusher thread has
> > > some work queued, someone submitted the work to achieve a goal more specific
> > > than what background writeback does. So it makes sense to give it a priority
> > > over a generic page cleaning.
> > > 
> > > Thus we interrupt background writeback if there is some other work to do. We
> > > return to the background writeback after completing all the queued work.
> > > 
> > > Signed-off-by: Jan Kara <jack@suse.cz>
> > > Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> > > ---
> > >  fs/fs-writeback.c |    9 +++++++++
> > >  1 file changed, 9 insertions(+)
> > > 
> > > --- linux-next.orig/fs/fs-writeback.c	2010-11-07 21:56:42.000000000 +0800
> > > +++ linux-next/fs/fs-writeback.c	2010-11-07 22:00:51.000000000 +0800
> > > @@ -651,6 +651,15 @@ static long wb_writeback(struct bdi_writ
> > >  			break;
> > >  
> > >  		/*
> > > +		 * Background writeout and kupdate-style writeback are
> > > +		 * easily livelockable. Stop them if there is other work
> > > +		 * to do so that e.g. sync can proceed.
> > > +		 */
> > > +		if ((work->for_background || work->for_kupdate) &&
> > > +		    !list_empty(&wb->bdi->work_list))
> > > +			break;
> > > +
> > > +		/*
> > >  		 * For background writeout, stop when we are below the
> > >  		 * background dirty threshold
> > >  		 */
> > 
> > So...  what prevents higher priority works (eg, sync(1)) from
> > livelocking or seriously retarding background or kudate writeout?
>   If other work than background or kupdate writeout livelocks, it's a bug
> which should be fixed (either by setting sensible nr_to_write or by tagging
> like we do it for WB_SYNC_ALL writeback). Of course, higher priority work
> can be running when background or kupdate writeout would need to run as
> well. But the idea here is that the purpose of background/kupdate types of
> writeout is to get rid of dirty data and any type of writeout does this so
> working on it we also work on background/kupdate writeout only possibly
> less efficiently.

The kupdate function is a data-integrity/quality-of-service sort of
thing.

And what I'm asking is whether this change enables scenarios in which
these threads can be kept so busy that the kupdate function gets
interrupted so frequently that we can have dirty memory not being
written back for arbitrarily long periods of time?


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 5/5] writeback: check skipped pages on WB_SYNC_ALL
  2010-11-09 22:47   ` Andrew Morton
@ 2010-11-09 23:16     ` Wu Fengguang
  0 siblings, 0 replies; 23+ messages in thread
From: Wu Fengguang @ 2010-11-09 23:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Wed, Nov 10, 2010 at 06:47:28AM +0800, Andrew Morton wrote:
> On Tue, 09 Nov 2010 07:09:21 +0800
> Wu Fengguang <fengguang.wu@intel.com> wrote:
> 
> > In WB_SYNC_ALL mode, filesystems are not expected to skip dirty pages on
> > temporal lock contentions or non fatal errors, otherwise sync() will
> > return without actually syncing the skipped pages. Add a check to
> > catch possible redirty_page_for_writepage() callers that violate this
> > expectation.
> > 
> > I'd recommend to keep this check in -mm tree for some time and fixup the
> > possible warnings before pushing it to upstream.
> > 
> > Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> > ---
> >  fs/fs-writeback.c |    1 +
> >  1 file changed, 1 insertion(+)
> > 
> > --- linux-next.orig/fs/fs-writeback.c	2010-11-07 22:01:06.000000000 +0800
> > +++ linux-next/fs/fs-writeback.c	2010-11-07 22:01:15.000000000 +0800
> > @@ -527,6 +527,7 @@ static int writeback_sb_inodes(struct su
> >  			 * buffers.  Skip this inode for now.
> >  			 */
> >  			redirty_tail(inode);
> > +			WARN_ON_ONCE(wbc->sync_mode == WB_SYNC_ALL);
> >  		}
> >  		spin_unlock(&inode_lock);
> >  		iput(inode);
> 
> This is quite kernel-developer-unfriendly.
> 
> Suppose the warning triggers.  Now some poor schmuck looks at the
> warning and doesn't have a *clue* why it was added.  He has to run off
> and grovel through git trees finding changelogs, which is a real pain
> if the code has been trivially altered since it was first added.
> 
> As a general rule, a kernel developer should be able to look at a
> warning callsite and then work out why the warning was emitted!
> 
> 
> IOW, you owe us a code comment, please.

Good point!

I'll add this comment.

+			/*
+			 * There's no logic to retry skipped pages for sync(),
+			 * filesystems are assumed not to skip dirty pages on
+			 * temporal lock contentions or non fatal errors.
+			 */
+			WARN_ON_ONCE(wbc->sync_mode == WB_SYNC_ALL);

IOW, if some FS triggers this warning and it's non-trivial to fix the
FS, we'll have to work out a sync retry scheme for skipped pages.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/5] writeback: avoid livelocking WB_SYNC_ALL writeback
  2010-11-09 22:43   ` Andrew Morton
@ 2010-11-09 23:18     ` Jan Kara
  2010-11-10  2:26       ` Wu Fengguang
  0 siblings, 1 reply; 23+ messages in thread
From: Jan Kara @ 2010-11-09 23:18 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Wu Fengguang, Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Tue 09-11-10 14:43:46, Andrew Morton wrote:
> On Tue, 09 Nov 2010 07:09:20 +0800
> Wu Fengguang <fengguang.wu@intel.com> wrote:
> 
> > From: Jan Kara <jack@suse.cz>
> > 
> > When wb_writeback() is called in WB_SYNC_ALL mode, work->nr_to_write is
> > usually set to LONG_MAX. The logic in wb_writeback() then calls
> > __writeback_inodes_sb() with nr_to_write == MAX_WRITEBACK_PAGES and thus
> > we easily end up with negative nr_to_write after the function returns.
> 
> No, nr_to_write can only be negative if the filesystem wrote back more
> pages than requested.
  Since some time, write_cache_pages() does not stop when nr_to_write
<= 0 in WB_SYNC_ALL mode as that is a possible data-integrity issue (we
could have written newly created pages but not the ones written before
sync was called). So nr_to_write gets negative rather easily in
WB_SYNC_ALL mode.
 
> > wb_writeback() then decides we need another round of writeback but this
> > is wrong in some cases! For example when a single large file is
> > continuously dirtied, we would never finish syncing it because each pass
> > would be able to write MAX_WRITEBACK_PAGES and inode dirty timestamp
> > never gets updated (as inode is never completely clean).
> 
> Well we shouldn't have asked the function to write LONG_MAX pages then!
> 
> The way this used to work was to try to write back N=(total dirty pages
> + total unstable pages + various fudge factors) to each superblock.  So
> each superblock will get fully written back unless someone is madly
> writing to it.  If that _is_ happening then we'll write a large amount
> of data to it and will then give up and move onto the next superblock.
> 
> But the "large amount of data" is constrained to a sane upper limit:
> total amount of dirty memory plus fudge factors.  Increasing that sane
> upper limit to an insane 2^63-1 pages will *of course* cause sync() to
> livelock.
> 
> Why was that sane->insane change made?
  Note that we are speaking about WB_SYNC_ALL mode and for above mentioned
data integrity reason any finite nr_to_write is just wrong... That's why we
do all that complex page tagging livelock avoidance thing in
write_cache_pages().

> > Fix the issue by setting nr_to_write to LONG_MAX in WB_SYNC_ALL mode. We
> > do not need nr_to_write in WB_SYNC_ALL mode anyway since livelock
> > avoidance is done differently for it.
> 
> Here the changelog should spell out what "done differently" means. 
> Because I really am unsure what is begin referred to.
> 
> I don't really see how this patch changes anything.  For WB_SYNC_ALL
> requests the code will still try to write out 2^63 pages, only it does
> it all in a single writeback_inodes_wb() call.  What prevents that call
> itself from getting livelocked?
  I'm referring to the livelock avoidance using page tagging. Fengguang
actually added a note about this into a comment in the code but it's not
in the changelog. And you're right it should be here.

> Perhaps the unmentioned problem here is that each call to
> writeback_inodes_wb(MAX_WRITEBACK_PAGES) will restart its walk across
> the inode lists.  So instead of giving up on a being-written-to-file,
> we continuously revisit it again and again and again.
> 
> Correct?  If so, please add the description.  If incorrect, please add
> the description as well ;)
  Yes, that's the problem.

> Root cause time: it's those damn per-sb inode lists *again*.  They're
> just awful.  We need some data structure there which is more amenable
> to being iterated over.  Something against which we can store cursors,
> for a start.
  This would be definitely nice. But in this particular case, since we have
that page tagging livelock avoidance, we can just do all we need in a one
big sweep so we are OK.

Suggestion for the new changelog:
When wb_writeback() is called in WB_SYNC_ALL mode, work->nr_to_write is
usually set to LONG_MAX. The logic in wb_writeback() then calls
__writeback_inodes_sb() with nr_to_write == MAX_WRITEBACK_PAGES and
we easily end up with negative nr_to_write after the function returns.
This is because write_cache_pages() does not stop writing when
nr_to_write drops to zero in WB_SYNC_ALL mode.

When nr_to_write is <= 0 wb_writeback() decides we need another round of
writeback but this is wrong in some cases! For example when a single large
file is continuously dirtied, we would never finish syncing it because each
pass would be able to write MAX_WRITEBACK_PAGES and inode dirty timestamp
never gets updated (as inode is never completely clean). Thus
__writeback_inodes_sb() would write the redirtied inode again and again.

Fix the issue by setting nr_to_write to LONG_MAX in WB_SYNC_ALL mode.  We
do not need nr_to_write in WB_SYNC_ALL mode anyway since
write_cache_pages() does livelock avoidance using page tagging in
WB_SYNC_ALL mode.

After this patch, program from http://lkml.org/lkml/2010/10/24/154 is no
longer able to stall sync forever.
-

Is this better?

								Honza

-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-09 23:00       ` Andrew Morton
@ 2010-11-09 23:56         ` Jan Kara
  2010-11-10 23:37           ` Andrew Morton
  0 siblings, 1 reply; 23+ messages in thread
From: Jan Kara @ 2010-11-09 23:56 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Wu Fengguang, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Tue 09-11-10 15:00:06, Andrew Morton wrote:
> On Tue, 9 Nov 2010 23:28:27 +0100
> Jan Kara <jack@suse.cz> wrote:
> >   New description which should address above questions:
> > Background writeback is easily livelockable in a loop in wb_writeback() by
> > a process continuously re-dirtying pages (or continuously appending to a
> > file). This is in fact intended as the target of background writeback is to
> > write dirty pages it can find as long as we are over
> > dirty_background_threshold.
> 
> Well.  The objective of the kupdate function is utterly different.
> 
> > But the above behavior gets inconvenient at times because no other work
> > queued in the flusher thread's queue gets processed. In particular,
> > since e.g. sync(1) relies on flusher thread to do all the IO for it,
> 
> That's fixable by doing the work synchronously within sync_inodes_sb(),
> rather than twiddling thumbs wasting a thread resource while waiting
> for kernel threads to do it.  As an added bonus, this even makes cpu
> time accounting more accurate ;)
> 
> Please remind me why we decided to hand the sync_inodes_sb() work off
> to other threads?
  Because when sync(1) does IO on it's own, it competes for the device with
the flusher thread running in parallel thus resulting in more seeks.

> > sync(1) can hang forever waiting for flusher thread to do the work.
> > 
> > Generally, when a flusher thread has some work queued, someone submitted
> > the work to achieve a goal more specific than what background writeback
> > does. Moreover by working on the specific work, we also reduce amount of
> > dirty pages which is exactly the target of background writeout. So it makes
> > sense to give specific work a priority over a generic page cleaning.
> > 
> > Thus we interrupt background writeback if there is some other work to do. We
> > return to the background writeback after completing all the queued work.
> > 
...
> > > So...  what prevents higher priority works (eg, sync(1)) from
> > > livelocking or seriously retarding background or kudate writeout?
> >   If other work than background or kupdate writeout livelocks, it's a bug
> > which should be fixed (either by setting sensible nr_to_write or by tagging
> > like we do it for WB_SYNC_ALL writeback). Of course, higher priority work
> > can be running when background or kupdate writeout would need to run as
> > well. But the idea here is that the purpose of background/kupdate types of
> > writeout is to get rid of dirty data and any type of writeout does this so
> > working on it we also work on background/kupdate writeout only possibly
> > less efficiently.
> 
> The kupdate function is a data-integrity/quality-of-service sort of
> thing.
> 
> And what I'm asking is whether this change enables scenarios in which
> these threads can be kept so busy that the kupdate function gets
> interrupted so frequently that we can have dirty memory not being
> written back for arbitrarily long periods of time?
  So let me compare:
What kupdate writeback does:
  queue inodes older than dirty_expire_centisecs
  while some inode in the queue
    write MAX_WRITEBACK_PAGES from each inode queued
    break if nr_to_write <= 0

What any other WB_SYNC_NONE writeback (let me call it "normal WB_SYNC_NONE
writeback") does:
  queue all dirty inodes 
  while some inode in the queue
    write MAX_WRITEBACK_PAGES from each inode queued
    break if nr_to_write <= 0


There only one kind of WB_SYNC_ALL writeback - the one which writes
everything.

So after WB_SYNC_ALL writeback (provided all livelocks are fixed ;)
obviously no old data should be unwritten in memory. Normal WB_SYNC_NONE
writeback differs from a kupdate one *only* in the fact that we queue all
inodes instead of only the old ones. We start writing old inodes first and
go inode by inode writing MAX_WRITEBACK_PAGES from each. Now because the
queue can be longer for normal WB_SYNC_NONE writeback, it can take longer
before we return to the old inodes. So if normal writeback interrupts
kupdate one, it can take longer before all data of old inodes get to disk.
But we always get the old data to disk - essentially at the same time at
which kupdate writeback would get them to disk if dirty_expire_centisecs
was 0.

Is this enough? Do you want any of this in the changelog?

Thanks for the inquiry btw. It made me cleanup my thoughts on the subject ;)

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 4/5] writeback: avoid livelocking WB_SYNC_ALL writeback
  2010-11-09 23:18     ` Jan Kara
@ 2010-11-10  2:26       ` Wu Fengguang
  0 siblings, 0 replies; 23+ messages in thread
From: Wu Fengguang @ 2010-11-10  2:26 UTC (permalink / raw)
  To: Jan Kara
  Cc: Andrew Morton, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Wed, Nov 10, 2010 at 07:18:40AM +0800, Jan Kara wrote:
> On Tue 09-11-10 14:43:46, Andrew Morton wrote:

> > I don't really see how this patch changes anything.  For WB_SYNC_ALL
> > requests the code will still try to write out 2^63 pages, only it does
> > it all in a single writeback_inodes_wb() call.  What prevents that call

Sorry sync() works on one super block after another, so it's some
__writeback_inodes_sb() call. I'll update the comment.

> > itself from getting livelocked?

__writeback_inodes_sb() livelock is prevented by

- working on a finite set of files by doing queue_io() once at the beginning
- working on a finite set of pages by PAGECACHE_TAG_TOWRITE page tagging

>   I'm referring to the livelock avoidance using page tagging. Fengguang
> actually added a note about this into a comment in the code but it's not
> in the changelog. And you're right it should be here.

OK, I'll add the above to changelog.

> > Perhaps the unmentioned problem here is that each call to
> > writeback_inodes_wb(MAX_WRITEBACK_PAGES) will restart its walk across
> > the inode lists.  So instead of giving up on a being-written-to-file,
> > we continuously revisit it again and again and again.
> > 
> > Correct?  If so, please add the description.  If incorrect, please add
> > the description as well ;)
>   Yes, that's the problem.

writeback_inodes_wb(MAX_WRITEBACK_PAGES) will put the not full written
inode to head of b_more_io, and pick up the next inode from tail of
b_io next time it is called. Here the tail of b_io serves as the
cursor.

         b_io             b_more_io
        |----------------|-----------------|
        ^head            ^cursor           ^tail

> > Root cause time: it's those damn per-sb inode lists *again*.  They're
> > just awful.  We need some data structure there which is more amenable
> > to being iterated over.  Something against which we can store cursors,
> > for a start.
>   This would be definitely nice. But in this particular case, since we have
> that page tagging livelock avoidance, we can just do all we need in a one
> big sweep so we are OK.

The main problem of list_head is the awkward superblock walks in
move_expired_inodes(). It may take inode_lock for too long time.

It helps to break up b_dirty into a rb-tree. That will make
redirty_tail() more straightforward, too.

> Suggestion for the new changelog:
> When wb_writeback() is called in WB_SYNC_ALL mode, work->nr_to_write is
> usually set to LONG_MAX. The logic in wb_writeback() then calls
> __writeback_inodes_sb() with nr_to_write == MAX_WRITEBACK_PAGES and

> we easily end up with negative nr_to_write after the function returns.
> This is because write_cache_pages() does not stop writing when
> nr_to_write drops to zero in WB_SYNC_ALL mode.

It will return with (nr_to_write <=0) regardless of the
write_cache_pages() trick to ignore nr_to_write. So I changed the
above to:

        we easily end up with non-positive nr_to_write after the function
        returns, if the inode has more than MAX_WRITEBACK_PAGES dirty pages
        at the moment.

Others look good. I'll repost the series with updated changelog.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-09 23:56         ` Jan Kara
@ 2010-11-10 23:37           ` Andrew Morton
  2010-11-11  0:40             ` Wu Fengguang
  2010-11-11 16:44             ` Jan Kara
  0 siblings, 2 replies; 23+ messages in thread
From: Andrew Morton @ 2010-11-10 23:37 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wu Fengguang, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Wed, 10 Nov 2010 00:56:32 +0100
Jan Kara <jack@suse.cz> wrote:

> On Tue 09-11-10 15:00:06, Andrew Morton wrote:
> > On Tue, 9 Nov 2010 23:28:27 +0100
> > Jan Kara <jack@suse.cz> wrote:
> > >   New description which should address above questions:
> > > Background writeback is easily livelockable in a loop in wb_writeback() by
> > > a process continuously re-dirtying pages (or continuously appending to a
> > > file). This is in fact intended as the target of background writeback is to
> > > write dirty pages it can find as long as we are over
> > > dirty_background_threshold.
> > 
> > Well.  The objective of the kupdate function is utterly different.
> > 
> > > But the above behavior gets inconvenient at times because no other work
> > > queued in the flusher thread's queue gets processed. In particular,
> > > since e.g. sync(1) relies on flusher thread to do all the IO for it,
> > 
> > That's fixable by doing the work synchronously within sync_inodes_sb(),
> > rather than twiddling thumbs wasting a thread resource while waiting
> > for kernel threads to do it.  As an added bonus, this even makes cpu
> > time accounting more accurate ;)
> > 
> > Please remind me why we decided to hand the sync_inodes_sb() work off
> > to other threads?
>   Because when sync(1) does IO on it's own, it competes for the device with
> the flusher thread running in parallel thus resulting in more seeks.

Skeptical.  Has that effect been demonstrated?  Has it been shown to be
a significant problem?  A worse problem than livelocking the machine? ;)

If this _is_ a problem then it's also a problem for fsync/msync.  But
see below.

> > > sync(1) can hang forever waiting for flusher thread to do the work.
> > > 
> > > Generally, when a flusher thread has some work queued, someone submitted
> > > the work to achieve a goal more specific than what background writeback
> > > does. Moreover by working on the specific work, we also reduce amount of
> > > dirty pages which is exactly the target of background writeout. So it makes
> > > sense to give specific work a priority over a generic page cleaning.
> > > 
> > > Thus we interrupt background writeback if there is some other work to do. We
> > > return to the background writeback after completing all the queued work.
> > > 
> ...
> > > > So...  what prevents higher priority works (eg, sync(1)) from
> > > > livelocking or seriously retarding background or kudate writeout?
> > >   If other work than background or kupdate writeout livelocks, it's a bug
> > > which should be fixed (either by setting sensible nr_to_write or by tagging
> > > like we do it for WB_SYNC_ALL writeback). Of course, higher priority work
> > > can be running when background or kupdate writeout would need to run as
> > > well. But the idea here is that the purpose of background/kupdate types of
> > > writeout is to get rid of dirty data and any type of writeout does this so
> > > working on it we also work on background/kupdate writeout only possibly
> > > less efficiently.
> > 
> > The kupdate function is a data-integrity/quality-of-service sort of
> > thing.
> > 
> > And what I'm asking is whether this change enables scenarios in which
> > these threads can be kept so busy that the kupdate function gets
> > interrupted so frequently that we can have dirty memory not being
> > written back for arbitrarily long periods of time?
>   So let me compare:
> What kupdate writeback does:
>   queue inodes older than dirty_expire_centisecs
>   while some inode in the queue
>     write MAX_WRITEBACK_PAGES from each inode queued
>     break if nr_to_write <= 0
> 
> What any other WB_SYNC_NONE writeback (let me call it "normal WB_SYNC_NONE
> writeback") does:
>   queue all dirty inodes 
>   while some inode in the queue
>     write MAX_WRITEBACK_PAGES from each inode queued
>     break if nr_to_write <= 0
> 
> 
> There only one kind of WB_SYNC_ALL writeback - the one which writes
> everything.

fsync() uses WB_SYNC_ALL and it doesn't write "everything".

> So after WB_SYNC_ALL writeback (provided all livelocks are fixed ;)
> obviously no old data should be unwritten in memory. Normal WB_SYNC_NONE
> writeback differs from a kupdate one *only* in the fact that we queue all
> inodes instead of only the old ones.

whoa.

That's only true for sync(), or sync_filesystem().  It isn't true for,
say, fsync() and msync().  Now when someone comes along and changes
fsync/msync to use flusher threads ("resulting in less seeks") then we
could get into a situation where fsync-serving flusher threads never
serve kupdate?

> We start writing old inodes first and

OT, but: your faith in those time-ordered inode lists is touching ;)
Put a debug function in there which checks that the lists _are_
time-ordered, and call that function from every site in the kernel
which modifies the lists.   I bet there are still gremlins.

> go inode by inode writing MAX_WRITEBACK_PAGES from each. Now because the
> queue can be longer for normal WB_SYNC_NONE writeback, it can take longer
> before we return to the old inodes. So if normal writeback interrupts
> kupdate one, it can take longer before all data of old inodes get to disk.
> But we always get the old data to disk - essentially at the same time at
> which kupdate writeback would get them to disk if dirty_expire_centisecs
> was 0.
> 
> Is this enough? Do you want any of this in the changelog?
> 
> Thanks for the inquiry btw. It made me cleanup my thoughts on the subject ;)
> 


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-10 23:37           ` Andrew Morton
@ 2010-11-11  0:40             ` Wu Fengguang
  2010-11-11 13:32               ` Christoph Hellwig
  2010-11-11 16:44             ` Jan Kara
  1 sibling, 1 reply; 23+ messages in thread
From: Wu Fengguang @ 2010-11-11  0:40 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Thu, Nov 11, 2010 at 07:37:29AM +0800, Andrew Morton wrote:
> On Wed, 10 Nov 2010 00:56:32 +0100
> Jan Kara <jack@suse.cz> wrote:
> 
> > On Tue 09-11-10 15:00:06, Andrew Morton wrote:
> > > On Tue, 9 Nov 2010 23:28:27 +0100
> > > Jan Kara <jack@suse.cz> wrote:
> > > >   New description which should address above questions:
> > > > Background writeback is easily livelockable in a loop in wb_writeback() by
> > > > a process continuously re-dirtying pages (or continuously appending to a
> > > > file). This is in fact intended as the target of background writeback is to
> > > > write dirty pages it can find as long as we are over
> > > > dirty_background_threshold.
> > > 
> > > Well.  The objective of the kupdate function is utterly different.
> > > 
> > > > But the above behavior gets inconvenient at times because no other work
> > > > queued in the flusher thread's queue gets processed. In particular,
> > > > since e.g. sync(1) relies on flusher thread to do all the IO for it,
> > > 
> > > That's fixable by doing the work synchronously within sync_inodes_sb(),
> > > rather than twiddling thumbs wasting a thread resource while waiting
> > > for kernel threads to do it.  As an added bonus, this even makes cpu
> > > time accounting more accurate ;)
> > > 
> > > Please remind me why we decided to hand the sync_inodes_sb() work off
> > > to other threads?
> >   Because when sync(1) does IO on it's own, it competes for the device with
> > the flusher thread running in parallel thus resulting in more seeks.
> 
> Skeptical.  Has that effect been demonstrated?  Has it been shown to be
> a significant problem?  A worse problem than livelocking the machine? ;)
> 
> If this _is_ a problem then it's also a problem for fsync/msync.  But
> see below.

Seriously, I also doubt the value of doing sync() in the flusher thread.
sync() is by definition inefficient. In the block layer, it's served
with less emphasis on throughput. In the VFS layer, it may sleep in
inode_wait_for_writeback() and filemap_fdatawait(). In various FS,
pages won't be skipped at the cost of more lock waiting.

So when a flusher thread is serving sync(), it has difficulties
saturating the storage device.

btw, it seems current sync() does not take advantage of the flusher
threads to sync multiple disks in parallel.

And I guess (concurrent) sync/fsync/msync calls will be rare,
especially for really performance demanding workloads (which will
optimize sync away in the first place).

And I'm still worrying about the sync work (which may take long time
to serve even without livelock) to delay other works considerably --
may not be a problem for now, but it will be a real priority dilemma
when we start writeback works from pageout().

> OT, but: your faith in those time-ordered inode lists is touching ;)
> Put a debug function in there which checks that the lists _are_
> time-ordered, and call that function from every site in the kernel
> which modifies the lists.   I bet there are still gremlins.

I'm more confident on that time orderness ;) But there is a caveat:
redirty_tail() may touch dirtied_when. So it merely keeps the time
orderness of b_dirty on the surface.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-11  0:40             ` Wu Fengguang
@ 2010-11-11 13:32               ` Christoph Hellwig
  0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2010-11-11 13:32 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Andrew Morton, Jan Kara, linux-fsdevel, linux-mm,
	Johannes Weiner, Christoph Hellwig, Jan Engelhardt, LKML

On Thu, Nov 11, 2010 at 08:40:47AM +0800, Wu Fengguang wrote:
> Seriously, I also doubt the value of doing sync() in the flusher thread.
> sync() is by definition inefficient. In the block layer, it's served
> with less emphasis on throughput. In the VFS layer, it may sleep in
> inode_wait_for_writeback() and filemap_fdatawait(). In various FS,
> pages won't be skipped at the cost of more lock waiting.
> 
> So when a flusher thread is serving sync(), it has difficulties
> saturating the storage device.
> 
> btw, it seems current sync() does not take advantage of the flusher
> threads to sync multiple disks in parallel.

sys_sync does a wakeup_flusher_threads(0) which kicks all flusher
threads to write back all data.  We then do another serialized task
of kicking per-sb writeback, and then do synchronous per-sb writeback,
interwinded with non-blocking and blocking quota sync, ->sync_fs
and __sync_blockdev calls.

The way sync ended up looking like it did is rather historic:

 First Jan and I fixed sync to actually get data correctly to disk,
 the way is was done traditionally had a lot of issues with ordering
 it's steps.  For that we changed it to just loop around sync_filesystem
 to have one common place to define the proper order for it.
 That caused a large performance regression, which Yanmin Zhang found
 and fixed, which added back the wakeup_pdflush(0) (which later became
 wakeup_flusher_threads).

 The introduction of the per-bdi writeback threads by Jens changed
 writeback_inodes_sb and sync_inodes_sb to offload the I/O submission
 to the I/O thread.

I'm not overly happy with the current situation.  Part of that is
the rather complex callchain in __sync_filesystem.  If we moved the
quota sync and the sync_blockdev into ->sync_fs we'd already be down
to a much more managable level, and we could optimize sync down to:


	wakeup_flusher_threads(0);

	for_each_sb
		sb->sync_fs(sb, 0)

	for_each_sb {
		sync_inodes_sb(sb);
		sb->sync_fs(sb, 1)
	}

We would still try to do most of the I/O from the flusher thread,
but only if we can do it non-blocking.  If we have to block we'll
get less optimal I/O patterns, but at least we don't block other
writeback while waiting for it.

I suspect a big problem for the statving workloads is that we only
do the live-lock avoidance tagging inside sync_inodes_sb, so
any page written by wakeup_flusher_threads, or the writeback_inodes_sb
in the two first passes that gets redirties is synced out again.

But I'd feel very vary doing this without a lot of performance testing.
dpkg package install workloads, the ffsb create_4k test Yanmin used,
or fs_mark in one of the sync using versions would be a good benchmark.

Btw, where in the block I/O code do we penalize sync?


I don't think moving the I/O submission into the caller is going to
help us anything.  What we should look into instead is to make as
much of the I/O submission non-blocking even inside sync.  

> And I guess (concurrent) sync/fsync/msync calls will be rare,
> especially for really performance demanding workloads (which will
> optimize sync away in the first place).

There is no way to optimize a sync away if you want your data on disk.

The most common case is fsync, followed by O_SYNC, but for example due
to the massive fsync suckage on ext3 dpkg for example has switched to
sync instead, which is quite nasty if you have other work going on.

Offloading fsync to the flusher thread is an interesting idea, but I
wonder how well it works.  Both fsync and sync are calls the application
waits on to make progress, so naturally we gave them some preference
to decrease the latency will penalizing background writeback.  By
first trying an asynchronous pass via the flusher thread we could
optimize the I/O pattern, but at a huge cost to the latency for
the caller.



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-10 23:37           ` Andrew Morton
  2010-11-11  0:40             ` Wu Fengguang
@ 2010-11-11 16:44             ` Jan Kara
  1 sibling, 0 replies; 23+ messages in thread
From: Jan Kara @ 2010-11-11 16:44 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Wu Fengguang, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

On Wed 10-11-10 15:37:29, Andrew Morton wrote:
> On Wed, 10 Nov 2010 00:56:32 +0100
> Jan Kara <jack@suse.cz> wrote:
> > On Tue 09-11-10 15:00:06, Andrew Morton wrote:
> > > On Tue, 9 Nov 2010 23:28:27 +0100
> > > Jan Kara <jack@suse.cz> wrote:
> > > >   New description which should address above questions:
> > > > Background writeback is easily livelockable in a loop in wb_writeback() by
> > > > a process continuously re-dirtying pages (or continuously appending to a
> > > > file). This is in fact intended as the target of background writeback is to
> > > > write dirty pages it can find as long as we are over
> > > > dirty_background_threshold.
> > > 
> > > Well.  The objective of the kupdate function is utterly different.
> > > 
> > > > But the above behavior gets inconvenient at times because no other work
> > > > queued in the flusher thread's queue gets processed. In particular,
> > > > since e.g. sync(1) relies on flusher thread to do all the IO for it,
> > > 
> > > That's fixable by doing the work synchronously within sync_inodes_sb(),
> > > rather than twiddling thumbs wasting a thread resource while waiting
> > > for kernel threads to do it.  As an added bonus, this even makes cpu
> > > time accounting more accurate ;)
> > > 
> > > Please remind me why we decided to hand the sync_inodes_sb() work off
> > > to other threads?
> >   Because when sync(1) does IO on it's own, it competes for the device with
> > the flusher thread running in parallel thus resulting in more seeks.
> 
> Skeptical.  Has that effect been demonstrated?  Has it been shown to be
> a significant problem?  A worse problem than livelocking the machine? ;)
  I don't think Jens has tested this when merging per-bdi writeback
patches. Just the argument makes sense to me (I've seen some loads where
application doing IO from balance_dirty_pages() together with pdflush doing
writeback did seriously harm performance and this looks like a similar
scenario). But I'd be also interested whether the theory and practice
matches here so I'll try to do some test to compare the two approaches.

> If this _is_ a problem then it's also a problem for fsync/msync.  But
> see below.
> 
> > > > sync(1) can hang forever waiting for flusher thread to do the work.
> > > > 
> > > > Generally, when a flusher thread has some work queued, someone submitted
> > > > the work to achieve a goal more specific than what background writeback
> > > > does. Moreover by working on the specific work, we also reduce amount of
> > > > dirty pages which is exactly the target of background writeout. So it makes
> > > > sense to give specific work a priority over a generic page cleaning.
> > > > 
> > > > Thus we interrupt background writeback if there is some other work to do. We
> > > > return to the background writeback after completing all the queued work.
> > > > 
> > ...
> > > > > So...  what prevents higher priority works (eg, sync(1)) from
> > > > > livelocking or seriously retarding background or kudate writeout?
> > > >   If other work than background or kupdate writeout livelocks, it's a bug
> > > > which should be fixed (either by setting sensible nr_to_write or by tagging
> > > > like we do it for WB_SYNC_ALL writeback). Of course, higher priority work
> > > > can be running when background or kupdate writeout would need to run as
> > > > well. But the idea here is that the purpose of background/kupdate types of
> > > > writeout is to get rid of dirty data and any type of writeout does this so
> > > > working on it we also work on background/kupdate writeout only possibly
> > > > less efficiently.
> > > 
> > > The kupdate function is a data-integrity/quality-of-service sort of
> > > thing.
> > > 
> > > And what I'm asking is whether this change enables scenarios in which
> > > these threads can be kept so busy that the kupdate function gets
> > > interrupted so frequently that we can have dirty memory not being
> > > written back for arbitrarily long periods of time?
> >   So let me compare:
> > What kupdate writeback does:
> >   queue inodes older than dirty_expire_centisecs
> >   while some inode in the queue
> >     write MAX_WRITEBACK_PAGES from each inode queued
> >     break if nr_to_write <= 0
> > 
> > What any other WB_SYNC_NONE writeback (let me call it "normal WB_SYNC_NONE
> > writeback") does:
> >   queue all dirty inodes 
> >   while some inode in the queue
> >     write MAX_WRITEBACK_PAGES from each inode queued
> >     break if nr_to_write <= 0
> > 
> > 
> > There only one kind of WB_SYNC_ALL writeback - the one which writes
> > everything.
> 
> fsync() uses WB_SYNC_ALL and it doesn't write "everything".
  Yes, but nobody submits fsync() work to a flusher thread (currently,
there's no way to tell flusher thread to operate only on a single inode).
But as you write below *if* somebody started doing it and fsync() would be
called often enough, we would have a problem.

> > So after WB_SYNC_ALL writeback (provided all livelocks are fixed ;)
> > obviously no old data should be unwritten in memory. Normal WB_SYNC_NONE
> > writeback differs from a kupdate one *only* in the fact that we queue all
> > inodes instead of only the old ones.
> 
> whoa.
> 
> That's only true for sync(), or sync_filesystem().  It isn't true for,
> say, fsync() and msync().  Now when someone comes along and changes
> fsync/msync to use flusher threads ("resulting in less seeks") then we
> could get into a situation where fsync-serving flusher threads never
> serve kupdate?
> 
> > We start writing old inodes first and
> 
> OT, but: your faith in those time-ordered inode lists is touching ;)
> Put a debug function in there which checks that the lists _are_
> time-ordered, and call that function from every site in the kernel
> which modifies the lists.   I bet there are still gremlins.
  ;-)

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-10  3:55   ` Wu Fengguang
@ 2010-11-10 16:26     ` Jan Kara
  0 siblings, 0 replies; 23+ messages in thread
From: Jan Kara @ 2010-11-10 16:26 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Andrew Morton, Jan Kara, linux-fsdevel, linux-mm,
	Johannes Weiner, Christoph Hellwig, Jan Engelhardt, LKML

On Wed 10-11-10 11:55:16, Wu Fengguang wrote:
> Jan, the below comment is also updated, please double check.
  Thanks! The comment looks OK.

> >  		/*
> > +		 * Background writeout and kupdate-style writeback may
> > +		 * run forever. Stop them if there is other work to do
> > +		 * so that e.g. sync can proceed. They'll be restarted
> > +		 * after the other works are all done.
> > +		 */
> > +		if ((work->for_background || work->for_kupdate) &&
> > +		    !list_empty(&wb->bdi->work_list))
> > +			break;
> > +
> > +		/*
> >  		 * For background writeout, stop when we are below the
> >  		 * background dirty threshold
> >  		 */

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-10  2:35 ` [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works Wu Fengguang
@ 2010-11-10  3:55   ` Wu Fengguang
  2010-11-10 16:26     ` Jan Kara
  0 siblings, 1 reply; 23+ messages in thread
From: Wu Fengguang @ 2010-11-10  3:55 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner,
	Christoph Hellwig, Jan Engelhardt, LKML

Jan, the below comment is also updated, please double check.

>  
>  		/*
> +		 * Background writeout and kupdate-style writeback may
> +		 * run forever. Stop them if there is other work to do
> +		 * so that e.g. sync can proceed. They'll be restarted
> +		 * after the other works are all done.
> +		 */
> +		if ((work->for_background || work->for_kupdate) &&
> +		    !list_empty(&wb->bdi->work_list))
> +			break;
> +
> +		/*
>  		 * For background writeout, stop when we are below the
>  		 * background dirty threshold
>  		 */
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works
  2010-11-10  2:35 [PATCH 0/5] writeback livelock fixes v2 Wu Fengguang
@ 2010-11-10  2:35 ` Wu Fengguang
  2010-11-10  3:55   ` Wu Fengguang
  0 siblings, 1 reply; 23+ messages in thread
From: Wu Fengguang @ 2010-11-10  2:35 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, linux-fsdevel, linux-mm, Johannes Weiner, Wu Fengguang,
	Christoph Hellwig, Jan Engelhardt, LKML

[-- Attachment #1: 0002-mm-Stop-background-writeback-if-there-is-other-work-.patch --]
[-- Type: text/plain, Size: 2125 bytes --]

From: Jan Kara <jack@suse.cz>

Background writeback is easily livelockable in a loop in wb_writeback()
by a process continuously re-dirtying pages (or continuously appending
to a file). This is in fact intended as the target of background
writeback is to write dirty pages it can find as long as we are over
dirty_background_threshold.

But the above behavior gets inconvenient at times because no other work
queued in the flusher thread's queue gets processed. In particular,
since e.g. sync(1) relies on flusher thread to do all the IO for it,
sync(1) can hang forever waiting for flusher thread to do the work.

Generally, when a flusher thread has some work queued, someone submitted
the work to achieve a goal more specific than what background writeback
does. Moreover by working on the specific work, we also reduce amount of
dirty pages which is exactly the target of background writeout. So it
makes sense to give specific work a priority over a generic page
cleaning.

Thus we interrupt background writeback if there is some other work to
do. We return to the background writeback after completing all the
queued work.

This may delay the writeback of expired inodes for a while, however the
expired inodes will eventually be flushed to disk as long as the other
works won't livelock.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c |   10 ++++++++++
 1 file changed, 10 insertions(+)

--- linux-next.orig/fs/fs-writeback.c	2010-11-10 07:04:34.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2010-11-10 10:32:09.000000000 +0800
@@ -651,6 +651,16 @@ static long wb_writeback(struct bdi_writ
 			break;
 
 		/*
+		 * Background writeout and kupdate-style writeback may
+		 * run forever. Stop them if there is other work to do
+		 * so that e.g. sync can proceed. They'll be restarted
+		 * after the other works are all done.
+		 */
+		if ((work->for_background || work->for_kupdate) &&
+		    !list_empty(&wb->bdi->work_list))
+			break;
+
+		/*
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */



^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2010-11-11 16:44 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-11-08 23:09 [PATCH 0/5] writeback livelock fixes Wu Fengguang
2010-11-08 23:09 ` [PATCH 1/5] writeback: integrated background writeback work Wu Fengguang
2010-11-08 23:09 ` [PATCH 2/5] writeback: trace wakeup event for background writeback Wu Fengguang
2010-11-08 23:09 ` [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works Wu Fengguang
2010-11-09 21:13   ` Andrew Morton
2010-11-09 22:28     ` Jan Kara
2010-11-09 23:00       ` Andrew Morton
2010-11-09 23:56         ` Jan Kara
2010-11-10 23:37           ` Andrew Morton
2010-11-11  0:40             ` Wu Fengguang
2010-11-11 13:32               ` Christoph Hellwig
2010-11-11 16:44             ` Jan Kara
2010-11-08 23:09 ` [PATCH 4/5] writeback: avoid livelocking WB_SYNC_ALL writeback Wu Fengguang
2010-11-09 22:43   ` Andrew Morton
2010-11-09 23:18     ` Jan Kara
2010-11-10  2:26       ` Wu Fengguang
2010-11-08 23:09 ` [PATCH 5/5] writeback: check skipped pages on WB_SYNC_ALL Wu Fengguang
2010-11-09 22:47   ` Andrew Morton
2010-11-09 23:16     ` Wu Fengguang
2010-11-08 23:23 ` [PATCH 0/5] writeback livelock fixes Wu Fengguang
2010-11-10  2:35 [PATCH 0/5] writeback livelock fixes v2 Wu Fengguang
2010-11-10  2:35 ` [PATCH 3/5] writeback: stop background/kupdate works from livelocking other works Wu Fengguang
2010-11-10  3:55   ` Wu Fengguang
2010-11-10 16:26     ` Jan Kara

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).