linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] sync livelock fixes
@ 2011-04-30 22:36 Wu Fengguang
  2011-04-30 22:36 ` [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Wu Fengguang @ 2011-04-30 22:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Dave Chinner, Christoph Hellwig, Wu Fengguang, LKML,
	linux-fsdevel


Andrew,

I wrote a simple script to test sync livelock and this patchset is working as
expected:

	sync time: 2
	Dirty:             26492 kB
	Writeback:         30260 kB
	NFS_Unstable:          0 kB
	WritebackTmp:          0 kB
	sync NOT livelocked

In particular patch 2 fixes the sync livelock problem introduced by patch
"writeback: try more writeback as long as something was written".

Thanks,
Fengguang
---

#!/bin/sh

umount /dev/sda7
# mkfs.xfs -f /dev/sda7
mkfs.ext4 /dev/sda7
mount /dev/sda7 /fs

echo $((50<<20)) > /proc/sys/vm/dirty_bytes

pid=
for i in `seq 10`
do     
	dd if=/dev/zero of=/fs/zero-$i bs=1M count=1000 &
	pid="$pid $!"
done

sleep 1

tic=$(date +'%s')
sync
tac=$(date +'%s')

echo
echo sync time: $((tac-tic))
egrep '(Dirty|Writeback|NFS_Unstable)' /proc/meminfo

pidof dd > /dev/null && { kill -9 $pid; echo sync NOT livelocked; }



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages
  2011-04-30 22:36 [PATCH 0/3] sync livelock fixes Wu Fengguang
@ 2011-04-30 22:36 ` Wu Fengguang
  2011-05-01  7:46   ` Dave Chinner
  2011-04-30 22:36 ` [PATCH 2/3] writeback: update dirtied_when for synced inode to prevent livelock Wu Fengguang
  2011-04-30 22:36 ` [PATCH 3/3] writeback: avoid extra sync work at enqueue time Wu Fengguang
  2 siblings, 1 reply; 6+ messages in thread
From: Wu Fengguang @ 2011-04-30 22:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Dave Chinner, Wu Fengguang, Christoph Hellwig, LKML,
	linux-fsdevel

[-- Attachment #1: writeback-for-sync.patch --]
[-- Type: text/plain, Size: 6854 bytes --]

sync(2) is performed in two stages: the WB_SYNC_NONE sync and the
WB_SYNC_ALL sync. Tag both stages with wbc.for_sync for livelock
prevention.

Note that writeback_inodes_sb() is called by not only sync(), they
are treated the same because the other callers need also need livelock
prevention.

Impacts:

- it changes the order in which pages/inodes are synced to disk. Now in
  the WB_SYNC_NONE stage, it won't proceed to write the next inode until
  finished with the current inode.

- this adds a new field to the writeback trace events and may possibly
  break some scripts.

CC: Jan Kara <jack@suse.cz>
CC: Dave Chinner <david@fromorbit.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/ext4/inode.c                  |    4 ++--
 fs/fs-writeback.c                |   10 ++++++----
 include/linux/writeback.h        |    1 +
 include/trace/events/writeback.h |   10 ++++++++--
 mm/page-writeback.c              |    4 ++--
 5 files changed, 19 insertions(+), 10 deletions(-)

--- linux-next.orig/fs/fs-writeback.c	2011-05-01 06:35:15.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-05-01 06:35:17.000000000 +0800
@@ -36,6 +36,7 @@ struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
 	enum writeback_sync_modes sync_mode;
+	unsigned int for_sync:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
@@ -645,13 +646,14 @@ static long wb_writeback(struct bdi_writ
 	struct writeback_control wbc = {
 		.sync_mode		= work->sync_mode,
 		.older_than_this	= NULL,
+		.for_sync		= work->for_sync,
 		.for_kupdate		= work->for_kupdate,
 		.for_background		= work->for_background,
 		.range_cyclic		= work->range_cyclic,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
-	long write_chunk;
+	long write_chunk = MAX_WRITEBACK_PAGES;
 	struct inode *inode;
 
 	if (!wbc.range_cyclic) {
@@ -672,9 +674,7 @@ static long wb_writeback(struct bdi_writ
 	 *                   (quickly) tag currently dirty pages
 	 *                   (maybe slowly) sync all tagged pages
 	 */
-	if (wbc.sync_mode == WB_SYNC_NONE)
-		write_chunk = MAX_WRITEBACK_PAGES;
-	else
+	if (wbc.for_sync)
 		write_chunk = LONG_MAX;
 
 	wbc.wb_start = jiffies; /* livelock avoidance */
@@ -1209,6 +1209,7 @@ void writeback_inodes_sb_nr(struct super
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
+		.for_sync	= 1,
 		.done		= &done,
 		.nr_pages	= nr,
 	};
@@ -1286,6 +1287,7 @@ void sync_inodes_sb(struct super_block *
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
+		.for_sync	= 1,
 		.nr_pages	= LONG_MAX,
 		.range_cyclic	= 0,
 		.done		= &done,
--- linux-next.orig/include/linux/writeback.h	2011-05-01 06:35:16.000000000 +0800
+++ linux-next/include/linux/writeback.h	2011-05-01 06:35:17.000000000 +0800
@@ -46,6 +46,7 @@ struct writeback_control {
 	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_background:1;	/* A background writeback */
+	unsigned for_sync:1;		/* do livelock prevention for sync */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
--- linux-next.orig/mm/page-writeback.c	2011-05-01 06:35:16.000000000 +0800
+++ linux-next/mm/page-writeback.c	2011-05-01 06:35:17.000000000 +0800
@@ -892,12 +892,12 @@ int write_cache_pages(struct address_spa
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->for_sync)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->for_sync)
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && (index <= end)) {
--- linux-next.orig/include/trace/events/writeback.h	2011-05-01 06:35:16.000000000 +0800
+++ linux-next/include/trace/events/writeback.h	2011-05-01 06:35:17.000000000 +0800
@@ -17,6 +17,7 @@ DECLARE_EVENT_CLASS(writeback_work_class
 		__array(char, name, 32)
 		__field(long, nr_pages)
 		__field(dev_t, sb_dev)
+		__field(int, for_sync)
 		__field(int, sync_mode)
 		__field(int, for_kupdate)
 		__field(int, range_cyclic)
@@ -26,16 +27,18 @@ DECLARE_EVENT_CLASS(writeback_work_class
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
 		__entry->nr_pages = work->nr_pages;
 		__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
+		__entry->for_sync = work->for_sync;
 		__entry->sync_mode = work->sync_mode;
 		__entry->for_kupdate = work->for_kupdate;
 		__entry->range_cyclic = work->range_cyclic;
 		__entry->for_background	= work->for_background;
 	),
-	TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
+	TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync=%d sync_mode=%d "
 		  "kupdate=%d range_cyclic=%d background=%d",
 		  __entry->name,
 		  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
 		  __entry->nr_pages,
+		  __entry->for_sync,
 		  __entry->sync_mode,
 		  __entry->for_kupdate,
 		  __entry->range_cyclic,
@@ -96,6 +99,7 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__array(char, name, 32)
 		__field(long, nr_to_write)
 		__field(long, pages_skipped)
+		__field(int, for_sync)
 		__field(int, sync_mode)
 		__field(int, for_kupdate)
 		__field(int, for_background)
@@ -111,6 +115,7 @@ DECLARE_EVENT_CLASS(wbc_class,
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
 		__entry->nr_to_write	= wbc->nr_to_write;
 		__entry->pages_skipped	= wbc->pages_skipped;
+		__entry->for_sync	= wbc->for_sync;
 		__entry->sync_mode	= wbc->sync_mode;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_background	= wbc->for_background;
@@ -123,12 +128,13 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->range_end	= (long)wbc->range_end;
 	),
 
-	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
+	TP_printk("bdi %s: towrt=%ld skip=%ld sync=%d mode=%d kupd=%d "
 		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
 		"start=0x%lx end=0x%lx",
 		__entry->name,
 		__entry->nr_to_write,
 		__entry->pages_skipped,
+		__entry->for_sync,
 		__entry->sync_mode,
 		__entry->for_kupdate,
 		__entry->for_background,
--- linux-next.orig/fs/ext4/inode.c	2011-05-01 06:35:15.000000000 +0800
+++ linux-next/fs/ext4/inode.c	2011-05-01 06:35:17.000000000 +0800
@@ -2741,7 +2741,7 @@ static int write_cache_pages_da(struct a
 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
 
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->for_sync)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
@@ -2975,7 +2975,7 @@ static int ext4_da_writepages(struct add
 	}
 
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->for_sync)
 		tag_pages_for_writeback(mapping, index, end);
 
 	while (!ret && wbc->nr_to_write > 0) {



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 2/3] writeback: update dirtied_when for synced inode to prevent livelock
  2011-04-30 22:36 [PATCH 0/3] sync livelock fixes Wu Fengguang
  2011-04-30 22:36 ` [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
@ 2011-04-30 22:36 ` Wu Fengguang
  2011-04-30 22:36 ` [PATCH 3/3] writeback: avoid extra sync work at enqueue time Wu Fengguang
  2 siblings, 0 replies; 6+ messages in thread
From: Wu Fengguang @ 2011-04-30 22:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Dave Chinner, Wu Fengguang, Christoph Hellwig, LKML,
	linux-fsdevel

[-- Attachment #1: writeback-fix-sync-busyloop.patch --]
[-- Type: text/plain, Size: 3573 bytes --]

With the more aggressive "keep writeback as long as we wrote something"
logic in wb_writeback(), the "use LONG_MAX .nr_to_write" trick in commit
b9543dac5bbc ("writeback: avoid livelocking WB_SYNC_ALL writeback") is
no longer enough to stop sync livelock.

The fix is to explicitly update .dirtied_when on synced inodes, so that
they are no longer considered for writeback in the next round.

CC: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c |    8 ++++++++
 1 file changed, 8 insertions(+)

ext3/ext4 are working fine now, however tests show that XFS may still
livelock inside the XFS routines:

[ 3581.181253] sync            D ffff8800b6ca15d8  4560  4403   4392 0x00000000
[ 3581.181734]  ffff88006f775bc8 0000000000000046 ffff8800b6ca12b8 00000001b6ca1938
[ 3581.182411]  ffff88006f774000 00000000001d2e40 00000000001d2e40 ffff8800b6ca1280
[ 3581.183088]  00000000001d2e40 ffff88006f775fd8 00000340af111ef2 00000000001d2e40
[ 3581.183765] Call Trace:
[ 3581.184008]  [<ffffffff8109be73>] ? lock_release_holdtime+0xa3/0xab
[ 3581.184392]  [<ffffffff8108cc0d>] ? prepare_to_wait+0x6c/0x79
[ 3581.184756]  [<ffffffff8108cc0d>] ? prepare_to_wait+0x6c/0x79
[ 3581.185120]  [<ffffffff812ed520>] xfs_ioend_wait+0x87/0x9f
[ 3581.185474]  [<ffffffff8108c97a>] ? wake_up_bit+0x2a/0x2a
[ 3581.185827]  [<ffffffff812f742a>] xfs_sync_inode_data+0x92/0x9d
[ 3581.186198]  [<ffffffff812f76e2>] xfs_inode_ag_walk+0x1a5/0x287
[ 3581.186569]  [<ffffffff812f779b>] ? xfs_inode_ag_walk+0x25e/0x287
[ 3581.186946]  [<ffffffff812f7398>] ? xfs_sync_worker+0x69/0x69
[ 3581.187311]  [<ffffffff812e2354>] ? xfs_perag_get+0x68/0xd0
[ 3581.187669]  [<ffffffff81092175>] ? local_clock+0x41/0x5a
[ 3581.188020]  [<ffffffff8109be73>] ? lock_release_holdtime+0xa3/0xab
[ 3581.188403]  [<ffffffff812e22ec>] ? xfs_check_sizes+0x160/0x160
[ 3581.188773]  [<ffffffff812e2354>] ? xfs_perag_get+0x68/0xd0
[ 3581.189130]  [<ffffffff812e236c>] ? xfs_perag_get+0x80/0xd0
[ 3581.189488]  [<ffffffff812e22ec>] ? xfs_check_sizes+0x160/0x160
[ 3581.189858]  [<ffffffff812f7831>] ? xfs_inode_ag_iterator+0x6d/0x8f
[ 3581.190241]  [<ffffffff812f7398>] ? xfs_sync_worker+0x69/0x69
[ 3581.190606]  [<ffffffff812f780b>] xfs_inode_ag_iterator+0x47/0x8f
[ 3581.190982]  [<ffffffff811611f5>] ? __sync_filesystem+0x7a/0x7a
[ 3581.191352]  [<ffffffff812f7877>] xfs_sync_data+0x24/0x43
[ 3581.191703]  [<ffffffff812f7911>] xfs_quiesce_data+0x2c/0x88
[ 3581.192065]  [<ffffffff812f5556>] xfs_fs_sync_fs+0x21/0x48
[ 3581.192419]  [<ffffffff811611e1>] __sync_filesystem+0x66/0x7a
[ 3581.192783]  [<ffffffff8116120b>] sync_one_sb+0x16/0x18
[ 3581.193128]  [<ffffffff8113e3e3>] iterate_supers+0x72/0xce
[ 3581.193482]  [<ffffffff81161140>] sync_filesystems+0x20/0x22
[ 3581.193842]  [<ffffffff8116127e>] sys_sync+0x21/0x33
[ 3581.194177]  [<ffffffff819016c2>] system_call_fastpath+0x16/0x1b

--- linux-next.orig/fs/fs-writeback.c	2011-05-01 06:35:17.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-05-01 06:35:19.000000000 +0800
@@ -431,6 +431,14 @@ writeback_single_inode(struct inode *ino
 				requeue_io(inode, wb);
 			} else {
 				/*
+				 * sync livelock prevention: each inode is
+				 * tagged and synced in one shot. If still
+				 * dirty, move it back to s_dirty with updated
+				 * dirty time to prevent syncing it again.
+				 */
+				if (wbc->for_sync)
+					inode->dirtied_when = jiffies;
+				/*
 				 * Writeback blocked by something other than
 				 * congestion. Delay the inode for some time to
 				 * avoid spinning on the CPU (100% iowait)



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 3/3] writeback: avoid extra sync work at enqueue time
  2011-04-30 22:36 [PATCH 0/3] sync livelock fixes Wu Fengguang
  2011-04-30 22:36 ` [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
  2011-04-30 22:36 ` [PATCH 2/3] writeback: update dirtied_when for synced inode to prevent livelock Wu Fengguang
@ 2011-04-30 22:36 ` Wu Fengguang
  2 siblings, 0 replies; 6+ messages in thread
From: Wu Fengguang @ 2011-04-30 22:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Dave Chinner, Wu Fengguang, Christoph Hellwig, LKML,
	linux-fsdevel

[-- Attachment #1: writeback-kill-wb_start.patch --]
[-- Type: text/plain, Size: 2269 bytes --]

This removes writeback_control.wb_start and does more straightforward
sync livelock prevention by setting .older_than_this to prevent extra
inodes from being enqueued in the first place.

CC: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c         |   17 ++++-------------
 include/linux/writeback.h |    3 ---
 2 files changed, 4 insertions(+), 16 deletions(-)

--- linux-next.orig/fs/fs-writeback.c	2011-05-01 06:35:19.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-05-01 06:35:21.000000000 +0800
@@ -543,15 +543,6 @@ static int writeback_sb_inodes(struct su
 			continue;
 		}
 
-		/*
-		 * Was this inode dirtied after sync_sb_inodes was called?
-		 * This keeps sync from extra jobs and livelock.
-		 */
-		if (inode_dirtied_after(inode, wbc->wb_start)) {
-			spin_unlock(&inode->i_lock);
-			return 1;
-		}
-
 		__iget(inode);
 
 		pages_skipped = wbc->pages_skipped;
@@ -584,8 +575,6 @@ static void __writeback_inodes_wb(struct
 {
 	int ret = 0;
 
-	if (!wbc->wb_start)
-		wbc->wb_start = jiffies; /* livelock avoidance */
 
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
@@ -682,10 +671,12 @@ static long wb_writeback(struct bdi_writ
 	 *                   (quickly) tag currently dirty pages
 	 *                   (maybe slowly) sync all tagged pages
 	 */
-	if (wbc.for_sync)
+	if (wbc.for_sync) {
 		write_chunk = LONG_MAX;
+		oldest_jif = jiffies;
+		wbc.older_than_this = &oldest_jif;
+	}
 
-	wbc.wb_start = jiffies; /* livelock avoidance */
 	for (;;) {
 		/*
 		 * Stop writeback when nr_pages has been consumed
--- linux-next.orig/include/linux/writeback.h	2011-05-01 06:35:17.000000000 +0800
+++ linux-next/include/linux/writeback.h	2011-05-01 06:35:21.000000000 +0800
@@ -26,9 +26,6 @@ struct writeback_control {
 	enum writeback_sync_modes sync_mode;
 	unsigned long *older_than_this;	/* If !NULL, only write back inodes
 					   older than this */
-	unsigned long wb_start;         /* Time writeback_inodes_wb was
-					   called. This is needed to avoid
-					   extra jobs and livelock */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
 	long pages_skipped;		/* Pages which were not written */



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages
  2011-04-30 22:36 ` [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
@ 2011-05-01  7:46   ` Dave Chinner
  2011-05-02  3:23     ` Wu Fengguang
  0 siblings, 1 reply; 6+ messages in thread
From: Dave Chinner @ 2011-05-01  7:46 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Andrew Morton, Jan Kara, Christoph Hellwig, LKML, linux-fsdevel

On Sun, May 01, 2011 at 06:36:06AM +0800, Wu Fengguang wrote:
> sync(2) is performed in two stages: the WB_SYNC_NONE sync and the
> WB_SYNC_ALL sync. Tag both stages with wbc.for_sync for livelock
> prevention.
> 
> Note that writeback_inodes_sb() is called by not only sync(), they
> are treated the same because the other callers need also need livelock
> prevention.
> 
> Impacts:
> 
> - it changes the order in which pages/inodes are synced to disk. Now in
>   the WB_SYNC_NONE stage, it won't proceed to write the next inode until
>   finished with the current inode.
> 
> - this adds a new field to the writeback trace events and may possibly
>   break some scripts.
.....
> --- linux-next.orig/mm/page-writeback.c	2011-05-01 06:35:16.000000000 +0800
> +++ linux-next/mm/page-writeback.c	2011-05-01 06:35:17.000000000 +0800
> @@ -892,12 +892,12 @@ int write_cache_pages(struct address_spa
>  			range_whole = 1;
>  		cycled = 1; /* ignore range_cyclic tests */
>  	}
> -	if (wbc->sync_mode == WB_SYNC_ALL)
> +	if (wbc->for_sync)
>  		tag = PAGECACHE_TAG_TOWRITE;
>  	else
>  		tag = PAGECACHE_TAG_DIRTY;
>  retry:
> -	if (wbc->sync_mode == WB_SYNC_ALL)
> +	if (wbc->for_sync)
>  		tag_pages_for_writeback(mapping, index, end);
>  	done_index = index;
>  	while (!done && (index <= end)) {

Doesn't that break anything that uses
filemap_write_and_wait{_range}() or filemap_fdatawrite{_range}()?
e.g. fsync, sync buffered writes, etc? i.e. everything that
currently relies on WB_SYNC_ALL for data integrity writeback is now
b0rken except for sync(1)?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages
  2011-05-01  7:46   ` Dave Chinner
@ 2011-05-02  3:23     ` Wu Fengguang
  0 siblings, 0 replies; 6+ messages in thread
From: Wu Fengguang @ 2011-05-02  3:23 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Andrew Morton, Jan Kara, Christoph Hellwig, LKML, linux-fsdevel

On Sun, May 01, 2011 at 03:46:04PM +0800, Dave Chinner wrote:
> On Sun, May 01, 2011 at 06:36:06AM +0800, Wu Fengguang wrote:
> > sync(2) is performed in two stages: the WB_SYNC_NONE sync and the
> > WB_SYNC_ALL sync. Tag both stages with wbc.for_sync for livelock
> > prevention.
> > 
> > Note that writeback_inodes_sb() is called by not only sync(), they
> > are treated the same because the other callers need also need livelock
> > prevention.
> > 
> > Impacts:
> > 
> > - it changes the order in which pages/inodes are synced to disk. Now in
> >   the WB_SYNC_NONE stage, it won't proceed to write the next inode until
> >   finished with the current inode.
> > 
> > - this adds a new field to the writeback trace events and may possibly
> >   break some scripts.
> .....
> > --- linux-next.orig/mm/page-writeback.c	2011-05-01 06:35:16.000000000 +0800
> > +++ linux-next/mm/page-writeback.c	2011-05-01 06:35:17.000000000 +0800
> > @@ -892,12 +892,12 @@ int write_cache_pages(struct address_spa
> >  			range_whole = 1;
> >  		cycled = 1; /* ignore range_cyclic tests */
> >  	}
> > -	if (wbc->sync_mode == WB_SYNC_ALL)
> > +	if (wbc->for_sync)
> >  		tag = PAGECACHE_TAG_TOWRITE;
> >  	else
> >  		tag = PAGECACHE_TAG_DIRTY;
> >  retry:
> > -	if (wbc->sync_mode == WB_SYNC_ALL)
> > +	if (wbc->for_sync)
> >  		tag_pages_for_writeback(mapping, index, end);
> >  	done_index = index;
> >  	while (!done && (index <= end)) {
> 
> Doesn't that break anything that uses
> filemap_write_and_wait{_range}() or filemap_fdatawrite{_range}()?
> e.g. fsync, sync buffered writes, etc? i.e. everything that
> currently relies on WB_SYNC_ALL for data integrity writeback is now
> b0rken except for sync(1)?

Right, they'll become livelockable.. Good catch, thanks! I'll update
the patches to do

-   if (wbc->sync_mode == WB_SYNC_ALL)
+   if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_sync)

The alternative is to ask the other WB_SYNC_ALL callers to set
wbc.tagged_sync, but that seems more error prone.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2011-05-02  3:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-04-30 22:36 [PATCH 0/3] sync livelock fixes Wu Fengguang
2011-04-30 22:36 ` [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
2011-05-01  7:46   ` Dave Chinner
2011-05-02  3:23     ` Wu Fengguang
2011-04-30 22:36 ` [PATCH 2/3] writeback: update dirtied_when for synced inode to prevent livelock Wu Fengguang
2011-04-30 22:36 ` [PATCH 3/3] writeback: avoid extra sync work at enqueue time Wu Fengguang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).