All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] sync livelock fixes
@ 2011-04-30 22:36 Wu Fengguang
  2011-04-30 22:36 ` [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Wu Fengguang @ 2011-04-30 22:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Dave Chinner, Christoph Hellwig, Wu Fengguang, LKML,
	linux-fsdevel


Andrew,

I wrote a simple script to test sync livelock and this patchset is working as
expected:

	sync time: 2
	Dirty:             26492 kB
	Writeback:         30260 kB
	NFS_Unstable:          0 kB
	WritebackTmp:          0 kB
	sync NOT livelocked

In particular patch 2 fixes the sync livelock problem introduced by patch
"writeback: try more writeback as long as something was written".

Thanks,
Fengguang
---

#!/bin/sh

umount /dev/sda7
# mkfs.xfs -f /dev/sda7
mkfs.ext4 /dev/sda7
mount /dev/sda7 /fs

echo $((50<<20)) > /proc/sys/vm/dirty_bytes

pid=
for i in `seq 10`
do     
	dd if=/dev/zero of=/fs/zero-$i bs=1M count=1000 &
	pid="$pid $!"
done

sleep 1

tic=$(date +'%s')
sync
tac=$(date +'%s')

echo
echo sync time: $((tac-tic))
egrep '(Dirty|Writeback|NFS_Unstable)' /proc/meminfo

pidof dd > /dev/null && { kill -9 $pid; echo sync NOT livelocked; }



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages
  2011-04-30 22:36 [PATCH 0/3] sync livelock fixes Wu Fengguang
@ 2011-04-30 22:36 ` Wu Fengguang
  2011-05-01  7:46   ` Dave Chinner
  2011-04-30 22:36 ` [PATCH 2/3] writeback: update dirtied_when for synced inode to prevent livelock Wu Fengguang
  2011-04-30 22:36 ` [PATCH 3/3] writeback: avoid extra sync work at enqueue time Wu Fengguang
  2 siblings, 1 reply; 6+ messages in thread
From: Wu Fengguang @ 2011-04-30 22:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Dave Chinner, Wu Fengguang, Christoph Hellwig, LKML,
	linux-fsdevel

[-- Attachment #1: writeback-for-sync.patch --]
[-- Type: text/plain, Size: 6854 bytes --]

sync(2) is performed in two stages: the WB_SYNC_NONE sync and the
WB_SYNC_ALL sync. Tag both stages with wbc.for_sync for livelock
prevention.

Note that writeback_inodes_sb() is called by not only sync(), they
are treated the same because the other callers need also need livelock
prevention.

Impacts:

- it changes the order in which pages/inodes are synced to disk. Now in
  the WB_SYNC_NONE stage, it won't proceed to write the next inode until
  finished with the current inode.

- this adds a new field to the writeback trace events and may possibly
  break some scripts.

CC: Jan Kara <jack@suse.cz>
CC: Dave Chinner <david@fromorbit.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/ext4/inode.c                  |    4 ++--
 fs/fs-writeback.c                |   10 ++++++----
 include/linux/writeback.h        |    1 +
 include/trace/events/writeback.h |   10 ++++++++--
 mm/page-writeback.c              |    4 ++--
 5 files changed, 19 insertions(+), 10 deletions(-)

--- linux-next.orig/fs/fs-writeback.c	2011-05-01 06:35:15.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-05-01 06:35:17.000000000 +0800
@@ -36,6 +36,7 @@ struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
 	enum writeback_sync_modes sync_mode;
+	unsigned int for_sync:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
@@ -645,13 +646,14 @@ static long wb_writeback(struct bdi_writ
 	struct writeback_control wbc = {
 		.sync_mode		= work->sync_mode,
 		.older_than_this	= NULL,
+		.for_sync		= work->for_sync,
 		.for_kupdate		= work->for_kupdate,
 		.for_background		= work->for_background,
 		.range_cyclic		= work->range_cyclic,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
-	long write_chunk;
+	long write_chunk = MAX_WRITEBACK_PAGES;
 	struct inode *inode;
 
 	if (!wbc.range_cyclic) {
@@ -672,9 +674,7 @@ static long wb_writeback(struct bdi_writ
 	 *                   (quickly) tag currently dirty pages
 	 *                   (maybe slowly) sync all tagged pages
 	 */
-	if (wbc.sync_mode == WB_SYNC_NONE)
-		write_chunk = MAX_WRITEBACK_PAGES;
-	else
+	if (wbc.for_sync)
 		write_chunk = LONG_MAX;
 
 	wbc.wb_start = jiffies; /* livelock avoidance */
@@ -1209,6 +1209,7 @@ void writeback_inodes_sb_nr(struct super
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
+		.for_sync	= 1,
 		.done		= &done,
 		.nr_pages	= nr,
 	};
@@ -1286,6 +1287,7 @@ void sync_inodes_sb(struct super_block *
 	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
+		.for_sync	= 1,
 		.nr_pages	= LONG_MAX,
 		.range_cyclic	= 0,
 		.done		= &done,
--- linux-next.orig/include/linux/writeback.h	2011-05-01 06:35:16.000000000 +0800
+++ linux-next/include/linux/writeback.h	2011-05-01 06:35:17.000000000 +0800
@@ -46,6 +46,7 @@ struct writeback_control {
 	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_background:1;	/* A background writeback */
+	unsigned for_sync:1;		/* do livelock prevention for sync */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
--- linux-next.orig/mm/page-writeback.c	2011-05-01 06:35:16.000000000 +0800
+++ linux-next/mm/page-writeback.c	2011-05-01 06:35:17.000000000 +0800
@@ -892,12 +892,12 @@ int write_cache_pages(struct address_spa
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->for_sync)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->for_sync)
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && (index <= end)) {
--- linux-next.orig/include/trace/events/writeback.h	2011-05-01 06:35:16.000000000 +0800
+++ linux-next/include/trace/events/writeback.h	2011-05-01 06:35:17.000000000 +0800
@@ -17,6 +17,7 @@ DECLARE_EVENT_CLASS(writeback_work_class
 		__array(char, name, 32)
 		__field(long, nr_pages)
 		__field(dev_t, sb_dev)
+		__field(int, for_sync)
 		__field(int, sync_mode)
 		__field(int, for_kupdate)
 		__field(int, range_cyclic)
@@ -26,16 +27,18 @@ DECLARE_EVENT_CLASS(writeback_work_class
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
 		__entry->nr_pages = work->nr_pages;
 		__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
+		__entry->for_sync = work->for_sync;
 		__entry->sync_mode = work->sync_mode;
 		__entry->for_kupdate = work->for_kupdate;
 		__entry->range_cyclic = work->range_cyclic;
 		__entry->for_background	= work->for_background;
 	),
-	TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
+	TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync=%d sync_mode=%d "
 		  "kupdate=%d range_cyclic=%d background=%d",
 		  __entry->name,
 		  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
 		  __entry->nr_pages,
+		  __entry->for_sync,
 		  __entry->sync_mode,
 		  __entry->for_kupdate,
 		  __entry->range_cyclic,
@@ -96,6 +99,7 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__array(char, name, 32)
 		__field(long, nr_to_write)
 		__field(long, pages_skipped)
+		__field(int, for_sync)
 		__field(int, sync_mode)
 		__field(int, for_kupdate)
 		__field(int, for_background)
@@ -111,6 +115,7 @@ DECLARE_EVENT_CLASS(wbc_class,
 		strncpy(__entry->name, dev_name(bdi->dev), 32);
 		__entry->nr_to_write	= wbc->nr_to_write;
 		__entry->pages_skipped	= wbc->pages_skipped;
+		__entry->for_sync	= wbc->for_sync;
 		__entry->sync_mode	= wbc->sync_mode;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_background	= wbc->for_background;
@@ -123,12 +128,13 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->range_end	= (long)wbc->range_end;
 	),
 
-	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
+	TP_printk("bdi %s: towrt=%ld skip=%ld sync=%d mode=%d kupd=%d "
 		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
 		"start=0x%lx end=0x%lx",
 		__entry->name,
 		__entry->nr_to_write,
 		__entry->pages_skipped,
+		__entry->for_sync,
 		__entry->sync_mode,
 		__entry->for_kupdate,
 		__entry->for_background,
--- linux-next.orig/fs/ext4/inode.c	2011-05-01 06:35:15.000000000 +0800
+++ linux-next/fs/ext4/inode.c	2011-05-01 06:35:17.000000000 +0800
@@ -2741,7 +2741,7 @@ static int write_cache_pages_da(struct a
 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
 
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->for_sync)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
@@ -2975,7 +2975,7 @@ static int ext4_da_writepages(struct add
 	}
 
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->for_sync)
 		tag_pages_for_writeback(mapping, index, end);
 
 	while (!ret && wbc->nr_to_write > 0) {



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 2/3] writeback: update dirtied_when for synced inode to prevent livelock
  2011-04-30 22:36 [PATCH 0/3] sync livelock fixes Wu Fengguang
  2011-04-30 22:36 ` [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
@ 2011-04-30 22:36 ` Wu Fengguang
  2011-04-30 22:36 ` [PATCH 3/3] writeback: avoid extra sync work at enqueue time Wu Fengguang
  2 siblings, 0 replies; 6+ messages in thread
From: Wu Fengguang @ 2011-04-30 22:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Dave Chinner, Wu Fengguang, Christoph Hellwig, LKML,
	linux-fsdevel

[-- Attachment #1: writeback-fix-sync-busyloop.patch --]
[-- Type: text/plain, Size: 3573 bytes --]

With the more aggressive "keep writeback as long as we wrote something"
logic in wb_writeback(), the "use LONG_MAX .nr_to_write" trick in commit
b9543dac5bbc ("writeback: avoid livelocking WB_SYNC_ALL writeback") is
no longer enough to stop sync livelock.

The fix is to explicitly update .dirtied_when on synced inodes, so that
they are no longer considered for writeback in the next round.

CC: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c |    8 ++++++++
 1 file changed, 8 insertions(+)

ext3/ext4 are working fine now, however tests show that XFS may still
livelock inside the XFS routines:

[ 3581.181253] sync            D ffff8800b6ca15d8  4560  4403   4392 0x00000000
[ 3581.181734]  ffff88006f775bc8 0000000000000046 ffff8800b6ca12b8 00000001b6ca1938
[ 3581.182411]  ffff88006f774000 00000000001d2e40 00000000001d2e40 ffff8800b6ca1280
[ 3581.183088]  00000000001d2e40 ffff88006f775fd8 00000340af111ef2 00000000001d2e40
[ 3581.183765] Call Trace:
[ 3581.184008]  [<ffffffff8109be73>] ? lock_release_holdtime+0xa3/0xab
[ 3581.184392]  [<ffffffff8108cc0d>] ? prepare_to_wait+0x6c/0x79
[ 3581.184756]  [<ffffffff8108cc0d>] ? prepare_to_wait+0x6c/0x79
[ 3581.185120]  [<ffffffff812ed520>] xfs_ioend_wait+0x87/0x9f
[ 3581.185474]  [<ffffffff8108c97a>] ? wake_up_bit+0x2a/0x2a
[ 3581.185827]  [<ffffffff812f742a>] xfs_sync_inode_data+0x92/0x9d
[ 3581.186198]  [<ffffffff812f76e2>] xfs_inode_ag_walk+0x1a5/0x287
[ 3581.186569]  [<ffffffff812f779b>] ? xfs_inode_ag_walk+0x25e/0x287
[ 3581.186946]  [<ffffffff812f7398>] ? xfs_sync_worker+0x69/0x69
[ 3581.187311]  [<ffffffff812e2354>] ? xfs_perag_get+0x68/0xd0
[ 3581.187669]  [<ffffffff81092175>] ? local_clock+0x41/0x5a
[ 3581.188020]  [<ffffffff8109be73>] ? lock_release_holdtime+0xa3/0xab
[ 3581.188403]  [<ffffffff812e22ec>] ? xfs_check_sizes+0x160/0x160
[ 3581.188773]  [<ffffffff812e2354>] ? xfs_perag_get+0x68/0xd0
[ 3581.189130]  [<ffffffff812e236c>] ? xfs_perag_get+0x80/0xd0
[ 3581.189488]  [<ffffffff812e22ec>] ? xfs_check_sizes+0x160/0x160
[ 3581.189858]  [<ffffffff812f7831>] ? xfs_inode_ag_iterator+0x6d/0x8f
[ 3581.190241]  [<ffffffff812f7398>] ? xfs_sync_worker+0x69/0x69
[ 3581.190606]  [<ffffffff812f780b>] xfs_inode_ag_iterator+0x47/0x8f
[ 3581.190982]  [<ffffffff811611f5>] ? __sync_filesystem+0x7a/0x7a
[ 3581.191352]  [<ffffffff812f7877>] xfs_sync_data+0x24/0x43
[ 3581.191703]  [<ffffffff812f7911>] xfs_quiesce_data+0x2c/0x88
[ 3581.192065]  [<ffffffff812f5556>] xfs_fs_sync_fs+0x21/0x48
[ 3581.192419]  [<ffffffff811611e1>] __sync_filesystem+0x66/0x7a
[ 3581.192783]  [<ffffffff8116120b>] sync_one_sb+0x16/0x18
[ 3581.193128]  [<ffffffff8113e3e3>] iterate_supers+0x72/0xce
[ 3581.193482]  [<ffffffff81161140>] sync_filesystems+0x20/0x22
[ 3581.193842]  [<ffffffff8116127e>] sys_sync+0x21/0x33
[ 3581.194177]  [<ffffffff819016c2>] system_call_fastpath+0x16/0x1b

--- linux-next.orig/fs/fs-writeback.c	2011-05-01 06:35:17.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-05-01 06:35:19.000000000 +0800
@@ -431,6 +431,14 @@ writeback_single_inode(struct inode *ino
 				requeue_io(inode, wb);
 			} else {
 				/*
+				 * sync livelock prevention: each inode is
+				 * tagged and synced in one shot. If still
+				 * dirty, move it back to s_dirty with updated
+				 * dirty time to prevent syncing it again.
+				 */
+				if (wbc->for_sync)
+					inode->dirtied_when = jiffies;
+				/*
 				 * Writeback blocked by something other than
 				 * congestion. Delay the inode for some time to
 				 * avoid spinning on the CPU (100% iowait)



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 3/3] writeback: avoid extra sync work at enqueue time
  2011-04-30 22:36 [PATCH 0/3] sync livelock fixes Wu Fengguang
  2011-04-30 22:36 ` [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
  2011-04-30 22:36 ` [PATCH 2/3] writeback: update dirtied_when for synced inode to prevent livelock Wu Fengguang
@ 2011-04-30 22:36 ` Wu Fengguang
  2 siblings, 0 replies; 6+ messages in thread
From: Wu Fengguang @ 2011-04-30 22:36 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Jan Kara, Dave Chinner, Wu Fengguang, Christoph Hellwig, LKML,
	linux-fsdevel

[-- Attachment #1: writeback-kill-wb_start.patch --]
[-- Type: text/plain, Size: 2269 bytes --]

This removes writeback_control.wb_start and does more straightforward
sync livelock prevention by setting .older_than_this to prevent extra
inodes from being enqueued in the first place.

CC: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c         |   17 ++++-------------
 include/linux/writeback.h |    3 ---
 2 files changed, 4 insertions(+), 16 deletions(-)

--- linux-next.orig/fs/fs-writeback.c	2011-05-01 06:35:19.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-05-01 06:35:21.000000000 +0800
@@ -543,15 +543,6 @@ static int writeback_sb_inodes(struct su
 			continue;
 		}
 
-		/*
-		 * Was this inode dirtied after sync_sb_inodes was called?
-		 * This keeps sync from extra jobs and livelock.
-		 */
-		if (inode_dirtied_after(inode, wbc->wb_start)) {
-			spin_unlock(&inode->i_lock);
-			return 1;
-		}
-
 		__iget(inode);
 
 		pages_skipped = wbc->pages_skipped;
@@ -584,8 +575,6 @@ static void __writeback_inodes_wb(struct
 {
 	int ret = 0;
 
-	if (!wbc->wb_start)
-		wbc->wb_start = jiffies; /* livelock avoidance */
 
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
@@ -682,10 +671,12 @@ static long wb_writeback(struct bdi_writ
 	 *                   (quickly) tag currently dirty pages
 	 *                   (maybe slowly) sync all tagged pages
 	 */
-	if (wbc.for_sync)
+	if (wbc.for_sync) {
 		write_chunk = LONG_MAX;
+		oldest_jif = jiffies;
+		wbc.older_than_this = &oldest_jif;
+	}
 
-	wbc.wb_start = jiffies; /* livelock avoidance */
 	for (;;) {
 		/*
 		 * Stop writeback when nr_pages has been consumed
--- linux-next.orig/include/linux/writeback.h	2011-05-01 06:35:17.000000000 +0800
+++ linux-next/include/linux/writeback.h	2011-05-01 06:35:21.000000000 +0800
@@ -26,9 +26,6 @@ struct writeback_control {
 	enum writeback_sync_modes sync_mode;
 	unsigned long *older_than_this;	/* If !NULL, only write back inodes
 					   older than this */
-	unsigned long wb_start;         /* Time writeback_inodes_wb was
-					   called. This is needed to avoid
-					   extra jobs and livelock */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
 	long pages_skipped;		/* Pages which were not written */



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages
  2011-04-30 22:36 ` [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
@ 2011-05-01  7:46   ` Dave Chinner
  2011-05-02  3:23     ` Wu Fengguang
  0 siblings, 1 reply; 6+ messages in thread
From: Dave Chinner @ 2011-05-01  7:46 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Andrew Morton, Jan Kara, Christoph Hellwig, LKML, linux-fsdevel

On Sun, May 01, 2011 at 06:36:06AM +0800, Wu Fengguang wrote:
> sync(2) is performed in two stages: the WB_SYNC_NONE sync and the
> WB_SYNC_ALL sync. Tag both stages with wbc.for_sync for livelock
> prevention.
> 
> Note that writeback_inodes_sb() is called by not only sync(), they
> are treated the same because the other callers need also need livelock
> prevention.
> 
> Impacts:
> 
> - it changes the order in which pages/inodes are synced to disk. Now in
>   the WB_SYNC_NONE stage, it won't proceed to write the next inode until
>   finished with the current inode.
> 
> - this adds a new field to the writeback trace events and may possibly
>   break some scripts.
.....
> --- linux-next.orig/mm/page-writeback.c	2011-05-01 06:35:16.000000000 +0800
> +++ linux-next/mm/page-writeback.c	2011-05-01 06:35:17.000000000 +0800
> @@ -892,12 +892,12 @@ int write_cache_pages(struct address_spa
>  			range_whole = 1;
>  		cycled = 1; /* ignore range_cyclic tests */
>  	}
> -	if (wbc->sync_mode == WB_SYNC_ALL)
> +	if (wbc->for_sync)
>  		tag = PAGECACHE_TAG_TOWRITE;
>  	else
>  		tag = PAGECACHE_TAG_DIRTY;
>  retry:
> -	if (wbc->sync_mode == WB_SYNC_ALL)
> +	if (wbc->for_sync)
>  		tag_pages_for_writeback(mapping, index, end);
>  	done_index = index;
>  	while (!done && (index <= end)) {

Doesn't that break anything that uses
filemap_write_and_wait{_range}() or filemap_fdatawrite{_range}()?
e.g. fsync, sync buffered writes, etc? i.e. everything that
currently relies on WB_SYNC_ALL for data integrity writeback is now
b0rken except for sync(1)?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages
  2011-05-01  7:46   ` Dave Chinner
@ 2011-05-02  3:23     ` Wu Fengguang
  0 siblings, 0 replies; 6+ messages in thread
From: Wu Fengguang @ 2011-05-02  3:23 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Andrew Morton, Jan Kara, Christoph Hellwig, LKML, linux-fsdevel

On Sun, May 01, 2011 at 03:46:04PM +0800, Dave Chinner wrote:
> On Sun, May 01, 2011 at 06:36:06AM +0800, Wu Fengguang wrote:
> > sync(2) is performed in two stages: the WB_SYNC_NONE sync and the
> > WB_SYNC_ALL sync. Tag both stages with wbc.for_sync for livelock
> > prevention.
> > 
> > Note that writeback_inodes_sb() is called by not only sync(), they
> > are treated the same because the other callers need also need livelock
> > prevention.
> > 
> > Impacts:
> > 
> > - it changes the order in which pages/inodes are synced to disk. Now in
> >   the WB_SYNC_NONE stage, it won't proceed to write the next inode until
> >   finished with the current inode.
> > 
> > - this adds a new field to the writeback trace events and may possibly
> >   break some scripts.
> .....
> > --- linux-next.orig/mm/page-writeback.c	2011-05-01 06:35:16.000000000 +0800
> > +++ linux-next/mm/page-writeback.c	2011-05-01 06:35:17.000000000 +0800
> > @@ -892,12 +892,12 @@ int write_cache_pages(struct address_spa
> >  			range_whole = 1;
> >  		cycled = 1; /* ignore range_cyclic tests */
> >  	}
> > -	if (wbc->sync_mode == WB_SYNC_ALL)
> > +	if (wbc->for_sync)
> >  		tag = PAGECACHE_TAG_TOWRITE;
> >  	else
> >  		tag = PAGECACHE_TAG_DIRTY;
> >  retry:
> > -	if (wbc->sync_mode == WB_SYNC_ALL)
> > +	if (wbc->for_sync)
> >  		tag_pages_for_writeback(mapping, index, end);
> >  	done_index = index;
> >  	while (!done && (index <= end)) {
> 
> Doesn't that break anything that uses
> filemap_write_and_wait{_range}() or filemap_fdatawrite{_range}()?
> e.g. fsync, sync buffered writes, etc? i.e. everything that
> currently relies on WB_SYNC_ALL for data integrity writeback is now
> b0rken except for sync(1)?

Right, they'll become livelockable.. Good catch, thanks! I'll update
the patches to do

-   if (wbc->sync_mode == WB_SYNC_ALL)
+   if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_sync)

The alternative is to ask the other WB_SYNC_ALL callers to set
wbc.tagged_sync, but that seems more error prone.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2011-05-02  3:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-04-30 22:36 [PATCH 0/3] sync livelock fixes Wu Fengguang
2011-04-30 22:36 ` [PATCH 1/3] writeback: introduce wbc.for_sync to cover the two sync stages Wu Fengguang
2011-05-01  7:46   ` Dave Chinner
2011-05-02  3:23     ` Wu Fengguang
2011-04-30 22:36 ` [PATCH 2/3] writeback: update dirtied_when for synced inode to prevent livelock Wu Fengguang
2011-04-30 22:36 ` [PATCH 3/3] writeback: avoid extra sync work at enqueue time Wu Fengguang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.