All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes
@ 2010-04-20  2:41 ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  2:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs

This series contains the initial writeback tracing patches from
Jens, as well as the extensions I added to provide visibility into
writeback control structures as the are used by the writeback code.
The visibility given is sufficient to understand what is happening
in the writeback path - what path is writing data, what path is
blocking on congestion, etc, and to determine the differences in
behaviour for different sync modes and calling contexts. This
tracing really needs to be integrated into mainline so that anyone
can improve the tracing as they use it to track down problems
in our convoluted writeback paths.

The remaining patches are fixes to problems that the new tracing
highlighted.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes
@ 2010-04-20  2:41 ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  2:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs

This series contains the initial writeback tracing patches from
Jens, as well as the extensions I added to provide visibility into
writeback control structures as the are used by the writeback code.
The visibility given is sufficient to understand what is happening
in the writeback path - what path is writing data, what path is
blocking on congestion, etc, and to determine the differences in
behaviour for different sync modes and calling contexts. This
tracing really needs to be integrated into mainline so that anyone
can improve the tracing as they use it to track down problems
in our convoluted writeback paths.

The remaining patches are fixes to problems that the new tracing
highlighted.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* [PATCH 1/4] writeback: initial tracing support
  2010-04-20  2:41 ` Dave Chinner
@ 2010-04-20  2:41   ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  2:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs

From: From: Jens Axboe <jens.axboe@oracle.com>

Trace queue/sched/exec parts of the writeback loop.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c                |   51 +++++-------
 include/linux/writeback.h        |   27 ++++++
 include/trace/events/writeback.h |  171 ++++++++++++++++++++++++++++++++++++++
 mm/backing-dev.c                 |    5 +
 4 files changed, 224 insertions(+), 30 deletions(-)
 create mode 100644 include/trace/events/writeback.h

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76fc4d5..3f5f0a5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -25,7 +25,9 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
+#include <linux/ftrace.h>
 #include "internal.h"
+#include <trace/events/writeback.h>
 
 #define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
 
@@ -34,33 +36,6 @@
  */
 int nr_pdflush_threads;
 
-/*
- * Passed into wb_writeback(), essentially a subset of writeback_control
- */
-struct wb_writeback_args {
-	long nr_pages;
-	struct super_block *sb;
-	enum writeback_sync_modes sync_mode;
-	int for_kupdate:1;
-	int range_cyclic:1;
-	int for_background:1;
-};
-
-/*
- * Work items for the bdi_writeback threads
- */
-struct bdi_work {
-	struct list_head list;		/* pending work list */
-	struct rcu_head rcu_head;	/* for RCU free/clear of work */
-
-	unsigned long seen;		/* threads that have seen this work */
-	atomic_t pending;		/* number of threads still to do work */
-
-	struct wb_writeback_args args;	/* writeback arguments */
-
-	unsigned long state;		/* flag bits, see WS_* */
-};
-
 enum {
 	WS_USED_B = 0,
 	WS_ONSTACK_B,
@@ -135,6 +110,8 @@ static void wb_work_complete(struct bdi_work *work)
 
 static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
 {
+	trace_writeback_clear(work);
+
 	/*
 	 * The caller has retrieved the work arguments from this work,
 	 * drop our reference. If this is the last ref, delete and free it
@@ -170,12 +147,16 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 	 * If the default thread isn't there, make sure we add it. When
 	 * it gets created and wakes up, we'll run this work.
 	 */
-	if (unlikely(list_empty_careful(&bdi->wb_list)))
+	if (unlikely(list_empty_careful(&bdi->wb_list))) {
+		trace_writeback_sched(bdi, work, "default");
 		wake_up_process(default_backing_dev_info.wb.task);
-	else {
+	} else {
 		struct bdi_writeback *wb = &bdi->wb;
+		struct task_struct *task = wb->task;
 
-		if (wb->task)
+		trace_writeback_sched(bdi, work, task ? "task" : "notask");
+
+		if (task)
 			wake_up_process(wb->task);
 	}
 }
@@ -202,6 +183,7 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 	if (work) {
 		bdi_work_init(work, args);
+		trace_writeback_queue(bdi, args);
 		bdi_queue_work(bdi, work);
 	} else {
 		struct bdi_writeback *wb = &bdi->wb;
@@ -235,6 +217,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 	bdi_work_init(&work, &args);
 	work.state |= WS_ONSTACK;
 
+	trace_writeback_queue(bdi, &args);
 	bdi_queue_work(bdi, &work);
 	bdi_wait_on_work_clear(&work);
 }
@@ -880,6 +863,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		if (force_wait)
 			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
 
+		trace_writeback_exec(work);
+
 		/*
 		 * If this isn't a data integrity operation, just notify
 		 * that we have seen this work and we are now starting it.
@@ -915,9 +900,13 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 	unsigned long wait_jiffies = -1UL;
 	long pages_written;
 
+	trace_writeback_thread_start(1);
+
 	while (!kthread_should_stop()) {
 		pages_written = wb_do_writeback(wb, 0);
 
+		trace_writeback_pages_written(pages_written);
+
 		if (pages_written)
 			last_active = jiffies;
 		else if (wait_jiffies != -1UL) {
@@ -938,6 +927,8 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 		try_to_freeze();
 	}
 
+	trace_writeback_thread_start(0);
+
 	return 0;
 }
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 76e8903..b2d615f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -22,6 +22,33 @@ enum writeback_sync_modes {
 };
 
 /*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_args {
+	long nr_pages;
+	struct super_block *sb;
+	enum writeback_sync_modes sync_mode;
+	int for_kupdate:1;
+	int range_cyclic:1;
+	int for_background:1;
+};
+
+/*
+ * Work items for the bdi_writeback threads
+ */
+struct bdi_work {
+	struct list_head list;		/* pending work list */
+	struct rcu_head rcu_head;	/* for RCU free/clear of work */
+
+	unsigned long seen;		/* threads that have seen this work */
+	atomic_t pending;		/* number of threads still to do work */
+
+	struct wb_writeback_args args;	/* writeback arguments */
+
+	unsigned long state;		/* flag bits, see WS_* */
+};
+
+/*
  * A control structure which tells the writeback code what to do.  These are
  * always on the stack, and hence need no locking.  They are always initialised
  * in a manner such that unspecified fields are set to zero.
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
new file mode 100644
index 0000000..df76457
--- /dev/null
+++ b/include/trace/events/writeback.h
@@ -0,0 +1,171 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM writeback
+
+#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WRITEBACK_H
+
+#include <linux/backing-dev.h>
+#include <linux/writeback.h>
+
+TRACE_EVENT(writeback_queue,
+
+	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_args *args),
+
+	TP_ARGS(bdi, args),
+
+	TP_STRUCT__entry(
+		__array(char,		name,		16)
+		__field(long,		nr_pages)
+		__field(int,		sb)
+		__field(int,		sync_mode)
+		__field(int,		for_kupdate)
+		__field(int,		range_cyclic)
+		__field(int,		for_background)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 16);
+		__entry->nr_pages	= args->nr_pages;
+		__entry->sb		= !!args->sb;
+		__entry->for_kupdate	= args->for_kupdate;
+		__entry->range_cyclic	= args->range_cyclic;
+		__entry->for_background	= args->for_background;
+	),
+
+	TP_printk("%s: pages=%ld, sb=%d, kupdate=%d, range_cyclic=%d "
+		  "for_background=%d", __entry->name, __entry->nr_pages,
+			__entry->sb, __entry->for_kupdate,
+			__entry->range_cyclic, __entry->for_background)
+);
+
+TRACE_EVENT(writeback_sched,
+
+	TP_PROTO(struct backing_dev_info *bdi, struct bdi_work *work,
+		 const char *msg),
+
+	TP_ARGS(bdi, work, msg),
+
+	TP_STRUCT__entry(
+		__array(char,			name,		16)
+		__field(unsigned int,		work)
+		__array(char,			task,		8)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 16);
+		__entry->work = (unsigned long) work & 0xffff;
+		snprintf(__entry->task, 8, "%s", msg);
+	),
+
+	TP_printk("work=%x, task=%s", __entry->work, __entry->task)
+);
+
+TRACE_EVENT(writeback_exec,
+
+	TP_PROTO(struct bdi_work *work),
+
+	TP_ARGS(work),
+
+	TP_STRUCT__entry(
+		__field(unsigned int,	work)
+		__field(long,		nr_pages)
+		__field(int,		sb)
+		__field(int,		sync_mode)
+		__field(int,		for_kupdate)
+		__field(int,		range_cyclic)
+		__field(int,		for_background)
+	),
+
+	TP_fast_assign(
+		__entry->work		= (unsigned long) work & 0xffff;
+		__entry->nr_pages	= work->args.nr_pages;
+		__entry->sb		= !!work->args.sb;
+		__entry->for_kupdate	= work->args.for_kupdate;
+		__entry->range_cyclic	= work->args.range_cyclic;
+		__entry->for_background	= work->args.for_background;
+
+	),
+
+	TP_printk("work=%x pages=%ld, sb=%d, kupdate=%d, range_cyclic=%d"
+		  " for_background=%d", __entry->work,
+			__entry->nr_pages, __entry->sb, __entry->for_kupdate,
+			__entry->range_cyclic, __entry->for_background)
+);
+
+TRACE_EVENT(writeback_clear,
+
+	TP_PROTO(struct bdi_work *work),
+
+	TP_ARGS(work),
+
+	TP_STRUCT__entry(
+		__field(struct bdi_work *,	work)
+		__field(int,			refs)
+	),
+
+	TP_fast_assign(
+		__entry->work		= work;
+		__entry->refs		= atomic_read(&work->pending);
+	),
+
+	TP_printk("work=%p, refs=%d", __entry->work, __entry->refs)
+);
+
+TRACE_EVENT(writeback_pages_written,
+
+	TP_PROTO(long pages_written),
+
+	TP_ARGS(pages_written),
+
+	TP_STRUCT__entry(
+		__field(long,		pages)
+	),
+
+	TP_fast_assign(
+		__entry->pages		= pages_written;
+	),
+
+	TP_printk("%ld", __entry->pages)
+);
+
+
+TRACE_EVENT(writeback_thread_start,
+
+	TP_PROTO(int start),
+
+	TP_ARGS(start),
+
+	TP_STRUCT__entry(
+		__field(int,	start)
+	),
+
+	TP_fast_assign(
+		__entry->start = start;
+	),
+
+	TP_printk("%s", __entry->start ? "started" : "exited")
+);
+
+TRACE_EVENT(writeback_bdi_register,
+
+	TP_PROTO(const char *name, int start),
+
+	TP_ARGS(name, start),
+
+	TP_STRUCT__entry(
+		__array(char,	name,		16)
+		__field(int,	start)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, name, 16);
+		__entry->start = start;
+	),
+
+	TP_printk("%s: %s", __entry->name,
+			__entry->start ? "registered" : "unregistered")
+);
+#endif /* _TRACE_WRITEBACK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca03..2323e92 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -11,6 +11,9 @@
 #include <linux/writeback.h>
 #include <linux/device.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
 }
@@ -570,6 +573,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 
 	bdi_debug_register(bdi, dev_name(dev));
 	set_bit(BDI_registered, &bdi->state);
+	trace_writeback_bdi_register(dev_name(dev), 1);
 exit:
 	return ret;
 }
@@ -632,6 +636,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 void bdi_unregister(struct backing_dev_info *bdi)
 {
 	if (bdi->dev) {
+		trace_writeback_bdi_register(dev_name(bdi->dev), 0);
 		bdi_prune_sb(bdi);
 
 		if (!bdi_cap_flush_forker(bdi))
-- 
1.6.5


^ permalink raw reply related	[flat|nested] 54+ messages in thread

* [PATCH 1/4] writeback: initial tracing support
@ 2010-04-20  2:41   ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  2:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs

From: From: Jens Axboe <jens.axboe@oracle.com>

Trace queue/sched/exec parts of the writeback loop.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c                |   51 +++++-------
 include/linux/writeback.h        |   27 ++++++
 include/trace/events/writeback.h |  171 ++++++++++++++++++++++++++++++++++++++
 mm/backing-dev.c                 |    5 +
 4 files changed, 224 insertions(+), 30 deletions(-)
 create mode 100644 include/trace/events/writeback.h

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76fc4d5..3f5f0a5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -25,7 +25,9 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
+#include <linux/ftrace.h>
 #include "internal.h"
+#include <trace/events/writeback.h>
 
 #define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)
 
@@ -34,33 +36,6 @@
  */
 int nr_pdflush_threads;
 
-/*
- * Passed into wb_writeback(), essentially a subset of writeback_control
- */
-struct wb_writeback_args {
-	long nr_pages;
-	struct super_block *sb;
-	enum writeback_sync_modes sync_mode;
-	int for_kupdate:1;
-	int range_cyclic:1;
-	int for_background:1;
-};
-
-/*
- * Work items for the bdi_writeback threads
- */
-struct bdi_work {
-	struct list_head list;		/* pending work list */
-	struct rcu_head rcu_head;	/* for RCU free/clear of work */
-
-	unsigned long seen;		/* threads that have seen this work */
-	atomic_t pending;		/* number of threads still to do work */
-
-	struct wb_writeback_args args;	/* writeback arguments */
-
-	unsigned long state;		/* flag bits, see WS_* */
-};
-
 enum {
 	WS_USED_B = 0,
 	WS_ONSTACK_B,
@@ -135,6 +110,8 @@ static void wb_work_complete(struct bdi_work *work)
 
 static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
 {
+	trace_writeback_clear(work);
+
 	/*
 	 * The caller has retrieved the work arguments from this work,
 	 * drop our reference. If this is the last ref, delete and free it
@@ -170,12 +147,16 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 	 * If the default thread isn't there, make sure we add it. When
 	 * it gets created and wakes up, we'll run this work.
 	 */
-	if (unlikely(list_empty_careful(&bdi->wb_list)))
+	if (unlikely(list_empty_careful(&bdi->wb_list))) {
+		trace_writeback_sched(bdi, work, "default");
 		wake_up_process(default_backing_dev_info.wb.task);
-	else {
+	} else {
 		struct bdi_writeback *wb = &bdi->wb;
+		struct task_struct *task = wb->task;
 
-		if (wb->task)
+		trace_writeback_sched(bdi, work, task ? "task" : "notask");
+
+		if (task)
 			wake_up_process(wb->task);
 	}
 }
@@ -202,6 +183,7 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
 	if (work) {
 		bdi_work_init(work, args);
+		trace_writeback_queue(bdi, args);
 		bdi_queue_work(bdi, work);
 	} else {
 		struct bdi_writeback *wb = &bdi->wb;
@@ -235,6 +217,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 	bdi_work_init(&work, &args);
 	work.state |= WS_ONSTACK;
 
+	trace_writeback_queue(bdi, &args);
 	bdi_queue_work(bdi, &work);
 	bdi_wait_on_work_clear(&work);
 }
@@ -880,6 +863,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 		if (force_wait)
 			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
 
+		trace_writeback_exec(work);
+
 		/*
 		 * If this isn't a data integrity operation, just notify
 		 * that we have seen this work and we are now starting it.
@@ -915,9 +900,13 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 	unsigned long wait_jiffies = -1UL;
 	long pages_written;
 
+	trace_writeback_thread_start(1);
+
 	while (!kthread_should_stop()) {
 		pages_written = wb_do_writeback(wb, 0);
 
+		trace_writeback_pages_written(pages_written);
+
 		if (pages_written)
 			last_active = jiffies;
 		else if (wait_jiffies != -1UL) {
@@ -938,6 +927,8 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 		try_to_freeze();
 	}
 
+	trace_writeback_thread_start(0);
+
 	return 0;
 }
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 76e8903..b2d615f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -22,6 +22,33 @@ enum writeback_sync_modes {
 };
 
 /*
+ * Passed into wb_writeback(), essentially a subset of writeback_control
+ */
+struct wb_writeback_args {
+	long nr_pages;
+	struct super_block *sb;
+	enum writeback_sync_modes sync_mode;
+	int for_kupdate:1;
+	int range_cyclic:1;
+	int for_background:1;
+};
+
+/*
+ * Work items for the bdi_writeback threads
+ */
+struct bdi_work {
+	struct list_head list;		/* pending work list */
+	struct rcu_head rcu_head;	/* for RCU free/clear of work */
+
+	unsigned long seen;		/* threads that have seen this work */
+	atomic_t pending;		/* number of threads still to do work */
+
+	struct wb_writeback_args args;	/* writeback arguments */
+
+	unsigned long state;		/* flag bits, see WS_* */
+};
+
+/*
  * A control structure which tells the writeback code what to do.  These are
  * always on the stack, and hence need no locking.  They are always initialised
  * in a manner such that unspecified fields are set to zero.
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
new file mode 100644
index 0000000..df76457
--- /dev/null
+++ b/include/trace/events/writeback.h
@@ -0,0 +1,171 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM writeback
+
+#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WRITEBACK_H
+
+#include <linux/backing-dev.h>
+#include <linux/writeback.h>
+
+TRACE_EVENT(writeback_queue,
+
+	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_args *args),
+
+	TP_ARGS(bdi, args),
+
+	TP_STRUCT__entry(
+		__array(char,		name,		16)
+		__field(long,		nr_pages)
+		__field(int,		sb)
+		__field(int,		sync_mode)
+		__field(int,		for_kupdate)
+		__field(int,		range_cyclic)
+		__field(int,		for_background)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 16);
+		__entry->nr_pages	= args->nr_pages;
+		__entry->sb		= !!args->sb;
+		__entry->for_kupdate	= args->for_kupdate;
+		__entry->range_cyclic	= args->range_cyclic;
+		__entry->for_background	= args->for_background;
+	),
+
+	TP_printk("%s: pages=%ld, sb=%d, kupdate=%d, range_cyclic=%d "
+		  "for_background=%d", __entry->name, __entry->nr_pages,
+			__entry->sb, __entry->for_kupdate,
+			__entry->range_cyclic, __entry->for_background)
+);
+
+TRACE_EVENT(writeback_sched,
+
+	TP_PROTO(struct backing_dev_info *bdi, struct bdi_work *work,
+		 const char *msg),
+
+	TP_ARGS(bdi, work, msg),
+
+	TP_STRUCT__entry(
+		__array(char,			name,		16)
+		__field(unsigned int,		work)
+		__array(char,			task,		8)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 16);
+		__entry->work = (unsigned long) work & 0xffff;
+		snprintf(__entry->task, 8, "%s", msg);
+	),
+
+	TP_printk("work=%x, task=%s", __entry->work, __entry->task)
+);
+
+TRACE_EVENT(writeback_exec,
+
+	TP_PROTO(struct bdi_work *work),
+
+	TP_ARGS(work),
+
+	TP_STRUCT__entry(
+		__field(unsigned int,	work)
+		__field(long,		nr_pages)
+		__field(int,		sb)
+		__field(int,		sync_mode)
+		__field(int,		for_kupdate)
+		__field(int,		range_cyclic)
+		__field(int,		for_background)
+	),
+
+	TP_fast_assign(
+		__entry->work		= (unsigned long) work & 0xffff;
+		__entry->nr_pages	= work->args.nr_pages;
+		__entry->sb		= !!work->args.sb;
+		__entry->for_kupdate	= work->args.for_kupdate;
+		__entry->range_cyclic	= work->args.range_cyclic;
+		__entry->for_background	= work->args.for_background;
+
+	),
+
+	TP_printk("work=%x pages=%ld, sb=%d, kupdate=%d, range_cyclic=%d"
+		  " for_background=%d", __entry->work,
+			__entry->nr_pages, __entry->sb, __entry->for_kupdate,
+			__entry->range_cyclic, __entry->for_background)
+);
+
+TRACE_EVENT(writeback_clear,
+
+	TP_PROTO(struct bdi_work *work),
+
+	TP_ARGS(work),
+
+	TP_STRUCT__entry(
+		__field(struct bdi_work *,	work)
+		__field(int,			refs)
+	),
+
+	TP_fast_assign(
+		__entry->work		= work;
+		__entry->refs		= atomic_read(&work->pending);
+	),
+
+	TP_printk("work=%p, refs=%d", __entry->work, __entry->refs)
+);
+
+TRACE_EVENT(writeback_pages_written,
+
+	TP_PROTO(long pages_written),
+
+	TP_ARGS(pages_written),
+
+	TP_STRUCT__entry(
+		__field(long,		pages)
+	),
+
+	TP_fast_assign(
+		__entry->pages		= pages_written;
+	),
+
+	TP_printk("%ld", __entry->pages)
+);
+
+
+TRACE_EVENT(writeback_thread_start,
+
+	TP_PROTO(int start),
+
+	TP_ARGS(start),
+
+	TP_STRUCT__entry(
+		__field(int,	start)
+	),
+
+	TP_fast_assign(
+		__entry->start = start;
+	),
+
+	TP_printk("%s", __entry->start ? "started" : "exited")
+);
+
+TRACE_EVENT(writeback_bdi_register,
+
+	TP_PROTO(const char *name, int start),
+
+	TP_ARGS(name, start),
+
+	TP_STRUCT__entry(
+		__array(char,	name,		16)
+		__field(int,	start)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, name, 16);
+		__entry->start = start;
+	),
+
+	TP_printk("%s: %s", __entry->name,
+			__entry->start ? "registered" : "unregistered")
+);
+#endif /* _TRACE_WRITEBACK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca03..2323e92 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -11,6 +11,9 @@
 #include <linux/writeback.h>
 #include <linux/device.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
 }
@@ -570,6 +573,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 
 	bdi_debug_register(bdi, dev_name(dev));
 	set_bit(BDI_registered, &bdi->state);
+	trace_writeback_bdi_register(dev_name(dev), 1);
 exit:
 	return ret;
 }
@@ -632,6 +636,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 void bdi_unregister(struct backing_dev_info *bdi)
 {
 	if (bdi->dev) {
+		trace_writeback_bdi_register(dev_name(bdi->dev), 0);
 		bdi_prune_sb(bdi);
 
 		if (!bdi_cap_flush_forker(bdi))
-- 
1.6.5

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 54+ messages in thread

* [PATCH 2/4] writeback: Add tracing to balance_dirty_pages
  2010-04-20  2:41 ` Dave Chinner
@ 2010-04-20  2:41   ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  2:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs

From: Dave Chinner <dchinner@redhat.com>

Tracing high level background writeback events is good, but it doesn't
give the entire picture. Add IO dispatched by foreground throttling to the
writeback events.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c                |    5 ++
 include/trace/events/writeback.h |   77 ++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c              |    4 ++
 3 files changed, 86 insertions(+), 0 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3f5f0a5..5214b61 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -752,7 +752,11 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.more_io = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
+
+		trace_wbc_writeback_start(&wbc);
 		writeback_inodes_wb(wb, &wbc);
+		trace_wbc_writeback_written(&wbc);
+
 		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
@@ -780,6 +784,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (!list_empty(&wb->b_more_io))  {
 			inode = list_entry(wb->b_more_io.prev,
 						struct inode, i_list);
+			trace_wbc_writeback_wait(&wbc);
 			inode_wait_for_writeback(inode);
 		}
 		spin_unlock(&inode_lock);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index df76457..02f34a5 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -165,6 +165,83 @@ TRACE_EVENT(writeback_bdi_register,
 	TP_printk("%s: %s", __entry->name,
 			__entry->start ? "registered" : "unregistered")
 );
+
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(wbc_class,
+	TP_PROTO(struct writeback_control *wbc),
+	TP_ARGS(wbc),
+	TP_STRUCT__entry(
+		__field(unsigned int, wbc)
+		__array(char, name, 16)
+		__field(long, nr_to_write)
+		__field(long, pages_skipped)
+		__field(int, sb)
+		__field(int, sync_mode)
+		__field(int, nonblocking)
+		__field(int, encountered_congestion)
+		__field(int, for_kupdate)
+		__field(int, for_background)
+		__field(int, for_reclaim)
+		__field(int, range_cyclic)
+		__field(int, more_io)
+		__field(unsigned long, older_than_this)
+		__field(long, range_start)
+		__field(long, range_end)
+	),
+
+	TP_fast_assign(
+		char *__name = "(none)";
+
+		__entry->wbc		= (unsigned long)wbc & 0xffff;
+		if (wbc->bdi)
+			strncpy(__entry->name, dev_name(wbc->bdi->dev), 16);
+		else
+			strncpy(__entry->name, __name, 16);
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->pages_skipped	= wbc->pages_skipped;
+		__entry->sb		= !!wbc->sb;
+		__entry->sync_mode	= wbc->sync_mode;
+		__entry->for_kupdate	= wbc->for_kupdate;
+		__entry->for_background	= wbc->for_background;
+		__entry->for_reclaim	= wbc->for_reclaim;
+		__entry->range_cyclic	= wbc->range_cyclic;
+		__entry->more_io	= wbc->more_io;
+		__entry->older_than_this = wbc->older_than_this ?
+						*wbc->older_than_this : 0;
+		__entry->range_start	= (long)wbc->range_start;
+		__entry->range_end	= (long)wbc->range_end;
+	),
+
+	TP_printk("dev %s wbc=%x towrt=%ld skip=%ld sb=%d mode=%d kupd=%d "
+		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+		"start=0x%lx end=0x%lx",
+		__entry->name,
+		__entry->wbc,
+		__entry->nr_to_write,
+		__entry->pages_skipped,
+		__entry->sb,
+		__entry->sync_mode,
+		__entry->for_kupdate,
+		__entry->for_background,
+		__entry->for_reclaim,
+		__entry->range_cyclic,
+		__entry->more_io,
+		__entry->older_than_this,
+		__entry->range_start,
+		__entry->range_end)
+)
+
+#define DEFINE_WBC_EVENT(name) \
+DEFINE_EVENT(wbc_class, name, \
+	TP_PROTO(struct writeback_control *wbc), \
+	TP_ARGS(wbc))
+DEFINE_WBC_EVENT(wbc_writeback_start);
+DEFINE_WBC_EVENT(wbc_writeback_written);
+DEFINE_WBC_EVENT(wbc_writeback_wait);
+DEFINE_WBC_EVENT(wbc_balance_dirty_start);
+DEFINE_WBC_EVENT(wbc_balance_dirty_written);
+DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..d45f59e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/events/writeback.h>
 
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -536,11 +537,13 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * threshold otherwise wait until the disk writes catch
 		 * up.
 		 */
+		trace_wbc_balance_dirty_start(&wbc);
 		if (bdi_nr_reclaimable > bdi_thresh) {
 			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+			trace_wbc_balance_dirty_written(&wbc);
 		}
 
 		/*
@@ -566,6 +569,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (pages_written >= write_chunk)
 			break;		/* We've done our duty */
 
+		trace_wbc_balance_dirty_wait(&wbc);
 		__set_current_state(TASK_INTERRUPTIBLE);
 		io_schedule_timeout(pause);
 
-- 
1.6.5


^ permalink raw reply related	[flat|nested] 54+ messages in thread

* [PATCH 2/4] writeback: Add tracing to balance_dirty_pages
@ 2010-04-20  2:41   ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  2:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs

From: Dave Chinner <dchinner@redhat.com>

Tracing high level background writeback events is good, but it doesn't
give the entire picture. Add IO dispatched by foreground throttling to the
writeback events.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c                |    5 ++
 include/trace/events/writeback.h |   77 ++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c              |    4 ++
 3 files changed, 86 insertions(+), 0 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3f5f0a5..5214b61 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -752,7 +752,11 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.more_io = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
+
+		trace_wbc_writeback_start(&wbc);
 		writeback_inodes_wb(wb, &wbc);
+		trace_wbc_writeback_written(&wbc);
+
 		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
@@ -780,6 +784,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (!list_empty(&wb->b_more_io))  {
 			inode = list_entry(wb->b_more_io.prev,
 						struct inode, i_list);
+			trace_wbc_writeback_wait(&wbc);
 			inode_wait_for_writeback(inode);
 		}
 		spin_unlock(&inode_lock);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index df76457..02f34a5 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -165,6 +165,83 @@ TRACE_EVENT(writeback_bdi_register,
 	TP_printk("%s: %s", __entry->name,
 			__entry->start ? "registered" : "unregistered")
 );
+
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(wbc_class,
+	TP_PROTO(struct writeback_control *wbc),
+	TP_ARGS(wbc),
+	TP_STRUCT__entry(
+		__field(unsigned int, wbc)
+		__array(char, name, 16)
+		__field(long, nr_to_write)
+		__field(long, pages_skipped)
+		__field(int, sb)
+		__field(int, sync_mode)
+		__field(int, nonblocking)
+		__field(int, encountered_congestion)
+		__field(int, for_kupdate)
+		__field(int, for_background)
+		__field(int, for_reclaim)
+		__field(int, range_cyclic)
+		__field(int, more_io)
+		__field(unsigned long, older_than_this)
+		__field(long, range_start)
+		__field(long, range_end)
+	),
+
+	TP_fast_assign(
+		char *__name = "(none)";
+
+		__entry->wbc		= (unsigned long)wbc & 0xffff;
+		if (wbc->bdi)
+			strncpy(__entry->name, dev_name(wbc->bdi->dev), 16);
+		else
+			strncpy(__entry->name, __name, 16);
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->pages_skipped	= wbc->pages_skipped;
+		__entry->sb		= !!wbc->sb;
+		__entry->sync_mode	= wbc->sync_mode;
+		__entry->for_kupdate	= wbc->for_kupdate;
+		__entry->for_background	= wbc->for_background;
+		__entry->for_reclaim	= wbc->for_reclaim;
+		__entry->range_cyclic	= wbc->range_cyclic;
+		__entry->more_io	= wbc->more_io;
+		__entry->older_than_this = wbc->older_than_this ?
+						*wbc->older_than_this : 0;
+		__entry->range_start	= (long)wbc->range_start;
+		__entry->range_end	= (long)wbc->range_end;
+	),
+
+	TP_printk("dev %s wbc=%x towrt=%ld skip=%ld sb=%d mode=%d kupd=%d "
+		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+		"start=0x%lx end=0x%lx",
+		__entry->name,
+		__entry->wbc,
+		__entry->nr_to_write,
+		__entry->pages_skipped,
+		__entry->sb,
+		__entry->sync_mode,
+		__entry->for_kupdate,
+		__entry->for_background,
+		__entry->for_reclaim,
+		__entry->range_cyclic,
+		__entry->more_io,
+		__entry->older_than_this,
+		__entry->range_start,
+		__entry->range_end)
+)
+
+#define DEFINE_WBC_EVENT(name) \
+DEFINE_EVENT(wbc_class, name, \
+	TP_PROTO(struct writeback_control *wbc), \
+	TP_ARGS(wbc))
+DEFINE_WBC_EVENT(wbc_writeback_start);
+DEFINE_WBC_EVENT(wbc_writeback_written);
+DEFINE_WBC_EVENT(wbc_writeback_wait);
+DEFINE_WBC_EVENT(wbc_balance_dirty_start);
+DEFINE_WBC_EVENT(wbc_balance_dirty_written);
+DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..d45f59e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/events/writeback.h>
 
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -536,11 +537,13 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * threshold otherwise wait until the disk writes catch
 		 * up.
 		 */
+		trace_wbc_balance_dirty_start(&wbc);
 		if (bdi_nr_reclaimable > bdi_thresh) {
 			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+			trace_wbc_balance_dirty_written(&wbc);
 		}
 
 		/*
@@ -566,6 +569,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (pages_written >= write_chunk)
 			break;		/* We've done our duty */
 
+		trace_wbc_balance_dirty_wait(&wbc);
 		__set_current_state(TASK_INTERRUPTIBLE);
 		io_schedule_timeout(pause);
 
-- 
1.6.5

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 54+ messages in thread

* [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-20  2:41 ` Dave Chinner
@ 2010-04-20  2:41   ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  2:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs

From: Dave Chinner <dchinner@redhat.com>

If a filesystem writes more than one page in ->writepage, write_cache_pages
fails to notice this and continues to attempt writeback when wbc->nr_to_write
has gone negative - this trace was captured from XFS:


    wbc_writeback_start: towrt=1024
    wbc_writepage: towrt=1024
    wbc_writepage: towrt=0
    wbc_writepage: towrt=-1
    wbc_writepage: towrt=-5
    wbc_writepage: towrt=-21
    wbc_writepage: towrt=-85

This has adverse effects on filesystem writeback behaviour. write_cache_pages()
needs to terminate after a certain number of pages are written, not after a
certain number of calls to ->writepage are made. Make it observe the current
value of wbc->nr_to_write and treat a value of <= 0 as though it is a either a
termination condition or a trigger to reset to MAX_WRITEḆACK_PAGES for data
integrity syncs.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c                |    9 ---------
 include/linux/writeback.h        |    9 +++++++++
 include/trace/events/writeback.h |    1 +
 mm/page-writeback.c              |   20 +++++++++++++++++++-
 4 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5214b61..d8271d5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -675,15 +675,6 @@ void writeback_inodes_wbc(struct writeback_control *wbc)
 	writeback_inodes_wb(&bdi->wb, wbc);
 }
 
-/*
- * The maximum number of pages to writeout in a single bdi flush/kupdate
- * operation.  We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode.  Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES     1024
-
 static inline bool over_bground_thresh(void)
 {
 	unsigned long background_thresh, dirty_thresh;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b2d615f..8533a0f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -14,6 +14,15 @@ extern struct list_head inode_in_use;
 extern struct list_head inode_unused;
 
 /*
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation.  We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode.  Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES     1024
+
+/*
  * fs/fs-writeback.c
  */
 enum writeback_sync_modes {
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 02f34a5..3bcbd83 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -241,6 +241,7 @@ DEFINE_WBC_EVENT(wbc_writeback_wait);
 DEFINE_WBC_EVENT(wbc_balance_dirty_start);
 DEFINE_WBC_EVENT(wbc_balance_dirty_written);
 DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
+DEFINE_WBC_EVENT(wbc_writepage);
 
 #endif /* _TRACE_WRITEBACK_H */
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d45f59e..e22af84 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -917,6 +917,7 @@ continue_unlock:
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
+			trace_wbc_writepage(wbc);
 			ret = (*writepage)(page, wbc, data);
 			if (unlikely(ret)) {
 				if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -935,7 +936,7 @@ continue_unlock:
 					done = 1;
 					break;
 				}
- 			}
+			}
 
 			if (nr_to_write > 0) {
 				nr_to_write--;
@@ -955,6 +956,23 @@ continue_unlock:
 					break;
 				}
 			}
+
+			/*
+			 * Some filesystems will write multiple pages in
+			 * ->writepage, so wbc->nr_to_write can change much,
+			 * much faster than nr_to_write. Check this as an exit
+			 * condition, or if we are doing a data integrity sync,
+			 * reset the wbc to MAX_WRITEBACK_PAGES so that such
+			 * filesystems can do optimal writeout here.
+			 */
+			if (wbc->nr_to_write <= 0) {
+				if (wbc->sync_mode == WB_SYNC_NONE) {
+					done = 1;
+					nr_to_write = 0;
+					break;
+				}
+				wbc->nr_to_write = MAX_WRITEBACK_PAGES;
+			}
 		}
 		pagevec_release(&pvec);
 		cond_resched();
-- 
1.6.5


^ permalink raw reply related	[flat|nested] 54+ messages in thread

* [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-20  2:41   ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  2:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs

From: Dave Chinner <dchinner@redhat.com>

If a filesystem writes more than one page in ->writepage, write_cache_pages
fails to notice this and continues to attempt writeback when wbc->nr_to_write
has gone negative - this trace was captured from XFS:


    wbc_writeback_start: towrt=1024
    wbc_writepage: towrt=1024
    wbc_writepage: towrt=0
    wbc_writepage: towrt=-1
    wbc_writepage: towrt=-5
    wbc_writepage: towrt=-21
    wbc_writepage: towrt=-85

This has adverse effects on filesystem writeback behaviour. write_cache_pages()
needs to terminate after a certain number of pages are written, not after a
certain number of calls to ->writepage are made. Make it observe the current
value of wbc->nr_to_write and treat a value of <= 0 as though it is a either a
termination condition or a trigger to reset to MAX_WRITEḆACK_PAGES for data
integrity syncs.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c                |    9 ---------
 include/linux/writeback.h        |    9 +++++++++
 include/trace/events/writeback.h |    1 +
 mm/page-writeback.c              |   20 +++++++++++++++++++-
 4 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5214b61..d8271d5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -675,15 +675,6 @@ void writeback_inodes_wbc(struct writeback_control *wbc)
 	writeback_inodes_wb(&bdi->wb, wbc);
 }
 
-/*
- * The maximum number of pages to writeout in a single bdi flush/kupdate
- * operation.  We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode.  Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES     1024
-
 static inline bool over_bground_thresh(void)
 {
 	unsigned long background_thresh, dirty_thresh;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b2d615f..8533a0f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -14,6 +14,15 @@ extern struct list_head inode_in_use;
 extern struct list_head inode_unused;
 
 /*
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation.  We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode.  Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES     1024
+
+/*
  * fs/fs-writeback.c
  */
 enum writeback_sync_modes {
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 02f34a5..3bcbd83 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -241,6 +241,7 @@ DEFINE_WBC_EVENT(wbc_writeback_wait);
 DEFINE_WBC_EVENT(wbc_balance_dirty_start);
 DEFINE_WBC_EVENT(wbc_balance_dirty_written);
 DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
+DEFINE_WBC_EVENT(wbc_writepage);
 
 #endif /* _TRACE_WRITEBACK_H */
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d45f59e..e22af84 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -917,6 +917,7 @@ continue_unlock:
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
+			trace_wbc_writepage(wbc);
 			ret = (*writepage)(page, wbc, data);
 			if (unlikely(ret)) {
 				if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -935,7 +936,7 @@ continue_unlock:
 					done = 1;
 					break;
 				}
- 			}
+			}
 
 			if (nr_to_write > 0) {
 				nr_to_write--;
@@ -955,6 +956,23 @@ continue_unlock:
 					break;
 				}
 			}
+
+			/*
+			 * Some filesystems will write multiple pages in
+			 * ->writepage, so wbc->nr_to_write can change much,
+			 * much faster than nr_to_write. Check this as an exit
+			 * condition, or if we are doing a data integrity sync,
+			 * reset the wbc to MAX_WRITEBACK_PAGES so that such
+			 * filesystems can do optimal writeout here.
+			 */
+			if (wbc->nr_to_write <= 0) {
+				if (wbc->sync_mode == WB_SYNC_NONE) {
+					done = 1;
+					nr_to_write = 0;
+					break;
+				}
+				wbc->nr_to_write = MAX_WRITEBACK_PAGES;
+			}
 		}
 		pagevec_release(&pvec);
 		cond_resched();
-- 
1.6.5

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 54+ messages in thread

* [PATCH 4/4] xfs: remove nr_to_write writeback windup.
  2010-04-20  2:41 ` Dave Chinner
@ 2010-04-20  2:41   ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  2:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs

From: Dave Chinner <dchinner@redhat.com>

Now that the background flush code has been fixed, we shouldn't need to
silently multiply the wbc->nr_to_write to get good writeback. Remove
that code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_aops.c |    8 --------
 1 files changed, 0 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 9962850..2b2225d 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1336,14 +1336,6 @@ xfs_vm_writepage(
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
 
-
-	/*
-	 *  VM calculation for nr_to_write seems off.  Bump it way
-	 *  up, this gets simple streaming writes zippy again.
-	 *  To be reviewed again after Jens' writeback changes.
-	 */
-	wbc->nr_to_write *= 4;
-
 	/*
 	 * Convert delayed allocate, unwritten or unmapped space
 	 * to real space and flush out to disk.
-- 
1.6.5


^ permalink raw reply related	[flat|nested] 54+ messages in thread

* [PATCH 4/4] xfs: remove nr_to_write writeback windup.
@ 2010-04-20  2:41   ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  2:41 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs

From: Dave Chinner <dchinner@redhat.com>

Now that the background flush code has been fixed, we shouldn't need to
silently multiply the wbc->nr_to_write to get good writeback. Remove
that code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_aops.c |    8 --------
 1 files changed, 0 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 9962850..2b2225d 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1336,14 +1336,6 @@ xfs_vm_writepage(
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
 
-
-	/*
-	 *  VM calculation for nr_to_write seems off.  Bump it way
-	 *  up, this gets simple streaming writes zippy again.
-	 *  To be reviewed again after Jens' writeback changes.
-	 */
-	wbc->nr_to_write *= 4;
-
 	/*
 	 * Convert delayed allocate, unwritten or unmapped space
 	 * to real space and flush out to disk.
-- 
1.6.5

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 54+ messages in thread

* [PATCH 5/4] writeback: limit write_cache_pages integrity scanning to current EOF
  2010-04-20  2:41 ` Dave Chinner
@ 2010-04-20  3:40   ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  3:40 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs


sync can currently take a really long time if a concurrent writer is
extending a file. The problem is that the dirty pages on the address
space grow in the same direction as write_cache_pages scans, so if
the writer keeps ahead of writeback, the writeback will not
terminate until the writer stops adding dirty pages.

For a data integrity sync, we only need to write the pages dirty at
the time we start the writeback, so we can stop scanning once we get
to the page that was at the end of the file at the time the scan
started.

This will prevent operations like copying a large file preventing
sync from completing as it will not write back pages that were
dirtied after the sync was started. This does not impact the
existing integrity guarantees, as any dirty page (old or new)
within the EOF range at the start of the scan will still be
captured.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 mm/page-writeback.c |   15 +++++++++++++++
 1 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e22af84..4ba2728 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -852,7 +852,22 @@ int write_cache_pages(struct address_space *mapping,
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
+
+		/*
+		 * If this is a data integrity sync, cap the writeback to the
+		 * current end of file. Any extension to the file that occurs
+		 * after this is a new write and we don't need to write those
+		 * pages out to fulfil our data integrity requirements. If we
+		 * try to write them out, we can get stuck in this scan until
+		 * the concurrent writer stops adding dirty pages and extending
+		 * EOF.
+		 */
+		if (wbc->sync_mode == WB_SYNC_ALL &&
+		    wbc->range_end == LLONG_MAX) {
+			end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
+		}
 	}
+
 retry:
 	done_index = index;
 	while (!done && (index <= end)) {

^ permalink raw reply related	[flat|nested] 54+ messages in thread

* [PATCH 5/4] writeback: limit write_cache_pages integrity scanning to current EOF
@ 2010-04-20  3:40   ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20  3:40 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel, xfs


sync can currently take a really long time if a concurrent writer is
extending a file. The problem is that the dirty pages on the address
space grow in the same direction as write_cache_pages scans, so if
the writer keeps ahead of writeback, the writeback will not
terminate until the writer stops adding dirty pages.

For a data integrity sync, we only need to write the pages dirty at
the time we start the writeback, so we can stop scanning once we get
to the page that was at the end of the file at the time the scan
started.

This will prevent operations like copying a large file preventing
sync from completing as it will not write back pages that were
dirtied after the sync was started. This does not impact the
existing integrity guarantees, as any dirty page (old or new)
within the EOF range at the start of the scan will still be
captured.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 mm/page-writeback.c |   15 +++++++++++++++
 1 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e22af84..4ba2728 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -852,7 +852,22 @@ int write_cache_pages(struct address_space *mapping,
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
+
+		/*
+		 * If this is a data integrity sync, cap the writeback to the
+		 * current end of file. Any extension to the file that occurs
+		 * after this is a new write and we don't need to write those
+		 * pages out to fulfil our data integrity requirements. If we
+		 * try to write them out, we can get stuck in this scan until
+		 * the concurrent writer stops adding dirty pages and extending
+		 * EOF.
+		 */
+		if (wbc->sync_mode == WB_SYNC_ALL &&
+		    wbc->range_end == LLONG_MAX) {
+			end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
+		}
 	}
+
 retry:
 	done_index = index;
 	while (!done && (index <= end)) {

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes
  2010-04-20  2:41 ` Dave Chinner
@ 2010-04-20 12:02   ` Richard Kennedy
  -1 siblings, 0 replies; 54+ messages in thread
From: Richard Kennedy @ 2010-04-20 12:02 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On 20/04/10 03:41, Dave Chinner wrote:
> This series contains the initial writeback tracing patches from
> Jens, as well as the extensions I added to provide visibility into
> writeback control structures as the are used by the writeback code.
> The visibility given is sufficient to understand what is happening
> in the writeback path - what path is writing data, what path is
> blocking on congestion, etc, and to determine the differences in
> behaviour for different sync modes and calling contexts. This
> tracing really needs to be integrated into mainline so that anyone
> can improve the tracing as they use it to track down problems
> in our convoluted writeback paths.
> 
> The remaining patches are fixes to problems that the new tracing
> highlighted.
> 

Hi Dave,

Thanks for adding tracing to this, it will be really useful.

The fix to write_cache_pages looks really interesting, I'm going to test
it on my machine. Maybe it should be a separate patch to get more
visibility?

Ext4 also multiplies nr_to_write, so will that need fixing too?

regards
Richard

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes
@ 2010-04-20 12:02   ` Richard Kennedy
  0 siblings, 0 replies; 54+ messages in thread
From: Richard Kennedy @ 2010-04-20 12:02 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On 20/04/10 03:41, Dave Chinner wrote:
> This series contains the initial writeback tracing patches from
> Jens, as well as the extensions I added to provide visibility into
> writeback control structures as the are used by the writeback code.
> The visibility given is sufficient to understand what is happening
> in the writeback path - what path is writing data, what path is
> blocking on congestion, etc, and to determine the differences in
> behaviour for different sync modes and calling contexts. This
> tracing really needs to be integrated into mainline so that anyone
> can improve the tracing as they use it to track down problems
> in our convoluted writeback paths.
> 
> The remaining patches are fixes to problems that the new tracing
> highlighted.
> 

Hi Dave,

Thanks for adding tracing to this, it will be really useful.

The fix to write_cache_pages looks really interesting, I'm going to test
it on my machine. Maybe it should be a separate patch to get more
visibility?

Ext4 also multiplies nr_to_write, so will that need fixing too?

regards
Richard

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 5/4] writeback: limit write_cache_pages integrity scanning to current EOF
  2010-04-20  3:40   ` Dave Chinner
@ 2010-04-20 23:28     ` Jamie Lokier
  -1 siblings, 0 replies; 54+ messages in thread
From: Jamie Lokier @ 2010-04-20 23:28 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

Dave Chinner wrote:
> sync can currently take a really long time if a concurrent writer is
> extending a file. The problem is that the dirty pages on the address
> space grow in the same direction as write_cache_pages scans, so if
> the writer keeps ahead of writeback, the writeback will not
> terminate until the writer stops adding dirty pages.
> 
> For a data integrity sync, we only need to write the pages dirty at
> the time we start the writeback, so we can stop scanning once we get
> to the page that was at the end of the file at the time the scan
> started.
> 
> This will prevent operations like copying a large file preventing
> sync from completing as it will not write back pages that were
> dirtied after the sync was started. This does not impact the
> existing integrity guarantees, as any dirty page (old or new)
> within the EOF range at the start of the scan will still be
> captured.

I guess it can still get stuck if someone does ftruncate() first, then
writes to the hole?

-- Jamie

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 5/4] writeback: limit write_cache_pages integrity scanning to current EOF
@ 2010-04-20 23:28     ` Jamie Lokier
  0 siblings, 0 replies; 54+ messages in thread
From: Jamie Lokier @ 2010-04-20 23:28 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

Dave Chinner wrote:
> sync can currently take a really long time if a concurrent writer is
> extending a file. The problem is that the dirty pages on the address
> space grow in the same direction as write_cache_pages scans, so if
> the writer keeps ahead of writeback, the writeback will not
> terminate until the writer stops adding dirty pages.
> 
> For a data integrity sync, we only need to write the pages dirty at
> the time we start the writeback, so we can stop scanning once we get
> to the page that was at the end of the file at the time the scan
> started.
> 
> This will prevent operations like copying a large file preventing
> sync from completing as it will not write back pages that were
> dirtied after the sync was started. This does not impact the
> existing integrity guarantees, as any dirty page (old or new)
> within the EOF range at the start of the scan will still be
> captured.

I guess it can still get stuck if someone does ftruncate() first, then
writes to the hole?

-- Jamie

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes
  2010-04-20 12:02   ` Richard Kennedy
@ 2010-04-20 23:29     ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20 23:29 UTC (permalink / raw)
  To: Richard Kennedy; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue, Apr 20, 2010 at 01:02:16PM +0100, Richard Kennedy wrote:
> On 20/04/10 03:41, Dave Chinner wrote:
> > This series contains the initial writeback tracing patches from
> > Jens, as well as the extensions I added to provide visibility into
> > writeback control structures as the are used by the writeback code.
> > The visibility given is sufficient to understand what is happening
> > in the writeback path - what path is writing data, what path is
> > blocking on congestion, etc, and to determine the differences in
> > behaviour for different sync modes and calling contexts. This
> > tracing really needs to be integrated into mainline so that anyone
> > can improve the tracing as they use it to track down problems
> > in our convoluted writeback paths.
> > 
> > The remaining patches are fixes to problems that the new tracing
> > highlighted.
> 
> Hi Dave,
> 
> Thanks for adding tracing to this, it will be really useful.
> 
> The fix to write_cache_pages looks really interesting, I'm going to test
> it on my machine. Maybe it should be a separate patch to get more
> visibility?

I don't see a big need to separate the series at this point. Once
there's been a review and testing we can decide how to push them
into mainline. IMO, the tracing is just as important as the bug
fixes....

> Ext4 also multiplies nr_to_write, so will that need fixing too?

No idea. I don't claim to understand ext4's convoluted delayed
allocation path and all it's constraints, so I guess you'd need to
ask the ext4 developers about that one. After all, with the tracing
they'd be able to see if there is a problem. ;)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes
@ 2010-04-20 23:29     ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20 23:29 UTC (permalink / raw)
  To: Richard Kennedy; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue, Apr 20, 2010 at 01:02:16PM +0100, Richard Kennedy wrote:
> On 20/04/10 03:41, Dave Chinner wrote:
> > This series contains the initial writeback tracing patches from
> > Jens, as well as the extensions I added to provide visibility into
> > writeback control structures as the are used by the writeback code.
> > The visibility given is sufficient to understand what is happening
> > in the writeback path - what path is writing data, what path is
> > blocking on congestion, etc, and to determine the differences in
> > behaviour for different sync modes and calling contexts. This
> > tracing really needs to be integrated into mainline so that anyone
> > can improve the tracing as they use it to track down problems
> > in our convoluted writeback paths.
> > 
> > The remaining patches are fixes to problems that the new tracing
> > highlighted.
> 
> Hi Dave,
> 
> Thanks for adding tracing to this, it will be really useful.
> 
> The fix to write_cache_pages looks really interesting, I'm going to test
> it on my machine. Maybe it should be a separate patch to get more
> visibility?

I don't see a big need to separate the series at this point. Once
there's been a review and testing we can decide how to push them
into mainline. IMO, the tracing is just as important as the bug
fixes....

> Ext4 also multiplies nr_to_write, so will that need fixing too?

No idea. I don't claim to understand ext4's convoluted delayed
allocation path and all it's constraints, so I guess you'd need to
ask the ext4 developers about that one. After all, with the tracing
they'd be able to see if there is a problem. ;)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 5/4] writeback: limit write_cache_pages integrity scanning to current EOF
  2010-04-20 23:28     ` Jamie Lokier
@ 2010-04-20 23:31       ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20 23:31 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: linux-fsdevel, linux-kernel, xfs

On Wed, Apr 21, 2010 at 12:28:19AM +0100, Jamie Lokier wrote:
> Dave Chinner wrote:
> > sync can currently take a really long time if a concurrent writer is
> > extending a file. The problem is that the dirty pages on the address
> > space grow in the same direction as write_cache_pages scans, so if
> > the writer keeps ahead of writeback, the writeback will not
> > terminate until the writer stops adding dirty pages.
> > 
> > For a data integrity sync, we only need to write the pages dirty at
> > the time we start the writeback, so we can stop scanning once we get
> > to the page that was at the end of the file at the time the scan
> > started.
> > 
> > This will prevent operations like copying a large file preventing
> > sync from completing as it will not write back pages that were
> > dirtied after the sync was started. This does not impact the
> > existing integrity guarantees, as any dirty page (old or new)
> > within the EOF range at the start of the scan will still be
> > captured.
> 
> I guess it can still get stuck if someone does ftruncate() first, then
> writes to the hole?

Yes, it would. It only deals with extending files because fixing the
problem w.r.t.  writes into holes requires something much more
invasive like Jan's radix tree mark-and-sweep algorithm....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 5/4] writeback: limit write_cache_pages integrity scanning to current EOF
@ 2010-04-20 23:31       ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-20 23:31 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: linux-fsdevel, linux-kernel, xfs

On Wed, Apr 21, 2010 at 12:28:19AM +0100, Jamie Lokier wrote:
> Dave Chinner wrote:
> > sync can currently take a really long time if a concurrent writer is
> > extending a file. The problem is that the dirty pages on the address
> > space grow in the same direction as write_cache_pages scans, so if
> > the writer keeps ahead of writeback, the writeback will not
> > terminate until the writer stops adding dirty pages.
> > 
> > For a data integrity sync, we only need to write the pages dirty at
> > the time we start the writeback, so we can stop scanning once we get
> > to the page that was at the end of the file at the time the scan
> > started.
> > 
> > This will prevent operations like copying a large file preventing
> > sync from completing as it will not write back pages that were
> > dirtied after the sync was started. This does not impact the
> > existing integrity guarantees, as any dirty page (old or new)
> > within the EOF range at the start of the scan will still be
> > captured.
> 
> I guess it can still get stuck if someone does ftruncate() first, then
> writes to the hole?

Yes, it would. It only deals with extending files because fixing the
problem w.r.t.  writes into holes requires something much more
invasive like Jan's radix tree mark-and-sweep algorithm....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-20  2:41   ` Dave Chinner
@ 2010-04-22 19:07     ` Jan Kara
  -1 siblings, 0 replies; 54+ messages in thread
From: Jan Kara @ 2010-04-22 19:07 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue 20-04-10 12:41:53, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If a filesystem writes more than one page in ->writepage, write_cache_pages
> fails to notice this and continues to attempt writeback when wbc->nr_to_write
> has gone negative - this trace was captured from XFS:
> 
> 
>     wbc_writeback_start: towrt=1024
>     wbc_writepage: towrt=1024
>     wbc_writepage: towrt=0
>     wbc_writepage: towrt=-1
>     wbc_writepage: towrt=-5
>     wbc_writepage: towrt=-21
>     wbc_writepage: towrt=-85
> 
> This has adverse effects on filesystem writeback behaviour. write_cache_pages()
> needs to terminate after a certain number of pages are written, not after a
> certain number of calls to ->writepage are made. Make it observe the current
> value of wbc->nr_to_write and treat a value of <= 0 as though it is a either a
> termination condition or a trigger to reset to MAX_WRITEḆACK_PAGES for data
> integrity syncs.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/fs-writeback.c                |    9 ---------
>  include/linux/writeback.h        |    9 +++++++++
>  include/trace/events/writeback.h |    1 +
>  mm/page-writeback.c              |   20 +++++++++++++++++++-
>  4 files changed, 29 insertions(+), 10 deletions(-)
> 
  <snip>

> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index d45f59e..e22af84 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -917,6 +917,7 @@ continue_unlock:
>  			if (!clear_page_dirty_for_io(page))
>  				goto continue_unlock;
>  
> +			trace_wbc_writepage(wbc);
>  			ret = (*writepage)(page, wbc, data);
>  			if (unlikely(ret)) {
>  				if (ret == AOP_WRITEPAGE_ACTIVATE) {
> @@ -935,7 +936,7 @@ continue_unlock:
>  					done = 1;
>  					break;
>  				}
> - 			}
> +			}
>  
>  			if (nr_to_write > 0) {
>  				nr_to_write--;
> @@ -955,6 +956,23 @@ continue_unlock:
>  					break;
>  				}
>  			}
> +
> +			/*
> +			 * Some filesystems will write multiple pages in
> +			 * ->writepage, so wbc->nr_to_write can change much,
> +			 * much faster than nr_to_write. Check this as an exit
> +			 * condition, or if we are doing a data integrity sync,
> +			 * reset the wbc to MAX_WRITEBACK_PAGES so that such
> +			 * filesystems can do optimal writeout here.
> +			 */
> +			if (wbc->nr_to_write <= 0) {
> +				if (wbc->sync_mode == WB_SYNC_NONE) {
> +					done = 1;
> +					nr_to_write = 0;
> +					break;
> +				}
> +				wbc->nr_to_write = MAX_WRITEBACK_PAGES;
> +			}
  Honestly, this is an ugly hack. I'd rather work towards ignoring
nr_to_write completely in WB_SYNC_ALL mode since it doesn't really make
any sence to say "write me *safely* 5 pages".

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-22 19:07     ` Jan Kara
  0 siblings, 0 replies; 54+ messages in thread
From: Jan Kara @ 2010-04-22 19:07 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue 20-04-10 12:41:53, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If a filesystem writes more than one page in ->writepage, write_cache_pages
> fails to notice this and continues to attempt writeback when wbc->nr_to_write
> has gone negative - this trace was captured from XFS:
> 
> 
>     wbc_writeback_start: towrt=1024
>     wbc_writepage: towrt=1024
>     wbc_writepage: towrt=0
>     wbc_writepage: towrt=-1
>     wbc_writepage: towrt=-5
>     wbc_writepage: towrt=-21
>     wbc_writepage: towrt=-85
> 
> This has adverse effects on filesystem writeback behaviour. write_cache_pages()
> needs to terminate after a certain number of pages are written, not after a
> certain number of calls to ->writepage are made. Make it observe the current
> value of wbc->nr_to_write and treat a value of <= 0 as though it is a either a
> termination condition or a trigger to reset to MAX_WRITEḆACK_PAGES for data
> integrity syncs.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/fs-writeback.c                |    9 ---------
>  include/linux/writeback.h        |    9 +++++++++
>  include/trace/events/writeback.h |    1 +
>  mm/page-writeback.c              |   20 +++++++++++++++++++-
>  4 files changed, 29 insertions(+), 10 deletions(-)
> 
  <snip>

> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index d45f59e..e22af84 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -917,6 +917,7 @@ continue_unlock:
>  			if (!clear_page_dirty_for_io(page))
>  				goto continue_unlock;
>  
> +			trace_wbc_writepage(wbc);
>  			ret = (*writepage)(page, wbc, data);
>  			if (unlikely(ret)) {
>  				if (ret == AOP_WRITEPAGE_ACTIVATE) {
> @@ -935,7 +936,7 @@ continue_unlock:
>  					done = 1;
>  					break;
>  				}
> - 			}
> +			}
>  
>  			if (nr_to_write > 0) {
>  				nr_to_write--;
> @@ -955,6 +956,23 @@ continue_unlock:
>  					break;
>  				}
>  			}
> +
> +			/*
> +			 * Some filesystems will write multiple pages in
> +			 * ->writepage, so wbc->nr_to_write can change much,
> +			 * much faster than nr_to_write. Check this as an exit
> +			 * condition, or if we are doing a data integrity sync,
> +			 * reset the wbc to MAX_WRITEBACK_PAGES so that such
> +			 * filesystems can do optimal writeout here.
> +			 */
> +			if (wbc->nr_to_write <= 0) {
> +				if (wbc->sync_mode == WB_SYNC_NONE) {
> +					done = 1;
> +					nr_to_write = 0;
> +					break;
> +				}
> +				wbc->nr_to_write = MAX_WRITEBACK_PAGES;
> +			}
  Honestly, this is an ugly hack. I'd rather work towards ignoring
nr_to_write completely in WB_SYNC_ALL mode since it doesn't really make
any sence to say "write me *safely* 5 pages".

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/4] xfs: remove nr_to_write writeback windup.
  2010-04-20  2:41   ` Dave Chinner
@ 2010-04-22 19:09     ` Jan Kara
  -1 siblings, 0 replies; 54+ messages in thread
From: Jan Kara @ 2010-04-22 19:09 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue 20-04-10 12:41:54, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that the background flush code has been fixed, we shouldn't need to
> silently multiply the wbc->nr_to_write to get good writeback. Remove
> that code.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/linux-2.6/xfs_aops.c |    8 --------
>  1 files changed, 0 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
> index 9962850..2b2225d 100644
> --- a/fs/xfs/linux-2.6/xfs_aops.c
> +++ b/fs/xfs/linux-2.6/xfs_aops.c
> @@ -1336,14 +1336,6 @@ xfs_vm_writepage(
>  	if (!page_has_buffers(page))
>  		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
>  
> -
> -	/*
> -	 *  VM calculation for nr_to_write seems off.  Bump it way
> -	 *  up, this gets simple streaming writes zippy again.
> -	 *  To be reviewed again after Jens' writeback changes.
> -	 */
> -	wbc->nr_to_write *= 4;
> -
  Hum, are you sure about this? I thought it's there because VM passes at
most 1024 pages to write from background writeback and you wanted to write
more in one go (at least ext4 wants to do this).

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/4] xfs: remove nr_to_write writeback windup.
@ 2010-04-22 19:09     ` Jan Kara
  0 siblings, 0 replies; 54+ messages in thread
From: Jan Kara @ 2010-04-22 19:09 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue 20-04-10 12:41:54, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Now that the background flush code has been fixed, we shouldn't need to
> silently multiply the wbc->nr_to_write to get good writeback. Remove
> that code.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/linux-2.6/xfs_aops.c |    8 --------
>  1 files changed, 0 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
> index 9962850..2b2225d 100644
> --- a/fs/xfs/linux-2.6/xfs_aops.c
> +++ b/fs/xfs/linux-2.6/xfs_aops.c
> @@ -1336,14 +1336,6 @@ xfs_vm_writepage(
>  	if (!page_has_buffers(page))
>  		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
>  
> -
> -	/*
> -	 *  VM calculation for nr_to_write seems off.  Bump it way
> -	 *  up, this gets simple streaming writes zippy again.
> -	 *  To be reviewed again after Jens' writeback changes.
> -	 */
> -	wbc->nr_to_write *= 4;
> -
  Hum, are you sure about this? I thought it's there because VM passes at
most 1024 pages to write from background writeback and you wanted to write
more in one go (at least ext4 wants to do this).

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 5/4] writeback: limit write_cache_pages integrity scanning to current EOF
  2010-04-20  3:40   ` Dave Chinner
@ 2010-04-22 19:13     ` Jan Kara
  -1 siblings, 0 replies; 54+ messages in thread
From: Jan Kara @ 2010-04-22 19:13 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue 20-04-10 13:40:05, Dave Chinner wrote:
> 
> sync can currently take a really long time if a concurrent writer is
> extending a file. The problem is that the dirty pages on the address
> space grow in the same direction as write_cache_pages scans, so if
> the writer keeps ahead of writeback, the writeback will not
> terminate until the writer stops adding dirty pages.
> 
> For a data integrity sync, we only need to write the pages dirty at
> the time we start the writeback, so we can stop scanning once we get
> to the page that was at the end of the file at the time the scan
> started.
> 
> This will prevent operations like copying a large file preventing
> sync from completing as it will not write back pages that were
> dirtied after the sync was started. This does not impact the
> existing integrity guarantees, as any dirty page (old or new)
> within the EOF range at the start of the scan will still be
> captured.
  Looks good.

Acked-by: Jan Kara <jack@suse.cz>

> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  mm/page-writeback.c |   15 +++++++++++++++
>  1 files changed, 15 insertions(+), 0 deletions(-)
> 
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index e22af84..4ba2728 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -852,7 +852,22 @@ int write_cache_pages(struct address_space *mapping,
>  		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
>  			range_whole = 1;
>  		cycled = 1; /* ignore range_cyclic tests */
> +
> +		/*
> +		 * If this is a data integrity sync, cap the writeback to the
> +		 * current end of file. Any extension to the file that occurs
> +		 * after this is a new write and we don't need to write those
> +		 * pages out to fulfil our data integrity requirements. If we
> +		 * try to write them out, we can get stuck in this scan until
> +		 * the concurrent writer stops adding dirty pages and extending
> +		 * EOF.
> +		 */
> +		if (wbc->sync_mode == WB_SYNC_ALL &&
> +		    wbc->range_end == LLONG_MAX) {
> +			end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
> +		}
>  	}
> +
>  retry:
>  	done_index = index;
>  	while (!done && (index <= end)) {
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 5/4] writeback: limit write_cache_pages integrity scanning to current EOF
@ 2010-04-22 19:13     ` Jan Kara
  0 siblings, 0 replies; 54+ messages in thread
From: Jan Kara @ 2010-04-22 19:13 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue 20-04-10 13:40:05, Dave Chinner wrote:
> 
> sync can currently take a really long time if a concurrent writer is
> extending a file. The problem is that the dirty pages on the address
> space grow in the same direction as write_cache_pages scans, so if
> the writer keeps ahead of writeback, the writeback will not
> terminate until the writer stops adding dirty pages.
> 
> For a data integrity sync, we only need to write the pages dirty at
> the time we start the writeback, so we can stop scanning once we get
> to the page that was at the end of the file at the time the scan
> started.
> 
> This will prevent operations like copying a large file preventing
> sync from completing as it will not write back pages that were
> dirtied after the sync was started. This does not impact the
> existing integrity guarantees, as any dirty page (old or new)
> within the EOF range at the start of the scan will still be
> captured.
  Looks good.

Acked-by: Jan Kara <jack@suse.cz>

> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  mm/page-writeback.c |   15 +++++++++++++++
>  1 files changed, 15 insertions(+), 0 deletions(-)
> 
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index e22af84..4ba2728 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -852,7 +852,22 @@ int write_cache_pages(struct address_space *mapping,
>  		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
>  			range_whole = 1;
>  		cycled = 1; /* ignore range_cyclic tests */
> +
> +		/*
> +		 * If this is a data integrity sync, cap the writeback to the
> +		 * current end of file. Any extension to the file that occurs
> +		 * after this is a new write and we don't need to write those
> +		 * pages out to fulfil our data integrity requirements. If we
> +		 * try to write them out, we can get stuck in this scan until
> +		 * the concurrent writer stops adding dirty pages and extending
> +		 * EOF.
> +		 */
> +		if (wbc->sync_mode == WB_SYNC_ALL &&
> +		    wbc->range_end == LLONG_MAX) {
> +			end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
> +		}
>  	}
> +
>  retry:
>  	done_index = index;
>  	while (!done && (index <= end)) {
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-20  2:41   ` Dave Chinner
  (?)
@ 2010-04-25  3:33     ` tytso
  -1 siblings, 0 replies; 54+ messages in thread
From: tytso @ 2010-04-25  3:33 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue, Apr 20, 2010 at 12:41:53PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If a filesystem writes more than one page in ->writepage, write_cache_pages
> fails to notice this and continues to attempt writeback when wbc->nr_to_write
> has gone negative - this trace was captured from XFS:
> 
> 
>     wbc_writeback_start: towrt=1024
>     wbc_writepage: towrt=1024
>     wbc_writepage: towrt=0
>     wbc_writepage: towrt=-1
>     wbc_writepage: towrt=-5
>     wbc_writepage: towrt=-21
>     wbc_writepage: towrt=-85
> 
> This has adverse effects on filesystem writeback behaviour. write_cache_pages()
> needs to terminate after a certain number of pages are written, not after a
> certain number of calls to ->writepage are made. Make it observe the current
> value of wbc->nr_to_write and treat a value of <= 0 as though it is a either a
> termination condition or a trigger to reset to MAX_WRITEḆACK_PAGES for data
> integrity syncs.

Be careful here.  If you are going to write more pages than what the
writeback code has requested (the stupid no more than 1024 pages
restriction in the writeback code before it jumps to start writing
some other inode), you actually need to let the returned
wbc->nr_to_write go negative, so that wb_writeback() knows how many
pages it has written.

In other words, the writeback code assumes that 

  <orignal value of nr_to_write> - <returned wbc->nr_to_write>

is

  <number of pages actually written>

If you don't let wbc->nr_to_write go negative, the writeback code will
be confused about how many pages were _actually_ written, and the
writeback code ends up writing too much.  See commit 2faf2e1.

All of this is a crock of course.  The file system shouldn't be
second-guessing the writeback code.  Instead the writeback code should
be adaptively measuring how long it takes to were written out N pages
to a particular block device, and then decide what's the appropriate
setting for nr_to_write.  What makes sense for a USB stick, or a 4200
RPM laptop drive, may not make sense for a massive RAID array....

But since we don't have that, both XFS and ext4 have workarounds for
brain-damaged writeback behaviour.  (I did some testing, and even for
standard laptop drives the cap of 1024 pages is just Way Too Small;
that limit was set something like a decade ago, and everyone has been
afraid to change it, even though disks have gotten a wee bit faster
since those days.)

    	   	      	       	     	 - Ted

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-25  3:33     ` tytso
  0 siblings, 0 replies; 54+ messages in thread
From: tytso @ 2010-04-25  3:33 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue, Apr 20, 2010 at 12:41:53PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If a filesystem writes more than one page in ->writepage, write_cache_pages
> fails to notice this and continues to attempt writeback when wbc->nr_to_write
> has gone negative - this trace was captured from XFS:
> 
> 
>     wbc_writeback_start: towrt=1024
>     wbc_writepage: towrt=1024
>     wbc_writepage: towrt=0
>     wbc_writepage: towrt=-1
>     wbc_writepage: towrt=-5
>     wbc_writepage: towrt=-21
>     wbc_writepage: towrt=-85
> 
> This has adverse effects on filesystem writeback behaviour. write_cache_pages()
> needs to terminate after a certain number of pages are written, not after a
> certain number of calls to ->writepage are made. Make it observe the current
> value of wbc->nr_to_write and treat a value of <= 0 as though it is a either a
> termination condition or a trigger to reset to MAX_WRITEḆACK_PAGES for data
> integrity syncs.

Be careful here.  If you are going to write more pages than what the
writeback code has requested (the stupid no more than 1024 pages
restriction in the writeback code before it jumps to start writing
some other inode), you actually need to let the returned
wbc->nr_to_write go negative, so that wb_writeback() knows how many
pages it has written.

In other words, the writeback code assumes that 

  <orignal value of nr_to_write> - <returned wbc->nr_to_write>

is

  <number of pages actually written>

If you don't let wbc->nr_to_write go negative, the writeback code will
be confused about how many pages were _actually_ written, and the
writeback code ends up writing too much.  See commit 2faf2e1.

All of this is a crock of course.  The file system shouldn't be
second-guessing the writeback code.  Instead the writeback code should
be adaptively measuring how long it takes to were written out N pages
to a particular block device, and then decide what's the appropriate
setting for nr_to_write.  What makes sense for a USB stick, or a 4200
RPM laptop drive, may not make sense for a massive RAID array....

But since we don't have that, both XFS and ext4 have workarounds for
brain-damaged writeback behaviour.  (I did some testing, and even for
standard laptop drives the cap of 1024 pages is just Way Too Small;
that limit was set something like a decade ago, and everyone has been
afraid to change it, even though disks have gotten a wee bit faster
since those days.)

    	   	      	       	     	 - Ted
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-25  3:33     ` tytso
  0 siblings, 0 replies; 54+ messages in thread
From: tytso @ 2010-04-25  3:33 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue, Apr 20, 2010 at 12:41:53PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If a filesystem writes more than one page in ->writepage, write_cache_pages
> fails to notice this and continues to attempt writeback when wbc->nr_to_write
> has gone negative - this trace was captured from XFS:
> 
> 
>     wbc_writeback_start: towrt=1024
>     wbc_writepage: towrt=1024
>     wbc_writepage: towrt=0
>     wbc_writepage: towrt=-1
>     wbc_writepage: towrt=-5
>     wbc_writepage: towrt=-21
>     wbc_writepage: towrt=-85
> 
> This has adverse effects on filesystem writeback behaviour. write_cache_pages()
> needs to terminate after a certain number of pages are written, not after a
> certain number of calls to ->writepage are made. Make it observe the current
> value of wbc->nr_to_write and treat a value of <= 0 as though it is a either a
> termination condition or a trigger to reset to MAX_WRITEḆACK_PAGES for data
> integrity syncs.

Be careful here.  If you are going to write more pages than what the
writeback code has requested (the stupid no more than 1024 pages
restriction in the writeback code before it jumps to start writing
some other inode), you actually need to let the returned
wbc->nr_to_write go negative, so that wb_writeback() knows how many
pages it has written.

In other words, the writeback code assumes that 

  <orignal value of nr_to_write> - <returned wbc->nr_to_write>

is

  <number of pages actually written>

If you don't let wbc->nr_to_write go negative, the writeback code will
be confused about how many pages were _actually_ written, and the
writeback code ends up writing too much.  See commit 2faf2e1.

All of this is a crock of course.  The file system shouldn't be
second-guessing the writeback code.  Instead the writeback code should
be adaptively measuring how long it takes to were written out N pages
to a particular block device, and then decide what's the appropriate
setting for nr_to_write.  What makes sense for a USB stick, or a 4200
RPM laptop drive, may not make sense for a massive RAID array....

But since we don't have that, both XFS and ext4 have workarounds for
brain-damaged writeback behaviour.  (I did some testing, and even for
standard laptop drives the cap of 1024 pages is just Way Too Small;
that limit was set something like a decade ago, and everyone has been
afraid to change it, even though disks have gotten a wee bit faster
since those days.)

    	   	      	       	     	 - Ted

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/4] xfs: remove nr_to_write writeback windup.
  2010-04-22 19:09     ` Jan Kara
@ 2010-04-26  0:46       ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-26  0:46 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, linux-kernel, xfs

On Thu, Apr 22, 2010 at 09:09:37PM +0200, Jan Kara wrote:
> On Tue 20-04-10 12:41:54, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that the background flush code has been fixed, we shouldn't need to
> > silently multiply the wbc->nr_to_write to get good writeback. Remove
> > that code.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/linux-2.6/xfs_aops.c |    8 --------
> >  1 files changed, 0 insertions(+), 8 deletions(-)
> > 
> > diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
> > index 9962850..2b2225d 100644
> > --- a/fs/xfs/linux-2.6/xfs_aops.c
> > +++ b/fs/xfs/linux-2.6/xfs_aops.c
> > @@ -1336,14 +1336,6 @@ xfs_vm_writepage(
> >  	if (!page_has_buffers(page))
> >  		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
> >  
> > -
> > -	/*
> > -	 *  VM calculation for nr_to_write seems off.  Bump it way
> > -	 *  up, this gets simple streaming writes zippy again.
> > -	 *  To be reviewed again after Jens' writeback changes.
> > -	 */
> > -	wbc->nr_to_write *= 4;
> > -
>   Hum, are you sure about this? I thought it's there because VM passes at
> most 1024 pages to write from background writeback and you wanted to write
> more in one go (at least ext4 wants to do this).

About 500MB/s sure. ;)

Seriously though, the problem that lead to us adding this
multiplication was that writeback was not feeding XFS 1024 pages at
a time - we were getting much less than this (somewhere in the order
of 32-64 pages at a time. With the fixes I posted, in every
circumstance I can see we are the correct number of pages (1024
pages or what is left over from the last inode) being passed into
->writepages, and writeback is back to full speed without needing
this crutch. Indeed, this multiplication now causes  nr_to_write to
go ballistic in some cirumstances, and that causes latency and
fairness problems that will significantly reduce write rates for
applications like NFS servers.

Realistically, XFS doesn't need to write more than 1024 pages in one
go - the reason ext4 needs to do this is it's amazingly convoluted
delayed allocation path and the fact that it's allocator is nowhere
near as good at contiguous allocation across multiple invocations as
the XFS allocator is. IOWs, XFS really just needs enough contiguous
pages to be able to form large IOs, and given that most hardware
limits the IO size to 1MB on x86_64, then 1024 pages is more than
enough to provide this.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 4/4] xfs: remove nr_to_write writeback windup.
@ 2010-04-26  0:46       ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-26  0:46 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, linux-kernel, xfs

On Thu, Apr 22, 2010 at 09:09:37PM +0200, Jan Kara wrote:
> On Tue 20-04-10 12:41:54, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > Now that the background flush code has been fixed, we shouldn't need to
> > silently multiply the wbc->nr_to_write to get good writeback. Remove
> > that code.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/xfs/linux-2.6/xfs_aops.c |    8 --------
> >  1 files changed, 0 insertions(+), 8 deletions(-)
> > 
> > diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
> > index 9962850..2b2225d 100644
> > --- a/fs/xfs/linux-2.6/xfs_aops.c
> > +++ b/fs/xfs/linux-2.6/xfs_aops.c
> > @@ -1336,14 +1336,6 @@ xfs_vm_writepage(
> >  	if (!page_has_buffers(page))
> >  		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
> >  
> > -
> > -	/*
> > -	 *  VM calculation for nr_to_write seems off.  Bump it way
> > -	 *  up, this gets simple streaming writes zippy again.
> > -	 *  To be reviewed again after Jens' writeback changes.
> > -	 */
> > -	wbc->nr_to_write *= 4;
> > -
>   Hum, are you sure about this? I thought it's there because VM passes at
> most 1024 pages to write from background writeback and you wanted to write
> more in one go (at least ext4 wants to do this).

About 500MB/s sure. ;)

Seriously though, the problem that lead to us adding this
multiplication was that writeback was not feeding XFS 1024 pages at
a time - we were getting much less than this (somewhere in the order
of 32-64 pages at a time. With the fixes I posted, in every
circumstance I can see we are the correct number of pages (1024
pages or what is left over from the last inode) being passed into
->writepages, and writeback is back to full speed without needing
this crutch. Indeed, this multiplication now causes  nr_to_write to
go ballistic in some cirumstances, and that causes latency and
fairness problems that will significantly reduce write rates for
applications like NFS servers.

Realistically, XFS doesn't need to write more than 1024 pages in one
go - the reason ext4 needs to do this is it's amazingly convoluted
delayed allocation path and the fact that it's allocator is nowhere
near as good at contiguous allocation across multiple invocations as
the XFS allocator is. IOWs, XFS really just needs enough contiguous
pages to be able to form large IOs, and given that most hardware
limits the IO size to 1MB on x86_64, then 1024 pages is more than
enough to provide this.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-25  3:33     ` tytso
  (?)
@ 2010-04-26  1:49       ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-26  1:49 UTC (permalink / raw)
  To: tytso, linux-fsdevel, linux-kernel, xfs

On Sat, Apr 24, 2010 at 11:33:15PM -0400, tytso@mit.edu wrote:
> On Tue, Apr 20, 2010 at 12:41:53PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > If a filesystem writes more than one page in ->writepage, write_cache_pages
> > fails to notice this and continues to attempt writeback when wbc->nr_to_write
> > has gone negative - this trace was captured from XFS:
> > 
> > 
> >     wbc_writeback_start: towrt=1024
> >     wbc_writepage: towrt=1024
> >     wbc_writepage: towrt=0
> >     wbc_writepage: towrt=-1
> >     wbc_writepage: towrt=-5
> >     wbc_writepage: towrt=-21
> >     wbc_writepage: towrt=-85
> > 
> > This has adverse effects on filesystem writeback behaviour. write_cache_pages()
> > needs to terminate after a certain number of pages are written, not after a
> > certain number of calls to ->writepage are made. Make it observe the current
> > value of wbc->nr_to_write and treat a value of <= 0 as though it is a either a
> > termination condition or a trigger to reset to MAX_WRITEḆACK_PAGES for data
> > integrity syncs.
> 
> Be careful here.  If you are going to write more pages than what the
> writeback code has requested (the stupid no more than 1024 pages
> restriction in the writeback code before it jumps to start writing
> some other inode), you actually need to let the returned
> wbc->nr_to_write go negative, so that wb_writeback() knows how many
> pages it has written.
> 
> In other words, the writeback code assumes that 
> 
>   <orignal value of nr_to_write> - <returned wbc->nr_to_write>
> 
> is
> 
>   <number of pages actually written>

Yes, but that does not require a negative value to get right.  None
of the code relies on negative nr_to_write values to do anything
correctly, and all the termination checks are for wbc->nr_to-write
<= 0. And the tracing shows it behaves correctly when
wbc->nr_to_write = 0 on return. Requiring a negative number is not
documented in any of the comments, write_cache_pages() does not
return a negative number, etc, so I can't see why you think this is
necessary....

> If you don't let wbc->nr_to_write go negative, the writeback code will
> be confused about how many pages were _actually_ written, and the
> writeback code ends up writing too much.  See commit 2faf2e1.

ext4 added a "bump" to wbc->nr_to_write, then in some cases forgot
to remove it so it never returned to <= 0. Well, of course this
causes writeback to write too much! But that's an ext4 bug not
allowing nr_to_write to reach zero (not negative, but zero), not a
general writeback bug....

> All of this is a crock of course.  The file system shouldn't be
> second-guessing the writeback code.  Instead the writeback code should
> be adaptively measuring how long it takes to were written out N pages
> to a particular block device, and then decide what's the appropriate
> setting for nr_to_write.  What makes sense for a USB stick, or a 4200
> RPM laptop drive, may not make sense for a massive RAID array....

Why? Writeback should just keep pushing pages down until it congests
the block device. Then it throttles itself in get_request() and so
writeback already adapts to the load on the device.  Multiple passes
of 1024 pages per dirty inode is fine for this - a larger
nr_to_write doesn't get the block device to congestion any faster or
slower, nor does it change the behaviour once at congestion....

> But since we don't have that, both XFS and ext4 have workarounds for
> brain-damaged writeback behaviour.  (I did some testing, and even for
> standard laptop drives the cap of 1024 pages is just Way Too Small;
> that limit was set something like a decade ago, and everyone has been
> afraid to change it, even though disks have gotten a wee bit faster
> since those days.)

XFS put a workaround in for a different reason to ext4. ext4 put it
in to improve delayed allocation by working with larger chunks of
pages. XFS put it in to get large IOs to be issued through
submit_bio(), not to help the allocator...

And to be the nasty person to shoot down your modern hardware
theory: nr_to_write = 1024 pages works just fine on my laptop (XFS
on indilix SSD) as well as my big test server (XFS on 12 disk RAID0)
The server gets 1.5GB/s with pretty much perfect IO patterns with
the fixes I posted, unlike the mess of single page IOs that occurs
without them....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-26  1:49       ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-26  1:49 UTC (permalink / raw)
  To: tytso, linux-fsdevel, linux-kernel, xfs

On Sat, Apr 24, 2010 at 11:33:15PM -0400, tytso@mit.edu wrote:
> On Tue, Apr 20, 2010 at 12:41:53PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > If a filesystem writes more than one page in ->writepage, write_cache_pages
> > fails to notice this and continues to attempt writeback when wbc->nr_to_write
> > has gone negative - this trace was captured from XFS:
> > 
> > 
> >     wbc_writeback_start: towrt=1024
> >     wbc_writepage: towrt=1024
> >     wbc_writepage: towrt=0
> >     wbc_writepage: towrt=-1
> >     wbc_writepage: towrt=-5
> >     wbc_writepage: towrt=-21
> >     wbc_writepage: towrt=-85
> > 
> > This has adverse effects on filesystem writeback behaviour. write_cache_pages()
> > needs to terminate after a certain number of pages are written, not after a
> > certain number of calls to ->writepage are made. Make it observe the current
> > value of wbc->nr_to_write and treat a value of <= 0 as though it is a either a
> > termination condition or a trigger to reset to MAX_WRITEḆACK_PAGES for data
> > integrity syncs.
> 
> Be careful here.  If you are going to write more pages than what the
> writeback code has requested (the stupid no more than 1024 pages
> restriction in the writeback code before it jumps to start writing
> some other inode), you actually need to let the returned
> wbc->nr_to_write go negative, so that wb_writeback() knows how many
> pages it has written.
> 
> In other words, the writeback code assumes that 
> 
>   <orignal value of nr_to_write> - <returned wbc->nr_to_write>
> 
> is
> 
>   <number of pages actually written>

Yes, but that does not require a negative value to get right.  None
of the code relies on negative nr_to_write values to do anything
correctly, and all the termination checks are for wbc->nr_to-write
<= 0. And the tracing shows it behaves correctly when
wbc->nr_to_write = 0 on return. Requiring a negative number is not
documented in any of the comments, write_cache_pages() does not
return a negative number, etc, so I can't see why you think this is
necessary....

> If you don't let wbc->nr_to_write go negative, the writeback code will
> be confused about how many pages were _actually_ written, and the
> writeback code ends up writing too much.  See commit 2faf2e1.

ext4 added a "bump" to wbc->nr_to_write, then in some cases forgot
to remove it so it never returned to <= 0. Well, of course this
causes writeback to write too much! But that's an ext4 bug not
allowing nr_to_write to reach zero (not negative, but zero), not a
general writeback bug....

> All of this is a crock of course.  The file system shouldn't be
> second-guessing the writeback code.  Instead the writeback code should
> be adaptively measuring how long it takes to were written out N pages
> to a particular block device, and then decide what's the appropriate
> setting for nr_to_write.  What makes sense for a USB stick, or a 4200
> RPM laptop drive, may not make sense for a massive RAID array....

Why? Writeback should just keep pushing pages down until it congests
the block device. Then it throttles itself in get_request() and so
writeback already adapts to the load on the device.  Multiple passes
of 1024 pages per dirty inode is fine for this - a larger
nr_to_write doesn't get the block device to congestion any faster or
slower, nor does it change the behaviour once at congestion....

> But since we don't have that, both XFS and ext4 have workarounds for
> brain-damaged writeback behaviour.  (I did some testing, and even for
> standard laptop drives the cap of 1024 pages is just Way Too Small;
> that limit was set something like a decade ago, and everyone has been
> afraid to change it, even though disks have gotten a wee bit faster
> since those days.)

XFS put a workaround in for a different reason to ext4. ext4 put it
in to improve delayed allocation by working with larger chunks of
pages. XFS put it in to get large IOs to be issued through
submit_bio(), not to help the allocator...

And to be the nasty person to shoot down your modern hardware
theory: nr_to_write = 1024 pages works just fine on my laptop (XFS
on indilix SSD) as well as my big test server (XFS on 12 disk RAID0)
The server gets 1.5GB/s with pretty much perfect IO patterns with
the fixes I posted, unlike the mess of single page IOs that occurs
without them....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-26  1:49       ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-26  1:49 UTC (permalink / raw)
  To: tytso, linux-fsdevel, linux-kernel, xfs

On Sat, Apr 24, 2010 at 11:33:15PM -0400, tytso@mit.edu wrote:
> On Tue, Apr 20, 2010 at 12:41:53PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > If a filesystem writes more than one page in ->writepage, write_cache_pages
> > fails to notice this and continues to attempt writeback when wbc->nr_to_write
> > has gone negative - this trace was captured from XFS:
> > 
> > 
> >     wbc_writeback_start: towrt=1024
> >     wbc_writepage: towrt=1024
> >     wbc_writepage: towrt=0
> >     wbc_writepage: towrt=-1
> >     wbc_writepage: towrt=-5
> >     wbc_writepage: towrt=-21
> >     wbc_writepage: towrt=-85
> > 
> > This has adverse effects on filesystem writeback behaviour. write_cache_pages()
> > needs to terminate after a certain number of pages are written, not after a
> > certain number of calls to ->writepage are made. Make it observe the current
> > value of wbc->nr_to_write and treat a value of <= 0 as though it is a either a
> > termination condition or a trigger to reset to MAX_WRITEḆACK_PAGES for data
> > integrity syncs.
> 
> Be careful here.  If you are going to write more pages than what the
> writeback code has requested (the stupid no more than 1024 pages
> restriction in the writeback code before it jumps to start writing
> some other inode), you actually need to let the returned
> wbc->nr_to_write go negative, so that wb_writeback() knows how many
> pages it has written.
> 
> In other words, the writeback code assumes that 
> 
>   <orignal value of nr_to_write> - <returned wbc->nr_to_write>
> 
> is
> 
>   <number of pages actually written>

Yes, but that does not require a negative value to get right.  None
of the code relies on negative nr_to_write values to do anything
correctly, and all the termination checks are for wbc->nr_to-write
<= 0. And the tracing shows it behaves correctly when
wbc->nr_to_write = 0 on return. Requiring a negative number is not
documented in any of the comments, write_cache_pages() does not
return a negative number, etc, so I can't see why you think this is
necessary....

> If you don't let wbc->nr_to_write go negative, the writeback code will
> be confused about how many pages were _actually_ written, and the
> writeback code ends up writing too much.  See commit 2faf2e1.

ext4 added a "bump" to wbc->nr_to_write, then in some cases forgot
to remove it so it never returned to <= 0. Well, of course this
causes writeback to write too much! But that's an ext4 bug not
allowing nr_to_write to reach zero (not negative, but zero), not a
general writeback bug....

> All of this is a crock of course.  The file system shouldn't be
> second-guessing the writeback code.  Instead the writeback code should
> be adaptively measuring how long it takes to were written out N pages
> to a particular block device, and then decide what's the appropriate
> setting for nr_to_write.  What makes sense for a USB stick, or a 4200
> RPM laptop drive, may not make sense for a massive RAID array....

Why? Writeback should just keep pushing pages down until it congests
the block device. Then it throttles itself in get_request() and so
writeback already adapts to the load on the device.  Multiple passes
of 1024 pages per dirty inode is fine for this - a larger
nr_to_write doesn't get the block device to congestion any faster or
slower, nor does it change the behaviour once at congestion....

> But since we don't have that, both XFS and ext4 have workarounds for
> brain-damaged writeback behaviour.  (I did some testing, and even for
> standard laptop drives the cap of 1024 pages is just Way Too Small;
> that limit was set something like a decade ago, and everyone has been
> afraid to change it, even though disks have gotten a wee bit faster
> since those days.)

XFS put a workaround in for a different reason to ext4. ext4 put it
in to improve delayed allocation by working with larger chunks of
pages. XFS put it in to get large IOs to be issued through
submit_bio(), not to help the allocator...

And to be the nasty person to shoot down your modern hardware
theory: nr_to_write = 1024 pages works just fine on my laptop (XFS
on indilix SSD) as well as my big test server (XFS on 12 disk RAID0)
The server gets 1.5GB/s with pretty much perfect IO patterns with
the fixes I posted, unlike the mess of single page IOs that occurs
without them....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-26  1:49       ` Dave Chinner
@ 2010-04-26  2:43         ` tytso
  -1 siblings, 0 replies; 54+ messages in thread
From: tytso @ 2010-04-26  2:43 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Mon, Apr 26, 2010 at 11:49:08AM +1000, Dave Chinner wrote:
> 
> Yes, but that does not require a negative value to get right.  None
> of the code relies on negative nr_to_write values to do anything
> correctly, and all the termination checks are for wbc->nr_to-write
> <= 0. And the tracing shows it behaves correctly when
> wbc->nr_to_write = 0 on return. Requiring a negative number is not
> documented in any of the comments, write_cache_pages() does not
> return a negative number, etc, so I can't see why you think this is
> necessary....

In fs/fs-writeback.c, wb_writeback(), around line 774:

   		      wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;

If we want "wrote" to be reflect accurately the number of pages that
the filesystem actually wrote, then if you write more pages than what
was requested by wbc.nr_to_write, then it needs to be negative.

> XFS put a workaround in for a different reason to ext4. ext4 put it
> in to improve delayed allocation by working with larger chunks of
> pages. XFS put it in to get large IOs to be issued through
> submit_bio(), not to help the allocator...

That's why I put in ext4 at least initially, yes.  I'm working on
rewriting the ext4_writepages() code to make this unnecessary....

However...

> And to be the nasty person to shoot down your modern hardware
> theory: nr_to_write = 1024 pages works just fine on my laptop (XFS
> on indilix SSD) as well as my big test server (XFS on 12 disk RAID0)
> The server gets 1.5GB/s with pretty much perfect IO patterns with
> the fixes I posted, unlike the mess of single page IOs that occurs
> without them....

Have you tested with multiple files that are subject to writeout at
the same time?  After all, if your I/O allocator does a great job of
keeping the files contiguous in chunks larger tham 4MB, then if you
have two or more files that need to be written out, the page allocator
will round robin between the two files in 4MB chunks, and that might
not be considered an ideal I/O pattern.

Regards,

					- Ted

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-26  2:43         ` tytso
  0 siblings, 0 replies; 54+ messages in thread
From: tytso @ 2010-04-26  2:43 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Mon, Apr 26, 2010 at 11:49:08AM +1000, Dave Chinner wrote:
> 
> Yes, but that does not require a negative value to get right.  None
> of the code relies on negative nr_to_write values to do anything
> correctly, and all the termination checks are for wbc->nr_to-write
> <= 0. And the tracing shows it behaves correctly when
> wbc->nr_to_write = 0 on return. Requiring a negative number is not
> documented in any of the comments, write_cache_pages() does not
> return a negative number, etc, so I can't see why you think this is
> necessary....

In fs/fs-writeback.c, wb_writeback(), around line 774:

   		      wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;

If we want "wrote" to be reflect accurately the number of pages that
the filesystem actually wrote, then if you write more pages than what
was requested by wbc.nr_to_write, then it needs to be negative.

> XFS put a workaround in for a different reason to ext4. ext4 put it
> in to improve delayed allocation by working with larger chunks of
> pages. XFS put it in to get large IOs to be issued through
> submit_bio(), not to help the allocator...

That's why I put in ext4 at least initially, yes.  I'm working on
rewriting the ext4_writepages() code to make this unnecessary....

However...

> And to be the nasty person to shoot down your modern hardware
> theory: nr_to_write = 1024 pages works just fine on my laptop (XFS
> on indilix SSD) as well as my big test server (XFS on 12 disk RAID0)
> The server gets 1.5GB/s with pretty much perfect IO patterns with
> the fixes I posted, unlike the mess of single page IOs that occurs
> without them....

Have you tested with multiple files that are subject to writeout at
the same time?  After all, if your I/O allocator does a great job of
keeping the files contiguous in chunks larger tham 4MB, then if you
have two or more files that need to be written out, the page allocator
will round robin between the two files in 4MB chunks, and that might
not be considered an ideal I/O pattern.

Regards,

					- Ted

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-26  2:43         ` tytso
@ 2010-04-26  2:45           ` tytso
  -1 siblings, 0 replies; 54+ messages in thread
From: tytso @ 2010-04-26  2:45 UTC (permalink / raw)
  To: Dave Chinner, linux-fsdevel, linux-kernel, xfs

On Sun, Apr 25, 2010 at 10:43:02PM -0400, tytso@MIT.EDU wrote:
> Have you tested with multiple files that are subject to writeout at
> the same time?  After all, if your I/O allocator does a great job of
> keeping the files contiguous in chunks larger tham 4MB, then if you
> have two or more files that need to be written out, the page allocator
> will round robin between the two files in 4MB chunks, and that might
> not be considered an ideal I/O pattern.

Argh.  Sorry for not proof reading better before hitting the send
key....

s/tham/than/
s/page allocator/writeback code/

						- Ted

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-26  2:45           ` tytso
  0 siblings, 0 replies; 54+ messages in thread
From: tytso @ 2010-04-26  2:45 UTC (permalink / raw)
  To: Dave Chinner, linux-fsdevel, linux-kernel, xfs

On Sun, Apr 25, 2010 at 10:43:02PM -0400, tytso@MIT.EDU wrote:
> Have you tested with multiple files that are subject to writeout at
> the same time?  After all, if your I/O allocator does a great job of
> keeping the files contiguous in chunks larger tham 4MB, then if you
> have two or more files that need to be written out, the page allocator
> will round robin between the two files in 4MB chunks, and that might
> not be considered an ideal I/O pattern.

Argh.  Sorry for not proof reading better before hitting the send
key....

s/tham/than/
s/page allocator/writeback code/

						- Ted

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-26  2:43         ` tytso
@ 2010-04-27  3:30           ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-27  3:30 UTC (permalink / raw)
  To: tytso, linux-fsdevel, linux-kernel, xfs

On Sun, Apr 25, 2010 at 10:43:02PM -0400, tytso@mit.edu wrote:
> On Mon, Apr 26, 2010 at 11:49:08AM +1000, Dave Chinner wrote:
> > 
> > Yes, but that does not require a negative value to get right.  None
> > of the code relies on negative nr_to_write values to do anything
> > correctly, and all the termination checks are for wbc->nr_to-write
> > <= 0. And the tracing shows it behaves correctly when
> > wbc->nr_to_write = 0 on return. Requiring a negative number is not
> > documented in any of the comments, write_cache_pages() does not
> > return a negative number, etc, so I can't see why you think this is
> > necessary....
> 
> In fs/fs-writeback.c, wb_writeback(), around line 774:
> 
>    		      wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> 
> If we want "wrote" to be reflect accurately the number of pages that
> the filesystem actually wrote, then if you write more pages than what
> was requested by wbc.nr_to_write, then it needs to be negative.

Yes, but the change I made:

	a) prevented it from writing more than requested in the
	   async writeback case, and
	b) prevented it from going massively negative so that the
	   higher levels wouldn't have over-accounted for pages
	   written.

And if we consider that for the sync case we actaully return the
number of pages written - it's gets capped at zero even when we
write a lot more than that.

Hence exactly accounting for pages written is really not important.
Indeed, exact number of written pages is not actually used for
anything specific - only to determine if there was activity or not:

 919                 pages_written = wb_do_writeback(wb, 0);
 920
 921                 if (pages_written)
 922                         last_active = jiffies;

> > XFS put a workaround in for a different reason to ext4. ext4 put it
> > in to improve delayed allocation by working with larger chunks of
> > pages. XFS put it in to get large IOs to be issued through
> > submit_bio(), not to help the allocator...
> 
> That's why I put in ext4 at least initially, yes.  I'm working on
> rewriting the ext4_writepages() code to make this unnecessary....
> 
> However...
> 
> > And to be the nasty person to shoot down your modern hardware
> > theory: nr_to_write = 1024 pages works just fine on my laptop (XFS
> > on indilix SSD) as well as my big test server (XFS on 12 disk RAID0)
> > The server gets 1.5GB/s with pretty much perfect IO patterns with
> > the fixes I posted, unlike the mess of single page IOs that occurs
> > without them....
> 
> Have you tested with multiple files that are subject to writeout at
> the same time?

Of course.

> After all, if your I/O allocator does a great job of
> keeping the files contiguous in chunks larger tham 4MB, then if you
> have two or more files that need to be written out, the page allocator
> will round robin between the two files in 4MB chunks, and that might
> not be considered an ideal I/O pattern.

4MB chunks translate into 4-8 IOs at the block layer with typical
setups that set the maximum IO size to 512k or 1MB. So that is
_plenty_ to keep a single disk or several disks in a RAID stripe
busy before seeking to another location to do the next set of 4-8
writes. And if the drive has any amount of cache (we're seeing
64-128MB in SATA drives now), then it will be aggregating these writes in
the cache into even larger sequential chunks. Hence seeks in _modern
hardware_ are going to be almost entirely mitigated for most large
sequential write workloads as long as the contiguous chunks are more
than a few MB in size.

Some numbers for you:

One 4GB file (baseline):

$ dd if=/dev/zero of=/mnt/scratch/$i/test bs=1024k count=4000
.....
$ sudo xfs_bmap -vp /mnt/scratch/*/test
/mnt/scratch/0/test:
 EXT: FILE-OFFSET         BLOCK-RANGE      AG AG-OFFSET     TOTAL FLAGS
   0: [0..4710271]:       96..4710367       0 (96..4710367) 4710272 00000
   1: [4710272..8191999]: 5242976..8724703  1 (96..3481823) 3481728 00000

Ideal layout - the AG size is about 2.4GB, so it'll be two extents
as we see (average gives 2GB per extent). This completed at about 440MB/s.

Two 4GB files in parallel into the same directory:

$ for i in `seq 0 1 1`; do dd if=/dev/zero of=/mnt/scratch/test$i bs=1024k count=4000 & done
$ sudo xfs_bmap -vp /mnt/scratch/test* | awk '/ [0-9]*:/ { tot += $6; cnt++ } END { print tot / cnt }'
712348
$

So the average extent size is ~355MB, and throughput was roughly
520MB/s.

Two 4GB files in parallel into different directories (to trigger a
different allocator placement heuristic):

$ for i in `seq 0 1 1`; do dd if=/dev/zero of=/mnt/scratch/$i/test bs=1024k count=4000 & done
$ sudo xfs_bmap -vp /mnt/scratch/*/test | awk '/ [0-9]*:/ { tot += $6; cnt++ } END { printf "%d\n", tot / cnt }'
1170285
$

~600MB average extent size and throughput was roughly 530MB/s.

Let's make it harder - eight 1GB files in parallel into the same directory:

$ for i in `seq 0 1 7`; do dd if=/dev/zero of=/mnt/scratch/test$i bs=1024k count=1000 & done
...
$ sudo xfs_bmap -vp /mnt/scratch/test* | awk '/[0-9]:/ { tot += $6; cnt++ } END { print tot / cnt }'
157538
$

An average of 78MB per extent with throughput at roughly 520MB/s.
IOWs, the extent size is still large enough to provide full
bandwidth to pretty much any application that does sequential IO.
i.e. it is not ideal, but it's not badly fragmented enough to be a
problem for most people.

FWIW, with the current code I am seeing average extent sizes of
roughly 55MB for this same test, so there is significant _reduction_
in fragmentation by making sure we interleave chunks of pages
_consistently_ in writeback. Mind you, throughput didn't change
because extents of 55MB are still large enough to maintain full disk
throughput for this workload....

FYI, if this level of fragmentation were a problem for this
workload (e.g. a mythTV box) I could use something like the
allocsize mount option to specify the EOF preallocation size:

$ sudo umount /mnt/scratch
$ sudo mount -o logbsize=262144,nobarrier,allocsize=512m /dev/vdb /mnt/scratch
$ for i in `seq 0 1 7`; do dd if=/dev/zero of=/mnt/scratch/test$i bs=1024k count=1000 & done
....
$ sudo xfs_bmap -vp /mnt/scratch/test* | awk '/ [0-9]*:/ { tot += $6; cnt++ } END { print tot / cnt }'
1024000
$

512MB extent size average, exactly, with throughput at 510MB/s (so
not real reduction in throughput). IOWs, fragmentation for this
workload can be directly controlled without any performance penalty
if necessary.

I hope this answers your question, Ted. ;)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-27  3:30           ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-04-27  3:30 UTC (permalink / raw)
  To: tytso, linux-fsdevel, linux-kernel, xfs

On Sun, Apr 25, 2010 at 10:43:02PM -0400, tytso@mit.edu wrote:
> On Mon, Apr 26, 2010 at 11:49:08AM +1000, Dave Chinner wrote:
> > 
> > Yes, but that does not require a negative value to get right.  None
> > of the code relies on negative nr_to_write values to do anything
> > correctly, and all the termination checks are for wbc->nr_to-write
> > <= 0. And the tracing shows it behaves correctly when
> > wbc->nr_to_write = 0 on return. Requiring a negative number is not
> > documented in any of the comments, write_cache_pages() does not
> > return a negative number, etc, so I can't see why you think this is
> > necessary....
> 
> In fs/fs-writeback.c, wb_writeback(), around line 774:
> 
>    		      wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
> 
> If we want "wrote" to be reflect accurately the number of pages that
> the filesystem actually wrote, then if you write more pages than what
> was requested by wbc.nr_to_write, then it needs to be negative.

Yes, but the change I made:

	a) prevented it from writing more than requested in the
	   async writeback case, and
	b) prevented it from going massively negative so that the
	   higher levels wouldn't have over-accounted for pages
	   written.

And if we consider that for the sync case we actaully return the
number of pages written - it's gets capped at zero even when we
write a lot more than that.

Hence exactly accounting for pages written is really not important.
Indeed, exact number of written pages is not actually used for
anything specific - only to determine if there was activity or not:

 919                 pages_written = wb_do_writeback(wb, 0);
 920
 921                 if (pages_written)
 922                         last_active = jiffies;

> > XFS put a workaround in for a different reason to ext4. ext4 put it
> > in to improve delayed allocation by working with larger chunks of
> > pages. XFS put it in to get large IOs to be issued through
> > submit_bio(), not to help the allocator...
> 
> That's why I put in ext4 at least initially, yes.  I'm working on
> rewriting the ext4_writepages() code to make this unnecessary....
> 
> However...
> 
> > And to be the nasty person to shoot down your modern hardware
> > theory: nr_to_write = 1024 pages works just fine on my laptop (XFS
> > on indilix SSD) as well as my big test server (XFS on 12 disk RAID0)
> > The server gets 1.5GB/s with pretty much perfect IO patterns with
> > the fixes I posted, unlike the mess of single page IOs that occurs
> > without them....
> 
> Have you tested with multiple files that are subject to writeout at
> the same time?

Of course.

> After all, if your I/O allocator does a great job of
> keeping the files contiguous in chunks larger tham 4MB, then if you
> have two or more files that need to be written out, the page allocator
> will round robin between the two files in 4MB chunks, and that might
> not be considered an ideal I/O pattern.

4MB chunks translate into 4-8 IOs at the block layer with typical
setups that set the maximum IO size to 512k or 1MB. So that is
_plenty_ to keep a single disk or several disks in a RAID stripe
busy before seeking to another location to do the next set of 4-8
writes. And if the drive has any amount of cache (we're seeing
64-128MB in SATA drives now), then it will be aggregating these writes in
the cache into even larger sequential chunks. Hence seeks in _modern
hardware_ are going to be almost entirely mitigated for most large
sequential write workloads as long as the contiguous chunks are more
than a few MB in size.

Some numbers for you:

One 4GB file (baseline):

$ dd if=/dev/zero of=/mnt/scratch/$i/test bs=1024k count=4000
.....
$ sudo xfs_bmap -vp /mnt/scratch/*/test
/mnt/scratch/0/test:
 EXT: FILE-OFFSET         BLOCK-RANGE      AG AG-OFFSET     TOTAL FLAGS
   0: [0..4710271]:       96..4710367       0 (96..4710367) 4710272 00000
   1: [4710272..8191999]: 5242976..8724703  1 (96..3481823) 3481728 00000

Ideal layout - the AG size is about 2.4GB, so it'll be two extents
as we see (average gives 2GB per extent). This completed at about 440MB/s.

Two 4GB files in parallel into the same directory:

$ for i in `seq 0 1 1`; do dd if=/dev/zero of=/mnt/scratch/test$i bs=1024k count=4000 & done
$ sudo xfs_bmap -vp /mnt/scratch/test* | awk '/ [0-9]*:/ { tot += $6; cnt++ } END { print tot / cnt }'
712348
$

So the average extent size is ~355MB, and throughput was roughly
520MB/s.

Two 4GB files in parallel into different directories (to trigger a
different allocator placement heuristic):

$ for i in `seq 0 1 1`; do dd if=/dev/zero of=/mnt/scratch/$i/test bs=1024k count=4000 & done
$ sudo xfs_bmap -vp /mnt/scratch/*/test | awk '/ [0-9]*:/ { tot += $6; cnt++ } END { printf "%d\n", tot / cnt }'
1170285
$

~600MB average extent size and throughput was roughly 530MB/s.

Let's make it harder - eight 1GB files in parallel into the same directory:

$ for i in `seq 0 1 7`; do dd if=/dev/zero of=/mnt/scratch/test$i bs=1024k count=1000 & done
...
$ sudo xfs_bmap -vp /mnt/scratch/test* | awk '/[0-9]:/ { tot += $6; cnt++ } END { print tot / cnt }'
157538
$

An average of 78MB per extent with throughput at roughly 520MB/s.
IOWs, the extent size is still large enough to provide full
bandwidth to pretty much any application that does sequential IO.
i.e. it is not ideal, but it's not badly fragmented enough to be a
problem for most people.

FWIW, with the current code I am seeing average extent sizes of
roughly 55MB for this same test, so there is significant _reduction_
in fragmentation by making sure we interleave chunks of pages
_consistently_ in writeback. Mind you, throughput didn't change
because extents of 55MB are still large enough to maintain full disk
throughput for this workload....

FYI, if this level of fragmentation were a problem for this
workload (e.g. a mythTV box) I could use something like the
allocsize mount option to specify the EOF preallocation size:

$ sudo umount /mnt/scratch
$ sudo mount -o logbsize=262144,nobarrier,allocsize=512m /dev/vdb /mnt/scratch
$ for i in `seq 0 1 7`; do dd if=/dev/zero of=/mnt/scratch/test$i bs=1024k count=1000 & done
....
$ sudo xfs_bmap -vp /mnt/scratch/test* | awk '/ [0-9]*:/ { tot += $6; cnt++ } END { print tot / cnt }'
1024000
$

512MB extent size average, exactly, with throughput at 510MB/s (so
not real reduction in throughput). IOWs, fragmentation for this
workload can be directly controlled without any performance penalty
if necessary.

I hope this answers your question, Ted. ;)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-20  2:41   ` Dave Chinner
@ 2010-04-29 21:39     ` Andrew Morton
  -1 siblings, 0 replies; 54+ messages in thread
From: Andrew Morton @ 2010-04-29 21:39 UTC (permalink / raw)
  To: Dave Chinner
  Cc: linux-fsdevel, linux-kernel, xfs, Aneesh Kumar K.V, Theodore Ts'o

On Tue, 20 Apr 2010 12:41:53 +1000
Dave Chinner <david@fromorbit.com> wrote:

> If a filesystem writes more than one page in ->writepage, write_cache_pages
> fails to notice this and continues to attempt writeback when wbc->nr_to_write
> has gone negative - this trace was captured from XFS:
> 
> 
>     wbc_writeback_start: towrt=1024
>     wbc_writepage: towrt=1024
>     wbc_writepage: towrt=0
>     wbc_writepage: towrt=-1
>     wbc_writepage: towrt=-5
>     wbc_writepage: towrt=-21
>     wbc_writepage: towrt=-85
> 

Bug.

AFAIT it's a regression introduced by

: commit 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4
: Author:     Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
: AuthorDate: Thu Oct 16 10:09:17 2008 -0400
: Commit:     Theodore Ts'o <tytso@mit.edu>
: CommitDate: Thu Oct 16 10:09:17 2008 -0400
: 
:     vfs: Add no_nrwrite_index_update writeback control flag

I suggest that what you do here is remove the local `nr_to_write' from
write_cache_pages() and go back to directly using wbc->nr_to_write
within the loop.

And thus we restore the convention that if the fs writes back more than
a single page, it subtracts (nr_written - 1) from wbc->nr_to_write.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-29 21:39     ` Andrew Morton
  0 siblings, 0 replies; 54+ messages in thread
From: Andrew Morton @ 2010-04-29 21:39 UTC (permalink / raw)
  To: Dave Chinner
  Cc: linux-fsdevel, Theodore Ts'o, linux-kernel, Aneesh Kumar K.V, xfs

On Tue, 20 Apr 2010 12:41:53 +1000
Dave Chinner <david@fromorbit.com> wrote:

> If a filesystem writes more than one page in ->writepage, write_cache_pages
> fails to notice this and continues to attempt writeback when wbc->nr_to_write
> has gone negative - this trace was captured from XFS:
> 
> 
>     wbc_writeback_start: towrt=1024
>     wbc_writepage: towrt=1024
>     wbc_writepage: towrt=0
>     wbc_writepage: towrt=-1
>     wbc_writepage: towrt=-5
>     wbc_writepage: towrt=-21
>     wbc_writepage: towrt=-85
> 

Bug.

AFAIT it's a regression introduced by

: commit 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4
: Author:     Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
: AuthorDate: Thu Oct 16 10:09:17 2008 -0400
: Commit:     Theodore Ts'o <tytso@mit.edu>
: CommitDate: Thu Oct 16 10:09:17 2008 -0400
: 
:     vfs: Add no_nrwrite_index_update writeback control flag

I suggest that what you do here is remove the local `nr_to_write' from
write_cache_pages() and go back to directly using wbc->nr_to_write
within the loop.

And thus we restore the convention that if the fs writes back more than
a single page, it subtracts (nr_written - 1) from wbc->nr_to_write.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-29 21:39     ` Andrew Morton
@ 2010-04-30  6:01       ` Aneesh Kumar K. V
  -1 siblings, 0 replies; 54+ messages in thread
From: Aneesh Kumar K. V @ 2010-04-30  6:01 UTC (permalink / raw)
  To: Andrew Morton, Dave Chinner
  Cc: linux-fsdevel, linux-kernel, xfs, Theodore Ts'o

On Thu, 29 Apr 2010 14:39:31 -0700, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Tue, 20 Apr 2010 12:41:53 +1000
> Dave Chinner <david@fromorbit.com> wrote:
> 
> > If a filesystem writes more than one page in ->writepage, write_cache_pages
> > fails to notice this and continues to attempt writeback when wbc->nr_to_write
> > has gone negative - this trace was captured from XFS:
> > 
> > 
> >     wbc_writeback_start: towrt=1024
> >     wbc_writepage: towrt=1024
> >     wbc_writepage: towrt=0
> >     wbc_writepage: towrt=-1
> >     wbc_writepage: towrt=-5
> >     wbc_writepage: towrt=-21
> >     wbc_writepage: towrt=-85
> > 
> 
> Bug.
> 
> AFAIT it's a regression introduced by
> 
> : commit 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4
> : Author:     Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> : AuthorDate: Thu Oct 16 10:09:17 2008 -0400
> : Commit:     Theodore Ts'o <tytso@mit.edu>
> : CommitDate: Thu Oct 16 10:09:17 2008 -0400
> : 
> :     vfs: Add no_nrwrite_index_update writeback control flag
> 
> I suggest that what you do here is remove the local `nr_to_write' from
> write_cache_pages() and go back to directly using wbc->nr_to_write
> within the loop.
> 
> And thus we restore the convention that if the fs writes back more than
> a single page, it subtracts (nr_written - 1) from wbc->nr_to_write.
> 

My mistake i never expected writepage to write more than one page. The
interface said 'writepage' so it was natural to expect that it writes only
one page. BTW the reason for the change is to give file system which
accumulate dirty pages using write_cache_pages and attempt to write
them out later a chance to properly manage nr_to_write. Something like

ext4_da_writepages
-- write_cache_pages
---- collect dirty page
---- return
--return
--now try to writeout all the collected dirty pages ( say 100)
----Only able to allocate blocks for 50 pages
    so update nr_to_write -= 50 and mark rest of 50 pages as dirty
    again

So we want wbc->nr_to_write updated only by ext4_da_writepages. 


-aneesh

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-30  6:01       ` Aneesh Kumar K. V
  0 siblings, 0 replies; 54+ messages in thread
From: Aneesh Kumar K. V @ 2010-04-30  6:01 UTC (permalink / raw)
  To: Andrew Morton, Dave Chinner
  Cc: linux-fsdevel, Theodore Ts'o, linux-kernel, xfs

On Thu, 29 Apr 2010 14:39:31 -0700, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Tue, 20 Apr 2010 12:41:53 +1000
> Dave Chinner <david@fromorbit.com> wrote:
> 
> > If a filesystem writes more than one page in ->writepage, write_cache_pages
> > fails to notice this and continues to attempt writeback when wbc->nr_to_write
> > has gone negative - this trace was captured from XFS:
> > 
> > 
> >     wbc_writeback_start: towrt=1024
> >     wbc_writepage: towrt=1024
> >     wbc_writepage: towrt=0
> >     wbc_writepage: towrt=-1
> >     wbc_writepage: towrt=-5
> >     wbc_writepage: towrt=-21
> >     wbc_writepage: towrt=-85
> > 
> 
> Bug.
> 
> AFAIT it's a regression introduced by
> 
> : commit 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4
> : Author:     Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> : AuthorDate: Thu Oct 16 10:09:17 2008 -0400
> : Commit:     Theodore Ts'o <tytso@mit.edu>
> : CommitDate: Thu Oct 16 10:09:17 2008 -0400
> : 
> :     vfs: Add no_nrwrite_index_update writeback control flag
> 
> I suggest that what you do here is remove the local `nr_to_write' from
> write_cache_pages() and go back to directly using wbc->nr_to_write
> within the loop.
> 
> And thus we restore the convention that if the fs writes back more than
> a single page, it subtracts (nr_written - 1) from wbc->nr_to_write.
> 

My mistake i never expected writepage to write more than one page. The
interface said 'writepage' so it was natural to expect that it writes only
one page. BTW the reason for the change is to give file system which
accumulate dirty pages using write_cache_pages and attempt to write
them out later a chance to properly manage nr_to_write. Something like

ext4_da_writepages
-- write_cache_pages
---- collect dirty page
---- return
--return
--now try to writeout all the collected dirty pages ( say 100)
----Only able to allocate blocks for 50 pages
    so update nr_to_write -= 50 and mark rest of 50 pages as dirty
    again

So we want wbc->nr_to_write updated only by ext4_da_writepages. 


-aneesh

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-30  6:01       ` Aneesh Kumar K. V
@ 2010-04-30 19:43         ` Andrew Morton
  -1 siblings, 0 replies; 54+ messages in thread
From: Andrew Morton @ 2010-04-30 19:43 UTC (permalink / raw)
  To: Aneesh Kumar K. V
  Cc: Dave Chinner, linux-fsdevel, linux-kernel, xfs, Theodore Ts'o

On Fri, 30 Apr 2010 11:31:53 +0530
"Aneesh Kumar K. V" <aneesh.kumar@linux.vnet.ibm.com> wrote:

> On Thu, 29 Apr 2010 14:39:31 -0700, Andrew Morton <akpm@linux-foundation.org> wrote:
> > On Tue, 20 Apr 2010 12:41:53 +1000
> > Dave Chinner <david@fromorbit.com> wrote:
> > 
> > > If a filesystem writes more than one page in ->writepage, write_cache_pages
> > > fails to notice this and continues to attempt writeback when wbc->nr_to_write
> > > has gone negative - this trace was captured from XFS:
> > > 
> > > 
> > >     wbc_writeback_start: towrt=1024
> > >     wbc_writepage: towrt=1024
> > >     wbc_writepage: towrt=0
> > >     wbc_writepage: towrt=-1
> > >     wbc_writepage: towrt=-5
> > >     wbc_writepage: towrt=-21
> > >     wbc_writepage: towrt=-85
> > > 
> > 
> > Bug.
> > 
> > AFAIT it's a regression introduced by
> > 
> > : commit 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4
> > : Author:     Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > : AuthorDate: Thu Oct 16 10:09:17 2008 -0400
> > : Commit:     Theodore Ts'o <tytso@mit.edu>
> > : CommitDate: Thu Oct 16 10:09:17 2008 -0400
> > : 
> > :     vfs: Add no_nrwrite_index_update writeback control flag
> > 
> > I suggest that what you do here is remove the local `nr_to_write' from
> > write_cache_pages() and go back to directly using wbc->nr_to_write
> > within the loop.
> > 
> > And thus we restore the convention that if the fs writes back more than
> > a single page, it subtracts (nr_written - 1) from wbc->nr_to_write.
> > 
> 
> My mistake i never expected writepage to write more than one page.

The writeback code is tricky and easy to break in subtle ways.

> The
> interface said 'writepage' so it was natural to expect that it writes only
> one page. BTW the reason for the change is to give file system which
> accumulate dirty pages using write_cache_pages and attempt to write
> them out later a chance to properly manage nr_to_write. Something like
> 
> ext4_da_writepages
> -- write_cache_pages
> ---- collect dirty page
> ---- return
> --return
> --now try to writeout all the collected dirty pages ( say 100)
> ----Only able to allocate blocks for 50 pages
>     so update nr_to_write -= 50 and mark rest of 50 pages as dirty
>     again
> 
> So we want wbc->nr_to_write updated only by ext4_da_writepages. 

So you want a ->writepage() implementation which doesn't actually write
a page at all - it just remembers that page for later.

Maybe that fs shouldn't be calling write_cache_pages() at all.  After
all, write_cache_pages() is a wrapper which emits a sequence of calls
to ->writepage(), and ->writepage() writes a page.

Rather than hacking around, subverting things and breaking core kernel
code, let's step back and more clearly think about what to do?

One option would be to implement a new address_space_operation which
provides the new semantics in a well-understood fashion.  Let's call it
writepage_prepare(?).  Then reimplement write_cache_pages() so that if
->writepage_prepare() is available, it handles it in a sensible fashion
and doesn't break traditional filesystems.

Or simply implement a new, different version of write_cache_pages() for
filesystems which wish to buffer in this fashion.  The new
write_cache_pages_prepare()(?) would call ->writepage_prepare(). 
Internally it might share implementation with write_cache_pages().

There are lots of options.  But the way in which write_cache_pages()
was extended to handle this ext4 requirement was rather unclean,
non-obvious and, umm, broken!



^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-04-30 19:43         ` Andrew Morton
  0 siblings, 0 replies; 54+ messages in thread
From: Andrew Morton @ 2010-04-30 19:43 UTC (permalink / raw)
  To: Aneesh Kumar K. V; +Cc: linux-fsdevel, Theodore Ts'o, linux-kernel, xfs

On Fri, 30 Apr 2010 11:31:53 +0530
"Aneesh Kumar K. V" <aneesh.kumar@linux.vnet.ibm.com> wrote:

> On Thu, 29 Apr 2010 14:39:31 -0700, Andrew Morton <akpm@linux-foundation.org> wrote:
> > On Tue, 20 Apr 2010 12:41:53 +1000
> > Dave Chinner <david@fromorbit.com> wrote:
> > 
> > > If a filesystem writes more than one page in ->writepage, write_cache_pages
> > > fails to notice this and continues to attempt writeback when wbc->nr_to_write
> > > has gone negative - this trace was captured from XFS:
> > > 
> > > 
> > >     wbc_writeback_start: towrt=1024
> > >     wbc_writepage: towrt=1024
> > >     wbc_writepage: towrt=0
> > >     wbc_writepage: towrt=-1
> > >     wbc_writepage: towrt=-5
> > >     wbc_writepage: towrt=-21
> > >     wbc_writepage: towrt=-85
> > > 
> > 
> > Bug.
> > 
> > AFAIT it's a regression introduced by
> > 
> > : commit 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4
> > : Author:     Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> > : AuthorDate: Thu Oct 16 10:09:17 2008 -0400
> > : Commit:     Theodore Ts'o <tytso@mit.edu>
> > : CommitDate: Thu Oct 16 10:09:17 2008 -0400
> > : 
> > :     vfs: Add no_nrwrite_index_update writeback control flag
> > 
> > I suggest that what you do here is remove the local `nr_to_write' from
> > write_cache_pages() and go back to directly using wbc->nr_to_write
> > within the loop.
> > 
> > And thus we restore the convention that if the fs writes back more than
> > a single page, it subtracts (nr_written - 1) from wbc->nr_to_write.
> > 
> 
> My mistake i never expected writepage to write more than one page.

The writeback code is tricky and easy to break in subtle ways.

> The
> interface said 'writepage' so it was natural to expect that it writes only
> one page. BTW the reason for the change is to give file system which
> accumulate dirty pages using write_cache_pages and attempt to write
> them out later a chance to properly manage nr_to_write. Something like
> 
> ext4_da_writepages
> -- write_cache_pages
> ---- collect dirty page
> ---- return
> --return
> --now try to writeout all the collected dirty pages ( say 100)
> ----Only able to allocate blocks for 50 pages
>     so update nr_to_write -= 50 and mark rest of 50 pages as dirty
>     again
> 
> So we want wbc->nr_to_write updated only by ext4_da_writepages. 

So you want a ->writepage() implementation which doesn't actually write
a page at all - it just remembers that page for later.

Maybe that fs shouldn't be calling write_cache_pages() at all.  After
all, write_cache_pages() is a wrapper which emits a sequence of calls
to ->writepage(), and ->writepage() writes a page.

Rather than hacking around, subverting things and breaking core kernel
code, let's step back and more clearly think about what to do?

One option would be to implement a new address_space_operation which
provides the new semantics in a well-understood fashion.  Let's call it
writepage_prepare(?).  Then reimplement write_cache_pages() so that if
->writepage_prepare() is available, it handles it in a sensible fashion
and doesn't break traditional filesystems.

Or simply implement a new, different version of write_cache_pages() for
filesystems which wish to buffer in this fashion.  The new
write_cache_pages_prepare()(?) would call ->writepage_prepare(). 
Internally it might share implementation with write_cache_pages().

There are lots of options.  But the way in which write_cache_pages()
was extended to handle this ext4 requirement was rather unclean,
non-obvious and, umm, broken!


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
  2010-04-30 19:43         ` Andrew Morton
@ 2010-05-01 19:47           ` tytso
  -1 siblings, 0 replies; 54+ messages in thread
From: tytso @ 2010-05-01 19:47 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Aneesh Kumar K. V, Dave Chinner, linux-fsdevel, linux-kernel, xfs

On Fri, Apr 30, 2010 at 12:43:29PM -0700, Andrew Morton wrote:
> 
> Maybe that fs shouldn't be calling write_cache_pages() at all.  After
> all, write_cache_pages() is a wrapper which emits a sequence of calls
> to ->writepage(), and ->writepage() writes a page.

On my todo list is to fix ext4 to not call write_cache_pages() at all.
We are seriously abusing that function ATM, since we're not actually
writing the pages when we call write_cache_pages().  I won't go into
what we're doing, because it's too embarassing, but suffice it to say
that we end up calling pagevec_lookup() or pagevec_lookup_tag()
*four*, count them *four* times while trying to do writeback.

I have a simple patch that gives ext4 our own copy of
write_cache_pages(), and then simplifies it a lot, and fixes a bunch
of problems, but then I discarded it in favor of fundamentally redoing
how we do writeback at all, but it's going to take a while to get
things completely right.  But I am working to try to fix this.

If it would help, I can ressurect the "fork write_cache_pages() and
simplify" patch, so ext4 isn't dependent on the mm/page-writeback.c's
write_cache_pages(), if there is an immediate, short-term need to fix
that function.

						- Ted

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages
@ 2010-05-01 19:47           ` tytso
  0 siblings, 0 replies; 54+ messages in thread
From: tytso @ 2010-05-01 19:47 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-fsdevel, xfs, Aneesh Kumar K. V, linux-kernel

On Fri, Apr 30, 2010 at 12:43:29PM -0700, Andrew Morton wrote:
> 
> Maybe that fs shouldn't be calling write_cache_pages() at all.  After
> all, write_cache_pages() is a wrapper which emits a sequence of calls
> to ->writepage(), and ->writepage() writes a page.

On my todo list is to fix ext4 to not call write_cache_pages() at all.
We are seriously abusing that function ATM, since we're not actually
writing the pages when we call write_cache_pages().  I won't go into
what we're doing, because it's too embarassing, but suffice it to say
that we end up calling pagevec_lookup() or pagevec_lookup_tag()
*four*, count them *four* times while trying to do writeback.

I have a simple patch that gives ext4 our own copy of
write_cache_pages(), and then simplifies it a lot, and fixes a bunch
of problems, but then I discarded it in favor of fundamentally redoing
how we do writeback at all, but it's going to take a while to get
things completely right.  But I am working to try to fix this.

If it would help, I can ressurect the "fork write_cache_pages() and
simplify" patch, so ext4 isn't dependent on the mm/page-writeback.c's
write_cache_pages(), if there is an immediate, short-term need to fix
that function.

						- Ted

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes
  2010-04-20  2:41 ` Dave Chinner
@ 2010-05-21 15:05   ` Christoph Hellwig
  -1 siblings, 0 replies; 54+ messages in thread
From: Christoph Hellwig @ 2010-05-21 15:05 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

What happened to this series?  Getting the trace events in will
defintively help with tuning the writeback code, and we'll also need
the nr_to_write issue fixed some way.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes
@ 2010-05-21 15:05   ` Christoph Hellwig
  0 siblings, 0 replies; 54+ messages in thread
From: Christoph Hellwig @ 2010-05-21 15:05 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

What happened to this series?  Getting the trace events in will
defintively help with tuning the writeback code, and we'll also need
the nr_to_write issue fixed some way.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/4] writeback: initial tracing support
  2010-04-20  2:41   ` Dave Chinner
@ 2010-05-21 15:06     ` Christoph Hellwig
  -1 siblings, 0 replies; 54+ messages in thread
From: Christoph Hellwig @ 2010-05-21 15:06 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue, Apr 20, 2010 at 12:41:51PM +1000, Dave Chinner wrote:
> From: From: Jens Axboe <jens.axboe@oracle.com>
> 
> Trace queue/sched/exec parts of the writeback loop.

If you move the CREATE_TRACE_POINTS into fs-writeback.c and include
<trace/events/writeback.h> after the sturcture defintions there's no
need to move them to a header.


^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 1/4] writeback: initial tracing support
@ 2010-05-21 15:06     ` Christoph Hellwig
  0 siblings, 0 replies; 54+ messages in thread
From: Christoph Hellwig @ 2010-05-21 15:06 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel, xfs

On Tue, Apr 20, 2010 at 12:41:51PM +1000, Dave Chinner wrote:
> From: From: Jens Axboe <jens.axboe@oracle.com>
> 
> Trace queue/sched/exec parts of the writeback loop.

If you move the CREATE_TRACE_POINTS into fs-writeback.c and include
<trace/events/writeback.h> after the sturcture defintions there's no
need to move them to a header.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes
  2010-05-21 15:05   ` Christoph Hellwig
@ 2010-05-22  0:09     ` Dave Chinner
  -1 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-05-22  0:09 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-fsdevel, linux-kernel, xfs

On Fri, May 21, 2010 at 11:05:04AM -0400, Christoph Hellwig wrote:
> What happened to this series?  Getting the trace events in will
> defintively help with tuning the writeback code, and we'll also need
> the nr_to_write issue fixed some way.

I've been snowed under trying to get several different things done
all at the same time, so this has slipped. I'm trying to get back to
it early next week. The nr_to_pages fix is badly needed, as that
causes severe fragmentation in XFS for the exact workloads it fixes
the severe fragmentation in ext4....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 54+ messages in thread

* Re: [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes
@ 2010-05-22  0:09     ` Dave Chinner
  0 siblings, 0 replies; 54+ messages in thread
From: Dave Chinner @ 2010-05-22  0:09 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-fsdevel, linux-kernel, xfs

On Fri, May 21, 2010 at 11:05:04AM -0400, Christoph Hellwig wrote:
> What happened to this series?  Getting the trace events in will
> defintively help with tuning the writeback code, and we'll also need
> the nr_to_write issue fixed some way.

I've been snowed under trying to get several different things done
all at the same time, so this has slipped. I'm trying to get back to
it early next week. The nr_to_pages fix is badly needed, as that
causes severe fragmentation in XFS for the exact workloads it fixes
the severe fragmentation in ext4....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 54+ messages in thread

end of thread, other threads:[~2010-05-22  0:09 UTC | newest]

Thread overview: 54+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-04-20  2:41 [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes Dave Chinner
2010-04-20  2:41 ` Dave Chinner
2010-04-20  2:41 ` [PATCH 1/4] writeback: initial tracing support Dave Chinner
2010-04-20  2:41   ` Dave Chinner
2010-05-21 15:06   ` Christoph Hellwig
2010-05-21 15:06     ` Christoph Hellwig
2010-04-20  2:41 ` [PATCH 2/4] writeback: Add tracing to balance_dirty_pages Dave Chinner
2010-04-20  2:41   ` Dave Chinner
2010-04-20  2:41 ` [PATCH 3/4] writeback: pay attention to wbc->nr_to_write in write_cache_pages Dave Chinner
2010-04-20  2:41   ` Dave Chinner
2010-04-22 19:07   ` Jan Kara
2010-04-22 19:07     ` Jan Kara
2010-04-25  3:33   ` tytso
2010-04-25  3:33     ` tytso
2010-04-25  3:33     ` tytso
2010-04-26  1:49     ` Dave Chinner
2010-04-26  1:49       ` Dave Chinner
2010-04-26  1:49       ` Dave Chinner
2010-04-26  2:43       ` tytso
2010-04-26  2:43         ` tytso
2010-04-26  2:45         ` tytso
2010-04-26  2:45           ` tytso
2010-04-27  3:30         ` Dave Chinner
2010-04-27  3:30           ` Dave Chinner
2010-04-29 21:39   ` Andrew Morton
2010-04-29 21:39     ` Andrew Morton
2010-04-30  6:01     ` Aneesh Kumar K. V
2010-04-30  6:01       ` Aneesh Kumar K. V
2010-04-30 19:43       ` Andrew Morton
2010-04-30 19:43         ` Andrew Morton
2010-05-01 19:47         ` tytso
2010-05-01 19:47           ` tytso
2010-04-20  2:41 ` [PATCH 4/4] xfs: remove nr_to_write writeback windup Dave Chinner
2010-04-20  2:41   ` Dave Chinner
2010-04-22 19:09   ` Jan Kara
2010-04-22 19:09     ` Jan Kara
2010-04-26  0:46     ` Dave Chinner
2010-04-26  0:46       ` Dave Chinner
2010-04-20  3:40 ` [PATCH 5/4] writeback: limit write_cache_pages integrity scanning to current EOF Dave Chinner
2010-04-20  3:40   ` Dave Chinner
2010-04-20 23:28   ` Jamie Lokier
2010-04-20 23:28     ` Jamie Lokier
2010-04-20 23:31     ` Dave Chinner
2010-04-20 23:31       ` Dave Chinner
2010-04-22 19:13   ` Jan Kara
2010-04-22 19:13     ` Jan Kara
2010-04-20 12:02 ` [PATCH 0/4] writeback: tracing and wbc->nr_to_write fixes Richard Kennedy
2010-04-20 12:02   ` Richard Kennedy
2010-04-20 23:29   ` Dave Chinner
2010-04-20 23:29     ` Dave Chinner
2010-05-21 15:05 ` Christoph Hellwig
2010-05-21 15:05   ` Christoph Hellwig
2010-05-22  0:09   ` Dave Chinner
2010-05-22  0:09     ` Dave Chinner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.