linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/8][RFC] IO latency/throughput fixes
@ 2009-04-06 12:48 Jens Axboe
  2009-04-06 12:48 ` [PATCH 1/8] block: change the request allocation/congestion logic to be sync/async based Jens Axboe
                   ` (9 more replies)
  0 siblings, 10 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 12:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds

Hi,

This is a set of patches that I worked on today in the hopes
of furthering the latency goals and at least fixing some of
the write regression with fwrite + fsync that current -git
is suffering from.

I haven't done any latency tests yet, I'm just tossing this
out there so we can collaborate on improving things. What I
did test was the silly fwrite() + fsync() loop test, which
is a LOT slower in current -git that it used to be. The test
is basically:

	while (nr--) {
		f = fopen();
		fprintf(f, "Some data here\n");
		fsync(fileno(f));
		fclose(f);
	}

which (for nr == 2000) takes 16 seconds in -git, completes
in 0.9s with the patches.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 53+ messages in thread

* [PATCH 1/8] block: change the request allocation/congestion logic to be sync/async based
  2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
@ 2009-04-06 12:48 ` Jens Axboe
  2009-04-06 12:48 ` [PATCH 2/8] Add WRITE_SYNC_PLUG and SWRITE_SYNC_PLUG Jens Axboe
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 12:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds, Jens Axboe

This makes sure that we never wait on async IO for sync requests, instead
of doing the split on writes vs reads.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c            |   70 +++++++++++++++++++++---------------------
 block/blk-sysfs.c           |   40 ++++++++++++------------
 block/elevator.c            |    2 +-
 include/linux/backing-dev.h |   12 ++++----
 include/linux/blkdev.h      |   52 +++++++++++++++++++++----------
 mm/backing-dev.c            |   10 +++---
 6 files changed, 102 insertions(+), 84 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 996ed90..a32b571 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -484,11 +484,11 @@ static int blk_init_free_list(struct request_queue *q)
 {
 	struct request_list *rl = &q->rq;
 
-	rl->count[READ] = rl->count[WRITE] = 0;
-	rl->starved[READ] = rl->starved[WRITE] = 0;
+	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
+	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
 	rl->elvpriv = 0;
-	init_waitqueue_head(&rl->wait[READ]);
-	init_waitqueue_head(&rl->wait[WRITE]);
+	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
+	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 
 	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
 				mempool_free_slab, request_cachep, q->node);
@@ -699,18 +699,18 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
 	ioc->last_waited = jiffies;
 }
 
-static void __freed_request(struct request_queue *q, int rw)
+static void __freed_request(struct request_queue *q, int sync)
 {
 	struct request_list *rl = &q->rq;
 
-	if (rl->count[rw] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, rw);
+	if (rl->count[sync] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, sync);
 
-	if (rl->count[rw] + 1 <= q->nr_requests) {
-		if (waitqueue_active(&rl->wait[rw]))
-			wake_up(&rl->wait[rw]);
+	if (rl->count[sync] + 1 <= q->nr_requests) {
+		if (waitqueue_active(&rl->wait[sync]))
+			wake_up(&rl->wait[sync]);
 
-		blk_clear_queue_full(q, rw);
+		blk_clear_queue_full(q, sync);
 	}
 }
 
@@ -718,18 +718,18 @@ static void __freed_request(struct request_queue *q, int rw)
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
-static void freed_request(struct request_queue *q, int rw, int priv)
+static void freed_request(struct request_queue *q, int sync, int priv)
 {
 	struct request_list *rl = &q->rq;
 
-	rl->count[rw]--;
+	rl->count[sync]--;
 	if (priv)
 		rl->elvpriv--;
 
-	__freed_request(q, rw);
+	__freed_request(q, sync);
 
-	if (unlikely(rl->starved[rw ^ 1]))
-		__freed_request(q, rw ^ 1);
+	if (unlikely(rl->starved[sync ^ 1]))
+		__freed_request(q, sync ^ 1);
 }
 
 /*
@@ -743,15 +743,15 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
-	const int rw = rw_flags & 0x01;
+	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	int may_queue, priv;
 
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 
-	if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
-		if (rl->count[rw]+1 >= q->nr_requests) {
+	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
+		if (rl->count[is_sync]+1 >= q->nr_requests) {
 			ioc = current_io_context(GFP_ATOMIC, q->node);
 			/*
 			 * The queue will fill after this allocation, so set
@@ -759,9 +759,9 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 			 * This process will be allowed to complete a batch of
 			 * requests, others will be blocked.
 			 */
-			if (!blk_queue_full(q, rw)) {
+			if (!blk_queue_full(q, is_sync)) {
 				ioc_set_batching(q, ioc);
-				blk_set_queue_full(q, rw);
+				blk_set_queue_full(q, is_sync);
 			} else {
 				if (may_queue != ELV_MQUEUE_MUST
 						&& !ioc_batching(q, ioc)) {
@@ -774,7 +774,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 				}
 			}
 		}
-		blk_set_queue_congested(q, rw);
+		blk_set_queue_congested(q, is_sync);
 	}
 
 	/*
@@ -782,11 +782,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 	 * limit of requests, otherwise we could have thousands of requests
 	 * allocated with any setting of ->nr_requests
 	 */
-	if (rl->count[rw] >= (3 * q->nr_requests / 2))
+	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
 		goto out;
 
-	rl->count[rw]++;
-	rl->starved[rw] = 0;
+	rl->count[is_sync]++;
+	rl->starved[is_sync] = 0;
 
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	if (priv)
@@ -804,7 +804,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 		 * wait queue, but this is pretty rare.
 		 */
 		spin_lock_irq(q->queue_lock);
-		freed_request(q, rw, priv);
+		freed_request(q, is_sync, priv);
 
 		/*
 		 * in the very unlikely event that allocation failed and no
@@ -814,8 +814,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 		 * rq mempool into READ and WRITE
 		 */
 rq_starved:
-		if (unlikely(rl->count[rw] == 0))
-			rl->starved[rw] = 1;
+		if (unlikely(rl->count[is_sync] == 0))
+			rl->starved[is_sync] = 1;
 
 		goto out;
 	}
@@ -829,7 +829,7 @@ rq_starved:
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 
-	trace_block_getrq(q, bio, rw);
+	trace_block_getrq(q, bio, rw_flags & 1);
 out:
 	return rq;
 }
@@ -843,7 +843,7 @@ out:
 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 					struct bio *bio)
 {
-	const int rw = rw_flags & 0x01;
+	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	struct request *rq;
 
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
@@ -852,10 +852,10 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 
-		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
+		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		trace_block_sleeprq(q, bio, rw);
+		trace_block_sleeprq(q, bio, rw_flags & 1);
 
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
@@ -871,7 +871,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		ioc_set_batching(q, ioc);
 
 		spin_lock_irq(q->queue_lock);
-		finish_wait(&rl->wait[rw], &wait);
+		finish_wait(&rl->wait[is_sync], &wait);
 
 		rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	};
@@ -1070,14 +1070,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	 * it didn't come out of our reserved rq pools
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
-		int rw = rq_data_dir(req);
+		int is_sync = rq_is_sync(req) != 0;
 		int priv = req->cmd_flags & REQ_ELVPRIV;
 
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 
 		blk_free_request(q, req);
-		freed_request(q, rw, priv);
+		freed_request(q, is_sync, priv);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e29ddfc..3ff9bba 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -48,28 +48,28 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	q->nr_requests = nr;
 	blk_queue_congestion_threshold(q);
 
-	if (rl->count[READ] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, READ);
-	else if (rl->count[READ] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, READ);
-
-	if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
-		blk_set_queue_congested(q, WRITE);
-	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
-		blk_clear_queue_congested(q, WRITE);
-
-	if (rl->count[READ] >= q->nr_requests) {
-		blk_set_queue_full(q, READ);
-	} else if (rl->count[READ]+1 <= q->nr_requests) {
-		blk_clear_queue_full(q, READ);
-		wake_up(&rl->wait[READ]);
+	if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
+		blk_set_queue_congested(q, BLK_RW_SYNC);
+	else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, BLK_RW_SYNC);
+
+	if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
+		blk_set_queue_congested(q, BLK_RW_ASYNC);
+	else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
+		blk_clear_queue_congested(q, BLK_RW_ASYNC);
+
+	if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
+		blk_set_queue_full(q, BLK_RW_SYNC);
+	} else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) {
+		blk_clear_queue_full(q, BLK_RW_SYNC);
+		wake_up(&rl->wait[BLK_RW_SYNC]);
 	}
 
-	if (rl->count[WRITE] >= q->nr_requests) {
-		blk_set_queue_full(q, WRITE);
-	} else if (rl->count[WRITE]+1 <= q->nr_requests) {
-		blk_clear_queue_full(q, WRITE);
-		wake_up(&rl->wait[WRITE]);
+	if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
+		blk_set_queue_full(q, BLK_RW_ASYNC);
+	} else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) {
+		blk_clear_queue_full(q, BLK_RW_ASYNC);
+		wake_up(&rl->wait[BLK_RW_ASYNC]);
 	}
 	spin_unlock_irq(q->queue_lock);
 	return ret;
diff --git a/block/elevator.c b/block/elevator.c
index 98259ed..ca6788a 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -677,7 +677,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 	}
 
 	if (unplug_it && blk_queue_plugged(q)) {
-		int nrq = q->rq.count[READ] + q->rq.count[WRITE]
+		int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
 			- q->in_flight;
 
 		if (nrq >= q->unplug_thresh)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index bee52ab..0ec2c59 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -24,8 +24,8 @@ struct dentry;
  */
 enum bdi_state {
 	BDI_pdflush,		/* A pdflush thread is working this device */
-	BDI_write_congested,	/* The write queue is getting full */
-	BDI_read_congested,	/* The read queue is getting full */
+	BDI_async_congested,	/* The async (write) queue is getting full */
+	BDI_sync_congested,	/* The sync queue is getting full */
 	BDI_unused,		/* Available bits start here */
 };
 
@@ -215,18 +215,18 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
 
 static inline int bdi_read_congested(struct backing_dev_info *bdi)
 {
-	return bdi_congested(bdi, 1 << BDI_read_congested);
+	return bdi_congested(bdi, 1 << BDI_sync_congested);
 }
 
 static inline int bdi_write_congested(struct backing_dev_info *bdi)
 {
-	return bdi_congested(bdi, 1 << BDI_write_congested);
+	return bdi_congested(bdi, 1 << BDI_async_congested);
 }
 
 static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 {
-	return bdi_congested(bdi, (1 << BDI_read_congested)|
-				  (1 << BDI_write_congested));
+	return bdi_congested(bdi, (1 << BDI_sync_congested) |
+				  (1 << BDI_async_congested));
 }
 
 void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 465d6ba..67dae3b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -38,6 +38,10 @@ struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
 
 struct request_list {
+	/*
+	 * count[], starved[], and wait[] are indexed by
+	 * BLK_RW_SYNC/BLK_RW_ASYNC
+	 */
 	int count[2];
 	int starved[2];
 	int elvpriv;
@@ -66,6 +70,11 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_ATA_PC,
 };
 
+enum {
+	BLK_RW_ASYNC	= 0,
+	BLK_RW_SYNC	= 1,
+};
+
 /*
  * For request of type REQ_TYPE_LINUX_BLOCK, rq->cmd[0] is the opcode being
  * sent down (similar to how REQ_TYPE_BLOCK_PC means that ->cmd[] holds a
@@ -103,7 +112,7 @@ enum rq_flag_bits {
 	__REQ_QUIET,		/* don't worry about errors */
 	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
-	__REQ_RW_SYNC,		/* request is sync (O_DIRECT) */
+	__REQ_RW_SYNC,		/* request is sync (sync write or read) */
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_RW_META,		/* metadata io request */
 	__REQ_COPY_USER,	/* contains copies of user pages */
@@ -438,8 +447,8 @@ struct request_queue
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
-#define	QUEUE_FLAG_READFULL	3	/* read queue has been filled */
-#define QUEUE_FLAG_WRITEFULL	4	/* write queue has been filled */
+#define	QUEUE_FLAG_SYNCFULL	3	/* read queue has been filled */
+#define QUEUE_FLAG_ASYNCFULL	4	/* write queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
@@ -611,32 +620,41 @@ enum {
 #define rq_data_dir(rq)		((rq)->cmd_flags & 1)
 
 /*
- * We regard a request as sync, if it's a READ or a SYNC write.
+ * We regard a request as sync, if either a read or a sync write
  */
-#define rq_is_sync(rq)		(rq_data_dir((rq)) == READ || (rq)->cmd_flags & REQ_RW_SYNC)
+static inline bool rw_is_sync(unsigned int rw_flags)
+{
+	return !(rw_flags & REQ_RW) || (rw_flags & REQ_RW_SYNC);
+}
+
+static inline bool rq_is_sync(struct request *rq)
+{
+	return rw_is_sync(rq->cmd_flags);
+}
+
 #define rq_is_meta(rq)		((rq)->cmd_flags & REQ_RW_META)
 
-static inline int blk_queue_full(struct request_queue *q, int rw)
+static inline int blk_queue_full(struct request_queue *q, int sync)
 {
-	if (rw == READ)
-		return test_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
-	return test_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
+	if (sync)
+		return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags);
+	return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags);
 }
 
-static inline void blk_set_queue_full(struct request_queue *q, int rw)
+static inline void blk_set_queue_full(struct request_queue *q, int sync)
 {
-	if (rw == READ)
-		queue_flag_set(QUEUE_FLAG_READFULL, q);
+	if (sync)
+		queue_flag_set(QUEUE_FLAG_SYNCFULL, q);
 	else
-		queue_flag_set(QUEUE_FLAG_WRITEFULL, q);
+		queue_flag_set(QUEUE_FLAG_ASYNCFULL, q);
 }
 
-static inline void blk_clear_queue_full(struct request_queue *q, int rw)
+static inline void blk_clear_queue_full(struct request_queue *q, int sync)
 {
-	if (rw == READ)
-		queue_flag_clear(QUEUE_FLAG_READFULL, q);
+	if (sync)
+		queue_flag_clear(QUEUE_FLAG_SYNCFULL, q);
 	else
-		queue_flag_clear(QUEUE_FLAG_WRITEFULL, q);
+		queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q);
 }
 
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index be68c95..493b468 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -284,12 +284,12 @@ static wait_queue_head_t congestion_wqh[2] = {
 	};
 
 
-void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
 	enum bdi_state bit;
-	wait_queue_head_t *wqh = &congestion_wqh[rw];
+	wait_queue_head_t *wqh = &congestion_wqh[sync];
 
-	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	bit = sync ? BDI_sync_congested : BDI_async_congested;
 	clear_bit(bit, &bdi->state);
 	smp_mb__after_clear_bit();
 	if (waitqueue_active(wqh))
@@ -297,11 +297,11 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
 }
 EXPORT_SYMBOL(clear_bdi_congested);
 
-void set_bdi_congested(struct backing_dev_info *bdi, int rw)
+void set_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
 	enum bdi_state bit;
 
-	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	bit = sync ? BDI_sync_congested : BDI_async_congested;
 	set_bit(bit, &bdi->state);
 }
 EXPORT_SYMBOL(set_bdi_congested);
-- 
1.6.2.1.423.g442d


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 2/8] Add WRITE_SYNC_PLUG and SWRITE_SYNC_PLUG
  2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
  2009-04-06 12:48 ` [PATCH 1/8] block: change the request allocation/congestion logic to be sync/async based Jens Axboe
@ 2009-04-06 12:48 ` Jens Axboe
  2009-04-06 12:48 ` [PATCH 3/8] block: fsync_buffers_list() should use SWRITE_SYNC_PLUG Jens Axboe
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 12:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds, Jens Axboe

(S)WRITE_SYNC always unplugs the device right after IO submission.
Sometimes we want to build up a queue before doing so, so add
variants that explicitly DON'T unplug the queue. The caller must
then do that after submitting all the IO.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/fs.h |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index a09e17c..ea05109 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -96,7 +96,10 @@ struct inodes_stat_t {
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define READ_META	(READ | (1 << BIO_RW_META))
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
+#define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO))
 #define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
+#define SWRITE_SYNC_PLUG	\
+	(SWRITE | (1 << BIO_RW_SYNCIO))
 #define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
-- 
1.6.2.1.423.g442d


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 3/8] block: fsync_buffers_list() should use SWRITE_SYNC_PLUG
  2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
  2009-04-06 12:48 ` [PATCH 1/8] block: change the request allocation/congestion logic to be sync/async based Jens Axboe
  2009-04-06 12:48 ` [PATCH 2/8] Add WRITE_SYNC_PLUG and SWRITE_SYNC_PLUG Jens Axboe
@ 2009-04-06 12:48 ` Jens Axboe
  2009-04-06 12:48 ` [PATCH 4/8] jbd: use WRITE_SYNC_PLUG instead of WRITE_SYNC Jens Axboe
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 12:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds, Jens Axboe

Then it can submit all the buffers without unplugging for each one.
We will kick off the pending IO if we come across a new address space.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/buffer.c |   20 ++++++++++++++++----
 1 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 5d55a89..43afaa5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -737,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head tmp;
-	struct address_space *mapping;
+	struct address_space *mapping, *prev_mapping = NULL;
 	int err = 0, err2;
 
 	INIT_LIST_HEAD(&tmp);
@@ -762,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * contents - it is a noop if I/O is still in
 				 * flight on potentially older contents.
 				 */
-				ll_rw_block(SWRITE_SYNC, 1, &bh);
+				ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+
+				/*
+				 * Kick off IO for the previous mapping. Note
+				 * that we will not run the very last mapping,
+				 * wait_on_buffer() will do that for us
+				 * through sync_buffer().
+				 */
+				if (prev_mapping && prev_mapping != mapping)
+					blk_run_address_space(prev_mapping);
+				prev_mapping = mapping;
+
 				brelse(bh);
 				spin_lock(lock);
 			}
@@ -2957,12 +2968,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 
-		if (rw == SWRITE || rw == SWRITE_SYNC)
+		if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
 			lock_buffer(bh);
 		else if (!trylock_buffer(bh))
 			continue;
 
-		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
+		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
+		    rw == SWRITE_SYNC_PLUG) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
-- 
1.6.2.1.423.g442d


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 4/8] jbd: use WRITE_SYNC_PLUG instead of WRITE_SYNC
  2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
                   ` (2 preceding siblings ...)
  2009-04-06 12:48 ` [PATCH 3/8] block: fsync_buffers_list() should use SWRITE_SYNC_PLUG Jens Axboe
@ 2009-04-06 12:48 ` Jens Axboe
  2009-04-06 12:48 ` [PATCH 5/8] jbd2: " Jens Axboe
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 12:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds, Jens Axboe

When you are going to be submitting several sync writes, we want to
give the IO scheduler a chance to merge some of them. Instead of
using the implicitly unplugging WRITE_SYNC variant, use WRITE_SYNC_PLUG
and rely on sync_buffer() doing the unplug when someone does a
wait_on_buffer()/lock_buffer().

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/jbd/commit.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f8077b9..a8e8513 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -351,8 +351,13 @@ void journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	/*
+	 * Use plugged writes here, since we want to submit several before
+	 * we unplug the device. We don't do explicit unplugging in here,
+	 * instead we rely on sync_buffer() doing the unplug for us.
+	 */
 	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
+		write_op = WRITE_SYNC_PLUG;
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
-- 
1.6.2.1.423.g442d


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 5/8] jbd2: use WRITE_SYNC_PLUG instead of WRITE_SYNC
  2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
                   ` (3 preceding siblings ...)
  2009-04-06 12:48 ` [PATCH 4/8] jbd: use WRITE_SYNC_PLUG instead of WRITE_SYNC Jens Axboe
@ 2009-04-06 12:48 ` Jens Axboe
  2009-04-06 12:48 ` [PATCH 6/8] block: enabling plugging on SSD devices that don't do queuing Jens Axboe
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 12:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds, Jens Axboe

When you are going to be submitting several sync writes, we want to
give the IO scheduler a chance to merge some of them. Instead of
using the implicitly unplugging WRITE_SYNC variant, use WRITE_SYNC_PLUG
and rely on sync_buffer() doing the unplug when someone does a
wait_on_buffer()/lock_buffer().

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/jbd2/commit.c |   13 +++++++++----
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4ea7237..073c8c3 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		set_buffer_ordered(bh);
 		barrier_done = 1;
 	}
-	ret = submit_bh(WRITE_SYNC, bh);
+	ret = submit_bh(WRITE_SYNC_PLUG, bh);
 	if (barrier_done)
 		clear_buffer_ordered(bh);
 
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		lock_buffer(bh);
 		set_buffer_uptodate(bh);
 		clear_buffer_dirty(bh);
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE_SYNC_PLUG, bh);
 	}
 	*cbh = bh;
 	return ret;
@@ -190,7 +190,7 @@ retry:
 		set_buffer_uptodate(bh);
 		bh->b_end_io = journal_end_buffer_io_sync;
 
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE_SYNC_PLUG, bh);
 		if (ret) {
 			unlock_buffer(bh);
 			return ret;
@@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	/*
+	 * Use plugged writes here, since we want to submit several before
+	 * we unplug the device. We don't do explicit unplugging in here,
+	 * instead we rely on sync_buffer() doing the unplug for us.
+	 */
 	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
+		write_op = WRITE_SYNC_PLUG;
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
-- 
1.6.2.1.423.g442d


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 6/8] block: enabling plugging on SSD devices that don't do queuing
  2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
                   ` (4 preceding siblings ...)
  2009-04-06 12:48 ` [PATCH 5/8] jbd2: " Jens Axboe
@ 2009-04-06 12:48 ` Jens Axboe
  2009-04-06 12:48 ` [PATCH 7/8] block: Add flag for telling the IO schedulers NOT to anticipate more IO Jens Axboe
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 12:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds, Jens Axboe

For the older SSD devices that don't do command queuing, we do want to
enable plugging to get better merging.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c |   13 +++++++++++--
 1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a32b571..c4198f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1136,6 +1136,15 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
+/*
+ * Only disabling plugging for non-rotational devices if it does tagging
+ * as well, otherwise we do need the proper merging
+ */
+static inline bool queue_should_plug(struct request_queue *q)
+{
+	return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
+}
+
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
@@ -1242,11 +1251,11 @@ get_rq:
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
 	    bio_flagged(bio, BIO_CPU_AFFINE))
 		req->cpu = blk_cpu_to_group(smp_processor_id());
-	if (!blk_queue_nonrot(q) && elv_queue_empty(q))
+	if (queue_should_plug(q) && elv_queue_empty(q))
 		blk_plug_device(q);
 	add_request(q, req);
 out:
-	if (unplug || blk_queue_nonrot(q))
+	if (unplug || !queue_should_plug(q))
 		__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
-- 
1.6.2.1.423.g442d


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 7/8] block: Add flag for telling the IO schedulers NOT to anticipate more IO
  2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
                   ` (5 preceding siblings ...)
  2009-04-06 12:48 ` [PATCH 6/8] block: enabling plugging on SSD devices that don't do queuing Jens Axboe
@ 2009-04-06 12:48 ` Jens Axboe
  2009-04-06 12:48 ` [PATCH 8/8] block: switch sync_dirty_buffer() over to WRITE_SYNC Jens Axboe
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 12:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds, Jens Axboe

By default, CFQ will anticipate more IO from a given io context if the
previously completed IO was sync. This used to be fine, since the only
sync IO was reads and O_DIRECT writes. But with more "normal" sync writes
being used now, we don't want to anticipate for those.

Add a bio/request flag that informs the IO scheduler that this is a sync
request that we should not idle for. Introduce WRITE_ODIRECT specifically
for O_DIRECT writes, and make sure that the other sync writes set this
flag.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       |    2 ++
 block/cfq-iosched.c    |    4 +++-
 fs/direct-io.c         |    2 +-
 include/linux/bio.h    |   19 +++++++++++--------
 include/linux/blkdev.h |    3 +++
 include/linux/fs.h     |    9 +++++----
 6 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index c4198f0..2557280 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1128,6 +1128,8 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_UNPLUG;
 	if (bio_rw_meta(bio))
 		req->cmd_flags |= REQ_RW_META;
+	if (bio_noidle(bio))
+		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 664ebfd..9e80934 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1992,8 +1992,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		}
 		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
 			cfq_slice_expired(cfqd, 1);
-		else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list))
+		else if (sync && !rq_noidle(rq) &&
+			 RB_EMPTY_ROOT(&cfqq->sort_list)) {
 			cfq_arm_slice_timer(cfqd);
+		}
 	}
 
 	if (!cfqd->rq_in_driver)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d4390..da258e7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		rw = WRITE_SYNC;
+		rw = WRITE_ODIRECT;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b05b1d4..b900d2c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -145,20 +145,21 @@ struct bio {
  * bit 2 -- barrier
  *	Insert a serialization point in the IO queue, forcing previously
  *	submitted IO to be completed before this one is issued.
- * bit 3 -- synchronous I/O hint: the block layer will unplug immediately
- *	Note that this does NOT indicate that the IO itself is sync, just
- *	that the block layer will not postpone issue of this IO by plugging.
- * bit 4 -- metadata request
+ * bit 3 -- synchronous I/O hint.
+ * bit 4 -- Unplug the device immediately after submitting this bio.
+ * bit 5 -- metadata request
  *	Used for tracing to differentiate metadata and data IO. May also
  *	get some preferential treatment in the IO scheduler
- * bit 5 -- discard sectors
+ * bit 6 -- discard sectors
  *	Informs the lower level device that this range of sectors is no longer
  *	used by the file system and may thus be freed by the device. Used
  *	for flash based storage.
- * bit 6 -- fail fast device errors
- * bit 7 -- fail fast transport errors
- * bit 8 -- fail fast driver errors
+ * bit 7 -- fail fast device errors
+ * bit 8 -- fail fast transport errors
+ * bit 9 -- fail fast driver errors
  *	Don't want driver retries for any fast fail whatever the reason.
+ * bit 10 -- Tell the IO scheduler not to wait for more requests after this
+	one has been submitted, even if it is a SYNC request.
  */
 #define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
 #define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */
@@ -170,6 +171,7 @@ struct bio {
 #define BIO_RW_FAILFAST_DEV		7
 #define BIO_RW_FAILFAST_TRANSPORT	8
 #define BIO_RW_FAILFAST_DRIVER		9
+#define BIO_RW_NOIDLE	10
 
 #define bio_rw_flagged(bio, flag)	((bio)->bi_rw & (1 << (flag)))
 
@@ -188,6 +190,7 @@ struct bio {
 #define bio_rw_ahead(bio)	bio_rw_flagged(bio, BIO_RW_AHEAD)
 #define bio_rw_meta(bio)	bio_rw_flagged(bio, BIO_RW_META)
 #define bio_discard(bio)	bio_rw_flagged(bio, BIO_RW_DISCARD)
+#define bio_noidle(bio)		bio_rw_flagged(bio, BIO_RW_NOIDLE)
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 67dae3b..e036609 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -118,6 +118,7 @@ enum rq_flag_bits {
 	__REQ_COPY_USER,	/* contains copies of user pages */
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
 	__REQ_UNPLUG,		/* unplug queue on submission */
+	__REQ_NOIDLE,		/* Don't anticipate more IO after this one */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -145,6 +146,7 @@ enum rq_flag_bits {
 #define REQ_COPY_USER	(1 << __REQ_COPY_USER)
 #define REQ_INTEGRITY	(1 << __REQ_INTEGRITY)
 #define REQ_UNPLUG	(1 << __REQ_UNPLUG)
+#define REQ_NOIDLE	(1 << __REQ_NOIDLE)
 
 #define BLK_MAX_CDB	16
 
@@ -633,6 +635,7 @@ static inline bool rq_is_sync(struct request *rq)
 }
 
 #define rq_is_meta(rq)		((rq)->cmd_flags & REQ_RW_META)
+#define rq_noidle(rq)		((rq)->cmd_flags & REQ_NOIDLE)
 
 static inline int blk_queue_full(struct request_queue *q, int sync)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ea05109..cae5720 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -95,11 +95,12 @@ struct inodes_stat_t {
 #define SWRITE 3	/* for ll_rw_block() - wait for buffer lock */
 #define READ_SYNC	(READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define READ_META	(READ | (1 << BIO_RW_META))
-#define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
-#define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO))
-#define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
+#define WRITE_SYNC_PLUG	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
+#define WRITE_SYNC	(WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
+#define WRITE_ODIRECT	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define SWRITE_SYNC_PLUG	\
-	(SWRITE | (1 << BIO_RW_SYNCIO))
+			(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE))
+#define SWRITE_SYNC	(SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG))
 #define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
-- 
1.6.2.1.423.g442d


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* [PATCH 8/8] block: switch sync_dirty_buffer() over to WRITE_SYNC
  2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
                   ` (6 preceding siblings ...)
  2009-04-06 12:48 ` [PATCH 7/8] block: Add flag for telling the IO schedulers NOT to anticipate more IO Jens Axboe
@ 2009-04-06 12:48 ` Jens Axboe
  2009-04-06 13:04 ` [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
  2009-04-06 15:04 ` Linus Torvalds
  9 siblings, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 12:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds, Jens Axboe

We should now have the logic in place to handle this properly
without regressing on the write performance, so re-enable
the sync writes.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/buffer.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 43afaa5..6e35762 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3010,7 +3010,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(WRITE, bh);
+		ret = submit_bh(WRITE_SYNC, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
-- 
1.6.2.1.423.g442d


^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
                   ` (7 preceding siblings ...)
  2009-04-06 12:48 ` [PATCH 8/8] block: switch sync_dirty_buffer() over to WRITE_SYNC Jens Axboe
@ 2009-04-06 13:04 ` Jens Axboe
  2009-04-06 13:13   ` Jens Axboe
  2009-04-06 15:37   ` Linus Torvalds
  2009-04-06 15:04 ` Linus Torvalds
  9 siblings, 2 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 13:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds

On Mon, Apr 06 2009, Jens Axboe wrote:
> Hi,
> 
> This is a set of patches that I worked on today in the hopes
> of furthering the latency goals and at least fixing some of
> the write regression with fwrite + fsync that current -git
> is suffering from.
> 
> I haven't done any latency tests yet, I'm just tossing this
> out there so we can collaborate on improving things. What I
> did test was the silly fwrite() + fsync() loop test, which
> is a LOT slower in current -git that it used to be. The test
> is basically:
> 
> 	while (nr--) {
> 		f = fopen();
> 		fprintf(f, "Some data here\n");
> 		fsync(fileno(f));
> 		fclose(f);
> 	}
> 
> which (for nr == 2000) takes 16 seconds in -git, completes
> in 0.9s with the patches.

Ran the fsync-tester [1]. Drive is a 3-4 years old SATA drive, fs is
ext3/writeback. IO scheduler is CFQ.

fsync time: 0.0402s
fsync time: 0.6572s
fsync time: 0.3187s
fsync time: 0.2901s
fsync time: 0.1478s
fsync time: 0.4158s
fsync time: 0.2815s
fsync time: 0.3216s
fsync time: 0.1604s
fsync time: 0.1929s
fsync time: 0.2413s
fsync time: 0.2138s
fsync time: 0.2441s
fsync time: 0.2785s
fsync time: 0.2640s

And with Linus torture dd running in the background:

fsync time: 0.0109s
fsync time: 0.5236s
fsync time: 1.2108s
fsync time: 0.2999s
fsync time: 1.5286s
fsync time: 0.2549s
fsync time: 0.4164s
fsync time: 1.1586s
fsync time: 1.6630s
fsync time: 0.6949s
fsync time: 1.0102s
fsync time: 0.3715s
fsync time: 0.6553s

[1]
http://www.gossamer-threads.com/lists/engine?do=post_attachment;postatt_id=33847;list=linux

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 13:04 ` [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
@ 2009-04-06 13:13   ` Jens Axboe
  2009-04-06 15:37   ` Linus Torvalds
  1 sibling, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 13:13 UTC (permalink / raw)
  To: linux-kernel; +Cc: tytso, torvalds

On Mon, Apr 06 2009, Jens Axboe wrote:
> On Mon, Apr 06 2009, Jens Axboe wrote:
> > Hi,
> > 
> > This is a set of patches that I worked on today in the hopes
> > of furthering the latency goals and at least fixing some of
> > the write regression with fwrite + fsync that current -git
> > is suffering from.
> > 
> > I haven't done any latency tests yet, I'm just tossing this
> > out there so we can collaborate on improving things. What I
> > did test was the silly fwrite() + fsync() loop test, which
> > is a LOT slower in current -git that it used to be. The test
> > is basically:
> > 
> > 	while (nr--) {
> > 		f = fopen();
> > 		fprintf(f, "Some data here\n");
> > 		fsync(fileno(f));
> > 		fclose(f);
> > 	}
> > 
> > which (for nr == 2000) takes 16 seconds in -git, completes
> > in 0.9s with the patches.
> 
> Ran the fsync-tester [1]. Drive is a 3-4 years old SATA drive, fs is
> ext3/writeback. IO scheduler is CFQ.
> 
> fsync time: 0.0402s
> fsync time: 0.6572s
> fsync time: 0.3187s
> fsync time: 0.2901s
> fsync time: 0.1478s
> fsync time: 0.4158s
> fsync time: 0.2815s
> fsync time: 0.3216s
> fsync time: 0.1604s
> fsync time: 0.1929s
> fsync time: 0.2413s
> fsync time: 0.2138s
> fsync time: 0.2441s
> fsync time: 0.2785s
> fsync time: 0.2640s
> 
> And with Linus torture dd running in the background:
> 
> fsync time: 0.0109s
> fsync time: 0.5236s
> fsync time: 1.2108s
> fsync time: 0.2999s
> fsync time: 1.5286s
> fsync time: 0.2549s
> fsync time: 0.4164s
> fsync time: 1.1586s
> fsync time: 1.6630s
> fsync time: 0.6949s
> fsync time: 1.0102s
> fsync time: 0.3715s
> fsync time: 0.6553s

And here is that latter test again with NCQ disabled on the drive:

fsync time: 0.0092s
fsync time: 0.7815s
fsync time: 0.6490s
fsync time: 0.7894s
fsync time: 0.5385s
fsync time: 0.6598s
fsync time: 0.7701s
fsync time: 0.2155s
fsync time: 0.0901s
fsync time: 0.2765s
fsync time: 0.2879s
fsync time: 0.2882s
fsync time: 0.2457s
fsync time: 0.4319s
fsync time: 0.3752s

It stays nicely below 1s.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
                   ` (8 preceding siblings ...)
  2009-04-06 13:04 ` [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
@ 2009-04-06 15:04 ` Linus Torvalds
  2009-04-06 15:10   ` Jens Axboe
  9 siblings, 1 reply; 53+ messages in thread
From: Linus Torvalds @ 2009-04-06 15:04 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, tytso



On Mon, 6 Apr 2009, Jens Axboe wrote:
> 
> This is a set of patches that I worked on today in the hopes
> of furthering the latency goals and at least fixing some of
> the write regression with fwrite + fsync that current -git
> is suffering from.

Goodie. Let's just do this. After all, right now we would otherwise have 
to revert the other changes as being a regression, and I absolutely _love_ 
the fact that we're actually finally getting somewhere on this fsync 
latency issue that has been with us for so long.

Jens - I can just apply this queue (I want to test it out anyway), or if 
you prefer I can pull from you. Just tell me.

		Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 15:04 ` Linus Torvalds
@ 2009-04-06 15:10   ` Jens Axboe
  2009-04-06 15:45     ` Linus Torvalds
  0 siblings, 1 reply; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 15:10 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, tytso

On Mon, Apr 06 2009, Linus Torvalds wrote:
> 
> 
> On Mon, 6 Apr 2009, Jens Axboe wrote:
> > 
> > This is a set of patches that I worked on today in the hopes
> > of furthering the latency goals and at least fixing some of
> > the write regression with fwrite + fsync that current -git
> > is suffering from.
> 
> Goodie. Let's just do this. After all, right now we would otherwise have 
> to revert the other changes as being a regression, and I absolutely _love_ 
> the fact that we're actually finally getting somewhere on this fsync 
> latency issue that has been with us for so long.

Agree, it's been nagging for some time...

> Jens - I can just apply this queue (I want to test it out anyway), or if 
> you prefer I can pull from you. Just tell me.

Whatever you want, if you want to pull instead of manually applying,
it's:

git://git.kernel.dk/linux-2.6-block.git blk-latency

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 13:04 ` [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
  2009-04-06 13:13   ` Jens Axboe
@ 2009-04-06 15:37   ` Linus Torvalds
  2009-04-06 16:57     ` Jens Axboe
  2009-04-07  3:28     ` Chris Mason
  1 sibling, 2 replies; 53+ messages in thread
From: Linus Torvalds @ 2009-04-06 15:37 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel Mailing List, tytso



On Mon, 6 Apr 2009, Jens Axboe wrote:
> 
> Ran the fsync-tester [1]. Drive is a 3-4 years old SATA drive, fs is
> ext3/writeback. IO scheduler is CFQ.
> 
> fsync time: 0.2785s
> fsync time: 0.2640s
> 
> And with Linus torture dd running in the background:
> 
> fsync time: 0.0109s
> fsync time: 0.5236s
> fsync time: 1.2108s

Ok, it's definitely better for me too. CFQ used to be the problem case 
(with the previous patches), now I've been trying with CFQ for a while, 
and it seems ok.

Not wonderful, by any means, but I haven't seen a 5+ second delay yet. 
I've come close (I have a few 2+s hickups in my trace), but it's 
clearly more responsive, even if I'd wish it to be better still.

One thing that I find intriguing is how the fsync time seems so 
_consistent_ across a wild variety of drives. It's interesting how you see 
delays that are roughly the same order of magnitude, even though you are 
using an old SATA drive, and I'm using the Intel SSD. And when you turn 
off TCQ, your numbers go down even more.

That just makes me suspect that there is something else than pure IO going 
on. There shouldn't be any idling by the IO scheduler in my setup 
("rotational" is zero for me), and quite frankly, I should not see 
latencies in the seconds even _with_ TCQ, since it should be limited to 
just 32 tags. Of course, maybe some of those requests just grow humongous. 

So maybe one reason the "sync()" workload is so horrible is that we get 
insanely big single requests. I see

	[root@nehalem queue]# cat max_sectors_kb 
	512

so we should be limited to half a meg per request, but I guess 32 of those 
will take some time even on the Intel SSD. In fact, I guess the SSD is not 
really any faster than your 2-3 year old SATA disk when it comes to pure 
linear throughput

Hmm. Doing a "echo 64 > max_sectors_kb" does seem to make my experience 
nicer. At no really noticeable downside in throughput that I can see: the 
"dd+sync" still tends to fluctuate 30-40s. But maybe I'm fooling myself. 
But my 'strace' seems to agree: I'm having a hard time triggering anything 
even close to a second latency now.

I wonder if we could limit the tag usage by request _size_, ie not let big 
requests fill up all the tags (by all means allow writes to fill them up 
if they are small - it's with many small requests that you get the biggest 
advantage, after all, and with many _big_ requests that the downside is 
the biggest too).

		Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 15:10   ` Jens Axboe
@ 2009-04-06 15:45     ` Linus Torvalds
  2009-04-06 17:01       ` Jens Axboe
  2009-04-06 18:31       ` Theodore Tso
  0 siblings, 2 replies; 53+ messages in thread
From: Linus Torvalds @ 2009-04-06 15:45 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel Mailing List, tytso



On Mon, 6 Apr 2009, Jens Axboe wrote:
>
> > Jens - I can just apply this queue (I want to test it out anyway), or if 
> > you prefer I can pull from you. Just tell me.
> 
> Whatever you want, if you want to pull instead of manually applying,
> it's:
> 
> git://git.kernel.dk/linux-2.6-block.git blk-latency

Ok, I applied them for testing anyway, so I think I'll just keep the 
series I have in my tree. I'm keeping my nasty "dd+sync" going in the 
background, just to verify that things really feel better, but I'm already 
convinced this is a winner. Especially with the request size limiter, I 
can really read email _almost_ as if nothing else was going on on the 
machine. 

With that request size limiting, my fsync pauses tend to be in the 250ms 
range, with a one 0.75s outlier in the last few minutes. I can definitely 
feel that quarter-second thing, and the .75s pause was a real stutter, but 
boy what a difference. I don't get the uncontrollable urge to kill that 
nasty background writer any more.

			Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 15:37   ` Linus Torvalds
@ 2009-04-06 16:57     ` Jens Axboe
  2009-04-07  3:28     ` Chris Mason
  1 sibling, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 16:57 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Linux Kernel Mailing List, tytso

On Mon, Apr 06 2009, Linus Torvalds wrote:
> 
> 
> On Mon, 6 Apr 2009, Jens Axboe wrote:
> > 
> > Ran the fsync-tester [1]. Drive is a 3-4 years old SATA drive, fs is
> > ext3/writeback. IO scheduler is CFQ.
> > 
> > fsync time: 0.2785s
> > fsync time: 0.2640s
> > 
> > And with Linus torture dd running in the background:
> > 
> > fsync time: 0.0109s
> > fsync time: 0.5236s
> > fsync time: 1.2108s
> 
> Ok, it's definitely better for me too. CFQ used to be the problem case 
> (with the previous patches), now I've been trying with CFQ for a while, 
> and it seems ok.
> 
> Not wonderful, by any means, but I haven't seen a 5+ second delay yet. 
> I've come close (I have a few 2+s hickups in my trace), but it's 
> clearly more responsive, even if I'd wish it to be better still.

OK that's good. I'll run some testing with this as well and perhaps we
can even do better still.

> One thing that I find intriguing is how the fsync time seems so 
> _consistent_ across a wild variety of drives. It's interesting how you see 
> delays that are roughly the same order of magnitude, even though you are 
> using an old SATA drive, and I'm using the Intel SSD. And when you turn 
> off TCQ, your numbers go down even more.
> 
> That just makes me suspect that there is something else than pure IO going 
> on. There shouldn't be any idling by the IO scheduler in my setup 
> ("rotational" is zero for me), and quite frankly, I should not see 
> latencies in the seconds even _with_ TCQ, since it should be limited to 
> just 32 tags. Of course, maybe some of those requests just grow humongous. 
> 
> So maybe one reason the "sync()" workload is so horrible is that we get 
> insanely big single requests. I see
> 
> 	[root@nehalem queue]# cat max_sectors_kb 
> 	512
> 
> so we should be limited to half a meg per request, but I guess 32 of those 
> will take some time even on the Intel SSD. In fact, I guess the SSD is not 
> really any faster than your 2-3 year old SATA disk when it comes to pure 
> linear throughput

It's probably close, for your MLC version. This drive does around
60Mb/sec sequential writes, which is in the ball park with the ~70 yours
should be doing.

> Hmm. Doing a "echo 64 > max_sectors_kb" does seem to make my experience 
> nicer. At no really noticeable downside in throughput that I can see: the 
> "dd+sync" still tends to fluctuate 30-40s. But maybe I'm fooling myself. 
> But my 'strace' seems to agree: I'm having a hard time triggering anything 
> even close to a second latency now.
> 
> I wonder if we could limit the tag usage by request _size_, ie not let big 
> requests fill up all the tags (by all means allow writes to fill them up 
> if they are small - it's with many small requests that you get the biggest 
> advantage, after all, and with many _big_ requests that the downside is 
> the biggest too).

I think we are doing OK with the tag async vs sync allocation, there
should be plenty of room for sync IO still. The problem is likely
earlier, before we even assign a tag. The IO schedulers will move IO to
the dispatch list and the driver will grab it from there, assign a tag,
start, etc. Perhaps we end up moving too much to the dispatch list. That
would increase latencies, even if a sync queue preempts the current
async queue and dispatches a request (which goes to the dispatch list,
ordered, and thus may have to wait for others to be serviced first).

Just speculating, I'll test and probe and see what comes up.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 15:45     ` Linus Torvalds
@ 2009-04-06 17:01       ` Jens Axboe
  2009-04-06 18:31       ` Theodore Tso
  1 sibling, 0 replies; 53+ messages in thread
From: Jens Axboe @ 2009-04-06 17:01 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Linux Kernel Mailing List, tytso

On Mon, Apr 06 2009, Linus Torvalds wrote:
> 
> 
> On Mon, 6 Apr 2009, Jens Axboe wrote:
> >
> > > Jens - I can just apply this queue (I want to test it out anyway), or if 
> > > you prefer I can pull from you. Just tell me.
> > 
> > Whatever you want, if you want to pull instead of manually applying,
> > it's:
> > 
> > git://git.kernel.dk/linux-2.6-block.git blk-latency
> 
> Ok, I applied them for testing anyway, so I think I'll just keep the 
> series I have in my tree. I'm keeping my nasty "dd+sync" going in the 

No problem, the end result should be identical :-)

> background, just to verify that things really feel better, but I'm already 
> convinced this is a winner. Especially with the request size limiter, I 
> can really read email _almost_ as if nothing else was going on on the 
> machine. 
> 
> With that request size limiting, my fsync pauses tend to be in the 250ms 
> range, with a one 0.75s outlier in the last few minutes. I can definitely 
> feel that quarter-second thing, and the .75s pause was a real stutter, but 
> boy what a difference. I don't get the uncontrollable urge to kill that 
> nasty background writer any more.

If we can make the dd writer almost unnoticable, then I think we've come
a long way already. That really is a nasty workload.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 15:45     ` Linus Torvalds
  2009-04-06 17:01       ` Jens Axboe
@ 2009-04-06 18:31       ` Theodore Tso
  2009-04-06 19:57         ` Linus Torvalds
  1 sibling, 1 reply; 53+ messages in thread
From: Theodore Tso @ 2009-04-06 18:31 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, Linux Kernel Mailing List

On Mon, Apr 06, 2009 at 08:45:09AM -0700, Linus Torvalds wrote:
> 
> With that request size limiting, my fsync pauses tend to be in the 250ms 
> range, with a one 0.75s outlier in the last few minutes. I can definitely 
> feel that quarter-second thing, and the .75s pause was a real stutter, but 
> boy what a difference. I don't get the uncontrollable urge to kill that 
> nasty background writer any more.

Cool!  Maybe we can make things even better, but given where we are
at, this brings up an question: given Jens block latency patches
(thanks, Jens!), and the patches which add the flush on
replace-via-rename/replace-via-truncate for ext3 data=writeback,
should we make ext3 data=writeback the default for 2.6.30?

					- Ted

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 18:31       ` Theodore Tso
@ 2009-04-06 19:57         ` Linus Torvalds
  2009-04-06 20:10           ` Linus Torvalds
  2009-04-06 20:12           ` Hua Zhong
  0 siblings, 2 replies; 53+ messages in thread
From: Linus Torvalds @ 2009-04-06 19:57 UTC (permalink / raw)
  To: Theodore Tso; +Cc: Jens Axboe, Linux Kernel Mailing List



On Mon, 6 Apr 2009, Theodore Tso wrote:
> 
> Cool!  Maybe we can make things even better, but given where we are
> at, this brings up an question: given Jens block latency patches
> (thanks, Jens!), and the patches which add the flush on
> replace-via-rename/replace-via-truncate for ext3 data=writeback,
> should we make ext3 data=writeback the default for 2.6.30?

Yes. I want to go back and see how much this all helps for the 
data=ordered case (if at all), but I think we should at least consider 
trying 'data=writeback' as a default.

Does the 'synchronize on rename / close-after-ftrucate' patches leave any 
other intersting cases open?

			Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 19:57         ` Linus Torvalds
@ 2009-04-06 20:10           ` Linus Torvalds
  2009-04-06 21:26             ` Theodore Tso
  2009-04-06 20:12           ` Hua Zhong
  1 sibling, 1 reply; 53+ messages in thread
From: Linus Torvalds @ 2009-04-06 20:10 UTC (permalink / raw)
  To: Theodore Tso; +Cc: Jens Axboe, Linux Kernel Mailing List



On Mon, 6 Apr 2009, Linus Torvalds wrote:
> 
> Yes. I want to go back and see how much this all helps for the 
> data=ordered case (if at all), but I think we should at least consider 
> trying 'data=writeback' as a default.

Hmm.

So I'm back to "data=ordered", and quite frankly, so far it's a horrible 
experience. 

The difference is rather startlingly huge, which it was _not_ before. 

Sure, data=writeback was better before too, but in the end, the difference 
between the occasional 5+ second pause and the occasional 10+ second pause 
wasn't really all that interesting. They were both unusuable, and both 
made me kill the background writer almost immediately.

This time I really forced myself to keep it around, just to get the whole 
horridness of the experience.

Here's wahat I just got from data=writeback and all the latest patches 
(which includes one extra experimental one from Jens to further improve on 
things):

        fsync(6)                                = 0 <0.839397>
        fsync(6)                                = 0 <0.879706>
        fsync(6)                                = 0 <0.240856>
        fsync(6)                                = 0 <0.006497>
        fsync(6)                                = 0 <0.207197>
        fsync(4)                                = 0 <0.318396>
        fsync(4)                                = 0 <0.044499>
        fsync(4)                                = 0 <0.088381>
        fsync(6)                                = 0 <0.001057>
        fsync(9)                                = 0 <0.245389>
        fsync(7)                                = 0 <0.042728>
        fsync(7)                                = 0 <1.148262>
        fsync(6)                                = 0 <0.200707>
        fsync(6)                                = 0 <0.045259>
        fsync(6)                                = 0 <0.111846>

that's all quite usable. Stuttering, yes, but never anything feeling rally 
bad, or like something died.

This is with data=ordered on the same kernel:

	fsync(5)                                = 0 <0.000949>
	fsync(8)                                = 0 <19.674681>
	fsync(8)                                = 0 <1.169357>
	fsync(8)                                = 0 <17.994631>
	fsync(8)                                = 0 <5.711143>
	fsync(8)                                = 0 <4.759515>
	fsync(4)                                = 0 <17.283171>
	fsync(4)                                = 0 <4.371930>
	fsync(4)                                = 0 <0.008327>
	fsync(8)                                = 0 <0.000636>
	fsync(11)                               = 0 <9.802027>
	fsync(9)                                = 0 <14.109262>
	fsync(9)                                = 0 <4.081919>
	fsync(8)                                = 0 <0.000490>
	fsync(8)                                = 0 <0.001588>
	fsync(11)                               = 0 <18.822826>
	fsync(9)                                = 0 <3.917717>
	fsync(9)                                = 0 <0.001035>
	fsync(8)                                = 0 <0.000138>
	fsync(9)                                = 0 <17.466156>

where basically each 'sync' on the big file triggers that 15-20s total 
failure. I do wonder if it's actually gotten _worse_ for the data=ordered 
case, but at the same time it does make a pretty compelling argument for 
switching the defaults around.

			Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 19:57         ` Linus Torvalds
  2009-04-06 20:10           ` Linus Torvalds
@ 2009-04-06 20:12           ` Hua Zhong
  2009-04-06 20:20             ` Linus Torvalds
  2009-04-06 21:19             ` Theodore Tso
  1 sibling, 2 replies; 53+ messages in thread
From: Hua Zhong @ 2009-04-06 20:12 UTC (permalink / raw)
  To: 'Linus Torvalds', 'Theodore Tso'
  Cc: 'Jens Axboe', 'Linux Kernel Mailing List'

Grr..making writeback as default would break people's setup that relies on
the ordered semantics, especially in the embedded world. It's a big no-no.

> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> owner@vger.kernel.org] On Behalf Of Linus Torvalds
> Sent: Monday, April 06, 2009 12:57 PM
> To: Theodore Tso
> Cc: Jens Axboe; Linux Kernel Mailing List
> Subject: Re: [PATCH 0/8][RFC] IO latency/throughput fixes
> 
> 
> 
> On Mon, 6 Apr 2009, Theodore Tso wrote:
> >
> > Cool!  Maybe we can make things even better, but given where we are
> > at, this brings up an question: given Jens block latency patches
> > (thanks, Jens!), and the patches which add the flush on
> > replace-via-rename/replace-via-truncate for ext3 data=writeback,
> > should we make ext3 data=writeback the default for 2.6.30?
> 
> Yes. I want to go back and see how much this all helps for the
> data=ordered case (if at all), but I think we should at least consider
> trying 'data=writeback' as a default.
> 
> Does the 'synchronize on rename / close-after-ftrucate' patches leave
> any
> other intersting cases open?
> 
> 			Linus
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel"
> in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 20:12           ` Hua Zhong
@ 2009-04-06 20:20             ` Linus Torvalds
  2009-04-06 21:19             ` Theodore Tso
  1 sibling, 0 replies; 53+ messages in thread
From: Linus Torvalds @ 2009-04-06 20:20 UTC (permalink / raw)
  To: Hua Zhong
  Cc: 'Theodore Tso', 'Jens Axboe',
	'Linux Kernel Mailing List'



On Mon, 6 Apr 2009, Hua Zhong wrote:
>
> Grr..making writeback as default would break people's setup that relies on
> the ordered semantics, especially in the embedded world. It's a big no-no.

We could make it a config option, perhaps.

		Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 20:12           ` Hua Zhong
  2009-04-06 20:20             ` Linus Torvalds
@ 2009-04-06 21:19             ` Theodore Tso
  2009-04-06 21:35               ` Hua Zhong
  2009-04-07  3:52               ` Chris Mason
  1 sibling, 2 replies; 53+ messages in thread
From: Theodore Tso @ 2009-04-06 21:19 UTC (permalink / raw)
  To: Hua Zhong
  Cc: 'Linus Torvalds', 'Jens Axboe',
	'Linux Kernel Mailing List'

On Mon, Apr 06, 2009 at 01:12:08PM -0700, Hua Zhong wrote:
> Grr..making writeback as default would break people's setup that relies on
> the ordered semantics, especially in the embedded world. It's a big no-no.

I've added workarounds for 2.6.30 that provide the replace-via-rename
and replace-via-truncate workarounds for ext3 data=writeback cases.
See commits e7c8f507 and f7ab34ea.  

There won't be an implied fsync for newly created files, yes, but you
could have crashed 5 seconds earlier, at which point you would have
lost the newly created file anyway.  Replace-via-rename and
replace-via-truncate solves the problem for applications which are
editing pre-existing files, which was most of people's complaints
about depending on data=ordered semantics.

						- Ted

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 20:10           ` Linus Torvalds
@ 2009-04-06 21:26             ` Theodore Tso
  0 siblings, 0 replies; 53+ messages in thread
From: Theodore Tso @ 2009-04-06 21:26 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, Linux Kernel Mailing List

On Mon, Apr 06, 2009 at 01:10:50PM -0700, Linus Torvalds wrote:
> where basically each 'sync' on the big file triggers that 15-20s total 
> failure. I do wonder if it's actually gotten _worse_ for the data=ordered 
> case, but at the same time it does make a pretty compelling argument for 
> switching the defaults around.

My first set of patches that I pushed to you were designed to fix
things for data=ordered, and did result in improvements for 2.6.29
versus 2.6.29+ext3-latency-fixes in the data=ordered case.  (It was
only in the last day or two that I switched to focusing on improving
things for data=writeback mode.)

I haven't benchmarked 2.6.29 versus 2.6.29 plus *all* of the recent
latency fixes with data=ordered, but I would expect that timings are
the same or better.

					- Ted

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 21:19             ` Theodore Tso
@ 2009-04-06 21:35               ` Hua Zhong
  2009-04-06 22:04                 ` Ray Lee
  2009-04-07  3:52               ` Chris Mason
  1 sibling, 1 reply; 53+ messages in thread
From: Hua Zhong @ 2009-04-06 21:35 UTC (permalink / raw)
  To: 'Theodore Tso'
  Cc: 'Linus Torvalds', 'Jens Axboe',
	'Linux Kernel Mailing List'

> I've added workarounds for 2.6.30 that provide the replace-via-rename
> and replace-via-truncate workarounds for ext3 data=writeback cases.
> See commits e7c8f507 and f7ab34ea.
> 
> There won't be an implied fsync for newly created files, yes, but you
> could have crashed 5 seconds earlier, at which point you would have
> lost the newly created file anyway.  Replace-via-rename and
> replace-via-truncate solves the problem for applications which are
> editing pre-existing files, which was most of people's complaints
> about depending on data=ordered semantics.

I am not talking about "most" people's complaints. There are use cases for
ext3 far beyond the desktop.

I worked on a user-space library on top of ext3 before on embedded systems.
It may not have been the case for me but I could well imagine where it could
get too clever and depend upon "ordered". You really don't want to silently
corrupt people's data by changing the default. How about security too?
Wasn't that the original reason for "ordered" mode?

Ext3 is supposed to be a stable filesystem, while ext4 is the shiny new
filesystem that gets to do all the exciting experiments. I thought it was
the idea. People do not expect such a change at this stage, for better or
worse. It's great that you can improve its performance, but the performance
problem wasn't introduced yesterday, so if people care they could always
mount it as writeback, but there is no urgency in doing it for them. A
default semantic change is just crazy talk.

Hua



^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 21:35               ` Hua Zhong
@ 2009-04-06 22:04                 ` Ray Lee
  2009-04-06 22:17                   ` Linus Torvalds
  2009-04-06 22:25                   ` Hua Zhong
  0 siblings, 2 replies; 53+ messages in thread
From: Ray Lee @ 2009-04-06 22:04 UTC (permalink / raw)
  To: Hua Zhong
  Cc: Theodore Tso, Linus Torvalds, Jens Axboe, Linux Kernel Mailing List

On Mon, Apr 6, 2009 at 2:35 PM, Hua Zhong <hzhong@gmail.com> wrote:
>> I've added workarounds for 2.6.30 that provide the replace-via-rename
>> and replace-via-truncate workarounds for ext3 data=writeback cases.
>> See commits e7c8f507 and f7ab34ea.
>>
>> There won't be an implied fsync for newly created files, yes, but you
>> could have crashed 5 seconds earlier, at which point you would have
>> lost the newly created file anyway.  Replace-via-rename and
>> replace-via-truncate solves the problem for applications which are
>> editing pre-existing files, which was most of people's complaints
>> about depending on data=ordered semantics.
>
> I am not talking about "most" people's complaints. There are use cases for
> ext3 far beyond the desktop.
>
> I worked on a user-space library on top of ext3 before on embedded systems.
> It may not have been the case for me but I could well imagine where it could
> get too clever and depend upon "ordered".

Speaking as another embedded Linux guy, I don't update kernels on my
embedded platforms willy-nilly, nor do I design a library that relies
upon some default behavior without specifying it explicitly. That's
just one of the prices of doing embedded development.

Your argument seems to be that someone may be relying upon default
kernel behavior and, at the same time, is willing to continually
upgrade their kernel. I'd argue that person is, y'know, nuts. If
they're willing to upgrade their kernel on something that has that
stringent of requirements, then they should be willing to force a
mount option at the same time.

If they're willing to upgrade their kernel blindly, then they
shouldn't be doing embedded development.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 22:04                 ` Ray Lee
@ 2009-04-06 22:17                   ` Linus Torvalds
  2009-04-06 23:10                     ` Linus Torvalds
  2009-04-06 22:25                   ` Hua Zhong
  1 sibling, 1 reply; 53+ messages in thread
From: Linus Torvalds @ 2009-04-06 22:17 UTC (permalink / raw)
  To: Ray Lee; +Cc: Hua Zhong, Theodore Tso, Jens Axboe, Linux Kernel Mailing List



On Mon, 6 Apr 2009, Ray Lee wrote:
> 
> Your argument seems to be that someone may be relying upon default
> kernel behavior and, at the same time, is willing to continually
> upgrade their kernel.

No, I do think that the argument is valid per se: the crash behavior of 
"data=ordered" wrt "data=writeback" _is_ very different. And it's true 
that "data=ordered" has nicer behavior in the case of a crash.

But it is also true that with all the recent patches, "data=writeback" has 
become a _lot_ more attractive. It is slightly more ordered than it used 
to be, in a very particular way that makes one of the biggest downsides go 
away. And the latency improvements are really quite stunning (*). 

So I think Hua's argument is real, but at the same time we should balance 
it against the fact that a lot of people will just use whatever is the 
default - and as a result we should strive to make the default be the 
thing that we think people would be happiest with.

I think "ordered" was a reasonable default, but that was at least partly 
because _both_ ordered and writeback sucked (partly in different ways).

I do think we could make it a config option.

			Linus

(*) Admittedly with a very specific workload, but I suspect it's not that 
uncommon, and there will be people who never realized what their problem 
was and blamed jerky behavior on scheduler or something.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 22:04                 ` Ray Lee
  2009-04-06 22:17                   ` Linus Torvalds
@ 2009-04-06 22:25                   ` Hua Zhong
  2009-04-06 22:48                     ` Ray Lee
  1 sibling, 1 reply; 53+ messages in thread
From: Hua Zhong @ 2009-04-06 22:25 UTC (permalink / raw)
  To: 'Ray Lee'
  Cc: 'Theodore Tso', 'Linus Torvalds',
	'Jens Axboe', 'Linux Kernel Mailing List'

> Speaking as another embedded Linux guy, I don't update kernels on my
> embedded platforms willy-nilly, nor do I design a library that relies
> upon some default behavior without specifying it explicitly. That's
> just one of the prices of doing embedded development.
> 
> Your argument seems to be that someone may be relying upon default
> kernel behavior and, at the same time, is willing to continually
> upgrade their kernel. I'd argue that person is, y'know, nuts. If
> they're willing to upgrade their kernel on something that has that
> stringent of requirements, then they should be willing to force a
> mount option at the same time.

You are implying that if someone upgrades their kernel, then he must 
1) upgrade it continuously and 2) without any scrutiny. Both are untrue.

There are certain things people expect from the kernel, such as 
no ABI changes. To me ext3 default option falls into this category.

And even if they are nuts, you can't prove they don't exist, especially
given how many software already depends on the ordered mode.

You also conveniently forgot to quote my question about security. Both
are valid questions, not something I just totally made up.

> If they're willing to upgrade their kernel blindly, then they
> shouldn't be doing embedded development.

Linus once said that if you don't understand "not breaking user space" then
you should not be doing kernel development. Or something to that extent.



^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 22:25                   ` Hua Zhong
@ 2009-04-06 22:48                     ` Ray Lee
  2009-04-06 22:52                       ` Hua Zhong
  2009-04-06 23:19                       ` Alan Cox
  0 siblings, 2 replies; 53+ messages in thread
From: Ray Lee @ 2009-04-06 22:48 UTC (permalink / raw)
  To: Hua Zhong
  Cc: Theodore Tso, Linus Torvalds, Jens Axboe, Linux Kernel Mailing List

On Mon, Apr 6, 2009 at 3:25 PM, Hua Zhong <hzhong@gmail.com> wrote:
> You are implying that if someone upgrades their kernel, then he must
> 1) upgrade it continuously and 2) without any scrutiny. Both are untrue.

Great, then they'll notice the default ext3 data mode changed, and
adjust their scripts accordingly. Problem solved.

> There are certain things people expect from the kernel, such as
> no ABI changes. To me ext3 default option falls into this category.

As the past several hundred messages on this topic has just shown,
while people may expected metadata vs data ordering to be a guarantee,
apparently it never was unless you specifically mounted your ext3
filesystem with data=ordered. The default was only that, a default,
and one that can still be specified explicitly.

> And even if they are nuts, you can't prove they don't exist, especially
> given how many software already depends on the ordered mode.

There are an unbounded number of possible incorrect setups in the
world. Should we live in fear of their existence without any proof?

> You also conveniently forgot to quote my question about security. Both
> are valid questions, not something I just totally made up.

Security on an embedded device starts with controlling physical
access. If they have access to the storage media all bets are off,
whether it's data=ordered or not. (Access to the storage media is
really what we're talking about here -- medical data, for example,
hitting the platter before the metadata updates that then make that
data unaccessible to other userspace processes.)

Because *if* they have access to the media, they can replace any
running code on that box, and your security is worthless.

So no, I don't see how that's a valid argument.

>> If they're willing to upgrade their kernel blindly, then they
>> shouldn't be doing embedded development.
>
> Linus once said that if you don't understand "not breaking user space" then
> you should not be doing kernel development. Or something to that extent.

Luckily, I do understand, and have argued on user-space's behalf for
years now. But your argument was hypothetical embedded developers who
upgrade their kernels without seeing what defaults changed, and then
getting burned by the different semantics. I can't muster up enough
give-a-damn to care about a minute percentage of the population (us
embedded developers) who aren't doing their job of reading
kernelnewbies to see what changed.

I accept Linus' argument that the change in crash behavior for random
folk is going to be very different, and that perhaps that alone should
drive keeping the default as it is today, but now we're talking about
the desktop space.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 22:48                     ` Ray Lee
@ 2009-04-06 22:52                       ` Hua Zhong
  2009-04-06 23:19                       ` Alan Cox
  1 sibling, 0 replies; 53+ messages in thread
From: Hua Zhong @ 2009-04-06 22:52 UTC (permalink / raw)
  To: 'Ray Lee'
  Cc: 'Theodore Tso', 'Linus Torvalds',
	'Jens Axboe', 'Linux Kernel Mailing List'

> Security on an embedded device starts with controlling physical
> access. If they have access to the storage media all bets are off,
> whether it's data=ordered or not. (Access to the storage media is
> really what we're talking about here -- medical data, for example,
> hitting the platter before the metadata updates that then make that
> data unaccessible to other userspace processes.)
> 
> Because *if* they have access to the media, they can replace any
> running code on that box, and your security is worthless.
> 
> So no, I don't see how that's a valid argument.

The problem with security has nothing to do with embedded. It's 
that when you commit metadata first and crash before you write 
the data, then you get to see random blocks which might contain 
sensitive information from other users.

Hua



^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 22:17                   ` Linus Torvalds
@ 2009-04-06 23:10                     ` Linus Torvalds
  2009-04-07  7:51                       ` Geert Uytterhoeven
                                         ` (2 more replies)
  0 siblings, 3 replies; 53+ messages in thread
From: Linus Torvalds @ 2009-04-06 23:10 UTC (permalink / raw)
  To: Ray Lee; +Cc: Hua Zhong, Theodore Tso, Jens Axboe, Linux Kernel Mailing List



On Mon, 6 Apr 2009, Linus Torvalds wrote:
> thing that we think people would be happiest with.
> 
> I think "ordered" was a reasonable default, but that was at least partly 
> because _both_ ordered and writeback sucked (partly in different ways).
> 
> I do think we could make it a config option.

A patch _something_ like this.

A few notes:

 - This is UNTESTED (of course)

 - If I did this right, this _only_ overrides the data mode if it's not 
   explicitly specified on disk in the superblock mount options.

IOW, if you have done a 

	tune2fs -o journal_data_ordered

then this will _not_ override that. Only in the absense of any explicit 
flags should this trigger and then make the choice be 'writeback'.

And just to be _extra_ backwards compatible, if you really want the old 
behavior, and don't want to set the ordering flag explicitly, just answer 
'y' to the EXT3_DEFAULTS_TO_ORDERED Kconfig question. 

What do people think? Anybody want to test?

		Linus

---
 fs/ext3/Kconfig |   19 +++++++++++++++++++
 fs/ext3/super.c |    8 +++++++-
 2 files changed, 26 insertions(+), 1 deletions(-)

diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe4..fb3c1a2 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
 	  To compile this file system support as a module, choose M here: the
 	  module will be called ext3.
 
+config EXT3_DEFAULTS_TO_ORDERED
+	bool "Default to 'data=ordered' in ext3 (legacy option)"
+	depends on EXT3_FS
+	help
+	  If a filesystem does not explicitly specify a data ordering
+	  mode, and the journal capability allowed it, ext3 used to
+	  historically default to 'data=ordered'.
+
+	  That was a rather unfortunate choice, because it leads to all
+	  kinds of latency problems, and the 'data=writeback' mode is more
+	  appropriate these days.
+
+	  You should probably always answer 'n' here, and if you really
+	  want to use 'data=ordered' mode, set it in the filesystem itself
+	  with 'tune2fs -o journal_data_ordered'.
+
+	  But if you really want to enable the legacy default, you can do
+	  so by answering 'y' to this question.
+
 config EXT3_FS_XATTR
 	bool "Ext3 extended attributes"
 	depends on EXT3_FS
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e3..599dbfe 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
 #include "acl.h"
 #include "namei.h"
 
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
+
 static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
 			     unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                    cope, else JOURNAL_DATA */
 		if (journal_check_available_features
 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
-			set_opt(sbi->s_mount_opt, ORDERED_DATA);
+			set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
 		else
 			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
 		break;

^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 22:48                     ` Ray Lee
  2009-04-06 22:52                       ` Hua Zhong
@ 2009-04-06 23:19                       ` Alan Cox
  1 sibling, 0 replies; 53+ messages in thread
From: Alan Cox @ 2009-04-06 23:19 UTC (permalink / raw)
  To: Ray Lee
  Cc: Hua Zhong, Theodore Tso, Linus Torvalds, Jens Axboe,
	Linux Kernel Mailing List

> apparently it never was unless you specifically mounted your ext3
> filesystem with data=ordered. The default was only that, a default,
> and one that can still be specified explicitly.

Look up "principle of least suprise"

> There are an unbounded number of possible incorrect setups in the
> world. Should we live in fear of their existence without any proof?

No but creating new ones for users is not good.

> I accept Linus' argument that the change in crash behavior for random
> folk is going to be very different, and that perhaps that alone should
> drive keeping the default as it is today, but now we're talking about
> the desktop space.

Which regularly handles data of great value -- like PhD theses.

Ext4 provides a good natural migration point, ext3 mid series does not.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 15:37   ` Linus Torvalds
  2009-04-06 16:57     ` Jens Axboe
@ 2009-04-07  3:28     ` Chris Mason
  1 sibling, 0 replies; 53+ messages in thread
From: Chris Mason @ 2009-04-07  3:28 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Jens Axboe, Linux Kernel Mailing List, tytso

On Mon, 2009-04-06 at 08:37 -0700, Linus Torvalds wrote:
> 
>
> One thing that I find intriguing is how the fsync time seems so 
> _consistent_ across a wild variety of drives. It's interesting how you see 
> delays that are roughly the same order of magnitude, even though you are 
> using an old SATA drive, and I'm using the Intel SSD. And when you turn 
> off TCQ, your numbers go down even more.
> 

It may help if we rule out the ext3/jbd transaction growing code.  I'd
be pretty surprised if it made for long pauses, but its worth checking:

-chris

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e6a1174..a7c4f79 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1417,7 +1417,7 @@ int journal_stop(handle_t *handle)
 	 * for joiners in that case.
 	 */
 	pid = current->pid;
-	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+	if (0 && handle->h_sync && journal->j_last_sync_writer != pid) {
 		u64 commit_time, trans_time;
 
 		journal->j_last_sync_writer = pid;



^ permalink raw reply related	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 21:19             ` Theodore Tso
  2009-04-06 21:35               ` Hua Zhong
@ 2009-04-07  3:52               ` Chris Mason
  2009-04-07  4:13                 ` Trenton D. Adams
  1 sibling, 1 reply; 53+ messages in thread
From: Chris Mason @ 2009-04-07  3:52 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Hua Zhong, 'Linus Torvalds', 'Jens Axboe',
	'Linux Kernel Mailing List'

On Mon, 2009-04-06 at 17:19 -0400, Theodore Tso wrote:
> On Mon, Apr 06, 2009 at 01:12:08PM -0700, Hua Zhong wrote:
> > Grr..making writeback as default would break people's setup that relies on
> > the ordered semantics, especially in the embedded world. It's a big no-no.
> 
> I've added workarounds for 2.6.30 that provide the replace-via-rename
> and replace-via-truncate workarounds for ext3 data=writeback cases.
> See commits e7c8f507 and f7ab34ea.  
> 
> There won't be an implied fsync for newly created files, yes, but you
> could have crashed 5 seconds earlier, at which point you would have
> lost the newly created file anyway.  Replace-via-rename and
> replace-via-truncate solves the problem for applications which are
> editing pre-existing files, which was most of people's complaints
> about depending on data=ordered semantics.

XFS got a reputation for losing data largely based on null bytes in
files after a crash.  In the XFS case the files always had zeros, and
not garbage that used to be on disk.

Years later xfs still gets beaten up for null bytes in files even though
they added code to prevent a long time ago.

Please leave data=ordered as the default for ext3.  If we really need to
fix ext3, it makes sense to switch to the xfs style i_size update at IO
end instead of data=writeback.

-chris



^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07  3:52               ` Chris Mason
@ 2009-04-07  4:13                 ` Trenton D. Adams
  2009-04-07  4:27                   ` Linus Torvalds
  0 siblings, 1 reply; 53+ messages in thread
From: Trenton D. Adams @ 2009-04-07  4:13 UTC (permalink / raw)
  To: Chris Mason
  Cc: Theodore Tso, Hua Zhong, Linus Torvalds, Jens Axboe,
	Linux Kernel Mailing List

Okay, so a config option is a benefit in what way, for these
particular circumstances?  If someone wants to change the behaviour of
the kernel, for this particular case, they can just use tune2fs, no?

I'm glad I'm not making the decision.  To break one person's computer
or another person's computer, hmm, not very good options. hehe.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07  4:13                 ` Trenton D. Adams
@ 2009-04-07  4:27                   ` Linus Torvalds
  2009-04-07  4:48                     ` Trenton D. Adams
  0 siblings, 1 reply; 53+ messages in thread
From: Linus Torvalds @ 2009-04-07  4:27 UTC (permalink / raw)
  To: Trenton D. Adams
  Cc: Chris Mason, Theodore Tso, Hua Zhong, Jens Axboe,
	Linux Kernel Mailing List



On Mon, 6 Apr 2009, Trenton D. Adams wrote:
>
> Okay, so a config option is a benefit in what way, for these
> particular circumstances?  If someone wants to change the behaviour of
> the kernel, for this particular case, they can just use tune2fs, no?

Basically, I have a very simple policy in my kernel life:

 - I don't touch distribution settings. I update the kernel, and NOTHING 
   else.

(Ok, I lie. I do my own git version too, but I'm really unhappy when I 
feel like I need to maintain anything else)

And I have that policy because quite frankly, if I start tuning distro 
settings, I'll

 (a) forget about them and not do it on my next machine and 
 (b) do some magic that _I_ may know how to do but nobody else does, so 
     now what I'm doing is no longer relevant for anybody else
 (c) I'm no longer helping anybody else.

So I have a few bits and pieces that I develop (mainly the kernel but also 
git etc), and I don't make site-specific changes to them even if I could. 

In other words: if I make a change that helps me, I want to make sure
that I make that option available to everybody else, because quite
frankly that's a big portion of the whole point of open source.

The whole "scratch your itch" isn't just about scratching _your_ itch, 
it's about getting it (almost by mistake) fixed for a lot of other people 
too.

			Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07  4:27                   ` Linus Torvalds
@ 2009-04-07  4:48                     ` Trenton D. Adams
  2009-04-07  5:02                       ` Linus Torvalds
  0 siblings, 1 reply; 53+ messages in thread
From: Trenton D. Adams @ 2009-04-07  4:48 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Chris Mason, Theodore Tso, Hua Zhong, Jens Axboe,
	Linux Kernel Mailing List

On Mon, Apr 6, 2009 at 10:27 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
>
> On Mon, 6 Apr 2009, Trenton D. Adams wrote:
>>
>> Okay, so a config option is a benefit in what way, for these
>> particular circumstances?  If someone wants to change the behaviour of
>> the kernel, for this particular case, they can just use tune2fs, no?
>
> Basically, I have a very simple policy in my kernel life:
>
>  - I don't touch distribution settings. I update the kernel, and NOTHING
>   else.
>
> (Ok, I lie. I do my own git version too, but I'm really unhappy when I
> feel like I need to maintain anything else)
>
> And I have that policy because quite frankly, if I start tuning distro
> settings, I'll
>
>  (a) forget about them and not do it on my next machine and
>  (b) do some magic that _I_ may know how to do but nobody else does, so
>     now what I'm doing is no longer relevant for anybody else
>  (c) I'm no longer helping anybody else.
>
> So I have a few bits and pieces that I develop (mainly the kernel but also
> git etc), and I don't make site-specific changes to them even if I could.
>
> In other words: if I make a change that helps me, I want to make sure
> that I make that option available to everybody else, because quite
> frankly that's a big portion of the whole point of open source.
>
> The whole "scratch your itch" isn't just about scratching _your_ itch,
> it's about getting it (almost by mistake) fixed for a lot of other people
> too.
>
>                        Linus
>

Interesting.  I suppose there is also the fact that every drive you
hook to the system would then have the kernel configured default.  So
that makes sense.  Otherwise it's 2, 5, 10, 20 tune2fs commands,
instead of one kernel config.

What about a procfs setting instead?  Is there a policy about why
something should be in procfs or /sys, or as a kernel config option?
That's basically as small as the patch you just made, right?

I'm just thinking that something like this, where people want one
thing or the other, but may not know it when they install Linux, might
like to change it realtime.  Especially if they are a Linux newbie,
and don't know how to compile their own kernel.  Or don't have time to
maintain their own kernel installs.

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07  4:48                     ` Trenton D. Adams
@ 2009-04-07  5:02                       ` Linus Torvalds
  2009-04-07  5:23                         ` Hua Zhong
  2009-04-07  6:27                         ` Trenton D. Adams
  0 siblings, 2 replies; 53+ messages in thread
From: Linus Torvalds @ 2009-04-07  5:02 UTC (permalink / raw)
  To: Trenton D. Adams
  Cc: Chris Mason, Theodore Tso, Hua Zhong, Jens Axboe,
	Linux Kernel Mailing List



On Mon, 6 Apr 2009, Trenton D. Adams wrote:
> 
> What about a procfs setting instead?  Is there a policy about why
> something should be in procfs or /sys, or as a kernel config option?
> That's basically as small as the patch you just made, right?

I'm never really against making things dynamically tunable, but this 
already was, and that wasn't really the issue.

Sure, you can just re-mount your filesystem with different options. That's 
what I did while testing - my /home is on a drive of its own, so I would 
just log out and as root unmount and re-mount with data=ordered/writeback, 
and log in and test again.

So dynamic tuning is good. But at the same time, having a tuning option is 
_never_ an excuse for not getting the default right in the first place. 
It's just a cop-out to say "hey, the default may be wrong for you, but you 
can always tune it locally with XYZ".

The thing is, almost nobody does that. Partly because it needs effort and 
knowledge, partly because after a few years the number of tuning knobs are 
in the hundreds for every little thing. 

So instead, leave the tuning for the _really_ odd cases (if you use your 
machine as an IP router, you hopefully know enough to tune it if you 
really care). Not for random general-purpose "use for whatever" kind of 
thing. 

> I'm just thinking that something like this, where people want one
> thing or the other, but may not know it when they install Linux, might
> like to change it realtime.  Especially if they are a Linux newbie,
> and don't know how to compile their own kernel.  Or don't have time to
> maintain their own kernel installs.

Oh absolutely. I'm not expecting people to compile their own kernels. I'm 
expecting that within a few months, most modern distributions will have 
(almost by mistake) gotten a new set of saner defaults, and anybody who 
keeps their machine up-to-date will see a smoother experience without ever 
even realizing _why_.

			Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* RE: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07  5:02                       ` Linus Torvalds
@ 2009-04-07  5:23                         ` Hua Zhong
  2009-04-07  6:27                         ` Trenton D. Adams
  1 sibling, 0 replies; 53+ messages in thread
From: Hua Zhong @ 2009-04-07  5:23 UTC (permalink / raw)
  To: 'Linus Torvalds', 'Trenton D. Adams'
  Cc: 'Chris Mason', 'Theodore Tso',
	'Jens Axboe', 'Linux Kernel Mailing List'

The (small) set of people that rely on "ordered" understand 
the problem, as long as they are aware of the change (no, I 
don't think reading through all changelogs from their old 
kernel to the new one is a realistic option).

So a config option should be good enough to get them to notice
the change (I assume a missing default will force them to 
choose an option), and therefore explicitly add the -o ordered 
option to their scripts.

On the other hand a run-time tunable has no real point.

> -----Original Message-----
> From: Linus Torvalds [mailto:torvalds@linux-foundation.org]
> Sent: Monday, April 06, 2009 10:02 PM
> To: Trenton D. Adams
> Cc: Chris Mason; Theodore Tso; Hua Zhong; Jens Axboe; Linux Kernel
> Mailing List
> Subject: Re: [PATCH 0/8][RFC] IO latency/throughput fixes
> 
> 
> 
> On Mon, 6 Apr 2009, Trenton D. Adams wrote:
> >
> > What about a procfs setting instead?  Is there a policy about why
> > something should be in procfs or /sys, or as a kernel config option?
> > That's basically as small as the patch you just made, right?
> 
> I'm never really against making things dynamically tunable, but this
> already was, and that wasn't really the issue.
> 
> Sure, you can just re-mount your filesystem with different options.
> That's
> what I did while testing - my /home is on a drive of its own, so I
> would
> just log out and as root unmount and re-mount with
> data=ordered/writeback,
> and log in and test again.
> 
> So dynamic tuning is good. But at the same time, having a tuning option
> is
> _never_ an excuse for not getting the default right in the first place.
> It's just a cop-out to say "hey, the default may be wrong for you, but
> you
> can always tune it locally with XYZ".
> 
> The thing is, almost nobody does that. Partly because it needs effort
> and
> knowledge, partly because after a few years the number of tuning knobs
> are
> in the hundreds for every little thing.
> 
> So instead, leave the tuning for the _really_ odd cases (if you use
> your
> machine as an IP router, you hopefully know enough to tune it if you
> really care). Not for random general-purpose "use for whatever" kind of
> thing.
> 
> > I'm just thinking that something like this, where people want one
> > thing or the other, but may not know it when they install Linux,
> might
> > like to change it realtime.  Especially if they are a Linux newbie,
> > and don't know how to compile their own kernel.  Or don't have time
> to
> > maintain their own kernel installs.
> 
> Oh absolutely. I'm not expecting people to compile their own kernels.
> I'm
> expecting that within a few months, most modern distributions will have
> (almost by mistake) gotten a new set of saner defaults, and anybody who
> keeps their machine up-to-date will see a smoother experience without
> ever
> even realizing _why_.
> 
> 			Linus


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07  5:02                       ` Linus Torvalds
  2009-04-07  5:23                         ` Hua Zhong
@ 2009-04-07  6:27                         ` Trenton D. Adams
  1 sibling, 0 replies; 53+ messages in thread
From: Trenton D. Adams @ 2009-04-07  6:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Chris Mason, Theodore Tso, Hua Zhong, Jens Axboe,
	Linux Kernel Mailing List

On Mon, Apr 6, 2009 at 11:02 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> So dynamic tuning is good. But at the same time, having a tuning option is
> _never_ an excuse for not getting the default right in the first place.
> It's just a cop-out to say "hey, the default may be wrong for you, but you
> can always tune it locally with XYZ".

Oh, right, you had mentioned that a few days ago about not liking to
have to tune stuff.  I agree, the kernel should just work, except in
rare cases.  But, with procfs, you could still have the default to
data=writeback, and realtime tuneable.  That is what I meant.

Anyhow, I don't want to take up too much of your time.  I wouldn't
want to know how busy you are. :P  No need to reply. :D

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 23:10                     ` Linus Torvalds
@ 2009-04-07  7:51                       ` Geert Uytterhoeven
  2009-04-07 10:36                         ` Ingo Molnar
  2009-04-07 13:35                       ` Mark Lord
  2009-04-09  2:40                       ` Eric Sandeen
  2 siblings, 1 reply; 53+ messages in thread
From: Geert Uytterhoeven @ 2009-04-07  7:51 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ray Lee, Hua Zhong, Theodore Tso, Jens Axboe, Linux Kernel Mailing List

On Tue, Apr 7, 2009 at 01:10, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Mon, 6 Apr 2009, Linus Torvalds wrote:
>> thing that we think people would be happiest with.
>>
>> I think "ordered" was a reasonable default, but that was at least partly
>> because _both_ ordered and writeback sucked (partly in different ways).
>>
>> I do think we could make it a config option.
>
> A patch _something_ like this.
>
> A few notes:
>
>  - This is UNTESTED (of course)
>
>  - If I did this right, this _only_ overrides the data mode if it's not
>   explicitly specified on disk in the superblock mount options.
>
> IOW, if you have done a
>
>        tune2fs -o journal_data_ordered
>
> then this will _not_ override that. Only in the absense of any explicit
> flags should this trigger and then make the choice be 'writeback'.
>
> And just to be _extra_ backwards compatible, if you really want the old
> behavior, and don't want to set the ordering flag explicitly, just answer
> 'y' to the EXT3_DEFAULTS_TO_ORDERED Kconfig question.
>
> What do people think? Anybody want to test?
>
>                Linus
>
> ---
>  fs/ext3/Kconfig |   19 +++++++++++++++++++
>  fs/ext3/super.c |    8 +++++++-
>  2 files changed, 26 insertions(+), 1 deletions(-)
>
> diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
> index 8e0cfe4..fb3c1a2 100644
> --- a/fs/ext3/Kconfig
> +++ b/fs/ext3/Kconfig
> @@ -28,6 +28,25 @@ config EXT3_FS
>          To compile this file system support as a module, choose M here: the
>          module will be called ext3.
>
> +config EXT3_DEFAULTS_TO_ORDERED
> +       bool "Default to 'data=ordered' in ext3 (legacy option)"
> +       depends on EXT3_FS
> +       help
> +         If a filesystem does not explicitly specify a data ordering
> +         mode, and the journal capability allowed it, ext3 used to
> +         historically default to 'data=ordered'.
> +
> +         That was a rather unfortunate choice, because it leads to all
> +         kinds of latency problems, and the 'data=writeback' mode is more
> +         appropriate these days.
> +
> +         You should probably always answer 'n' here, and if you really
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> +         want to use 'data=ordered' mode, set it in the filesystem itself
> +         with 'tune2fs -o journal_data_ordered'.
> +
> +         But if you really want to enable the legacy default, you can do
> +         so by answering 'y' to this question.
> +

So `allmodconfig' will enable it? Is that the right thing to do, or
should it be inverted?

Gr{oetje,eeting}s,

						Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
							    -- Linus Torvalds

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07  7:51                       ` Geert Uytterhoeven
@ 2009-04-07 10:36                         ` Ingo Molnar
  2009-04-07 14:10                           ` Diego Calleja
  2009-04-08 12:56                           ` Denys Vlasenko
  0 siblings, 2 replies; 53+ messages in thread
From: Ingo Molnar @ 2009-04-07 10:36 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Linus Torvalds, Ray Lee, Hua Zhong, Theodore Tso, Jens Axboe,
	Linux Kernel Mailing List


* Geert Uytterhoeven <geert@linux-m68k.org> wrote:

> On Tue, Apr 7, 2009 at 01:10, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> > On Mon, 6 Apr 2009, Linus Torvalds wrote:
> >> thing that we think people would be happiest with.
> >>
> >> I think "ordered" was a reasonable default, but that was at least partly
> >> because _both_ ordered and writeback sucked (partly in different ways).
> >>
> >> I do think we could make it a config option.
> >
> > A patch _something_ like this.
> >
> > A few notes:
> >
> >  - This is UNTESTED (of course)
> >
> >  - If I did this right, this _only_ overrides the data mode if it's not
> >   explicitly specified on disk in the superblock mount options.
> >
> > IOW, if you have done a
> >
> >        tune2fs -o journal_data_ordered
> >
> > then this will _not_ override that. Only in the absense of any explicit
> > flags should this trigger and then make the choice be 'writeback'.
> >
> > And just to be _extra_ backwards compatible, if you really want the old
> > behavior, and don't want to set the ordering flag explicitly, just answer
> > 'y' to the EXT3_DEFAULTS_TO_ORDERED Kconfig question.
> >
> > What do people think? Anybody want to test?
> >
> >                Linus
> >
> > ---
> >  fs/ext3/Kconfig |   19 +++++++++++++++++++
> >  fs/ext3/super.c |    8 +++++++-
> >  2 files changed, 26 insertions(+), 1 deletions(-)
> >
> > diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
> > index 8e0cfe4..fb3c1a2 100644
> > --- a/fs/ext3/Kconfig
> > +++ b/fs/ext3/Kconfig
> > @@ -28,6 +28,25 @@ config EXT3_FS
> >          To compile this file system support as a module, choose M here: the
> >          module will be called ext3.
> >
> > +config EXT3_DEFAULTS_TO_ORDERED
> > +       bool "Default to 'data=ordered' in ext3 (legacy option)"
> > +       depends on EXT3_FS
> > +       help
> > +         If a filesystem does not explicitly specify a data ordering
> > +         mode, and the journal capability allowed it, ext3 used to
> > +         historically default to 'data=ordered'.
> > +
> > +         That was a rather unfortunate choice, because it leads to all
> > +         kinds of latency problems, and the 'data=writeback' mode is more
> > +         appropriate these days.
> > +
> > +         You should probably always answer 'n' here, and if you really
>             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > +         want to use 'data=ordered' mode, set it in the filesystem itself
> > +         with 'tune2fs -o journal_data_ordered'.
> > +
> > +         But if you really want to enable the legacy default, you can do
> > +         so by answering 'y' to this question.
> > +
> 
> So `allmodconfig' will enable it? Is that the right thing to do, 
> or should it be inverted?
> 
> Gr{oetje,eeting}s,

allmod/allyes will enable all sorts of legacy options.

Since besides myself i'm not aware of any other person on this 
planet actually _booting_ allyes/allmod Linux kernels, i guess this 
is not a big issue anyway :-)

One small detail only: i'd suggest to name it 
CONFIG_COMPAT_EXT3_DEFAULTS_TO_ORDERED, to move it more in line with 
all the CONFIG_COMPAT_* legacy options.

	Ingo

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 23:10                     ` Linus Torvalds
  2009-04-07  7:51                       ` Geert Uytterhoeven
@ 2009-04-07 13:35                       ` Mark Lord
  2009-04-07 14:33                         ` Linus Torvalds
  2009-04-09  2:40                       ` Eric Sandeen
  2 siblings, 1 reply; 53+ messages in thread
From: Mark Lord @ 2009-04-07 13:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ray Lee, Hua Zhong, Theodore Tso, Jens Axboe, Linux Kernel Mailing List

Linus Torvalds wrote:
..
>  - If I did this right, this _only_ overrides the data mode if it's not 
>    explicitly specified on disk in the superblock mount options.
> 
> IOW, if you have done a 
> 
> 	tune2fs -o journal_data_ordered
> 
> then this will _not_ override that.
..

Thanks for saving me time to look up that command!  :)

With the stuff I work on here, kernel oops and sudden reboots
are not at all uncommon during development, and I really prefer
to keep "ordered" as the default on these machines for a tiny
extra margin of safety.

What happens with ext3 "writeback", and ext4 "whatever",
when one does the quickie reboot method:

   ALT-SYSRQ-S ALT-SYSRQ-U ALT-SYSRQ-S ALT-SYSRQ-B

???

Cheers

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07 10:36                         ` Ingo Molnar
@ 2009-04-07 14:10                           ` Diego Calleja
  2009-04-08 12:04                             ` Ingo Molnar
  2009-04-08 12:56                           ` Denys Vlasenko
  1 sibling, 1 reply; 53+ messages in thread
From: Diego Calleja @ 2009-04-07 14:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Geert Uytterhoeven, Linus Torvalds, Ray Lee, Hua Zhong,
	Theodore Tso, Jens Axboe, Linux Kernel Mailing List

On Martes 07 Abril 2009 12:36:11 Ingo Molnar escribió:
> Since besides myself i'm not aware of any other person on this 
> planet actually _booting_ allyes/allmod Linux kernels, i guess this 
> is not a big issue anyway :-)

IIRC once I tried to boot an allyes kernel and it oopsed, because (i think)
the probing routines of some drivers have never been tested in machines
that do not have the corresponding hardware. So I doubt someoneone is
using it ;) 

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07 13:35                       ` Mark Lord
@ 2009-04-07 14:33                         ` Linus Torvalds
  2009-04-07 19:24                           ` Mark Lord
  2009-04-07 20:53                           ` Mike Galbraith
  0 siblings, 2 replies; 53+ messages in thread
From: Linus Torvalds @ 2009-04-07 14:33 UTC (permalink / raw)
  To: Mark Lord
  Cc: Ray Lee, Hua Zhong, Theodore Tso, Jens Axboe, Linux Kernel Mailing List



On Tue, 7 Apr 2009, Mark Lord wrote:
> 
> What happens with ext3 "writeback", and ext4 "whatever",
> when one does the quickie reboot method:
> 
>   ALT-SYSRQ-S ALT-SYSRQ-U ALT-SYSRQ-S ALT-SYSRQ-B
> 
> ???

Since 's' syncs (I think 'u' does too, as part of making things 
read-only), the data blocks will be on disk after the boot regardless of 
any other ordering.

Of course, it will leave all your lock-files files alone, and I can almost 
guarantee that some daemons (read: "NetworkManager") will then fail on the 
next boot because they think they are already running.

			Linus

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07 14:33                         ` Linus Torvalds
@ 2009-04-07 19:24                           ` Mark Lord
  2009-04-07 19:45                             ` Jeff Garzik
  2009-04-07 20:53                           ` Mike Galbraith
  1 sibling, 1 reply; 53+ messages in thread
From: Mark Lord @ 2009-04-07 19:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ray Lee, Hua Zhong, Theodore Tso, Jens Axboe, Linux Kernel Mailing List

Linus Torvalds wrote:
> 
> On Tue, 7 Apr 2009, Mark Lord wrote:
>> What happens with ext3 "writeback", and ext4 "whatever",
>> when one does the quickie reboot method:
>>
>>   ALT-SYSRQ-S ALT-SYSRQ-U ALT-SYSRQ-S ALT-SYSRQ-B
>>
>> ???
> 
> Since 's' syncs (I think 'u' does too, as part of making things 
> read-only), the data blocks will be on disk after the boot regardless of 
> any other ordering.
..

I was thinking more about delayed allocation in ext4, though.
If it hasn't allocated the blocks, then sync() has nothing to write out.
Or do they have hooks into the block layer to force alloc/commit when
somebody does a sync() ??

> Of course, it will leave all your lock-files files alone, and I can almost 
> guarantee that some daemons (read: "NetworkManager") will then fail on the 
> next boot because they think they are already running.
..

No, it behaves fine on reboot here.  But actually, NM is one big reason
why I end up having to use the ALT-SYSRQ-S/U/S/B.

The Ubunutu reboot scripts seem broken at times w.r.t. NM --
it hangs the reboot sequence on some of my machines here
for a very long time (during shutdown), because (I think)
the scripts disable the interface before disabling NM..  Doh!

Seems happy enough after rebooting though.

Cheers

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07 19:24                           ` Mark Lord
@ 2009-04-07 19:45                             ` Jeff Garzik
  0 siblings, 0 replies; 53+ messages in thread
From: Jeff Garzik @ 2009-04-07 19:45 UTC (permalink / raw)
  To: Mark Lord
  Cc: Linus Torvalds, Ray Lee, Hua Zhong, Theodore Tso, Jens Axboe,
	Linux Kernel Mailing List

Mark Lord wrote:
> Linus Torvalds wrote:
>>
>> On Tue, 7 Apr 2009, Mark Lord wrote:
>>> What happens with ext3 "writeback", and ext4 "whatever",
>>> when one does the quickie reboot method:
>>>
>>>   ALT-SYSRQ-S ALT-SYSRQ-U ALT-SYSRQ-S ALT-SYSRQ-B
>>>
>>> ???
>>
>> Since 's' syncs (I think 'u' does too, as part of making things 
>> read-only), the data blocks will be on disk after the boot regardless 
>> of any other ordering.
> ..
> 
> I was thinking more about delayed allocation in ext4, though.
> If it hasn't allocated the blocks, then sync() has nothing to write out.
> Or do they have hooks into the block layer to force alloc/commit when
> somebody does a sync() ??

sync(2) doesn't just sync dirty buffers...  it sync's inodes, which 
pokes the filesystem to do something intelligent, perhaps triggering (a) 
write-out of data, (b) write-out of zeroed blocks, or (c) annotation in 
filesystem metadata that certain blocks are allocated, but not initialized.

	Jeff




^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07 14:33                         ` Linus Torvalds
  2009-04-07 19:24                           ` Mark Lord
@ 2009-04-07 20:53                           ` Mike Galbraith
  1 sibling, 0 replies; 53+ messages in thread
From: Mike Galbraith @ 2009-04-07 20:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Mark Lord, Ray Lee, Hua Zhong, Theodore Tso, Jens Axboe,
	Linux Kernel Mailing List

On Tue, 2009-04-07 at 07:33 -0700, Linus Torvalds wrote:
> 
> On Tue, 7 Apr 2009, Mark Lord wrote:
> > 
> > What happens with ext3 "writeback", and ext4 "whatever",
> > when one does the quickie reboot method:
> > 
> >   ALT-SYSRQ-S ALT-SYSRQ-U ALT-SYSRQ-S ALT-SYSRQ-B
> > 
> > ???
> 
> Since 's' syncs (I think 'u' does too, as part of making things 
> read-only), the data blocks will be on disk after the boot regardless of 
> any other ordering.
> 
> Of course, it will leave all your lock-files files alone, and I can almost 
> guarantee that some daemons (read: "NetworkManager") will then fail on the 
> next boot because they think they are already running.

I don't do that _regularly_, but have had cause to do so many times over
the last couple years since switching to data=writeback.  Box boots up
fine here, at least with opensuse 10.3 and 11.0 it does.

I've had to reconfigure evolution, and some desktop settings a few times
after pulling the _eject_ handle (plain sysrq-b or BRB if kernel is
having a bad hair day), but that's about it.

	-Mike


^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07 14:10                           ` Diego Calleja
@ 2009-04-08 12:04                             ` Ingo Molnar
  0 siblings, 0 replies; 53+ messages in thread
From: Ingo Molnar @ 2009-04-08 12:04 UTC (permalink / raw)
  To: Diego Calleja
  Cc: Geert Uytterhoeven, Linus Torvalds, Ray Lee, Hua Zhong,
	Theodore Tso, Jens Axboe, Linux Kernel Mailing List


* Diego Calleja <diegocg@gmail.com> wrote:

> On Martes 07 Abril 2009 12:36:11 Ingo Molnar escribió:
> > Since besides myself i'm not aware of any other person on this 
> > planet actually _booting_ allyes/allmod Linux kernels, i guess this 
> > is not a big issue anyway :-)
> 
> IIRC once I tried to boot an allyes kernel and it oopsed, because 
> (i think) the probing routines of some drivers have never been 
> tested in machines that do not have the corresponding hardware. So 
> I doubt someoneone is using it ;)

Yes, i mapped those out gradually and disabled them. I've been 
test-booting such allyesconfig 32-bit and 64-bit kernels ever since 
then - for the past 2 years or so.

	Ingo

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-07 10:36                         ` Ingo Molnar
  2009-04-07 14:10                           ` Diego Calleja
@ 2009-04-08 12:56                           ` Denys Vlasenko
  2009-04-08 13:27                             ` Ingo Molnar
  1 sibling, 1 reply; 53+ messages in thread
From: Denys Vlasenko @ 2009-04-08 12:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Geert Uytterhoeven, Linus Torvalds, Ray Lee, Hua Zhong,
	Theodore Tso, Jens Axboe, Linux Kernel Mailing List

Hi Ingo,

On Tue, Apr 7, 2009 at 12:36 PM, Ingo Molnar <mingo@elte.hu> wrote:
> allmod/allyes will enable all sorts of legacy options.
>
> Since besides myself i'm not aware of any other person on this
> planet actually _booting_ allyes/allmod Linux kernels,

Does allyesconfig boot? Any funny effects?
--
vda

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-08 12:56                           ` Denys Vlasenko
@ 2009-04-08 13:27                             ` Ingo Molnar
  0 siblings, 0 replies; 53+ messages in thread
From: Ingo Molnar @ 2009-04-08 13:27 UTC (permalink / raw)
  To: Denys Vlasenko
  Cc: Geert Uytterhoeven, Linus Torvalds, Ray Lee, Hua Zhong,
	Theodore Tso, Jens Axboe, Linux Kernel Mailing List


* Denys Vlasenko <vda.linux@googlemail.com> wrote:

> Hi Ingo,
> 
> On Tue, Apr 7, 2009 at 12:36 PM, Ingo Molnar <mingo@elte.hu> wrote:
> > allmod/allyes will enable all sorts of legacy options.
> >
> > Since besides myself i'm not aware of any other person on this
> > planet actually _booting_ allyes/allmod Linux kernels,
> 
> Does allyesconfig boot? Any funny effects?

Besides a long bootup time and a somewhat slow kernel (everything 
built in, all debug facilities, everything ...) and the need to 
filter out bad drivers first, none.

Any funny effects get fixed or reported by me ;-)

	Ingo

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-06 23:10                     ` Linus Torvalds
  2009-04-07  7:51                       ` Geert Uytterhoeven
  2009-04-07 13:35                       ` Mark Lord
@ 2009-04-09  2:40                       ` Eric Sandeen
  2009-04-09 14:01                         ` Ric Wheeler
  2 siblings, 1 reply; 53+ messages in thread
From: Eric Sandeen @ 2009-04-09  2:40 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ray Lee, Hua Zhong, Theodore Tso, Jens Axboe, Linux Kernel Mailing List

Linus Torvalds wrote:
> 
> On Mon, 6 Apr 2009, Linus Torvalds wrote:
>> thing that we think people would be happiest with.
>>
>> I think "ordered" was a reasonable default, but that was at least partly 
>> because _both_ ordered and writeback sucked (partly in different ways).
>>
>> I do think we could make it a config option.
> 
> A patch _something_ like this.
> 
> A few notes:
> 
>  - This is UNTESTED (of course)
> 
>  - If I did this right, this _only_ overrides the data mode if it's not 
>    explicitly specified on disk in the superblock mount options.
> 
> IOW, if you have done a 
> 
> 	tune2fs -o journal_data_ordered
> 
> then this will _not_ override that. Only in the absense of any explicit 
> flags should this trigger and then make the choice be 'writeback'.
> 
> And just to be _extra_ backwards compatible, if you really want the old 
> behavior, and don't want to set the ordering flag explicitly, just answer 
> 'y' to the EXT3_DEFAULTS_TO_ORDERED Kconfig question. 
> 
> What do people think? Anybody want to test?

I think this is a terrible idea.  I ran the following test with
data=writeback on 2.6.29.1 (which doesn't have the rename & truncate
hacks, but they would not help in this case, either):

tar xvjf linux-2.6.29.1.tar.bz2; echo b > /proc/sysrq-trigger

This simulates a crash on a busy system.  I got back 8000+ files
containing other people's data.

data=ordered isn't just "nicer" behavior than writeback on a crash, it's
necessary today for security.  Making data=writeback default is a
security flaw.

Are we really considering (wait, not considering; it's checked in
already!) - blowing a huge security hole in the filesystem used on the
vast majority of installations in the name of speed?

Chris suggested earlier in this thread that we should use the XFS trick
of not extending the i_size until io completion, and I agree that it
makes sense.  Chris even offered to take a stab at it and I hope I can
work with him on this.  It's a -much- better answer than this
reactionary change.

-Eric

^ permalink raw reply	[flat|nested] 53+ messages in thread

* Re: [PATCH 0/8][RFC] IO latency/throughput fixes
  2009-04-09  2:40                       ` Eric Sandeen
@ 2009-04-09 14:01                         ` Ric Wheeler
  0 siblings, 0 replies; 53+ messages in thread
From: Ric Wheeler @ 2009-04-09 14:01 UTC (permalink / raw)
  To: Eric Sandeen
  Cc: Linus Torvalds, Ray Lee, Hua Zhong, Theodore Tso, Jens Axboe,
	Linux Kernel Mailing List, Dave Chinner, Stephen C. Tweedie,
	Andrew Morton

On 04/08/2009 10:40 PM, Eric Sandeen wrote:
> Linus Torvalds wrote:
>    
>> On Mon, 6 Apr 2009, Linus Torvalds wrote:
>>      
>>> thing that we think people would be happiest with.
>>>
>>> I think "ordered" was a reasonable default, but that was at least partly
>>> because _both_ ordered and writeback sucked (partly in different ways).
>>>
>>> I do think we could make it a config option.
>>>        
>> A patch _something_ like this.
>>
>> A few notes:
>>
>>   - This is UNTESTED (of course)
>>
>>   - If I did this right, this _only_ overrides the data mode if it's not
>>     explicitly specified on disk in the superblock mount options.
>>
>> IOW, if you have done a
>>
>> 	tune2fs -o journal_data_ordered
>>
>> then this will _not_ override that. Only in the absense of any explicit
>> flags should this trigger and then make the choice be 'writeback'.
>>
>> And just to be _extra_ backwards compatible, if you really want the old
>> behavior, and don't want to set the ordering flag explicitly, just answer
>> 'y' to the EXT3_DEFAULTS_TO_ORDERED Kconfig question.
>>
>> What do people think? Anybody want to test?
>>      
>
> I think this is a terrible idea.  I ran the following test with
> data=writeback on 2.6.29.1 (which doesn't have the rename&  truncate
> hacks, but they would not help in this case, either):
>
> tar xvjf linux-2.6.29.1.tar.bz2; echo b>  /proc/sysrq-trigger
>
> This simulates a crash on a busy system.  I got back 8000+ files
> containing other people's data.
>
> data=ordered isn't just "nicer" behavior than writeback on a crash, it's
> necessary today for security.  Making data=writeback default is a
> security flaw.
>
> Are we really considering (wait, not considering; it's checked in
> already!) - blowing a huge security hole in the filesystem used on the
> vast majority of installations in the name of speed?
>
> Chris suggested earlier in this thread that we should use the XFS trick
> of not extending the i_size until io completion, and I agree that it
> makes sense.  Chris even offered to take a stab at it and I hope I can
> work with him on this.  It's a -much- better answer than this
> reactionary change.
>
> -Eric
>    

I agree - we definitely should work to fix the fsync latency problems, 
but this seems to jump back in time to the early 80's for UNIX file systems.

Writeback mode is just not a safe default for naive users or even more 
sophisticated users who don't understand the risks here. Definitely not 
a journal mode that any distribution would be able to ship as a default.

Wouldn't it make much more sense to leave the default at the safe data 
ordered mode and let the few people who understand the tradeoff remount 
file systems in writeback mode?

Regards,

Ric


^ permalink raw reply	[flat|nested] 53+ messages in thread

end of thread, other threads:[~2009-04-09 14:04 UTC | newest]

Thread overview: 53+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-06 12:48 [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
2009-04-06 12:48 ` [PATCH 1/8] block: change the request allocation/congestion logic to be sync/async based Jens Axboe
2009-04-06 12:48 ` [PATCH 2/8] Add WRITE_SYNC_PLUG and SWRITE_SYNC_PLUG Jens Axboe
2009-04-06 12:48 ` [PATCH 3/8] block: fsync_buffers_list() should use SWRITE_SYNC_PLUG Jens Axboe
2009-04-06 12:48 ` [PATCH 4/8] jbd: use WRITE_SYNC_PLUG instead of WRITE_SYNC Jens Axboe
2009-04-06 12:48 ` [PATCH 5/8] jbd2: " Jens Axboe
2009-04-06 12:48 ` [PATCH 6/8] block: enabling plugging on SSD devices that don't do queuing Jens Axboe
2009-04-06 12:48 ` [PATCH 7/8] block: Add flag for telling the IO schedulers NOT to anticipate more IO Jens Axboe
2009-04-06 12:48 ` [PATCH 8/8] block: switch sync_dirty_buffer() over to WRITE_SYNC Jens Axboe
2009-04-06 13:04 ` [PATCH 0/8][RFC] IO latency/throughput fixes Jens Axboe
2009-04-06 13:13   ` Jens Axboe
2009-04-06 15:37   ` Linus Torvalds
2009-04-06 16:57     ` Jens Axboe
2009-04-07  3:28     ` Chris Mason
2009-04-06 15:04 ` Linus Torvalds
2009-04-06 15:10   ` Jens Axboe
2009-04-06 15:45     ` Linus Torvalds
2009-04-06 17:01       ` Jens Axboe
2009-04-06 18:31       ` Theodore Tso
2009-04-06 19:57         ` Linus Torvalds
2009-04-06 20:10           ` Linus Torvalds
2009-04-06 21:26             ` Theodore Tso
2009-04-06 20:12           ` Hua Zhong
2009-04-06 20:20             ` Linus Torvalds
2009-04-06 21:19             ` Theodore Tso
2009-04-06 21:35               ` Hua Zhong
2009-04-06 22:04                 ` Ray Lee
2009-04-06 22:17                   ` Linus Torvalds
2009-04-06 23:10                     ` Linus Torvalds
2009-04-07  7:51                       ` Geert Uytterhoeven
2009-04-07 10:36                         ` Ingo Molnar
2009-04-07 14:10                           ` Diego Calleja
2009-04-08 12:04                             ` Ingo Molnar
2009-04-08 12:56                           ` Denys Vlasenko
2009-04-08 13:27                             ` Ingo Molnar
2009-04-07 13:35                       ` Mark Lord
2009-04-07 14:33                         ` Linus Torvalds
2009-04-07 19:24                           ` Mark Lord
2009-04-07 19:45                             ` Jeff Garzik
2009-04-07 20:53                           ` Mike Galbraith
2009-04-09  2:40                       ` Eric Sandeen
2009-04-09 14:01                         ` Ric Wheeler
2009-04-06 22:25                   ` Hua Zhong
2009-04-06 22:48                     ` Ray Lee
2009-04-06 22:52                       ` Hua Zhong
2009-04-06 23:19                       ` Alan Cox
2009-04-07  3:52               ` Chris Mason
2009-04-07  4:13                 ` Trenton D. Adams
2009-04-07  4:27                   ` Linus Torvalds
2009-04-07  4:48                     ` Trenton D. Adams
2009-04-07  5:02                       ` Linus Torvalds
2009-04-07  5:23                         ` Hua Zhong
2009-04-07  6:27                         ` Trenton D. Adams

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).