All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ide write barrier support
@ 2003-10-13 14:08 Jens Axboe
  2003-10-13 15:23 ` Jeff Garzik
                   ` (4 more replies)
  0 siblings, 5 replies; 39+ messages in thread
From: Jens Axboe @ 2003-10-13 14:08 UTC (permalink / raw)
  To: Linux Kernel

Hi,

Forward ported and tested today (with the dummy ext3 patch included),
works for me. Some todo's left, but I thought I'd send it out to gauge
interest. TODO:

- Detect write cache setting and only issue SYNC_CACHE if write cache is
  enabled (not a biggy, all drives ship with it enabled)

- Toggle flush support on hdparm -W0/1

- Various small bits I can't remember right now

===== drivers/block/ll_rw_blk.c 1.219 vs edited =====
--- 1.219/drivers/block/ll_rw_blk.c	Wed Oct  8 04:53:42 2003
+++ edited/drivers/block/ll_rw_blk.c	Mon Oct 13 14:28:51 2003
@@ -240,11 +240,40 @@
 	INIT_LIST_HEAD(&q->plug_list);
 
 	blk_queue_activity_fn(q, NULL, NULL);
+
+	q->ordered = QUEUE_ORDERED_NONE;
 }
 
 EXPORT_SYMBOL(blk_queue_make_request);
 
 /**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q:     the request queue
+ * @flag:  see below
+ *
+ * Description:
+ *   For journalled file systems, doing ordered writes on a commit
+ *   block instead of explicitly doing wait_on_buffer (which is bad
+ *   for performance) can be a big win. Block drivers supporting this
+ *   feature should call this function and indicate so.
+ *
+ *   SCSI drivers usually need to support ordered tags, while others
+ *   may have to do a complete drive cache flush if they are using write
+ *   back caching (or not and lying about it)
+ *
+ *   With this in mind, the values are
+ *             QUEUE_ORDERED_NONE:	the default, doesn't support barrier
+ *             QUEUE_ORDERED_TAG:	supports ordered tags
+ *             QUEUE_ORDERED_FLUSH:	supports barrier through cache flush
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+	q->ordered = flag;
+}
+
+EXPORT_SYMBOL(blk_queue_ordered);
+
+/**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q:  the request queue for the device
  * @dma_addr:   bus address limit
@@ -1820,6 +1849,8 @@
 
 	if (unlikely(!q))
 		return;
+
+	WARN_ON(!req->ref_count);
 	if (unlikely(--req->ref_count))
 		return;
 
@@ -1986,7 +2017,7 @@
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req, *freereq = NULL;
-	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra;
+	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra, err;
 	sector_t sector;
 
 	sector = bio->bi_sector;
@@ -2005,6 +2036,10 @@
 	spin_lock_prefetch(q->queue_lock);
 
 	barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw);
+	if (barrier && (q->ordered == QUEUE_ORDERED_NONE)) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
 
 	ra = bio->bi_rw & (1 << BIO_RW_AHEAD);
 
@@ -2086,6 +2121,7 @@
 			/*
 			 * READA bit set
 			 */
+			err = -EWOULDBLOCK;
 			if (ra)
 				goto end_io;
 	
@@ -2141,7 +2177,7 @@
 	return 0;
 
 end_io:
-	bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK);
+	bio_endio(bio, nr_sectors << 9, err);
 	return 0;
 }
 
===== drivers/ide/ide-io.c 1.20 vs edited =====
--- 1.20/drivers/ide/ide-io.c	Tue Sep  9 20:31:23 2003
+++ edited/drivers/ide/ide-io.c	Mon Oct 13 15:37:24 2003
@@ -85,6 +85,39 @@
 #endif /* DISK_RECOVERY_TIME */
 }
 
+/*
+ * preempt pending requests, and store this cache flush for immediate
+ * execution
+ */
+static struct request *ide_queue_flush_cmd(ide_drive_t *drive,
+					   struct request *rq, int post)
+{
+	struct request *flush_rq = &HWGROUP(drive)->wrq;
+
+	blkdev_dequeue_request(rq);
+
+	memset(drive->special_buf, 0, sizeof(drive->special_buf));
+
+	ide_init_drive_cmd(flush_rq);
+
+	flush_rq->buffer = drive->special_buf;
+	flush_rq->special = rq;
+	flush_rq->buffer[0] = WIN_FLUSH_CACHE;
+
+	if (drive->id->cfs_enable_2 & 0x2400)
+		flush_rq->buffer[0] = WIN_FLUSH_CACHE_EXT;
+
+	if (!post) {
+		drive->doing_barrier = 1;
+		flush_rq->flags |= REQ_BAR_PREFLUSH;
+	} else
+		flush_rq->flags |= REQ_BAR_POSTFLUSH;
+
+	flush_rq->flags |= REQ_STARTED;
+	list_add(&flush_rq->queuelist, &drive->queue->queue_head);
+	return flush_rq;
+}
+
 /**
  *	ide_end_request		-	complete an IDE I/O
  *	@drive: IDE device for the I/O
@@ -128,12 +161,23 @@
 
 	if (!end_that_request_first(rq, uptodate, nr_sectors)) {
 		add_disk_randomness(rq->rq_disk);
-		if (!blk_rq_tagged(rq))
-			blkdev_dequeue_request(rq);
-		else
-			blk_queue_end_tag(drive->queue, rq);
+
+		/*
+		 * if this is a barrier write, flush the write cache
+		 * before signalling completion of this request
+		 */
+		if (blk_barrier_rq(rq))
+			ide_queue_flush_cmd(drive, rq, 1);
+		else {
+			if (!blk_rq_tagged(rq))
+				blkdev_dequeue_request(rq);
+			else
+				blk_queue_end_tag(drive->queue, rq);
+
+			end_that_request_last(rq);
+		}
+
 		HWGROUP(drive)->rq = NULL;
-		end_that_request_last(rq);
 		ret = 0;
 	}
 	spin_unlock_irqrestore(&ide_lock, flags);
@@ -260,6 +304,36 @@
 
 	spin_lock_irqsave(&ide_lock, flags);
 	blkdev_dequeue_request(rq);
+
+	/*
+	 * if a cache flush fails, disable ordered write support
+	 */
+	if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq)) {
+		struct request *real_rq = rq->special;
+
+		/*
+		 * should we forcibly disable the write back caching?
+		 */
+		if (err) {
+			printk("%s: cache flushing failed. disable write back cacheing for journalled file systems\n", drive->name);
+			blk_queue_ordered(drive->queue, QUEUE_ORDERED_NONE);
+		}
+
+		if (blk_barrier_postflush(rq)) {
+			/*
+			 * this completes the barrier write
+			 */
+			drive->doing_barrier = 0;
+			end_that_request_last(real_rq);
+		} else {
+			/*
+			 * just indicate that we did the pre flush
+			 */
+			real_rq->flags |= REQ_BAR_PREFLUSH;
+			__elv_add_request(drive->queue, real_rq, ELEVATOR_INSERT_FRONT, 0);
+		}
+	}
+
 	HWGROUP(drive)->rq = NULL;
 	end_that_request_last(rq);
 	spin_unlock_irqrestore(&ide_lock, flags);
@@ -752,6 +826,15 @@
 repeat:	
 	best = NULL;
 	drive = hwgroup->drive;
+
+	/*
+	 * drive is doing pre-flush, ordered write, post-flush sequence. even
+	 * though that is 3 requests, it must be seen as a single transaction.
+	 * we must not preempt this drive until that is complete
+	 */
+	if (drive->doing_barrier)
+		return drive;
+
 	do {
 		if ((!drive->sleep || time_after_eq(jiffies, drive->sleep))
 		    && !elv_queue_empty(drive->queue)) {
@@ -919,6 +1002,13 @@
 		}
 
 		/*
+		 * if rq is a barrier write, issue pre cache flush if not
+		 * already done
+		 */
+		if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq))
+			rq = ide_queue_flush_cmd(drive, rq, 0);
+
+		/*
 		 * Sanity: don't accept a request that isn't a PM request
 		 * if we are currently power managed. This is very important as
 		 * blk_stop_queue() doesn't prevent the elv_next_request()
@@ -1344,6 +1434,7 @@
 {
 	memset(rq, 0, sizeof(*rq));
 	rq->flags = REQ_DRIVE_CMD;
+	rq->ref_count = 1;
 }
 
 EXPORT_SYMBOL(ide_init_drive_cmd);
===== drivers/ide/ide-probe.c 1.65 vs edited =====
--- 1.65/drivers/ide/ide-probe.c	Wed Sep  3 18:52:16 2003
+++ edited/drivers/ide/ide-probe.c	Mon Oct 13 09:55:02 2003
@@ -958,9 +958,14 @@
 	/* needs drive->queue to be set */
 	ide_toggle_bounce(drive, 1);
 
-	/* enable led activity for disk drives only */
-	if (drive->media == ide_disk && hwif->led_act)
-		blk_queue_activity_fn(q, hwif->led_act, drive);
+	if (drive->media == ide_disk) {
+		/* enable led activity for disk drives only */
+		if (hwif->led_act)
+			blk_queue_activity_fn(q, hwif->led_act, drive);
+
+		/* flush cache for ordered writes */
+		blk_queue_ordered(q, QUEUE_ORDERED_FLUSH);
+	}
 
 	return 0;
 }
===== fs/buffer.c 1.215 vs edited =====
--- 1.215/fs/buffer.c	Tue Sep 30 03:12:02 2003
+++ edited/fs/buffer.c	Mon Oct 13 10:06:59 2003
@@ -2658,12 +2658,20 @@
 	BUG_ON(!buffer_mapped(bh));
 	BUG_ON(!bh->b_end_io);
 
+	if (rw == WRITEBARRIER) {
+		set_bit(BH_Ordered, &bh->b_state);
+		rw = WRITE;
+	}
+
 	if ((rw == READ || rw == READA) && buffer_uptodate(bh))
 		buffer_error();
 	if (rw == WRITE && !buffer_uptodate(bh))
 		buffer_error();
 	if (rw == READ && buffer_dirty(bh))
 		buffer_error();
+
+	if (test_bit(BH_Ordered, &bh->b_state) && (rw == WRITE))
+		rw = WRITEBARRIER;
 
 	/* Only clear out a write error when rewriting */
 	if (test_set_buffer_req(bh) && rw == WRITE)
===== fs/jbd/commit.c 1.40 vs edited =====
--- 1.40/fs/jbd/commit.c	Fri Aug  1 12:02:20 2003
+++ edited/fs/jbd/commit.c	Mon Oct 13 10:17:28 2003
@@ -474,7 +474,9 @@
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
+				set_bit(BH_Ordered, &bh->b_state);
 				submit_bh(WRITE, bh);
+				clear_bit(BH_Ordered, &bh->b_state);
 			}
 			cond_resched();
 
===== include/linux/blkdev.h 1.127 vs edited =====
--- 1.127/include/linux/blkdev.h	Tue Sep 16 13:57:26 2003
+++ edited/include/linux/blkdev.h	Mon Oct 13 09:52:33 2003
@@ -193,6 +193,8 @@
 	__REQ_PM_SUSPEND,	/* suspend request */
 	__REQ_PM_RESUME,	/* resume request */
 	__REQ_PM_SHUTDOWN,	/* shutdown request */
+	__REQ_BAR_PREFLUSH,	/* barrier pre-flush done */
+	__REQ_BAR_POSTFLUSH,	/* barrier post-flush */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -218,6 +220,8 @@
 #define REQ_PM_SUSPEND	(1 << __REQ_PM_SUSPEND)
 #define REQ_PM_RESUME	(1 << __REQ_PM_RESUME)
 #define REQ_PM_SHUTDOWN	(1 << __REQ_PM_SHUTDOWN)
+#define REQ_BAR_PREFLUSH	(1 << __REQ_BAR_PREFLUSH)
+#define REQ_BAR_POSTFLUSH	(1 << __REQ_BAR_POSTFLUSH)
 
 /*
  * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -344,6 +348,8 @@
 	unsigned long		seg_boundary_mask;
 	unsigned int		dma_alignment;
 
+	unsigned short		ordered;
+
 	struct blk_queue_tag	*queue_tags;
 
 	atomic_t		refcnt;
@@ -368,6 +374,13 @@
 #define QUEUE_FLAG_WRITEFULL	4	/* read queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 
+/*
+ * write barrier support
+ */
+#define QUEUE_ORDERED_NONE	0	/* no support */
+#define QUEUE_ORDERED_TAG	1	/* supported by tags */
+#define QUEUE_ORDERED_FLUSH	2	/* supported by cache flush */
+
 #define blk_queue_plugged(q)	!list_empty(&(q)->plug_list)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_fs_request(rq)	((rq)->flags & REQ_CMD)
@@ -379,6 +392,10 @@
 #define blk_pm_request(rq)	\
 	((rq)->flags & (REQ_PM_SUSPEND | REQ_PM_RESUME))
 
+#define blk_barrier_rq(rq)	((rq)->flags & REQ_HARDBARRIER)
+#define blk_barrier_preflush(rq)	((rq)->flags & REQ_BAR_PREFLUSH)
+#define blk_barrier_postflush(rq)	((rq)->flags & REQ_BAR_POSTFLUSH)
+
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
 #define rq_data_dir(rq)		((rq)->flags & 1)
@@ -561,6 +578,7 @@
 extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(request_queue_t *, int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
+extern void blk_queue_ordered(request_queue_t *, int);
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
===== include/linux/buffer_head.h 1.44 vs edited =====
--- 1.44/include/linux/buffer_head.h	Tue Aug 19 07:30:30 2003
+++ edited/include/linux/buffer_head.h	Mon Oct 13 09:56:22 2003
@@ -26,6 +26,7 @@
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
+	BH_Ordered,	/* ordered write */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
===== include/linux/fs.h 1.274 vs edited =====
--- 1.274/include/linux/fs.h	Tue Sep 23 06:16:30 2003
+++ edited/include/linux/fs.h	Mon Oct 13 10:04:04 2003
@@ -81,7 +81,7 @@
 #define READ 0
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
-#define SPECIAL 4	/* For non-blockdevice requests in request queue */
+#define WRITEBARRIER	5	/* 1st bit, write, 3rd barrier */
 
 #define SEL_IN		1
 #define SEL_OUT		2
===== include/linux/ide.h 1.75 vs edited =====
--- 1.75/include/linux/ide.h	Sat Sep  6 17:21:14 2003
+++ edited/include/linux/ide.h	Mon Oct 13 09:40:46 2003
@@ -728,6 +728,7 @@
 	unsigned ata_flash	: 1;	/* 1=present, 0=default */
 	unsigned blocked        : 1;	/* 1=powermanagment told us not to do anything, so sleep nicely */
 	unsigned vdma		: 1;	/* 1=doing PIO over DMA 0=doing normal DMA */
+	unsigned doing_barrier	: 1;	/* state, 1=currently doing flush */
 	unsigned addressing;		/*      : 3;
 					 *  0=28-bit
 					 *  1=48-bit
@@ -773,6 +774,7 @@
 	int		forced_lun;	/* if hdxlun was given at boot */
 	int		lun;		/* logical unit */
 	int		crc_count;	/* crc counter to reduce drive speed */
+	char		special_buf[4];	/* private command buffer */
 	struct list_head list;
 	struct device	gendev;
 	struct semaphore gendev_rel_sem;	/* to deal with device release() */

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-13 14:08 [PATCH] ide write barrier support Jens Axboe
@ 2003-10-13 15:23 ` Jeff Garzik
  2003-10-13 15:35   ` Jens Axboe
  2003-10-13 22:39 ` Matthias Andree
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 39+ messages in thread
From: Jeff Garzik @ 2003-10-13 15:23 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel

On Mon, Oct 13, 2003 at 04:08:58PM +0200, Jens Axboe wrote:
> +/*
> + * preempt pending requests, and store this cache flush for immediate
> + * execution
> + */
> +static struct request *ide_queue_flush_cmd(ide_drive_t *drive,
> +					   struct request *rq, int post)
> +{
> +	struct request *flush_rq = &HWGROUP(drive)->wrq;
> +
> +	blkdev_dequeue_request(rq);
> +
> +	memset(drive->special_buf, 0, sizeof(drive->special_buf));
> +
> +	ide_init_drive_cmd(flush_rq);
> +
> +	flush_rq->buffer = drive->special_buf;
> +	flush_rq->special = rq;
> +	flush_rq->buffer[0] = WIN_FLUSH_CACHE;
> +
> +	if (drive->id->cfs_enable_2 & 0x2400)
> +		flush_rq->buffer[0] = WIN_FLUSH_CACHE_EXT;
> +
> +	if (!post) {
> +		drive->doing_barrier = 1;
> +		flush_rq->flags |= REQ_BAR_PREFLUSH;
> +	} else
> +		flush_rq->flags |= REQ_BAR_POSTFLUSH;
> +
> +	flush_rq->flags |= REQ_STARTED;
> +	list_add(&flush_rq->queuelist, &drive->queue->queue_head);
> +	return flush_rq;
> +}

AFAICS you're missing some code that could be a major data corrupter:

FLUSH CACHE [EXT] may return before it's complete.  You need to create
an issue loop, that does FLUSH CACHE [EXT] and reads the result.  If the
result indicates the flush cache was partial, then you need to re-issue
the flush.  Lather, rinse, repeat until flush cache indicates all data
is really flushed.

	Jeff




^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-13 15:23 ` Jeff Garzik
@ 2003-10-13 15:35   ` Jens Axboe
  2003-10-13 15:37     ` Jens Axboe
  0 siblings, 1 reply; 39+ messages in thread
From: Jens Axboe @ 2003-10-13 15:35 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Linux Kernel

On Mon, Oct 13 2003, Jeff Garzik wrote:
> On Mon, Oct 13, 2003 at 04:08:58PM +0200, Jens Axboe wrote:
> > +/*
> > + * preempt pending requests, and store this cache flush for immediate
> > + * execution
> > + */
> > +static struct request *ide_queue_flush_cmd(ide_drive_t *drive,
> > +					   struct request *rq, int post)
> > +{
> > +	struct request *flush_rq = &HWGROUP(drive)->wrq;
> > +
> > +	blkdev_dequeue_request(rq);
> > +
> > +	memset(drive->special_buf, 0, sizeof(drive->special_buf));
> > +
> > +	ide_init_drive_cmd(flush_rq);
> > +
> > +	flush_rq->buffer = drive->special_buf;
> > +	flush_rq->special = rq;
> > +	flush_rq->buffer[0] = WIN_FLUSH_CACHE;
> > +
> > +	if (drive->id->cfs_enable_2 & 0x2400)
> > +		flush_rq->buffer[0] = WIN_FLUSH_CACHE_EXT;
> > +
> > +	if (!post) {
> > +		drive->doing_barrier = 1;
> > +		flush_rq->flags |= REQ_BAR_PREFLUSH;
> > +	} else
> > +		flush_rq->flags |= REQ_BAR_POSTFLUSH;
> > +
> > +	flush_rq->flags |= REQ_STARTED;
> > +	list_add(&flush_rq->queuelist, &drive->queue->queue_head);
> > +	return flush_rq;
> > +}
> 
> AFAICS you're missing some code that could be a major data corrupter:
> 
> FLUSH CACHE [EXT] may return before it's complete.  You need to create
> an issue loop, that does FLUSH CACHE [EXT] and reads the result.  If the
> result indicates the flush cache was partial, then you need to re-issue
> the flush.  Lather, rinse, repeat until flush cache indicates all data
> is really flushed.

It looks like you are right, at least the wording has changed since ata5
that states that BSY must remain set until all data has been flushed out
(or error occurs). Which seems sane.

Only in the error case can I see this making sense, with partial
flushes. It needs fixing (the error case too), but I'd hardly call that
a major data corrupter. Not for the general case, we'd have to do really
badly to risk corrupting data when compared to how 2.4 and 2.6 with
journalling works now.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-13 15:35   ` Jens Axboe
@ 2003-10-13 15:37     ` Jens Axboe
  0 siblings, 0 replies; 39+ messages in thread
From: Jens Axboe @ 2003-10-13 15:37 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Linux Kernel

On Mon, Oct 13 2003, Jens Axboe wrote:
> On Mon, Oct 13 2003, Jeff Garzik wrote:
> > On Mon, Oct 13, 2003 at 04:08:58PM +0200, Jens Axboe wrote:
> > > +/*
> > > + * preempt pending requests, and store this cache flush for immediate
> > > + * execution
> > > + */
> > > +static struct request *ide_queue_flush_cmd(ide_drive_t *drive,
> > > +					   struct request *rq, int post)
> > > +{
> > > +	struct request *flush_rq = &HWGROUP(drive)->wrq;
> > > +
> > > +	blkdev_dequeue_request(rq);
> > > +
> > > +	memset(drive->special_buf, 0, sizeof(drive->special_buf));
> > > +
> > > +	ide_init_drive_cmd(flush_rq);
> > > +
> > > +	flush_rq->buffer = drive->special_buf;
> > > +	flush_rq->special = rq;
> > > +	flush_rq->buffer[0] = WIN_FLUSH_CACHE;
> > > +
> > > +	if (drive->id->cfs_enable_2 & 0x2400)
> > > +		flush_rq->buffer[0] = WIN_FLUSH_CACHE_EXT;
> > > +
> > > +	if (!post) {
> > > +		drive->doing_barrier = 1;
> > > +		flush_rq->flags |= REQ_BAR_PREFLUSH;
> > > +	} else
> > > +		flush_rq->flags |= REQ_BAR_POSTFLUSH;
> > > +
> > > +	flush_rq->flags |= REQ_STARTED;
> > > +	list_add(&flush_rq->queuelist, &drive->queue->queue_head);
> > > +	return flush_rq;
> > > +}
> > 
> > AFAICS you're missing some code that could be a major data corrupter:
> > 
> > FLUSH CACHE [EXT] may return before it's complete.  You need to create
> > an issue loop, that does FLUSH CACHE [EXT] and reads the result.  If the
> > result indicates the flush cache was partial, then you need to re-issue
> > the flush.  Lather, rinse, repeat until flush cache indicates all data
> > is really flushed.
> 
> It looks like you are right, at least the wording has changed since ata5
> that states that BSY must remain set until all data has been flushed out
> (or error occurs). Which seems sane.
> 
> Only in the error case can I see this making sense, with partial
> flushes. It needs fixing (the error case too), but I'd hardly call that
> a major data corrupter. Not for the general case, we'd have to do really
> badly to risk corrupting data when compared to how 2.4 and 2.6 with
> journalling works now.

Newer ata (6, to be precise, the newest I have handy) have the same
wording. Only in error recovery should I take care.

So it's not a major issue, it's a corner case. The error handling isn't
quite complete yet, that's is a known issue.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-13 14:08 [PATCH] ide write barrier support Jens Axboe
  2003-10-13 15:23 ` Jeff Garzik
@ 2003-10-13 22:39 ` Matthias Andree
  2003-10-14  0:16   ` Jeff Garzik
  2003-10-13 23:07 ` Andrew Morton
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 39+ messages in thread
From: Matthias Andree @ 2003-10-13 22:39 UTC (permalink / raw)
  To: Linux Kernel

On Mon, 13 Oct 2003, Jens Axboe wrote:

> Forward ported and tested today (with the dummy ext3 patch included),
> works for me. Some todo's left, but I thought I'd send it out to gauge
> interest. TODO:
> 
> - Detect write cache setting and only issue SYNC_CACHE if write cache is
>   enabled (not a biggy, all drives ship with it enabled)

Yup, and I disable it on all drives at boot time at the latest.

Is there a status document that lists

- what SCSI drivers support write barriers
  (I'm interested in sym53c8xx_2 if that matters)

- what IDE drivers support write barriers
  (VIA for AMD and Intel for PII/PIII/P4 chip sets here)

- what file systems know how to utilize write barriers (other than
  reiserfs ;-) - what does "dummy ext3 patch" mean?

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-13 14:08 [PATCH] ide write barrier support Jens Axboe
  2003-10-13 15:23 ` Jeff Garzik
  2003-10-13 22:39 ` Matthias Andree
@ 2003-10-13 23:07 ` Andrew Morton
  2003-10-14  6:48   ` Jens Axboe
  2003-10-15  3:40 ` Greg Stark
  2003-10-20 17:10 ` Daniel Phillips
  4 siblings, 1 reply; 39+ messages in thread
From: Andrew Morton @ 2003-10-13 23:07 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel

Jens Axboe <axboe@suse.de> wrote:
>
> Hi,
> 
> Forward ported and tested today (with the dummy ext3 patch included),
> works for me. Some todo's left, but I thought I'd send it out to gauge
> interest. TODO:
> 
> - Detect write cache setting and only issue SYNC_CACHE if write cache is
>   enabled (not a biggy, all drives ship with it enabled)
> 
> - Toggle flush support on hdparm -W0/1
> 
> - Various small bits I can't remember right now
> 

> ...
> +		set_bit(BH_Ordered, &bh->b_state);

We have standard macros for generating standard buffer_head operations, so
this can become

	set_buffer_ordered(bh);

See appended patch.


> --- 1.40/fs/jbd/commit.c	Fri Aug  1 12:02:20 2003
> +++ edited/fs/jbd/commit.c	Mon Oct 13 10:17:28 2003
> @@ -474,7 +474,9 @@
>  				clear_buffer_dirty(bh);
>  				set_buffer_uptodate(bh);
>  				bh->b_end_io = journal_end_buffer_io_sync;
> +				set_bit(BH_Ordered, &bh->b_state);
>  				submit_bh(WRITE, bh);
> +				clear_bit(BH_Ordered, &bh->b_state);
>  			}
>  			cond_resched();

Why does the ordering go here?  I'd have thought that we only need to
enforce ordering around the commit block.

Touching the bh here after submitting it may be racy: may need to take an
extra ref against the bh to prevent it from disappearing.  I need to look
at it more closely.

> @@ -344,6 +348,8 @@
>  	unsigned long		seg_boundary_mask;
>  	unsigned int		dma_alignment;
>  
> +	unsigned short		ordered;
> +
>  	struct blk_queue_tag	*queue_tags;
>  
>  	atomic_t		refcnt;

shorts-in-structs worry me.  If the CPU implements a write-to-short as a
word-sized RMW and the compiler decides to align or pack the short into a
less-than-wored-sized storage space then a write-to-short could stomp on a
neighbouring member.

I doubt if it can happen, but if so, I'd be interested in knowing what guarantees it.

> ...
>  	unsigned vdma		: 1;	/* 1=doing PIO over DMA 0=doing normal DMA */
> +	unsigned doing_barrier	: 1;	/* state, 1=currently doing flush */

Similarly, I suspect that bitfields like this need locking.  If the CPU
implements a write-to-bitfield as a non-buslocked RMW it can stomp on
neighbouring bitfields in the same word.


 25-akpm/fs/buffer.c                 |    4 ++--
 25-akpm/fs/jbd/commit.c             |    4 ++--
 25-akpm/include/linux/buffer_head.h |    3 ++-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff -puN fs/buffer.c~ide-write-barrier-support-tidies fs/buffer.c
--- 25/fs/buffer.c~ide-write-barrier-support-tidies	Mon Oct 13 15:53:56 2003
+++ 25-akpm/fs/buffer.c	Mon Oct 13 15:53:56 2003
@@ -2655,7 +2655,7 @@ int submit_bh(int rw, struct buffer_head
 	BUG_ON(!bh->b_end_io);
 
 	if (rw == WRITEBARRIER) {
-		set_bit(BH_Ordered, &bh->b_state);
+		set_buffer_ordered(bh);
 		rw = WRITE;
 	}
 
@@ -2666,7 +2666,7 @@ int submit_bh(int rw, struct buffer_head
 	if (rw == READ && buffer_dirty(bh))
 		buffer_error();
 
-	if (test_bit(BH_Ordered, &bh->b_state) && (rw == WRITE))
+	if (buffer_ordered(bh) && (rw == WRITE))
 		rw = WRITEBARRIER;
 
 	/* Only clear out a write error when rewriting */
diff -puN fs/jbd/commit.c~ide-write-barrier-support-tidies fs/jbd/commit.c
--- 25/fs/jbd/commit.c~ide-write-barrier-support-tidies	Mon Oct 13 15:53:56 2003
+++ 25-akpm/fs/jbd/commit.c	Mon Oct 13 15:53:56 2003
@@ -474,9 +474,9 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				set_bit(BH_Ordered, &bh->b_state);
+				set_buffer_ordered(bh);
 				submit_bh(WRITE, bh);
-				clear_bit(BH_Ordered, &bh->b_state);
+				clear_buffer_ordered(bh)
 			}
 			cond_resched();
 
diff -puN include/linux/buffer_head.h~ide-write-barrier-support-tidies include/linux/buffer_head.h
--- 25/include/linux/buffer_head.h~ide-write-barrier-support-tidies	Mon Oct 13 15:53:56 2003
+++ 25-akpm/include/linux/buffer_head.h	Mon Oct 13 15:53:56 2003
@@ -118,7 +118,8 @@ BUFFER_FNS(Async_Read, async_read)
 BUFFER_FNS(Async_Write, async_write)
 BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
-BUFFER_FNS(Write_EIO,write_io_error)
+BUFFER_FNS(Write_EIO, write_io_error)
+BUFFER_FNS(Ordered, ordered)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 #define touch_buffer(bh)	mark_page_accessed(bh->b_page)

_


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-13 22:39 ` Matthias Andree
@ 2003-10-14  0:16   ` Jeff Garzik
  2003-10-16 10:36     ` Jens Axboe
  0 siblings, 1 reply; 39+ messages in thread
From: Jeff Garzik @ 2003-10-14  0:16 UTC (permalink / raw)
  To: Matthias Andree; +Cc: Linux Kernel

Matthias Andree wrote:
> On Mon, 13 Oct 2003, Jens Axboe wrote:
> 
> 
>>Forward ported and tested today (with the dummy ext3 patch included),
>>works for me. Some todo's left, but I thought I'd send it out to gauge
>>interest. TODO:
>>
>>- Detect write cache setting and only issue SYNC_CACHE if write cache is
>>  enabled (not a biggy, all drives ship with it enabled)
> 
> 
> Yup, and I disable it on all drives at boot time at the latest.
> 
> Is there a status document that lists
> 
> - what SCSI drivers support write barriers
>   (I'm interested in sym53c8xx_2 if that matters)
> 
> - what IDE drivers support write barriers
>   (VIA for AMD and Intel for PII/PIII/P4 chip sets here)

The device is the entity that does, or does not, support flush-cache... 
  All IDE chipsets support flush-cache... it's just another IDE command.

	Jeff




^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-13 23:07 ` Andrew Morton
@ 2003-10-14  6:48   ` Jens Axboe
  0 siblings, 0 replies; 39+ messages in thread
From: Jens Axboe @ 2003-10-14  6:48 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

On Mon, Oct 13 2003, Andrew Morton wrote:
> Jens Axboe <axboe@suse.de> wrote:
> >
> > Hi,
> > 
> > Forward ported and tested today (with the dummy ext3 patch included),
> > works for me. Some todo's left, but I thought I'd send it out to gauge
> > interest. TODO:
> > 
> > - Detect write cache setting and only issue SYNC_CACHE if write cache is
> >   enabled (not a biggy, all drives ship with it enabled)
> > 
> > - Toggle flush support on hdparm -W0/1
> > 
> > - Various small bits I can't remember right now
> > 
> 
> > ...
> > +		set_bit(BH_Ordered, &bh->b_state);
> 
> We have standard macros for generating standard buffer_head operations, so
> this can become
> 
> 	set_buffer_ordered(bh);
> 
> See appended patch.

Yes thanks.

> > --- 1.40/fs/jbd/commit.c	Fri Aug  1 12:02:20 2003
> > +++ edited/fs/jbd/commit.c	Mon Oct 13 10:17:28 2003
> > @@ -474,7 +474,9 @@
> >  				clear_buffer_dirty(bh);
> >  				set_buffer_uptodate(bh);
> >  				bh->b_end_io = journal_end_buffer_io_sync;
> > +				set_bit(BH_Ordered, &bh->b_state);
> >  				submit_bh(WRITE, bh);
> > +				clear_bit(BH_Ordered, &bh->b_state);
> >  			}
> >  			cond_resched();
> 
> Why does the ordering go here?  I'd have thought that we only need to
> enforce ordering around the commit block.

Yes only for the commit block, this is just left-over from stress
testing.

> Touching the bh here after submitting it may be racy: may need to take an
> extra ref against the bh to prevent it from disappearing.  I need to look
> at it more closely.

Indeed, it needs a get/put_bh. Thanks!

> > @@ -344,6 +348,8 @@
> >  	unsigned long		seg_boundary_mask;
> >  	unsigned int		dma_alignment;
> >  
> > +	unsigned short		ordered;
> > +
> >  	struct blk_queue_tag	*queue_tags;
> >  
> >  	atomic_t		refcnt;
> 
> shorts-in-structs worry me.  If the CPU implements a write-to-short as
> a word-sized RMW and the compiler decides to align or pack the short
> into a less-than-wored-sized storage space then a write-to-short could
> stomp on a neighbouring member.
> 
> I doubt if it can happen, but if so, I'd be interested in knowing what
> guarantees it.

None of the surrounding members are frequently accessed, surely we
should be ok? But I agree, I only ever used shorts in structs when it
helps the alignment. I've made the change locally.

> > ...
> >  	unsigned vdma		: 1;	/* 1=doing PIO over DMA 0=doing normal DMA */
> > +	unsigned doing_barrier	: 1;	/* state, 1=currently doing flush */
> 
> Similarly, I suspect that bitfields like this need locking.  If the CPU
> implements a write-to-bitfield as a non-buslocked RMW it can stomp on
> neighbouring bitfields in the same word.

It is locked down with ide_lock. Other members may be more problematic,
it might not be a silly idea to bit-ify these fields.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-13 14:08 [PATCH] ide write barrier support Jens Axboe
                   ` (2 preceding siblings ...)
  2003-10-13 23:07 ` Andrew Morton
@ 2003-10-15  3:40 ` Greg Stark
  2003-10-16  7:10   ` Jens Axboe
  2003-10-20 17:10 ` Daniel Phillips
  4 siblings, 1 reply; 39+ messages in thread
From: Greg Stark @ 2003-10-15  3:40 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel


Jens Axboe <axboe@suse.de> writes:

> Hi,
> 
> Forward ported and tested today (with the dummy ext3 patch included),
> works for me. Some todo's left, but I thought I'd send it out to gauge
> interest. TODO:


Is there a user-space interface planned for this? 


One possibility may be just to hang it off fsync(2) so fsync doesn't return
until until all the buffers it flushed are actually synced to disk. That's its
documented semantics anyways.


There's also the case of files opened with O_SYNC. Would inserting a write
barrier after every write to such a file destroy performance?


-- 
greg


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-15  3:40 ` Greg Stark
@ 2003-10-16  7:10   ` Jens Axboe
  0 siblings, 0 replies; 39+ messages in thread
From: Jens Axboe @ 2003-10-16  7:10 UTC (permalink / raw)
  To: Greg Stark; +Cc: Linux Kernel

On Tue, Oct 14 2003, Greg Stark wrote:
> 
> Jens Axboe <axboe@suse.de> writes:
> 
> > Hi,
> > 
> > Forward ported and tested today (with the dummy ext3 patch included),
> > works for me. Some todo's left, but I thought I'd send it out to gauge
> > interest. TODO:
> 
> 
> Is there a user-space interface planned for this? 

I don't have one planned.

> One possibility may be just to hang it off fsync(2) so fsync doesn't
> return until until all the buffers it flushed are actually synced to
> disk. That's its documented semantics anyways.

Makes sense, indeed.

> There's also the case of files opened with O_SYNC. Would inserting a
> write barrier after every write to such a file destroy performance?

If it's mainly sequential io, then no it won't destroy performance. It
will be lower than without the cache flush of course.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-14  0:16   ` Jeff Garzik
@ 2003-10-16 10:36     ` Jens Axboe
  2003-10-16 10:46       ` Jeff Garzik
  0 siblings, 1 reply; 39+ messages in thread
From: Jens Axboe @ 2003-10-16 10:36 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Matthias Andree, Linux Kernel

On Mon, Oct 13 2003, Jeff Garzik wrote:
> Matthias Andree wrote:
> >On Mon, 13 Oct 2003, Jens Axboe wrote:
> >
> >
> >>Forward ported and tested today (with the dummy ext3 patch included),
> >>works for me. Some todo's left, but I thought I'd send it out to gauge
> >>interest. TODO:
> >>
> >>- Detect write cache setting and only issue SYNC_CACHE if write cache is
> >> enabled (not a biggy, all drives ship with it enabled)
> >
> >
> >Yup, and I disable it on all drives at boot time at the latest.
> >
> >Is there a status document that lists
> >
> >- what SCSI drivers support write barriers
> >  (I'm interested in sym53c8xx_2 if that matters)
> >
> >- what IDE drivers support write barriers
> >  (VIA for AMD and Intel for PII/PIII/P4 chip sets here)
> 
> The device is the entity that does, or does not, support flush-cache... 
>  All IDE chipsets support flush-cache... it's just another IDE command.

Well drivers need to support it, too. IDE is supported, and that covers
all devices that support it. It's not implemented on SCSI at all right
now, that's coming up.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-16 10:36     ` Jens Axboe
@ 2003-10-16 10:46       ` Jeff Garzik
  2003-10-16 10:48         ` Jens Axboe
  0 siblings, 1 reply; 39+ messages in thread
From: Jeff Garzik @ 2003-10-16 10:46 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Matthias Andree, Linux Kernel

Jens Axboe wrote:
> On Mon, Oct 13 2003, Jeff Garzik wrote:
> 
>>Matthias Andree wrote:
>>
>>>On Mon, 13 Oct 2003, Jens Axboe wrote:
>>>
>>>
>>>
>>>>Forward ported and tested today (with the dummy ext3 patch included),
>>>>works for me. Some todo's left, but I thought I'd send it out to gauge
>>>>interest. TODO:
>>>>
>>>>- Detect write cache setting and only issue SYNC_CACHE if write cache is
>>>>enabled (not a biggy, all drives ship with it enabled)
>>>
>>>
>>>Yup, and I disable it on all drives at boot time at the latest.
>>>
>>>Is there a status document that lists
>>>
>>>- what SCSI drivers support write barriers
>>> (I'm interested in sym53c8xx_2 if that matters)
>>>
>>>- what IDE drivers support write barriers
>>> (VIA for AMD and Intel for PII/PIII/P4 chip sets here)
>>
>>The device is the entity that does, or does not, support flush-cache... 
>> All IDE chipsets support flush-cache... it's just another IDE command.
> 
> 
> Well drivers need to support it, too. IDE is supported, and that covers
> all devices that support it. It's not implemented on SCSI at all right
> now, that's coming up.

I do not see the need for write barrier support in 
drivers/ide/pci/{via82cxxx,piix,}.c, which is the question the original 
poster was asking.

Low-level chipset drivers should _not_ need to support it, the subsystem 
can do that.

	Jeff




^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-16 10:46       ` Jeff Garzik
@ 2003-10-16 10:48         ` Jens Axboe
  0 siblings, 0 replies; 39+ messages in thread
From: Jens Axboe @ 2003-10-16 10:48 UTC (permalink / raw)
  To: Jeff Garzik; +Cc: Matthias Andree, Linux Kernel

On Thu, Oct 16 2003, Jeff Garzik wrote:
> Jens Axboe wrote:
> >On Mon, Oct 13 2003, Jeff Garzik wrote:
> >
> >>Matthias Andree wrote:
> >>
> >>>On Mon, 13 Oct 2003, Jens Axboe wrote:
> >>>
> >>>
> >>>
> >>>>Forward ported and tested today (with the dummy ext3 patch included),
> >>>>works for me. Some todo's left, but I thought I'd send it out to gauge
> >>>>interest. TODO:
> >>>>
> >>>>- Detect write cache setting and only issue SYNC_CACHE if write cache is
> >>>>enabled (not a biggy, all drives ship with it enabled)
> >>>
> >>>
> >>>Yup, and I disable it on all drives at boot time at the latest.
> >>>
> >>>Is there a status document that lists
> >>>
> >>>- what SCSI drivers support write barriers
> >>>(I'm interested in sym53c8xx_2 if that matters)
> >>>
> >>>- what IDE drivers support write barriers
> >>>(VIA for AMD and Intel for PII/PIII/P4 chip sets here)
> >>
> >>The device is the entity that does, or does not, support flush-cache... 
> >>All IDE chipsets support flush-cache... it's just another IDE command.
> >
> >
> >Well drivers need to support it, too. IDE is supported, and that covers
> >all devices that support it. It's not implemented on SCSI at all right
> >now, that's coming up.
> 
> I do not see the need for write barrier support in 
> drivers/ide/pci/{via82cxxx,piix,}.c, which is the question the original 
> poster was asking.
> 
> Low-level chipset drivers should _not_ need to support it, the subsystem 
> can do that.

Right, nothing for IDE - as I wrote, IDE is supported and that covers
all devices. For SCSI, low level device drivers _do_ need to support it,
it's usually a two-liner to add support setting the ordered tag bit.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-13 14:08 [PATCH] ide write barrier support Jens Axboe
                   ` (3 preceding siblings ...)
  2003-10-15  3:40 ` Greg Stark
@ 2003-10-20 17:10 ` Daniel Phillips
  2003-10-20 19:56   ` Jens Axboe
  4 siblings, 1 reply; 39+ messages in thread
From: Daniel Phillips @ 2003-10-20 17:10 UTC (permalink / raw)
  To: Jens Axboe, Linux Kernel

Hi Jens,

On Monday 13 October 2003 16:08, Jens Axboe wrote:
> Forward ported and tested today (with the dummy ext3 patch included),
> works for me. Some todo's left, but I thought I'd send it out to gauge
> interest.

This is highly interesting of course, but is it suitable for submission during 
the stability freeze?  There is no correctness issue so long as no filesystem 
in mainline sets the BIO_RW_BARRIER bit, which appears to be the case.  
Therefore this is really a performance patch that introduces a new internal 
API.

It seems to me there are a few unresolved issues with the barrier API.  It 
needs to be clearly stated that only write barriers are supported, not read 
or read/write barriers, if that is in fact the intention.  Assuming it is, 
then BIOs with read barriers need to be failed.

The current BIO API provides no way to express a rw barrier, only read 
barriers and write barriers (the combination of direction bit and barrier bit 
indicates the barrier type).  This is minor but it but how nice it would be 
if the API was either orthogonal or there was a clear explanation of why RW 
barriers never make sense.  And if they don't, why read barriers do make 
sense.  Another possible wart is that the API doesn't allow for a read 
barrier carried by a write BIO or a write barrier carried by a read BIO.
>From a practical point of view the only immediate use we have for barriers is 
to accelerate journal writes and everything else comes under the heading of 
R&D.  It would help if the code clearly reflected that modest goal.

The BIO barrier scheme doesn't mesh properly with your proposed 
QUEUE_ORDERED_* scheme.  It seems to me that what you want is just 
QUEUE_ORDERED_NONE and QUEUE_ORDERED_WRITE.  Is there any case where the 
distinction between a tag based implemenation versus a flush matters to high 
level code?

Also, the blk_queue_ordered function isn't a sufficient interface to enable 
the functionality at a high level, a filesystem also needs a way to know 
whether barriers are supported or not, short of just submitting a barrier 
request and seeing if it fails.

The high level interface needs to be able to handled stacked devices, i.e., 
device mapper, but not just device mapper.  Barriers have to be supported by 
all the devices in the stack, not just the top or bottom one.  I don't have a 
concrete suggestion on what the interface should be just now.

The point of this is, there still remain a number of open issues with this 
patch, no doubt more than just the ones I touched on.  Though it is clearly 
headed in the right direction, I'd suggest holding off during the stability 
freeze and taking the needed time to get it right.

Regards,

Daniel


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-20 17:10 ` Daniel Phillips
@ 2003-10-20 19:56   ` Jens Axboe
  2003-10-20 23:46     ` Daniel Phillips
  0 siblings, 1 reply; 39+ messages in thread
From: Jens Axboe @ 2003-10-20 19:56 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: Linux Kernel

On Mon, Oct 20 2003, Daniel Phillips wrote:
> Hi Jens,
> 
> On Monday 13 October 2003 16:08, Jens Axboe wrote:
> > Forward ported and tested today (with the dummy ext3 patch included),
> > works for me. Some todo's left, but I thought I'd send it out to gauge
> > interest.
> 
> This is highly interesting of course, but is it suitable for
> submission during the stability freeze?  There is no correctness issue
> so long as no filesystem in mainline sets the BIO_RW_BARRIER bit,
> which appears to be the case.  Therefore this is really a performance
> patch that introduces a new internal API.

I've said this to you 3 times now I think, I don't think you understand
what I mean with 'correctness issue'. There IS a correctness issue, in
that drives ship with write back caching enabled. The fs assumes that
once wait_on_buffer() returns, the data is on disk. Which is false, can
remain false for quite a long number of seconds.

So let me state again that this is NOT a performance patch. I've never
considered it a performance patch, there are no performance gains with
the patch I posted. It is purely a data integrity issue. I don't know
how I can state this any clearer than I already have...

There are possibilities for performance gains in the _future_, that's
just an added _future_ bonus.

> It seems to me there are a few unresolved issues with the barrier API.  It 

I agree.

> needs to be clearly stated that only write barriers are supported, not read 
> or read/write barriers, if that is in fact the intention.  Assuming it is, 
> then BIOs with read barriers need to be failed.

read barriers can be just as easily supported, I still think that the
notion of read/write barriers is something you are inventing that I
don't see any practical use for. So I wont expand on that at all. From
my point of view, a read barrier is simply an io scheduler barrier. The
drive/driver never sees that bit. But it is 100% expressable with the
current logic.

> The current BIO API provides no way to express a rw barrier, only read
> barriers and write barriers (the combination of direction bit and
> barrier bit indicates the barrier type).  This is minor but it but how
> nice it would be if the API was either orthogonal or there was a clear
> explanation of why RW barriers never make sense.  And if they don't,
> why read barriers do make sense.  Another possible wart is that the
> API doesn't allow for a read barrier carried by a write BIO or a write
> barrier carried by a read BIO.  From a practical point of view the
> only immediate use we have for barriers is to accelerate journal
> writes and everything else comes under the heading of R&D.  It would
> help if the code clearly reflected that modest goal.

Please come up with at least pseudo-rational exampes for why this would
ever be needed, I refuse to design API's based on loose whims or ideas.
The API is "designed" for the practical use of today and what I assumed
would be useful within reason, that's as far as I think it makes sense
to go. To bend the API for a doctored example such as 'rw barrier' is
stupid imho.

> The BIO barrier scheme doesn't mesh properly with your proposed
> QUEUE_ORDERED_* scheme.  It seems to me that what you want is just
> QUEUE_ORDERED_NONE and QUEUE_ORDERED_WRITE.  Is there any case where
> the distinction between a tag based implemenation versus a flush
> matters to high level code?

The difference comes from the early reiser implementation in 2.4, I'm
sure Chris can expand on that. I think it's long gone though, and it's
just an over sight on my part that the ORDERED_TAG is still there. It
will go.

> Also, the blk_queue_ordered function isn't a sufficient interface to
> enable the functionality at a high level, a filesystem also needs a
> way to know whether barriers are supported or not, short of just
> submitting a barrier request and seeing if it fails.

Why? Sometimes the only reliable way to detect whether you can support
barrier writes or not is to issue one. So I can't really help you there.

> The high level interface needs to be able to handled stacked devices,
> i.e., device mapper, but not just device mapper.  Barriers have to be
> supported by all the devices in the stack, not just the top or bottom
> one.  I don't have a concrete suggestion on what the interface should
> be just now.

I completely agree. And I'm very open to patches correcting that issue,
thanks.

> The point of this is, there still remain a number of open issues with
> this patch, no doubt more than just the ones I touched on.  Though it
> is clearly headed in the right direction, I'd suggest holding off
> during the stability freeze and taking the needed time to get it
> right.

You touched on 1 valid point, the md/dm issue. That goes doubly for the
2.4 version (that we don't need to care more about). And I agree with
you there, it needs to be done. And feel free to knock yourself out.
It's not a trivial issue.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-20 19:56   ` Jens Axboe
@ 2003-10-20 23:46     ` Daniel Phillips
  2003-10-21  5:40       ` Jens Axboe
  0 siblings, 1 reply; 39+ messages in thread
From: Daniel Phillips @ 2003-10-20 23:46 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel

On Monday 20 October 2003 21:56, Jens Axboe wrote:
> There IS a correctness issue, in
> that drives ship with write back caching enabled. The fs assumes that
> once wait_on_buffer() returns, the data is on disk. Which is false, can
> remain false for quite a long number of seconds.

Maybe what you're saying is, there are only two ways to deal with IDE drives 
with non-disablable writeback cache:

  1) flush the cache on every write
  2) Implement write barriers, add them to all the journalling filesystems,
     and flush only on the write barrier

and (1) is just too slow.  Correct?

Regards,

Daniel


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-20 23:46     ` Daniel Phillips
@ 2003-10-21  5:40       ` Jens Axboe
  2003-10-23 16:22         ` Daniel Phillips
  0 siblings, 1 reply; 39+ messages in thread
From: Jens Axboe @ 2003-10-21  5:40 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: Linux Kernel

On Tue, Oct 21 2003, Daniel Phillips wrote:
> On Monday 20 October 2003 21:56, Jens Axboe wrote:
> > There IS a correctness issue, in
> > that drives ship with write back caching enabled. The fs assumes that
> > once wait_on_buffer() returns, the data is on disk. Which is false, can
> > remain false for quite a long number of seconds.
> 
> Maybe what you're saying is, there are only two ways to deal with IDE drives 
> with non-disablable writeback cache:
> 
>   1) flush the cache on every write
>   2) Implement write barriers, add them to all the journalling filesystems,
>      and flush only on the write barrier
> 
> and (1) is just too slow.  Correct?

Yes, that is exactly what I'm saying. It's not just that, though.
Completely disabling write back caching on IDE drives totally kills
performance typically, they are just not meant to be driven this way.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-21  5:40       ` Jens Axboe
@ 2003-10-23 16:22         ` Daniel Phillips
  2003-10-23 16:23           ` Jens Axboe
  0 siblings, 1 reply; 39+ messages in thread
From: Daniel Phillips @ 2003-10-23 16:22 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel

On Tuesday 21 October 2003 07:40, Jens Axboe wrote:
> On Tue, Oct 21 2003, Daniel Phillips wrote:
> > Maybe what you're saying is, there are only two ways to deal with IDE
> > drives with non-disablable writeback cache:
> >
> >   1) flush the cache on every write
> >   2) Implement write barriers, add them to all the journalling
> > filesystems, and flush only on the write barrier
> >
> > and (1) is just too slow.  Correct?
>
> Yes, that is exactly what I'm saying. It's not just that, though.
> Completely disabling write back caching on IDE drives totally kills
> performance typically, they are just not meant to be driven this way.

OK, this is still experimental development in the middle of the stability 
freeze in order to fix a performance bug, but that's a tradition anyway so 
let me join in.

I'm specifically interested in working out the issues related to stacked 
virtual devices, and there are many.  Let me start with an easy one.

Consider a multipath virtual device that is doing load balancing and wants to 
handle write barriers efficiently, not just allow the downstream queues to 
drain before allowing new writes.  This device wants to send a write barrier 
to each of the downstream devices, however, we have only one write request to 
carry the barrier bit.  How do you recommend handling this situation?

Regards,

Daniel


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-23 16:22         ` Daniel Phillips
@ 2003-10-23 16:23           ` Jens Axboe
  2003-10-23 17:20             ` Daniel Phillips
  0 siblings, 1 reply; 39+ messages in thread
From: Jens Axboe @ 2003-10-23 16:23 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: Linux Kernel

On Thu, Oct 23 2003, Daniel Phillips wrote:
> On Tuesday 21 October 2003 07:40, Jens Axboe wrote:
> > On Tue, Oct 21 2003, Daniel Phillips wrote:
> > > Maybe what you're saying is, there are only two ways to deal with IDE
> > > drives with non-disablable writeback cache:
> > >
> > >   1) flush the cache on every write
> > >   2) Implement write barriers, add them to all the journalling
> > > filesystems, and flush only on the write barrier
> > >
> > > and (1) is just too slow.  Correct?
> >
> > Yes, that is exactly what I'm saying. It's not just that, though.
> > Completely disabling write back caching on IDE drives totally kills
> > performance typically, they are just not meant to be driven this way.
> 
> OK, this is still experimental development in the middle of the stability 
> freeze in order to fix a performance bug, but that's a tradition anyway so 
> let me join in.

It's not a performance bug, fer crissakes!! Do you suggest disabling
write back cache on all ide drives? People will scream bloody murder if
we do. So it's a question of data integrity.

> I'm specifically interested in working out the issues related to stacked 
> virtual devices, and there are many.  Let me start with an easy one.
> 
> Consider a multipath virtual device that is doing load balancing and
> wants to handle write barriers efficiently, not just allow the
> downstream queues to drain before allowing new writes.  This device
> wants to send a write barrier to each of the downstream devices,
> however, we have only one write request to carry the barrier bit.  How
> do you recommend handling this situation?

That needs something to hold the state in, and a bio per device. As
they complete, mark them as such. When they all have completed, barrier
is done.

That's just an idea, I'm sure there are other ways. Depending on how
complex it gets, it might not be a bad idea to just let the queues drain
though. I think I'd prefer that approach.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-23 16:23           ` Jens Axboe
@ 2003-10-23 17:20             ` Daniel Phillips
  2003-10-23 23:21               ` Nick Piggin
  2003-10-24  9:36               ` Helge Hafting
  0 siblings, 2 replies; 39+ messages in thread
From: Daniel Phillips @ 2003-10-23 17:20 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux Kernel

On Thursday 23 October 2003 18:23, Jens Axboe wrote:
> On Thu, Oct 23 2003, Daniel Phillips wrote:
> > I'm specifically interested in working out the issues related to stacked
> > virtual devices, and there are many.  Let me start with an easy one.
> >
> > Consider a multipath virtual device that is doing load balancing and
> > wants to handle write barriers efficiently, not just allow the
> > downstream queues to drain before allowing new writes.  This device
> > wants to send a write barrier to each of the downstream devices,
> > however, we have only one write request to carry the barrier bit.  How
> > do you recommend handling this situation?
>
> That needs something to hold the state in, and a bio per device. As
> they complete, mark them as such. When they all have completed, barrier
> is done.
>
> That's just an idea, I'm sure there are other ways. Depending on how
> complex it gets, it might not be a bad idea to just let the queues drain
> though. I think I'd prefer that approach.

These are essentially the same, they both rely on draining the downstream 
queues.  But if we could keep the downstream queues full, bus transfers for 
post-barrier writes will overlap the media transfers for pre-barrier writes, 
which would seem to be worth some extra effort.

To keep the downstream queues full, we must submit write barriers to all the 
downstream devices and not wait for completion.  That is, as soon as a 
barrier is issued to a given downstream device we can start passing through 
post-barrier writes to it.

Assuming this is worth doing, how do we issue N barriers to the downstream 
devices when we have only one incoming barrier write?

Regards,

Daniel


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-23 17:20             ` Daniel Phillips
@ 2003-10-23 23:21               ` Nick Piggin
  2003-10-26 21:06                 ` Daniel Phillips
  2003-10-24  9:36               ` Helge Hafting
  1 sibling, 1 reply; 39+ messages in thread
From: Nick Piggin @ 2003-10-23 23:21 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: Jens Axboe, Linux Kernel



Daniel Phillips wrote:

>On Thursday 23 October 2003 18:23, Jens Axboe wrote:
>
>>On Thu, Oct 23 2003, Daniel Phillips wrote:
>>
>>>I'm specifically interested in working out the issues related to stacked
>>>virtual devices, and there are many.  Let me start with an easy one.
>>>
>>>Consider a multipath virtual device that is doing load balancing and
>>>wants to handle write barriers efficiently, not just allow the
>>>downstream queues to drain before allowing new writes.  This device
>>>wants to send a write barrier to each of the downstream devices,
>>>however, we have only one write request to carry the barrier bit.  How
>>>do you recommend handling this situation?
>>>
>>That needs something to hold the state in, and a bio per device. As
>>they complete, mark them as such. When they all have completed, barrier
>>is done.
>>
>>That's just an idea, I'm sure there are other ways. Depending on how
>>complex it gets, it might not be a bad idea to just let the queues drain
>>though. I think I'd prefer that approach.
>>
>
>These are essentially the same, they both rely on draining the downstream 
>queues.  But if we could keep the downstream queues full, bus transfers for 
>post-barrier writes will overlap the media transfers for pre-barrier writes, 
>which would seem to be worth some extra effort.
>
>To keep the downstream queues full, we must submit write barriers to all the 
>downstream devices and not wait for completion.  That is, as soon as a 
>barrier is issued to a given downstream device we can start passing through 
>post-barrier writes to it.
>
>Assuming this is worth doing, how do we issue N barriers to the downstream 
>devices when we have only one incoming barrier write?
>

You would do this in the multipath code, wouldn't you?

Anyway, I might be missing something, but I don't think draining the
queue will guarantee that writeback caches will go to permanent storage.



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-23 17:20             ` Daniel Phillips
  2003-10-23 23:21               ` Nick Piggin
@ 2003-10-24  9:36               ` Helge Hafting
  2003-10-26 15:38                 ` Daniel Phillips
  1 sibling, 1 reply; 39+ messages in thread
From: Helge Hafting @ 2003-10-24  9:36 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: Jens Axboe, Linux Kernel

On Thu, Oct 23, 2003 at 07:20:39PM +0200, Daniel Phillips wrote:
> These are essentially the same, they both rely on draining the downstream 
> queues.  But if we could keep the downstream queues full, bus transfers for 
> post-barrier writes will overlap the media transfers for pre-barrier writes, 
> which would seem to be worth some extra effort.
> 
> To keep the downstream queues full, we must submit write barriers to all the 
> downstream devices and not wait for completion.  That is, as soon as a 
> barrier is issued to a given downstream device we can start passing through 
> post-barrier writes to it.
> 
This approach may fail:

a. Some pre-barrier writes go to all devices
b. barrier is sent to all devices
c. Post-barrier writes go to all devices
d. drive 1 commits all its pre-barrier writes, then
   commits its post-barrier writes.
e. drive 2 is slow and havent done the pre-barrier writes yet
f. power is lost - leaving inconsistent devices.

The problem is that drive 1 don't know wether drive 2
did the barrier yet.

Helge Hafting

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-24  9:36               ` Helge Hafting
@ 2003-10-26 15:38                 ` Daniel Phillips
  0 siblings, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2003-10-26 15:38 UTC (permalink / raw)
  To: Helge Hafting; +Cc: Jens Axboe, Linux Kernel

On Friday 24 October 2003 11:36, Helge Hafting wrote:
> On Thu, Oct 23, 2003 at 07:20:39PM +0200, Daniel Phillips wrote:
> > These are essentially the same, they both rely on draining the downstream
> > queues.  But if we could keep the downstream queues full, bus transfers
> > for post-barrier writes will overlap the media transfers for pre-barrier
> > writes, which would seem to be worth some extra effort.
> >
> > To keep the downstream queues full, we must submit write barriers to all
> > the downstream devices and not wait for completion.  That is, as soon as
> > a barrier is issued to a given downstream device we can start passing
> > through post-barrier writes to it.
>
> This approach may fail:
>
> a. Some pre-barrier writes go to all devices
> b. barrier is sent to all devices
> c. Post-barrier writes go to all devices
> d. drive 1 commits all its pre-barrier writes, then
>    commits its post-barrier writes.
> e. drive 2 is slow and havent done the pre-barrier writes yet
> f. power is lost - leaving inconsistent devices.
>
> The problem is that drive 1 don't know wether drive 2
> did the barrier yet.

I was originally talking about SCSI multipath where more than one host adapter 
issues commands to the same drive, however this idea works for M adapters 
connected to N disks as well.  Several barriers sent down different paths 
share a count of the number of paths; on receiving a barrier, a driver 
decrements the count and if it is nonzero it blocks; if zero it unblocks the 
other drivers and each driver submits a barrier to its respective device 
before resuming normal processing.

Under balanced load this will keep all the device queues full, and it should 
be clear that there is no hole in this multi-device barrier for a write to 
tunnel through.  This strategy does however require some mechanism that isn't 
currently present in the barrier API.  I agree with Jens that for now the 
easiest thing to do is to block at the point of the multipath virtual device 
and allow the queues to drain.

The real purpose of this line of thinking was to see whether barriers want to 
be a third type of request, distinct from read or write.  This gets away from 
arbitrary tying of the barrier to a specific IO request, which might work out 
ok in some usages but leads to awkward posturing in others.  As a bonus, it 
gets rid of the question of how many bits to reserve for barrier type and 
makes it easy to set aside fields for such a purpose as a multi-queue 
barrier, as described above.  The cost is that the barrier needs to be 
submitted as a seperate request, which is not a big deal.

Overall, it's conceptually correct to treat a barrier as a separator rather 
than as a property of some other request.

Regards,

Daniel


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-23 23:21               ` Nick Piggin
@ 2003-10-26 21:06                 ` Daniel Phillips
  2003-10-27 10:29                   ` Lars Marowsky-Bree
  0 siblings, 1 reply; 39+ messages in thread
From: Daniel Phillips @ 2003-10-26 21:06 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Jens Axboe, Linux Kernel

On Friday 24 October 2003 01:21, Nick Piggin wrote:
> Daniel Phillips wrote:
> > To keep the downstream queues full, we must submit write barriers to all
> > the downstream devices and not wait for completion.  That is, as soon as
> > a barrier is issued to a given downstream device we can start passing
> > through post-barrier writes to it.
> >
> > Assuming this is worth doing, how do we issue N barriers to the downstream
> > devices when we have only one incoming barrier write?
>
> You would do this in the multipath code, wouldn't you?

Not entirely within the multipath virtual device, that's the problem.  If it 
could stay somehow all within one device driver then ok, but since we want to 
build this modularly, as a device mapper target, there are API issues.

> Anyway, I might be missing something, but I don't think draining the
> queue will guarantee that writeback caches will go to permanent storage.

We moved on from the IDE writeback problem a while back, this is about SCSI 
multipath, and the idea is to keep the SCSI device queues full so that 
barrier requests can flow through instead of stalling.

To be honest, after poring through the SCSI docs I'm not sure whether SCSI 
supports the behavior I want, which is for dma transfers on post-barrier 
requests to run in parallel with media transfers of pre-barrier requests.

With SCSI there are two mechanisms that could be used to implement barriers, 
ordered commands and task lists; patches posted to date use the first method.  
The ordered method sets an attribute on a SCSI write command that says "must 
be executed in order submitted" implying that all previously submitted 
commands have to finish before the ordered command begins to execute.  (Note, 
this is not necessarily optimal if the barrier is just supposed to separate 
two groups of writes and the order within the second group doesn't matter.  
Also, it implements a stronger barrier than just read/write, so reads will be 
blocked as well.)  What is not clear to me is whether or not the drive is 
allowed to read the buffer into its cache before the ordered command becomes 
active.  I'd appreciate comments from any SCSI gurus that happen to be 
reading.

Regards,

Daniel


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-26 21:06                 ` Daniel Phillips
@ 2003-10-27 10:29                   ` Lars Marowsky-Bree
  2003-10-27 21:35                     ` Daniel Phillips
  0 siblings, 1 reply; 39+ messages in thread
From: Lars Marowsky-Bree @ 2003-10-27 10:29 UTC (permalink / raw)
  To: Daniel Phillips, Nick Piggin; +Cc: Jens Axboe, Linux Kernel

On 2003-10-26T23:06:53,
   Daniel Phillips <phillips@arcor.de> said:

> Not entirely within the multipath virtual device, that's the problem.
> If it could stay somehow all within one device driver then ok, but
> since we want to build this modularly, as a device mapper target,
> there are API issues.

Are you seeing problems with the write-ordering properties of
multipathing? If so, what is the issue with handling them in the DM
target once?


Sincerely,
    Lars Marowsky-Brée <lmb@suse.de>

-- 
High Availability & Clustering	      \ ever tried. ever failed. no matter.
SUSE Labs			      | try again. fail again. fail better.
Research & Development, SUSE LINUX AG \ 	-- Samuel Beckett


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-27 10:29                   ` Lars Marowsky-Bree
@ 2003-10-27 21:35                     ` Daniel Phillips
  0 siblings, 0 replies; 39+ messages in thread
From: Daniel Phillips @ 2003-10-27 21:35 UTC (permalink / raw)
  To: Lars Marowsky-Bree, Nick Piggin; +Cc: Jens Axboe, Linux Kernel

On Monday 27 October 2003 11:29, Lars Marowsky-Bree wrote:
> On 2003-10-26T23:06:53,
>
>    Daniel Phillips <phillips@arcor.de> said:
> > Not entirely within the multipath virtual device, that's the problem.
> > If it could stay somehow all within one device driver then ok, but
> > since we want to build this modularly, as a device mapper target,
> > there are API issues.
>
> Are you seeing problems with the write-ordering properties of
> multipathing? If so, what is the issue with handling them in the DM
> target once?

No, no problem.  A multipath write barrier can be handled by blocking incoming 
writes in the target and waiting for all outstanding writes to complete.  I'd 
prefer to let the write barrier flow down the pipe to the SCSI disk if I 
could, but there are technical challenges.  Mainly I need to know if an 
ordered write is able to read its buffer into the drive's cache while it is 
waiting for preceding commands to complete; if so it would be worth putting 
the effort into developing the flow-through scheme.

Regards,

Daneil


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
       [not found] <IXzh.61g.5@gated-at.bofh.it>
@ 2003-10-21 19:24 ` Anton Ertl
  0 siblings, 0 replies; 39+ messages in thread
From: Anton Ertl @ 2003-10-21 19:24 UTC (permalink / raw)
  To: linux-kernel

Jens Axboe <axboe@suse.de> writes:
>Completely disabling write back caching on IDE drives totally kills
>performance typically, they are just not meant to be driven this way.

It has a significant performance impact, but I would not call it
"totally kills performance".  In Section 7 of

http://www.complang.tuwien.ac.at/papers/czezatke%26ertl00/

you can find some numbers for ext2 with and without write-back caching
(and for other file systems without write-back caching).  The slowdown
was about a factor 1.5 for an un-tar benchmark and 1 for an rm
benchmark (and of course 1 for the read-dominated tar and find
benchmarks).

IMO the OS should turn off write-back caching on all devices that have
journaling, log-structured or soft-updated file systems mounted,
unless the OS ensures correctness through write barriers, explicit
flushes or somesuch.  I certainly turn off write-back caching on all
drives with ext3 file systems.

So I hope to see write barrier support in the Linux kernel at some
point in the future.

Concerning disk behaviour, I have certainly seen disks (with
write-back caching turned on) that don't write a block for many
seconds, while they write other, later (from the view of the CPU)
blocks.  See

http://www.complang.tuwien.ac.at/anton/hdtest/

- anton
-- 
M. Anton Ertl                    Some things have to be seen to be believed
anton@mips.complang.tuwien.ac.at Most things have to be believed to be seen
http://www.complang.tuwien.ac.at/anton/home.html

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-17 18:06 ` Jens Axboe
@ 2003-10-21  0:47   ` Matthias Andree
  0 siblings, 0 replies; 39+ messages in thread
From: Matthias Andree @ 2003-10-21  0:47 UTC (permalink / raw)
  To: linux-kernel

On Fri, 17 Oct 2003, Jens Axboe wrote:

> On Fri, Oct 17 2003, Manfred Spraul wrote:
> > Jens wrote:
> > 
> > >The problem is that as far as I can see the best way to make fsync
> > >really work is to make the last write a barrier write. That
> > >automagically gets everything right for you - when the last block goes
> > >to disk, you know the previous ones have already. And when the last
> > >block completes, you know the whole lot is on platter.
> > >
> > Are you sure?
> > What prevents the io scheduler from writing the last block before other 
> > blocks?
> 
> Very sure, the io scheduler will never put the barrier write before
> previously comitted writes. So yes, it will work as described.

If the "last block" is special however (containing a "complete" marker
for instance), the order needs to be:

write1 write2 write3 write4 barrier write-last barrier

This will guarantee write-last will happen after write4.

The drive is free to permute the order of write1 ... write4, no?

Manfred, was that your idea?

-- 
Matthias Andree

Encrypt your mail: my GnuPG key ID is 0x052E7D95

^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH] ide write barrier support
@ 2003-10-17 18:42 Mudama, Eric
  0 siblings, 0 replies; 39+ messages in thread
From: Mudama, Eric @ 2003-10-17 18:42 UTC (permalink / raw)
  To: 'Jens Axboe'; +Cc: 'Greg Stark', Linux Kernel



> -----Original Message-----
> From: Jens Axboe [mailto:axboe@suse.de]
>
> > Luckilly, us drive guys are a bit smarter (if only a bit)...
> 
> Some of you? :)

We're a little bit smarter than the totally braindead possible data order
issues that are allowed in the spec... but only a little bit smarter.

> That's why for IDE I prefer handling it in software. Let the 
> queue drain,
> issue a barrier write, and continue. That works, regardless of drive
> firmware implementations. As long as the spec doesn't make it explicit
> what happens, there's no way I can rely on it.

agreed

The drive will be doing ops as fast as possible, letting the queue go to
zero depth then flushing cache would be the absolute fastest way to do it
for best overall performance.

eric

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-17 16:07 Mudama, Eric
@ 2003-10-17 18:08 ` Jens Axboe
  0 siblings, 0 replies; 39+ messages in thread
From: Jens Axboe @ 2003-10-17 18:08 UTC (permalink / raw)
  To: Mudama, Eric; +Cc: 'Greg Stark', Linux Kernel

On Fri, Oct 17 2003, Mudama, Eric wrote:
> 
> 
> > -----Original Message-----
> > From: Jens Axboe [mailto:axboe@suse.de]
> >
> > Yes that would be very nice, but unfortunately I think FUA in ATA got
> > defined as not implying ordering (the FUA write would typically go
> > straight to disk, ahead of any in-cache dirty data). Which 
> > makes it less
> > useful, imo.
> 
> None of the TCQ/FUA components of the spec mention ordering.  According to
> the "letter" of the specification, if you issue two queued writes for the
> same LBA, the drive has the choice of which one to do first and which one to
> put on the media first, which is totally broken in common sense land.
> 
> Luckilly, us drive guys are a bit smarter (if only a bit)...

Some of you? :)

> If you issue a FUA write for data already in cache, and you put the FUA
> write onto the media, there's no problem if you discard the cached data that
> you were going to write.
> 
> In drives with a unified cache, they'll always be consistent provided the
> overlapping interface transfers happen in the same order they were
> issued.... this is common sense.
> 
> However, you're right in that non-overlapping cached write data may stay in
> cache a long time, which potentially gives you a very large time hole in
> which your FUA'd data is on the media and your user data is still hangin' in
> the breeze prior to a flush on a very busy drive.

That's why for IDE I prefer handling it in software. Let the queue drain,
issue a barrier write, and continue. That works, regardless of drive
firmware implementations. As long as the spec doesn't make it explicit
what happens, there's no way I can rely on it.

Jens


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-17 17:59 Manfred Spraul
@ 2003-10-17 18:06 ` Jens Axboe
  2003-10-21  0:47   ` Matthias Andree
  0 siblings, 1 reply; 39+ messages in thread
From: Jens Axboe @ 2003-10-17 18:06 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: linux-kernel

On Fri, Oct 17 2003, Manfred Spraul wrote:
> Jens wrote:
> 
> >The problem is that as far as I can see the best way to make fsync
> >really work is to make the last write a barrier write. That
> >automagically gets everything right for you - when the last block goes
> >to disk, you know the previous ones have already. And when the last
> >block completes, you know the whole lot is on platter.
> >
> Are you sure?
> What prevents the io scheduler from writing the last block before other 
> blocks?

Very sure, the io scheduler will never put the barrier write before
previously comitted writes. So yes, it will work as described.

Jens


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
@ 2003-10-17 17:59 Manfred Spraul
  2003-10-17 18:06 ` Jens Axboe
  0 siblings, 1 reply; 39+ messages in thread
From: Manfred Spraul @ 2003-10-17 17:59 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel

Jens wrote:

>The problem is that as far as I can see the best way to make fsync
>really work is to make the last write a barrier write. That
>automagically gets everything right for you - when the last block goes
>to disk, you know the previous ones have already. And when the last
>block completes, you know the whole lot is on platter.
>
Are you sure?
What prevents the io scheduler from writing the last block before other 
blocks?

--
    Manfred


^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH] ide write barrier support
@ 2003-10-17 16:07 Mudama, Eric
  2003-10-17 18:08 ` Jens Axboe
  0 siblings, 1 reply; 39+ messages in thread
From: Mudama, Eric @ 2003-10-17 16:07 UTC (permalink / raw)
  To: 'Jens Axboe'; +Cc: 'Greg Stark', Linux Kernel



> -----Original Message-----
> From: Jens Axboe [mailto:axboe@suse.de]
>
> Yes that would be very nice, but unfortunately I think FUA in ATA got
> defined as not implying ordering (the FUA write would typically go
> straight to disk, ahead of any in-cache dirty data). Which 
> makes it less
> useful, imo.

None of the TCQ/FUA components of the spec mention ordering.  According to
the "letter" of the specification, if you issue two queued writes for the
same LBA, the drive has the choice of which one to do first and which one to
put on the media first, which is totally broken in common sense land.

Luckilly, us drive guys are a bit smarter (if only a bit)...

If you issue a FUA write for data already in cache, and you put the FUA
write onto the media, there's no problem if you discard the cached data that
you were going to write.

In drives with a unified cache, they'll always be consistent provided the
overlapping interface transfers happen in the same order they were
issued.... this is common sense.

However, you're right in that non-overlapping cached write data may stay in
cache a long time, which potentially gives you a very large time hole in
which your FUA'd data is on the media and your user data is still hangin' in
the breeze prior to a flush on a very busy drive.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-16 20:51 Mudama, Eric
@ 2003-10-17  6:48 ` Jens Axboe
  0 siblings, 0 replies; 39+ messages in thread
From: Jens Axboe @ 2003-10-17  6:48 UTC (permalink / raw)
  To: Mudama, Eric; +Cc: 'Greg Stark', Linux Kernel

On Thu, Oct 16 2003, Mudama, Eric wrote:
> 
> 
> > -----Original Message-----
> > From: Greg Stark
> >
> > Ideally postgres just needs to call some kind of fsync 
> > syscall that guarantees
> > it won't return until all buffers from the file that were 
> > dirty prior to the
> > sync were flushed and the disk was really synced. It's fine 
> > for buffers that
> > were dirtied later to get synced as well, as long as all the 
> > old buffers are
> > all synced.
> 
> This checkpointing doesn't exist in ATA, only in SCSI I think.  You can get
> similar behavior in ATA-7 capable drives (which I don't think are on the
> market yet) by issuing FUA commands.  These will not return good status
> until the data is on the media, and they can be intermingled with other
> cached writes without destroying overall performance.
> 
> If there was some way to define a file as FUA instead of normal, then you'd
> know every write to it would be on the media if the status was good.
> However, you may have committed your journal or whatever and have possibly
> significantly stale data on the drive's cache in the user data area.
> 
> As far as the actual file-system call mechanism to achive this, I have no
> idea... I know very little about linux internals, I just try to answer
> disk-related questions.

Yes that would be very nice, but unfortunately I think FUA in ATA got
defined as not implying ordering (the FUA write would typically go
straight to disk, ahead of any in-cache dirty data). Which makes it less
useful, imo.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-16 16:51 Mudama, Eric
  2003-10-16 20:43 ` Greg Stark
@ 2003-10-17  6:46 ` Jens Axboe
  1 sibling, 0 replies; 39+ messages in thread
From: Jens Axboe @ 2003-10-17  6:46 UTC (permalink / raw)
  To: Mudama, Eric; +Cc: Greg Stark, Linux Kernel

On Thu, Oct 16 2003, Mudama, Eric wrote:
> 
> On Tue, Oct 14 2003, Greg Stark wrote:
> > Jens Axboe <axboe@suse.de> writes:
> > > There's also the case of files opened with O_SYNC. Would inserting a
> > > write barrier after every write to such a file destroy performance?
> > 
> > If it's mainly sequential io, then no it won't destroy performance. It
> > will be lower than without the cache flush of course.
> 
> If you flush a cache after every command in a sequential write, yes, you'll
> destroy performance.  How much you destroy it is a function of command size
> relative to track size, and the RPM of the drive.

Yes you are right, my logic was a bit backwards. But you don't have to
issue a flush after every write, thankfully. But lets still not forget
that performance means nothing if integrity isn't guarenteed.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-16 20:43 ` Greg Stark
@ 2003-10-17  6:44   ` Jens Axboe
  0 siblings, 0 replies; 39+ messages in thread
From: Jens Axboe @ 2003-10-17  6:44 UTC (permalink / raw)
  To: Greg Stark; +Cc: Mudama, Eric, Linux Kernel

On Thu, Oct 16 2003, Greg Stark wrote:
> 
> "Mudama, Eric" <eric_mudama@Maxtor.com> writes:
> 
> > It takes us multiple servo wedges to know that we think our write to the
> > media went in the right place, therefore by definition if we didn't already
> > have the next command's data, we've already missed our target location and
> > have to wait a full revolution to put the new data on the media.  Since we
> > can't report good status for the flush until after we're sure the data is
> > down properly, we'll always blow a rev.
> 
> Ok, on further thought. I think a write barrier isn't really what the database
> needs. It seems to be stronger and more resource intensive than what it really
> needs.
> 
> Postgres writes a transaction log. When the client issues a commit postgres
> cannot return until it knows all the writes for the transaction log for that
> transaction have completed.
> 
> Currently it issues an fsync which is already a bit stronger than necessary.
> But a write barrier sounds even stronger. It would block all other disk i/o
> until the fsync completes. This is completely unnecessary, it would prevent
> other transactions from proceeding at all until the commit finished.
> 
> Ideally postgres just needs to call some kind of fsync syscall that guarantees
> it won't return until all buffers from the file that were dirty prior to the
> sync were flushed and the disk was really synced. It's fine for buffers that
> were dirtied later to get synced as well, as long as all the old buffers are
> all synced.

I've been thinking about adding WRITESYNC to do exactly that, and keep
WRITEBARRIER with its current functionality for journalled file
systems. WRITESYNC would be exactly what you describe, it just wont
imply any io scheduler ordering. So a post-flush would be enough to
handle that case.

The problem is that as far as I can see the best way to make fsync
really work is to make the last write a barrier write. That
automagically gets everything right for you - when the last block goes
to disk, you know the previous ones have already. And when the last
block completes, you know the whole lot is on platter. If you were just
using WRITESYNC, you would have to WRITESYNC all blocks in that range
instead of just WRITE WRITE WRITE ... WRITEBARRIER. So the barrier would
still end up being cheaper, unless the fsync just flushes a single page
in which case the WRITESYNC is enough.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH] ide write barrier support
@ 2003-10-16 20:51 Mudama, Eric
  2003-10-17  6:48 ` Jens Axboe
  0 siblings, 1 reply; 39+ messages in thread
From: Mudama, Eric @ 2003-10-16 20:51 UTC (permalink / raw)
  To: 'Greg Stark'; +Cc: 'Jens Axboe', Linux Kernel



> -----Original Message-----
> From: Greg Stark
>
> Ideally postgres just needs to call some kind of fsync 
> syscall that guarantees
> it won't return until all buffers from the file that were 
> dirty prior to the
> sync were flushed and the disk was really synced. It's fine 
> for buffers that
> were dirtied later to get synced as well, as long as all the 
> old buffers are
> all synced.

This checkpointing doesn't exist in ATA, only in SCSI I think.  You can get
similar behavior in ATA-7 capable drives (which I don't think are on the
market yet) by issuing FUA commands.  These will not return good status
until the data is on the media, and they can be intermingled with other
cached writes without destroying overall performance.

If there was some way to define a file as FUA instead of normal, then you'd
know every write to it would be on the media if the status was good.
However, you may have committed your journal or whatever and have possibly
significantly stale data on the drive's cache in the user data area.

As far as the actual file-system call mechanism to achive this, I have no
idea... I know very little about linux internals, I just try to answer
disk-related questions.

--eric

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] ide write barrier support
  2003-10-16 16:51 Mudama, Eric
@ 2003-10-16 20:43 ` Greg Stark
  2003-10-17  6:44   ` Jens Axboe
  2003-10-17  6:46 ` Jens Axboe
  1 sibling, 1 reply; 39+ messages in thread
From: Greg Stark @ 2003-10-16 20:43 UTC (permalink / raw)
  To: Mudama, Eric; +Cc: 'Jens Axboe', Greg Stark, Linux Kernel


"Mudama, Eric" <eric_mudama@Maxtor.com> writes:

> It takes us multiple servo wedges to know that we think our write to the
> media went in the right place, therefore by definition if we didn't already
> have the next command's data, we've already missed our target location and
> have to wait a full revolution to put the new data on the media.  Since we
> can't report good status for the flush until after we're sure the data is
> down properly, we'll always blow a rev.

Ok, on further thought. I think a write barrier isn't really what the database
needs. It seems to be stronger and more resource intensive than what it really
needs.

Postgres writes a transaction log. When the client issues a commit postgres
cannot return until it knows all the writes for the transaction log for that
transaction have completed.

Currently it issues an fsync which is already a bit stronger than necessary.
But a write barrier sounds even stronger. It would block all other disk i/o
until the fsync completes. This is completely unnecessary, it would prevent
other transactions from proceeding at all until the commit finished.

Ideally postgres just needs to call some kind of fsync syscall that guarantees
it won't return until all buffers from the file that were dirty prior to the
sync were flushed and the disk was really synced. It's fine for buffers that
were dirtied later to get synced as well, as long as all the old buffers are
all synced.

In theory it doesn't even need to force the dirty buffers to be flushed, just
as long as it can keep track of when they are all flushed and then wait until
they're confirmed written by the disk controller. Depending on the application
it may be better for it to force the flush to reduce latency or not to
maximize bandwidth.

There is a related problem when the logs are checkpointed. There postgres has
to be sure that ALL dirty buffers in any file in the database are synced to
disk before it proceeds. This includes files that the process doing the
checkpoint never even had open. Currently the only strategy available is to
call sync(2) and then sleep an arbitrary amount of time.

Ideally it would be able to issue a kind of sync syscall that didn't
necessarily prevent other i/o from happening, but just scheduled all dirty
buffers to be synced and didn't return until they were all guaranteed on disk.
Similarly it's not really necessary to force the buffers to be flushed, and in
the case of a checkpoint probably better that it not. They'll all be flushed
within a reasonable amount of time anyways and forcing them to be flushed
could cause a huge spike in disk i/o as it affects every file in the whole
database.

> If you're issuing 32 MiB writes to a very fast drive , you'll only have
> wasted time every 1/3 of a second (with a ~100MB interface)... which is near
> trivial.  However, if you're doing 1MB or smaller writes, I think you'll see
> a huge performance penalty.

I think the writes to the transaction log are much much smaller than that.
I'm guessing measured in bytes, not megabytes. (I guess small numbers of
blocks in practice of course.)
 
> Then again, if you're only working with datasets of a few megabytes, you'd
> probably never notice.  It is the person flushing a huge stream of data to
> disk who gets penalized the most. 

Note that while the transaction logs are usually pretty small and involve lots
of slow writes, the data itself is often being stored on the same disks and
could be terabytes of data.

-- 
greg


^ permalink raw reply	[flat|nested] 39+ messages in thread

* RE: [PATCH] ide write barrier support
@ 2003-10-16 16:51 Mudama, Eric
  2003-10-16 20:43 ` Greg Stark
  2003-10-17  6:46 ` Jens Axboe
  0 siblings, 2 replies; 39+ messages in thread
From: Mudama, Eric @ 2003-10-16 16:51 UTC (permalink / raw)
  To: 'Jens Axboe', Greg Stark; +Cc: Linux Kernel


On Tue, Oct 14 2003, Greg Stark wrote:
> Jens Axboe <axboe@suse.de> writes:
> > There's also the case of files opened with O_SYNC. Would inserting a
> > write barrier after every write to such a file destroy performance?
> 
> If it's mainly sequential io, then no it won't destroy performance. It
> will be lower than without the cache flush of course.

If you flush a cache after every command in a sequential write, yes, you'll
destroy performance.  How much you destroy it is a function of command size
relative to track size, and the RPM of the drive.

It takes us multiple servo wedges to know that we think our write to the
media went in the right place, therefore by definition if we didn't already
have the next command's data, we've already missed our target location and
have to wait a full revolution to put the new data on the media.  Since we
can't report good status for the flush until after we're sure the data is
down properly, we'll always blow a rev.

If you're issuing 32 MiB writes to a very fast drive , you'll only have
wasted time every 1/3 of a second (with a ~100MB interface)... which is near
trivial.  However, if you're doing 1MB or smaller writes, I think you'll see
a huge performance penalty.

Then again, if you're only working with datasets of a few megabytes, you'd
probably never notice.  It is the person flushing a huge stream of data to
disk who gets penalized the most. 

--eric


^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2003-10-27 21:29 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-10-13 14:08 [PATCH] ide write barrier support Jens Axboe
2003-10-13 15:23 ` Jeff Garzik
2003-10-13 15:35   ` Jens Axboe
2003-10-13 15:37     ` Jens Axboe
2003-10-13 22:39 ` Matthias Andree
2003-10-14  0:16   ` Jeff Garzik
2003-10-16 10:36     ` Jens Axboe
2003-10-16 10:46       ` Jeff Garzik
2003-10-16 10:48         ` Jens Axboe
2003-10-13 23:07 ` Andrew Morton
2003-10-14  6:48   ` Jens Axboe
2003-10-15  3:40 ` Greg Stark
2003-10-16  7:10   ` Jens Axboe
2003-10-20 17:10 ` Daniel Phillips
2003-10-20 19:56   ` Jens Axboe
2003-10-20 23:46     ` Daniel Phillips
2003-10-21  5:40       ` Jens Axboe
2003-10-23 16:22         ` Daniel Phillips
2003-10-23 16:23           ` Jens Axboe
2003-10-23 17:20             ` Daniel Phillips
2003-10-23 23:21               ` Nick Piggin
2003-10-26 21:06                 ` Daniel Phillips
2003-10-27 10:29                   ` Lars Marowsky-Bree
2003-10-27 21:35                     ` Daniel Phillips
2003-10-24  9:36               ` Helge Hafting
2003-10-26 15:38                 ` Daniel Phillips
2003-10-16 16:51 Mudama, Eric
2003-10-16 20:43 ` Greg Stark
2003-10-17  6:44   ` Jens Axboe
2003-10-17  6:46 ` Jens Axboe
2003-10-16 20:51 Mudama, Eric
2003-10-17  6:48 ` Jens Axboe
2003-10-17 16:07 Mudama, Eric
2003-10-17 18:08 ` Jens Axboe
2003-10-17 17:59 Manfred Spraul
2003-10-17 18:06 ` Jens Axboe
2003-10-21  0:47   ` Matthias Andree
2003-10-17 18:42 Mudama, Eric
     [not found] <IXzh.61g.5@gated-at.bofh.it>
2003-10-21 19:24 ` Anton Ertl

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.