All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/3] write barrier support
@ 2009-05-05 12:08 Christoph Hellwig
  2009-05-05 12:08 ` [Qemu-devel] barriers: block layer preparations Christoph Hellwig
                   ` (3 more replies)
  0 siblings, 4 replies; 15+ messages in thread
From: Christoph Hellwig @ 2009-05-05 12:08 UTC (permalink / raw)
  To: qemu-devel

Add support for write barriers that allow guests to control write
ordering when write back cachin is enabled.

For now only the simple stack of virtio on posix raw/host devices is
supported.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Qemu-devel] barriers: block layer preparations
  2009-05-05 12:08 [Qemu-devel] [PATCH 0/3] write barrier support Christoph Hellwig
@ 2009-05-05 12:08 ` Christoph Hellwig
  2009-05-05 13:51   ` Avi Kivity
  2009-05-05 12:08 ` [Qemu-devel] [PATCH 2/3] barriers: block-raw-posix barrier support Christoph Hellwig
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2009-05-05 12:08 UTC (permalink / raw)
  To: qemu-devel

Add a flag to BlockDriverState to advertise barriers support, and add a flags
argument to bdrv_aio_readv/writev to allow passing down the barrier flag.

Note that the flags argument to bdrv_aio_readv is for now actually superflous
because write barriers obviously only apply to writes.  I kept the read side
argument so the API is symmetric and we can easily add more flags to both
of them.


Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/block-qcow.c
===================================================================
--- qemu.orig/block-qcow.c	2009-04-26 16:38:30.509074899 +0200
+++ qemu/block-qcow.c	2009-05-05 11:29:11.545784130 +0200
@@ -587,7 +587,7 @@ static void qcow_aio_read_cb(void *opaqu
             acb->hd_iov.iov_len = acb->n * 512;
             qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
             acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
-                &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+                &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb, 0);
             if (acb->hd_aiocb == NULL)
                 goto done;
         } else {
@@ -612,7 +612,7 @@ static void qcow_aio_read_cb(void *opaqu
         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
         acb->hd_aiocb = bdrv_aio_readv(s->hd,
                             (acb->cluster_offset >> 9) + index_in_cluster,
-                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb, 0);
         if (acb->hd_aiocb == NULL)
             goto done;
     }
@@ -630,7 +630,7 @@ done:
 
 static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
 {
     QCowAIOCB *acb;
 
@@ -708,7 +708,7 @@ static void qcow_aio_write_cb(void *opaq
     acb->hd_aiocb = bdrv_aio_writev(s->hd,
                                     (cluster_offset >> 9) + index_in_cluster,
                                     &acb->hd_qiov, acb->n,
-                                    qcow_aio_write_cb, acb);
+                                    qcow_aio_write_cb, acb, 0);
     if (acb->hd_aiocb == NULL)
         goto done;
     return;
@@ -722,7 +722,7 @@ done:
 
 static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
 {
     BDRVQcowState *s = bs->opaque;
     QCowAIOCB *acb;
Index: qemu/block-qcow2.c
===================================================================
--- qemu.orig/block-qcow2.c	2009-04-26 16:47:40.682949219 +0200
+++ qemu/block-qcow2.c	2009-05-05 11:29:11.546784118 +0200
@@ -1349,7 +1349,7 @@ static void qcow_aio_read_cb(void *opaqu
                 qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
                 acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
                                     &acb->hd_qiov, acb->n,
-				    qcow_aio_read_cb, acb);
+				    qcow_aio_read_cb, acb, 0);
                 if (acb->hd_aiocb == NULL)
                     goto done;
             } else {
@@ -1384,7 +1384,7 @@ static void qcow_aio_read_cb(void *opaqu
         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
         acb->hd_aiocb = bdrv_aio_readv(s->hd,
                             (acb->cluster_offset >> 9) + index_in_cluster,
-                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb, 0);
         if (acb->hd_aiocb == NULL)
             goto done;
     }
@@ -1427,7 +1427,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDr
 
 static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
 {
     QCowAIOCB *acb;
 
@@ -1498,7 +1498,7 @@ static void qcow_aio_write_cb(void *opaq
     acb->hd_aiocb = bdrv_aio_writev(s->hd,
                                     (acb->cluster_offset >> 9) + index_in_cluster,
                                     &acb->hd_qiov, acb->n,
-                                    qcow_aio_write_cb, acb);
+                                    qcow_aio_write_cb, acb, 2);
     if (acb->hd_aiocb == NULL)
         goto done;
 
@@ -1513,7 +1513,7 @@ done:
 
 static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
 {
     BDRVQcowState *s = bs->opaque;
     QCowAIOCB *acb;
Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2009-04-26 16:38:30.513074989 +0200
+++ qemu/block.c	2009-05-05 11:29:11.547817490 +0200
@@ -59,10 +59,10 @@ typedef struct BlockDriverAIOCBSync {
 
 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque);
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags);
 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque);
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags);
 static void bdrv_aio_cancel_em(BlockDriverAIOCB *acb);
 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
                         uint8_t *buf, int nb_sectors);
@@ -1309,7 +1309,8 @@ char *bdrv_snapshot_dump(char *buf, int 
 
 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
                                  QEMUIOVector *qiov, int nb_sectors,
-                                 BlockDriverCompletionFunc *cb, void *opaque)
+                                 BlockDriverCompletionFunc *cb, void *opaque,
+                                 unsigned flags)
 {
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
@@ -1320,7 +1321,7 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
         return NULL;
 
     ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
-                              cb, opaque);
+                              cb, opaque, flags);
 
     if (ret) {
 	/* Update stats even though technically transfer has not happened. */
@@ -1333,7 +1334,8 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
 
 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
                                   QEMUIOVector *qiov, int nb_sectors,
-                                  BlockDriverCompletionFunc *cb, void *opaque)
+                                  BlockDriverCompletionFunc *cb, void *opaque,
+                                  unsigned flags)
 {
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
@@ -1346,7 +1348,7 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockD
         return NULL;
 
     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
-                               cb, opaque);
+                               cb, opaque, flags);
 
     if (ret) {
 	/* Update stats even though technically transfer has not happened. */
@@ -1411,14 +1413,14 @@ static BlockDriverAIOCB *bdrv_aio_rw_vec
 
 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
 {
     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
 }
 
 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
 {
     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
 }
@@ -1454,7 +1456,7 @@ static int bdrv_read_em(BlockDriverState
     iov.iov_len = nb_sectors * 512;
     qemu_iovec_init_external(&qiov, &iov, 1);
     acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors,
-        bdrv_rw_em_cb, &async_ret);
+        bdrv_rw_em_cb, &async_ret, 0);
     if (acb == NULL)
         return -1;
 
@@ -1478,7 +1480,7 @@ static int bdrv_write_em(BlockDriverStat
     iov.iov_len = nb_sectors * 512;
     qemu_iovec_init_external(&qiov, &iov, 1);
     acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors,
-        bdrv_rw_em_cb, &async_ret);
+        bdrv_rw_em_cb, &async_ret, 0);
     if (acb == NULL)
         return -1;
     while (async_ret == NOT_DONE) {
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h	2009-04-26 16:38:30.513074989 +0200
+++ qemu/block.h	2009-05-05 11:29:11.549784011 +0200
@@ -54,6 +54,12 @@ typedef struct QEMUSnapshotInfo {
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_CACHE_DEF)
 
+
+/*
+ * Flags for bdrv_aio_readv/bdrv_aio_writev.
+ */
+#define BDRV_IO_BARRIER    0x0001
+
 void bdrv_info(Monitor *mon);
 void bdrv_info_stats(Monitor *mon);
 
@@ -93,10 +99,12 @@ typedef void BlockDriverCompletionFunc(v
 
 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
                                  QEMUIOVector *iov, int nb_sectors,
-                                 BlockDriverCompletionFunc *cb, void *opaque);
+                                 BlockDriverCompletionFunc *cb, void *opaque,
+                                 unsigned flags);
 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
                                   QEMUIOVector *iov, int nb_sectors,
-                                  BlockDriverCompletionFunc *cb, void *opaque);
+                                  BlockDriverCompletionFunc *cb, void *opaque,
+                                  unsigned flags);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
 
 /* sg packet commands */
Index: qemu/dma-helpers.c
===================================================================
--- qemu.orig/dma-helpers.c	2009-04-26 16:38:30.531074417 +0200
+++ qemu/dma-helpers.c	2009-05-05 11:29:11.550836309 +0200
@@ -120,10 +120,10 @@ static void dma_bdrv_cb(void *opaque, in
 
     if (dbs->is_write) {
         dbs->acb = bdrv_aio_writev(dbs->bs, dbs->sector_num, &dbs->iov,
-                                   dbs->iov.size / 512, dma_bdrv_cb, dbs);
+                                   dbs->iov.size / 512, dma_bdrv_cb, dbs, 0);
     } else {
         dbs->acb = bdrv_aio_readv(dbs->bs, dbs->sector_num, &dbs->iov,
-                                  dbs->iov.size / 512, dma_bdrv_cb, dbs);
+                                  dbs->iov.size / 512, dma_bdrv_cb, dbs, 0);
     }
     if (!dbs->acb) {
         dma_bdrv_unmap(dbs);
Index: qemu/hw/ide.c
===================================================================
--- qemu.orig/hw/ide.c	2009-05-04 17:02:14.367783478 +0200
+++ qemu/hw/ide.c	2009-05-05 11:29:11.552783415 +0200
@@ -1473,7 +1473,7 @@ static void ide_atapi_cmd_read_dma_cb(vo
     bm->iov.iov_len = n * 4 * 512;
     qemu_iovec_init_external(&bm->qiov, &bm->iov, 1);
     bm->aiocb = bdrv_aio_readv(s->bs, (int64_t)s->lba << 2, &bm->qiov,
-                               n * 4, ide_atapi_cmd_read_dma_cb, bm);
+                               n * 4, ide_atapi_cmd_read_dma_cb, bm, 0);
     if (!bm->aiocb) {
         /* Note: media not present is the most likely case */
         ide_atapi_cmd_error(s, SENSE_NOT_READY,
Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c	2009-04-26 16:38:30.592081973 +0200
+++ qemu/hw/scsi-disk.c	2009-05-05 11:29:11.554836260 +0200
@@ -210,7 +210,7 @@ static void scsi_read_data(SCSIDevice *d
     r->iov.iov_len = n * 512;
     qemu_iovec_init_external(&r->qiov, &r->iov, 1);
     r->aiocb = bdrv_aio_readv(s->bdrv, r->sector, &r->qiov, n,
-                              scsi_read_complete, r);
+                              scsi_read_complete, r, 0);
     if (r->aiocb == NULL)
         scsi_command_complete(r, STATUS_CHECK_CONDITION, SENSE_HARDWARE_ERROR);
     r->sector += n;
@@ -275,7 +275,7 @@ static void scsi_write_request(SCSIReque
     if (n) {
         qemu_iovec_init_external(&r->qiov, &r->iov, 1);
         r->aiocb = bdrv_aio_writev(s->bdrv, r->sector, &r->qiov, n,
-                                   scsi_write_complete, r);
+                                   scsi_write_complete, r, 0);
         if (r->aiocb == NULL)
             scsi_command_complete(r, STATUS_CHECK_CONDITION,
                                   SENSE_HARDWARE_ERROR);
Index: qemu/hw/virtio-blk.c
===================================================================
--- qemu.orig/hw/virtio-blk.c	2009-05-04 17:02:14.374784160 +0200
+++ qemu/hw/virtio-blk.c	2009-05-05 13:21:24.196784347 +0200
@@ -212,13 +212,13 @@ static void virtio_blk_handle_scsi(VirtI
 static void virtio_blk_handle_write(VirtIOBlockReq *req)
 {
     bdrv_aio_writev(req->dev->bs, req->out->sector, &req->qiov,
-                    req->qiov.size / 512, virtio_blk_rw_complete, req);
+                    req->qiov.size / 512, virtio_blk_rw_complete, req, 0);
 }
 
 static void virtio_blk_handle_read(VirtIOBlockReq *req)
 {
     bdrv_aio_readv(req->dev->bs, req->out->sector, &req->qiov,
-                   req->qiov.size / 512, virtio_blk_rw_complete, req);
+                   req->qiov.size / 512, virtio_blk_rw_complete, req, 0);
 }
 
 static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
Index: qemu/hw/xen_disk.c
===================================================================
--- qemu.orig/hw/xen_disk.c	2009-04-26 16:38:30.612075090 +0200
+++ qemu/hw/xen_disk.c	2009-05-05 11:29:11.556785880 +0200
@@ -397,14 +397,14 @@ static int ioreq_runio_qemu_aio(struct i
         ioreq->aio_inflight++;
         bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE,
                        &ioreq->v, ioreq->v.size / BLOCK_SIZE,
-                       qemu_aio_complete, ioreq);
+                       qemu_aio_complete, ioreq, 0);
 	break;
     case BLKIF_OP_WRITE:
     case BLKIF_OP_WRITE_BARRIER:
         ioreq->aio_inflight++;
         bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
                         &ioreq->v, ioreq->v.size / BLOCK_SIZE,
-                        qemu_aio_complete, ioreq);
+                        qemu_aio_complete, ioreq, 0);
 	break;
     default:
 	/* unknown operation (shouldn't happen -- parse catches this) */
Index: qemu/qemu-io.c
===================================================================
--- qemu.orig/qemu-io.c	2009-05-04 17:02:14.381783935 +0200
+++ qemu/qemu-io.c	2009-05-05 11:31:57.159659916 +0200
@@ -148,7 +148,7 @@ static int do_aio_readv(QEMUIOVector *qi
 	int async_ret = NOT_DONE;
 
 	acb = bdrv_aio_readv(bs, offset >> 9, qiov, qiov->size >> 9,
-			     aio_rw_done, &async_ret);
+			     aio_rw_done, &async_ret, 0);
 	if (!acb)
 		return -EIO;
 
@@ -165,7 +165,7 @@ static int do_aio_writev(QEMUIOVector *q
 	int async_ret = NOT_DONE;
 
 	acb = bdrv_aio_writev(bs, offset >> 9, qiov, qiov->size >> 9,
-			      aio_rw_done, &async_ret);
+			      aio_rw_done, &async_ret, 0);
 	if (!acb)
 		return -EIO;
 
Index: qemu/block-raw-posix.c
===================================================================
--- qemu.orig/block-raw-posix.c	2009-04-28 11:42:09.257949541 +0200
+++ qemu/block-raw-posix.c	2009-05-05 13:21:23.136784385 +0200
@@ -654,7 +654,7 @@ static void raw_aio_remove(RawAIOCB *acb
 
 static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
 {
     RawAIOCB *acb;
 
@@ -670,7 +670,7 @@ static BlockDriverAIOCB *raw_aio_readv(B
 
 static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags)
 {
     RawAIOCB *acb;
 
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2009-04-26 16:38:30.513074989 +0200
+++ qemu/block_int.h	2009-05-05 13:23:37.599695377 +0200
@@ -56,10 +56,10 @@ struct BlockDriver {
     /* aio */
     BlockDriverAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque);
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags);
     BlockDriverAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque);
+        BlockDriverCompletionFunc *cb, void *opaque, unsigned flags);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
     int aiocb_size;
 
@@ -117,6 +117,7 @@ struct BlockDriverState {
     int encrypted; /* if true, the media is encrypted */
     int valid_key; /* if true, a valid encryption key has been set */
     int sg;        /* if true, the device is a /dev/sg* */
+    int barrier_support;  /* if true, the device supports barrier requests */
     /* event callback when inserting/removing */
     void (*change_cb)(void *opaque);
     void *change_opaque;

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Qemu-devel] [PATCH 2/3] barriers: block-raw-posix barrier support
  2009-05-05 12:08 [Qemu-devel] [PATCH 0/3] write barrier support Christoph Hellwig
  2009-05-05 12:08 ` [Qemu-devel] barriers: block layer preparations Christoph Hellwig
@ 2009-05-05 12:08 ` Christoph Hellwig
  2009-05-05 12:33   ` Jamie Lokier
  2009-05-05 12:09 ` [Qemu-devel] [PATCH 3/3] barriers: virtio Christoph Hellwig
  2009-05-05 13:53 ` [Qemu-devel] [PATCH 0/3] write barrier support Avi Kivity
  3 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2009-05-05 12:08 UTC (permalink / raw)
  To: qemu-devel

Add support for write barriers to the posix raw file / block device code.
The guts of this is in the aio emulation as that's where we handle our queue
of outstanding requests.

The highlevel design is the following:

 - As soon as a barrier request is submitted via qemu_paio_submit we increment
   the barrier_inprogress count to signal we now have to deal with barriers.
 - From that point on every new request that is queued up by qemu_paio_submit
   does not get onto the normal request list but a secondary post-barrier queue
 - Once the barrier request is dequeued by an aio_thread that thread waits for
   all other outstanding requests to finish, issues an fdatasync, the actual
   barrier request, another fdatasync to prevent reordering in the pagecache.
   After the request is finished the barrier_inprogress counter is decrement,
   the post-barrier list is splice back onto the main request list up to and
   including the next barrier request if there is one and normal operation
   is resumed.

That means barrier mean a quite massive serialization of the I/O submission
path, which unfortunately is not avoidable given their semantics.  I will
mitigate it for setups with multiple virtual storage device with a patch
that makes the aio state per-device in the near future.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/posix-aio-compat.c
===================================================================
--- qemu.orig/posix-aio-compat.c	2009-05-05 13:35:09.115784239 +0200
+++ qemu/posix-aio-compat.c	2009-05-05 13:47:38.625659276 +0200
@@ -17,6 +17,7 @@
 #include <errno.h>
 #include <time.h>
 #include <string.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include "osdep.h"
@@ -31,8 +32,19 @@ static pthread_attr_t attr;
 static int max_threads = 64;
 static int cur_threads = 0;
 static int idle_threads = 0;
+
+/* number of barriers currently handled */
+static int barrier_inprogress = 0;
+
+/* normal list of all requests waiting for execution */
 static TAILQ_HEAD(, qemu_paiocb) request_list;
 
+/* list of all requests issued after a barrier request */
+static TAILQ_HEAD(, qemu_paiocb) post_barrier_list;
+
+/* wait for all I/O threads to be idle before issueing a barrier request */
+static pthread_cond_t idle_wait = PTHREAD_COND_INITIALIZER;
+
 #ifdef HAVE_PREADV
 static int preadv_present = 1;
 #else
@@ -62,6 +74,13 @@ static void mutex_unlock(pthread_mutex_t
     if (ret) die2(ret, "pthread_mutex_unlock");
 }
 
+static int cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
+{
+    int ret = pthread_cond_wait(cond, mutex);
+    if (ret) die2(ret, "pthread_cond_wait");
+    return ret;
+}
+
 static int cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
                            struct timespec *ts)
 {
@@ -264,6 +283,22 @@ static size_t handle_aiocb_rw(struct qem
     return nbytes;
 }
 
+static void requeue_request_list(void)
+{
+    struct qemu_paiocb *cb, *next;
+
+    TAILQ_FOREACH_SAFE(cb, &post_barrier_list, node, next) {
+        TAILQ_REMOVE(&post_barrier_list, cb, node);
+        TAILQ_INSERT_TAIL(&request_list, cb, node);
+
+        /*
+         * Stop after the first barrier request.
+         */
+        if (cb->aio_flags & QEMU_AIO_BARRIER)
+            break;
+    }
+}
+
 static void *aio_thread(void *unused)
 {
     pid_t pid;
@@ -280,6 +315,8 @@ static void *aio_thread(void *unused)
         size_t ret = 0;
         qemu_timeval tv;
         struct timespec ts;
+        bool wakeup_threads = false;
+        bool wakeup_idle = false;
 
         qemu_gettimeofday(&tv);
         ts.tv_sec = tv.tv_sec + 10;
@@ -297,6 +334,16 @@ static void *aio_thread(void *unused)
 
         aiocb = TAILQ_FIRST(&request_list);
         TAILQ_REMOVE(&request_list, aiocb, node);
+
+        /*
+         * We've got a barrier request.  Make sure all previous requests
+         * are completed before we issue it.
+         */
+        if (aiocb->aio_flags & QEMU_AIO_BARRIER) {
+            while (idle_threads != cur_threads)
+                cond_wait(&idle_wait, &lock);
+        }
+
         aiocb->active = 1;
         idle_threads--;
         mutex_unlock(&lock);
@@ -304,7 +351,13 @@ static void *aio_thread(void *unused)
         switch (aiocb->aio_type) {
         case QEMU_PAIO_READ:
         case QEMU_PAIO_WRITE:
-		ret = handle_aiocb_rw(aiocb);
+                if (aiocb->aio_flags & QEMU_AIO_BARRIER) {
+                    fdatasync(aiocb->aio_fildes);
+                    ret = handle_aiocb_rw(aiocb);
+                    fdatasync(aiocb->aio_fildes);
+                } else {
+                    ret = handle_aiocb_rw(aiocb);
+                }
 		break;
         case QEMU_PAIO_IOCTL:
 		ret = handle_aiocb_ioctl(aiocb);
@@ -317,9 +370,32 @@ static void *aio_thread(void *unused)
 
         mutex_lock(&lock);
         aiocb->ret = ret;
-        idle_threads++;
+
+        if (aiocb->aio_flags & QEMU_AIO_BARRIER) {
+            barrier_inprogress--;
+            if (!TAILQ_EMPTY(&request_list))
+                 die2(ret, "request list not empty");
+
+            if (!TAILQ_EMPTY(&post_barrier_list)) {
+                requeue_request_list();
+                wakeup_threads = true;
+            }
+        }
+
+        /* wake up barrier thread when all threads are idle */
+        if (++idle_threads == cur_threads && barrier_inprogress)
+            wakeup_idle = true;
         mutex_unlock(&lock);
 
+        /*
+         * If any new requests were queued up on the post_barrier_list wake up
+         * I/O threads now.
+         */
+        if (wakeup_threads)
+            cond_signal(&cond);
+	if (wakeup_idle)
+            cond_signal(&idle_wait);
+
         if (kill(pid, aiocb->ev_signo)) die("kill failed");
     }
 
@@ -348,6 +424,7 @@ int qemu_paio_init(struct qemu_paioinit 
     if (ret) die2(ret, "pthread_attr_setdetachstate");
 
     TAILQ_INIT(&request_list);
+    TAILQ_INIT(&post_barrier_list);
 
     return 0;
 }
@@ -357,10 +434,21 @@ static int qemu_paio_submit(struct qemu_
     aiocb->aio_type = type;
     aiocb->ret = -EINPROGRESS;
     aiocb->active = 0;
+
     mutex_lock(&lock);
     if (idle_threads == 0 && cur_threads < max_threads)
         spawn_thread();
-    TAILQ_INSERT_TAIL(&request_list, aiocb, node);
+
+    if (barrier_inprogress) {
+        aiocb->aio_flags |= QEMU_AIO_POST_BARRIER;
+        TAILQ_INSERT_TAIL(&post_barrier_list, aiocb, node);
+    } else {
+        TAILQ_INSERT_TAIL(&request_list, aiocb, node);
+    }
+
+    if (aiocb->aio_flags & QEMU_AIO_BARRIER)
+        barrier_inprogress++;
+
     mutex_unlock(&lock);
     cond_signal(&cond);
 
@@ -411,13 +499,17 @@ int qemu_paio_cancel(int fd, struct qemu
 
     mutex_lock(&lock);
     if (!aiocb->active) {
-        TAILQ_REMOVE(&request_list, aiocb, node);
+        if (aiocb->aio_flags & QEMU_AIO_POST_BARRIER)
+            TAILQ_REMOVE(&post_barrier_list, aiocb, node);
+	else
+            TAILQ_REMOVE(&request_list, aiocb, node);
         aiocb->ret = -ECANCELED;
         ret = QEMU_PAIO_CANCELED;
-    } else if (aiocb->ret == -EINPROGRESS)
+    } else if (aiocb->ret == -EINPROGRESS) {
         ret = QEMU_PAIO_NOTCANCELED;
-    else
+    } else {
         ret = QEMU_PAIO_ALLDONE;
+    }
     mutex_unlock(&lock);
 
     return ret;
Index: qemu/posix-aio-compat.h
===================================================================
--- qemu.orig/posix-aio-compat.h	2009-05-05 13:35:09.160784863 +0200
+++ qemu/posix-aio-compat.h	2009-05-05 13:45:54.312668406 +0200
@@ -39,6 +39,11 @@ struct qemu_paiocb
     unsigned aio_flags;
 /* 512 byte alignment required for buffer, offset and length */
 #define QEMU_AIO_SECTOR_ALIGNED	0x01
+/* Barrier request, must not re-order */
+#define QEMU_AIO_BARRIER        0x02
+
+/* Internal flag, is in the post-barrier queue */
+#define QEMU_AIO_POST_BARRIER   0x80
 
     /* private */
     TAILQ_ENTRY(qemu_paiocb) node;
Index: qemu/block-raw-posix.c
===================================================================
--- qemu.orig/block-raw-posix.c	2009-05-05 13:43:21.431811845 +0200
+++ qemu/block-raw-posix.c	2009-05-05 13:43:21.897783237 +0200
@@ -172,6 +172,14 @@ static int raw_open(BlockDriverState *bs
             return ret;
         }
     }
+
+    /*
+     * If the open mode allows caching writes in the file cache advertise
+     * barrier support so that the guest can control the cachie behaviour.
+     */
+    if (!(open_flags & (O_DIRECT|O_DSYNC)))
+        bs->barrier_support = 1;
+
     return 0;
 }
 
@@ -600,8 +608,8 @@ static int posix_aio_init(void)
 }
 
 static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int64_t sector_num,
-        QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+        QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb,
+        void *opaque, unsigned flags)
 {
     BDRVRawState *s = bs->opaque;
     RawAIOCB *acb;
@@ -627,6 +635,8 @@ static RawAIOCB *raw_aio_setup(BlockDriv
      */
     if (s->aligned_buf)
         acb->aiocb.aio_flags |= QEMU_AIO_SECTOR_ALIGNED;
+    if (flags & BDRV_IO_BARRIER)
+        acb->aiocb.aio_flags |= QEMU_AIO_BARRIER;
 
     acb->next = posix_aio_state->first_aio;
     posix_aio_state->first_aio = acb;
@@ -658,7 +668,7 @@ static BlockDriverAIOCB *raw_aio_readv(B
 {
     RawAIOCB *acb;
 
-    acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+    acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, flags);
     if (!acb)
         return NULL;
     if (qemu_paio_read(&acb->aiocb) < 0) {
@@ -674,7 +684,7 @@ static BlockDriverAIOCB *raw_aio_writev(
 {
     RawAIOCB *acb;
 
-    acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+    acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, flags);
     if (!acb)
         return NULL;
     if (qemu_paio_write(&acb->aiocb) < 0) {
@@ -1022,6 +1032,14 @@ static int hdev_open(BlockDriverState *b
         s->fd_media_changed = 1;
     }
 #endif
+
+    /*
+     * If the open mode allows caching writes in the file cache advertise
+     * barrier support so that the guest can control the cachie behaviour.
+     */
+    if (!(open_flags & (O_DIRECT|O_DSYNC)))
+        bs->barrier_support = 1;
+
     return 0;
 }
 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Qemu-devel] [PATCH 3/3] barriers: virtio
  2009-05-05 12:08 [Qemu-devel] [PATCH 0/3] write barrier support Christoph Hellwig
  2009-05-05 12:08 ` [Qemu-devel] barriers: block layer preparations Christoph Hellwig
  2009-05-05 12:08 ` [Qemu-devel] [PATCH 2/3] barriers: block-raw-posix barrier support Christoph Hellwig
@ 2009-05-05 12:09 ` Christoph Hellwig
  2009-05-05 13:53 ` [Qemu-devel] [PATCH 0/3] write barrier support Avi Kivity
  3 siblings, 0 replies; 15+ messages in thread
From: Christoph Hellwig @ 2009-05-05 12:09 UTC (permalink / raw)
  To: qemu-devel

Advertise the barriers feature to virtio guests if the underlying device
supports it, and make sure to pass down the barrier flag on write requests
to the block layer.


Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/hw/virtio-blk.c
===================================================================
--- qemu.orig/hw/virtio-blk.c	2009-05-05 13:21:24.196784347 +0200
+++ qemu/hw/virtio-blk.c	2009-05-05 13:31:33.654786861 +0200
@@ -211,8 +211,14 @@ static void virtio_blk_handle_scsi(VirtI
 
 static void virtio_blk_handle_write(VirtIOBlockReq *req)
 {
+
+    unsigned flags = 0;
+
+    if (req->out->type & VIRTIO_BLK_T_BARRIER)
+        flags |= BDRV_IO_BARRIER;
+
     bdrv_aio_writev(req->dev->bs, req->out->sector, &req->qiov,
-                    req->qiov.size / 512, virtio_blk_rw_complete, req, 0);
+                    req->qiov.size / 512, virtio_blk_rw_complete, req, flags);
 }
 
 static void virtio_blk_handle_read(VirtIOBlockReq *req)
@@ -304,6 +310,7 @@ static void virtio_blk_update_config(Vir
 
 static uint32_t virtio_blk_get_features(VirtIODevice *vdev)
 {
+    VirtIOBlock *s = to_virtio_blk(vdev);
     uint32_t features = 0;
 
     features |= (1 << VIRTIO_BLK_F_SEG_MAX);
@@ -312,6 +319,9 @@ static uint32_t virtio_blk_get_features(
     features |= (1 << VIRTIO_BLK_F_SCSI);
 #endif
 
+    if (s->bs->barrier_support)
+        features |= (1 << VIRTIO_BLK_F_BARRIER);
+
     return features;
 }
 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] barriers: block-raw-posix barrier support
  2009-05-05 12:08 ` [Qemu-devel] [PATCH 2/3] barriers: block-raw-posix barrier support Christoph Hellwig
@ 2009-05-05 12:33   ` Jamie Lokier
  2009-05-05 13:29     ` Christoph Hellwig
  0 siblings, 1 reply; 15+ messages in thread
From: Jamie Lokier @ 2009-05-05 12:33 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel

Christoph Hellwig wrote:
> Add support for write barriers to the posix raw file / block device code.
> The guts of this is in the aio emulation as that's where we handle our queue
> of outstanding requests.

It's nice to see this :-)

IDE and SCSI's cache flush commands should map to it nicely too.

> The highlevel design is the following:
> 
>  - As soon as a barrier request is submitted via qemu_paio_submit we
>    increment the barrier_inprogress count to signal we now have to
>    deal with barriers.
>  - From that point on every new request that is queued up by
>    qemu_paio_submit does not get onto the normal request list but a
>    secondary post-barrier queue
>
>  - Once the barrier request is dequeued by an aio_thread that thread waits for
>    all other outstanding requests to finish, issues an fdatasync, the actual
>    barrier request, another fdatasync to prevent reordering in the pagecache.

You don't need two fdatasyncs if the barrier request is just a
barrier, no data write, used only to flush previously written data by
a guest's fsync/fdatasync implementation.

>    After the request is finished the barrier_inprogress counter is decrement,
>    the post-barrier list is splice back onto the main request list up to and
>    including the next barrier request if there is one and normal operation
>    is resumed.
> 
> That means barrier mean a quite massive serialization of the I/O submission
> path, which unfortunately is not avoidable given their semantics.

This is the best argument yet for having distinct "barrier" and "sync"
operations.  "Barrier" is for ordering I/O, such as journalling
filesystems.

"Sync" is to be sent after guest fsync/fdatasync, to commit data
written so far to storage.  It waits for the data to be committed, and
also asks the data to be written sooner.

"Sync" operations don't need to serialise I/O as much: it's ok to
initiate later writes in parallel, and this is enough to keep the
storage busy when there's a steady stream of guest fsyncs.

Both together, "Barrier|Sync" would do what you've implemented: force
ordering, write data quickly, and wait until it's committed to hard
storage.

Although Linux doesn't separate these two concepts (yet), because of
I/O serialisation it might make a measurable difference to fsync-heavy
workloads for virtio to have two separate bits, one for each concept,
and then add the necessary tweaks to guests kernels to use only one or
both bits as needed.

-- Jamie

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] barriers: block-raw-posix barrier support
  2009-05-05 12:33   ` Jamie Lokier
@ 2009-05-05 13:29     ` Christoph Hellwig
  2009-05-05 16:00       ` Jamie Lokier
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2009-05-05 13:29 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Christoph Hellwig, qemu-devel

On Tue, May 05, 2009 at 01:33:11PM +0100, Jamie Lokier wrote:
> You don't need two fdatasyncs if the barrier request is just a
> barrier, no data write, used only to flush previously written data by
> a guest's fsync/fdatasync implementation.

Yeah.  I'll put that optimization in after some testing.

> This is the best argument yet for having distinct "barrier" and "sync"
> operations.  "Barrier" is for ordering I/O, such as journalling
> filesystems.

Doesn't really help as long as we're using the normal Posix filesystem
APIs on the host.  The only way to guarantee ordering of multiple
*write* systen calls is to call f(data)sync between them.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] barriers: block layer preparations
  2009-05-05 12:08 ` [Qemu-devel] barriers: block layer preparations Christoph Hellwig
@ 2009-05-05 13:51   ` Avi Kivity
  2009-05-05 15:38     ` Jamie Lokier
  2009-05-05 20:57     ` Christoph Hellwig
  0 siblings, 2 replies; 15+ messages in thread
From: Avi Kivity @ 2009-05-05 13:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel

Christoph Hellwig wrote:
> Add a flag to BlockDriverState to advertise barriers support, and add a flags
> argument to bdrv_aio_readv/writev to allow passing down the barrier flag.
>
> Note that the flags argument to bdrv_aio_readv is for now actually superflous
> because write barriers obviously only apply to writes.  I kept the read side
> argument so the API is symmetric and we can easily add more flags to both
> of them.
>
>
> @@ -1498,7 +1498,7 @@ static void qcow_aio_write_cb(void *opaq
>      acb->hd_aiocb = bdrv_aio_writev(s->hd,
>                                      (acb->cluster_offset >> 9) + index_in_cluster,
>                                      &acb->hd_qiov, acb->n,
> -                                    qcow_aio_write_cb, acb);
> +                                    qcow_aio_write_cb, acb, 2);
>   

2?

An alternative approach is to add a new op, bdrv_aio_barrier(), 
submitted immediately after the write.  It's probably more complicated 
overall.

A barrier for read can be meaningful if the guest wishes to read a 
known-to-be-stable write.  I don't think any guest can make use of this 
though.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] write barrier support
  2009-05-05 12:08 [Qemu-devel] [PATCH 0/3] write barrier support Christoph Hellwig
                   ` (2 preceding siblings ...)
  2009-05-05 12:09 ` [Qemu-devel] [PATCH 3/3] barriers: virtio Christoph Hellwig
@ 2009-05-05 13:53 ` Avi Kivity
  2009-05-05 21:00   ` Christoph Hellwig
  3 siblings, 1 reply; 15+ messages in thread
From: Avi Kivity @ 2009-05-05 13:53 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel

Christoph Hellwig wrote:
> Add support for write barriers that allow guests to control write
> ordering when write back cachin is enabled.
>
> For now only the simple stack of virtio on posix raw/host devices is
> supported.
>   

I suggest emulating barriers for format drivers that don't support them 
by queuing the writes in the block layer and issuing a flush_all command.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] barriers: block layer preparations
  2009-05-05 13:51   ` Avi Kivity
@ 2009-05-05 15:38     ` Jamie Lokier
  2009-05-05 15:49       ` Avi Kivity
  2009-05-05 20:57     ` Christoph Hellwig
  1 sibling, 1 reply; 15+ messages in thread
From: Jamie Lokier @ 2009-05-05 15:38 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, qemu-devel

Avi Kivity wrote:
> An alternative approach is to add a new op, bdrv_aio_barrier(), 
> submitted immediately after the write.  It's probably more complicated 
> overall.

Is it ok to submit a zero-length bdrv_aio_writev(), when you need a
barrier without data?

-- Jamie

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] barriers: block layer preparations
  2009-05-05 15:38     ` Jamie Lokier
@ 2009-05-05 15:49       ` Avi Kivity
  2009-05-05 16:00         ` Jamie Lokier
  0 siblings, 1 reply; 15+ messages in thread
From: Avi Kivity @ 2009-05-05 15:49 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Christoph Hellwig, qemu-devel

Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>> An alternative approach is to add a new op, bdrv_aio_barrier(), 
>> submitted immediately after the write.  It's probably more complicated 
>> overall.
>>     
>
> Is it ok to submit a zero-length bdrv_aio_writev(), when you need a
> barrier without data?
>   

We could make it so.  But my point was to make barriers and writes 
orthogonal.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] barriers: block-raw-posix barrier support
  2009-05-05 13:29     ` Christoph Hellwig
@ 2009-05-05 16:00       ` Jamie Lokier
  0 siblings, 0 replies; 15+ messages in thread
From: Jamie Lokier @ 2009-05-05 16:00 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel

Christoph Hellwig wrote:
> On Tue, May 05, 2009 at 01:33:11PM +0100, Jamie Lokier wrote:
> > You don't need two fdatasyncs if the barrier request is just a
> > barrier, no data write, used only to flush previously written data by
> > a guest's fsync/fdatasync implementation.
> 
> Yeah.  I'll put that optimization in after some testing.

I suggest keeping a flag "flush_needed".  Set it whenever a write is
submitted, don't submit fsync/fdatasync when the flag is clear, clear
it whenever an fsync/fdatasync is submitted.  Provides a few more
optimisation opportunities.

> > This is the best argument yet for having distinct "barrier" and "sync"
> > operations.  "Barrier" is for ordering I/O, such as journalling
> > filesystems.
> 
> Doesn't really help as long as we're using the normal Posix filesystem
> APIs on the host.  The only way to guarantee ordering of multiple
> *write* systen calls is to call f(data)sync between them.

It doesn't help with journalling barriers, which I agree are dominant
in a lot of workloads, but it does help guest fsync-heavy workloads.

When "Sync && !Barrier" the guest doesn't require the full ordering
guarantee.

Therefore you can call f(data)sync _and_ call some writes on other I/O
threads in parallel.  The f(data)sync) mustn't be started until
previous-queued writes are complete, but later-queued writes can be
called in parallel with f(data)sync.

(Or if using Linux AIO, the same with aio_fsync and later-queued
aio_writes in parallel).

In other words, with a guest fdatasync-heavy workload, like a
database, it could keep the I/O pipeline busy instead of draining it
as the full barrier does.

It won't help with a journalling-barrier-heavy workload, without
changes to the host to expose the distinct barrier types - i.e. a more
flexible alternative to f(data)sync, such as is occasionally discussed
elsewhere.

-- Jamie

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] barriers: block layer preparations
  2009-05-05 15:49       ` Avi Kivity
@ 2009-05-05 16:00         ` Jamie Lokier
  0 siblings, 0 replies; 15+ messages in thread
From: Jamie Lokier @ 2009-05-05 16:00 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, qemu-devel

Avi Kivity wrote:
> Jamie Lokier wrote:
> >Avi Kivity wrote:
> >  
> >>An alternative approach is to add a new op, bdrv_aio_barrier(), 
> >>submitted immediately after the write.  It's probably more complicated 
> >>overall.
> >>    
> >
> >Is it ok to submit a zero-length bdrv_aio_writev(), when you need a
> >barrier without data?
> >  
> 
> We could make it so.  But my point was to make barriers and writes 
> orthogonal.

Yes, and my point is that you _must_ make them orthogonal if
zero-length write isn't permitted :-)

-- Jamie

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] barriers: block layer preparations
  2009-05-05 13:51   ` Avi Kivity
  2009-05-05 15:38     ` Jamie Lokier
@ 2009-05-05 20:57     ` Christoph Hellwig
  2009-05-05 22:49       ` Jamie Lokier
  1 sibling, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2009-05-05 20:57 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, qemu-devel

On Tue, May 05, 2009 at 04:51:39PM +0300, Avi Kivity wrote:
> >+                                    qcow_aio_write_cb, acb, 2);
> >  
> 
> 2?

Typo.

> An alternative approach is to add a new op, bdrv_aio_barrier(), 
> submitted immediately after the write.  It's probably more complicated 
> overall.

Yeah, it sounds more complicated to implement to me.

> A barrier for read can be meaningful if the guest wishes to read a 
> known-to-be-stable write.  I don't think any guest can make use of this 
> though.

I don't know of any storage standard having anything like this or any
OS taking advantage of it.  I'd be happy to review anything like that,
though.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] write barrier support
  2009-05-05 13:53 ` [Qemu-devel] [PATCH 0/3] write barrier support Avi Kivity
@ 2009-05-05 21:00   ` Christoph Hellwig
  0 siblings, 0 replies; 15+ messages in thread
From: Christoph Hellwig @ 2009-05-05 21:00 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Christoph Hellwig, qemu-devel

On Tue, May 05, 2009 at 04:53:25PM +0300, Avi Kivity wrote:
> I suggest emulating barriers for format drivers that don't support them 
> by queuing the writes in the block layer and issuing a flush_all command.

We could potentially move the thread pool in posix-aio-compat.c to
work ontop of the QEMU block API instead of the Posix API which would
take care of this.  It might also help replacing the state machines
in qcow/qcow2 with a simpler threaded model, but that would need
separate investigation.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] barriers: block layer preparations
  2009-05-05 20:57     ` Christoph Hellwig
@ 2009-05-05 22:49       ` Jamie Lokier
  0 siblings, 0 replies; 15+ messages in thread
From: Jamie Lokier @ 2009-05-05 22:49 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Avi Kivity, qemu-devel

Christoph Hellwig wrote:
> > A barrier for read can be meaningful if the guest wishes to read a 
> > known-to-be-stable write.  I don't think any guest can make use of this 
> > though.
> 
> I don't know of any storage standard having anything like this or any
> OS taking advantage of it.  I'd be happy to review anything like that,
> though.

Several OSes have O_RSYNC, meaning reads hit the storage instead of
the OS cache.  I don't know if they bypass storage caches, but the
purpose seems to be to verify writes, and they will certainly bypass
storage caches if the OS or user turns off the storage caches.

-- Jamie

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2009-05-05 22:49 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-05-05 12:08 [Qemu-devel] [PATCH 0/3] write barrier support Christoph Hellwig
2009-05-05 12:08 ` [Qemu-devel] barriers: block layer preparations Christoph Hellwig
2009-05-05 13:51   ` Avi Kivity
2009-05-05 15:38     ` Jamie Lokier
2009-05-05 15:49       ` Avi Kivity
2009-05-05 16:00         ` Jamie Lokier
2009-05-05 20:57     ` Christoph Hellwig
2009-05-05 22:49       ` Jamie Lokier
2009-05-05 12:08 ` [Qemu-devel] [PATCH 2/3] barriers: block-raw-posix barrier support Christoph Hellwig
2009-05-05 12:33   ` Jamie Lokier
2009-05-05 13:29     ` Christoph Hellwig
2009-05-05 16:00       ` Jamie Lokier
2009-05-05 12:09 ` [Qemu-devel] [PATCH 3/3] barriers: virtio Christoph Hellwig
2009-05-05 13:53 ` [Qemu-devel] [PATCH 0/3] write barrier support Avi Kivity
2009-05-05 21:00   ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.