All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/3] better I/O accounting V2
@ 2011-08-21 22:25 Christoph Hellwig
  2011-08-21 22:25 ` [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats Christoph Hellwig
                   ` (6 more replies)
  0 siblings, 7 replies; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-21 22:25 UTC (permalink / raw)
  To: qemu-devel

changes since V1:
 - rebase to apply against the current qemu.git tree

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats
  2011-08-21 22:25 [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Christoph Hellwig
@ 2011-08-21 22:25 ` Christoph Hellwig
  2011-08-21 22:26 ` [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting Christoph Hellwig
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-21 22:25 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2011-08-11 12:32:35.842658196 +0200
+++ qemu/block.c	2011-08-11 16:04:48.302061893 +0200
@@ -1888,11 +1888,13 @@ static void bdrv_stats_iter(QObject *dat
                         " wr_bytes=%" PRId64
                         " rd_operations=%" PRId64
                         " wr_operations=%" PRId64
+                        " flush_operations=%" PRId64
                         "\n",
                         qdict_get_int(qdict, "rd_bytes"),
                         qdict_get_int(qdict, "wr_bytes"),
                         qdict_get_int(qdict, "rd_operations"),
-                        qdict_get_int(qdict, "wr_operations"));
+                        qdict_get_int(qdict, "wr_operations"),
+                        qdict_get_int(qdict, "flush_operations"));
 }
 
 void bdrv_stats_print(Monitor *mon, const QObject *data)
@@ -1910,12 +1912,16 @@ static QObject* bdrv_info_stats_bs(Block
                              "'wr_bytes': %" PRId64 ","
                              "'rd_operations': %" PRId64 ","
                              "'wr_operations': %" PRId64 ","
-                             "'wr_highest_offset': %" PRId64
+                             "'wr_highest_offset': %" PRId64 ","
+                             "'flush_operations': %" PRId64
                              "} }",
-                             bs->rd_bytes, bs->wr_bytes,
-                             bs->rd_ops, bs->wr_ops,
+                             bs->rd_bytes,
+                             bs->wr_bytes,
+                             bs->rd_ops,
+                             bs->wr_ops,
                              bs->wr_highest_sector *
-                             (uint64_t)BDRV_SECTOR_SIZE);
+                             (uint64_t)BDRV_SECTOR_SIZE,
+                             bs->flush_ops);
     dict  = qobject_to_qdict(res);
 
     if (*bs->device_name) {
@@ -2579,6 +2585,8 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDr
 
     trace_bdrv_aio_flush(bs, opaque);
 
+    bs->flush_ops++;
+
     if (bs->open_flags & BDRV_O_NO_FLUSH) {
         return bdrv_aio_noop_em(bs, cb, opaque);
     }
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2011-08-11 12:32:35.852658142 +0200
+++ qemu/block_int.h	2011-08-11 16:04:48.302061893 +0200
@@ -188,6 +188,7 @@ struct BlockDriverState {
     uint64_t wr_bytes;
     uint64_t rd_ops;
     uint64_t wr_ops;
+    uint64_t flush_ops;
     uint64_t wr_highest_sector;
 
     /* Whether the disk can expand beyond total_sectors */
Index: qemu/qmp-commands.hx
===================================================================
--- qemu.orig/qmp-commands.hx	2011-08-11 12:32:36.062657004 +0200
+++ qemu/qmp-commands.hx	2011-08-11 16:04:48.305395208 +0200
@@ -1201,6 +1201,7 @@ Each json-object contain the following:
     - "wr_bytes": bytes written (json-int)
     - "rd_operations": read operations (json-int)
     - "wr_operations": write operations (json-int)
+    - "flush_operations": cache flush operations (json-int)
     - "wr_highest_offset": Highest offset of a sector written since the
                            BlockDriverState has been opened (json-int)
 - "parent": Contains recursively the statistics of the underlying
@@ -1222,6 +1223,7 @@ Example:
                   "wr_operations":751,
                   "rd_bytes":122567168,
                   "rd_operations":36772
+                  "flush_operations":61,
                }
             },
             "stats":{
@@ -1230,6 +1232,7 @@ Example:
                "wr_operations":692,
                "rd_bytes":122739200,
                "rd_operations":36604
+               "flush_operations":51,
             }
          },
          {
@@ -1240,6 +1243,7 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
             }
          },
          {
@@ -1250,6 +1254,7 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
             }
          },
          {
@@ -1260,6 +1265,7 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
             }
          }
       ]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting
  2011-08-21 22:25 [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Christoph Hellwig
  2011-08-21 22:25 ` [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats Christoph Hellwig
@ 2011-08-21 22:26 ` Christoph Hellwig
  2011-08-22 15:48   ` Kevin Wolf
  2011-08-21 22:26 ` [Qemu-devel] [PATCH 3/3] block: latency accounting Christoph Hellwig
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-21 22:26 UTC (permalink / raw)
  To: qemu-devel

Decouple the I/O accounting from bdrv_aio_readv/writev/flush and
make the hardware models call directly into the accounting helpers.

This means:
 - we do not count internal requests from image formats in addition
   to guest originating I/O
 - we do not double count I/O ops if the device model handles it
   chunk wise
 - we only account I/O once it actuall is done
 - can extent I/O accounting to synchronous or coroutine I/O easily
 - implement I/O latency tracking easily (see the next patch)

I've conveted the existing device model callers to the new model,
device models that are using synchronous I/O and weren't accounted
before haven't been updated yet.  Also scsi hasn't been converted
to the end-to-end accounting as I want to defer that after the pending
scsi layer overhaul.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2011-08-21 14:53:09.684226561 -0700
+++ qemu/block.c	2011-08-21 14:57:09.507558463 -0700
@@ -1915,13 +1915,13 @@ static QObject* bdrv_info_stats_bs(Block
                              "'wr_highest_offset': %" PRId64 ","
                              "'flush_operations': %" PRId64
                              "} }",
-                             bs->rd_bytes,
-                             bs->wr_bytes,
-                             bs->rd_ops,
-                             bs->wr_ops,
+                             bs->nr_bytes[BDRV_ACCT_READ],
+                             bs->nr_bytes[BDRV_ACCT_WRITE],
+                             bs->nr_ops[BDRV_ACCT_READ],
+                             bs->nr_ops[BDRV_ACCT_WRITE],
                              bs->wr_highest_sector *
                              (uint64_t)BDRV_SECTOR_SIZE,
-                             bs->flush_ops);
+                             bs->nr_ops[BDRV_ACCT_FLUSH]);
     dict  = qobject_to_qdict(res);
 
     if (*bs->device_name) {
@@ -2235,7 +2235,6 @@ char *bdrv_snapshot_dump(char *buf, int
     return buf;
 }
 
-
 /**************************************************************/
 /* async I/Os */
 
@@ -2244,7 +2243,6 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
                                  BlockDriverCompletionFunc *cb, void *opaque)
 {
     BlockDriver *drv = bs->drv;
-    BlockDriverAIOCB *ret;
 
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
@@ -2253,16 +2251,7 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
     if (bdrv_check_request(bs, sector_num, nb_sectors))
         return NULL;
 
-    ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
-                              cb, opaque);
-
-    if (ret) {
-	/* Update stats even though technically transfer has not happened. */
-	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-	bs->rd_ops ++;
-    }
-
-    return ret;
+    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, cb, opaque);
 }
 
 typedef struct BlockCompleteData {
@@ -2329,9 +2318,6 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockD
                                cb, opaque);
 
     if (ret) {
-        /* Update stats even though technically transfer has not happened. */
-        bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-        bs->wr_ops ++;
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
@@ -2585,8 +2571,6 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDr
 
     trace_bdrv_aio_flush(bs, opaque);
 
-    bs->flush_ops++;
-
     if (bs->open_flags & BDRV_O_NO_FLUSH) {
         return bdrv_aio_noop_em(bs, cb, opaque);
     }
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2011-08-21 14:53:09.000000000 -0700
+++ qemu/block_int.h	2011-08-21 14:57:09.510891797 -0700
@@ -148,6 +148,13 @@ struct BlockDriver {
     QLIST_ENTRY(BlockDriver) list;
 };
 
+enum BlockIOType {
+    BDRV_ACCT_READ,
+    BDRV_ACCT_WRITE,
+    BDRV_ACCT_FLUSH,
+    BDRV_MAX_IOTYPE,
+};
+
 struct BlockDriverState {
     int64_t total_sectors; /* if we are reading a disk image, give its
                               size in sectors */
@@ -184,11 +191,8 @@ struct BlockDriverState {
     void *sync_aiocb;
 
     /* I/O stats (display with "info blockstats"). */
-    uint64_t rd_bytes;
-    uint64_t wr_bytes;
-    uint64_t rd_ops;
-    uint64_t wr_ops;
-    uint64_t flush_ops;
+    uint64_t nr_bytes[BDRV_MAX_IOTYPE];
+    uint64_t nr_ops[BDRV_MAX_IOTYPE];
     uint64_t wr_highest_sector;
 
     /* Whether the disk can expand beyond total_sectors */
@@ -212,6 +216,27 @@ struct BlockDriverState {
     void *private;
 };
 
+typedef struct BlockAcctCookie {
+    int64_t bytes;
+    enum BlockIOType type;
+} BlockAcctCookie;
+
+static inline void
+bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
+        enum BlockIOType type)
+{
+    cookie->bytes = bytes;
+    cookie->type = type;
+}
+
+static inline void
+bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
+{
+    bs->nr_bytes[cookie->type] += cookie->bytes;
+    bs->nr_ops[cookie->type]++;
+}
+
+
 #define CHANGE_MEDIA    0x01
 #define CHANGE_SIZE     0x02
 
Index: qemu/hw/ide/ahci.c
===================================================================
--- qemu.orig/hw/ide/ahci.c	2011-08-21 11:49:29.000000000 -0700
+++ qemu/hw/ide/ahci.c	2011-08-21 14:57:09.510891797 -0700
@@ -710,6 +710,7 @@ static void ncq_cb(void *opaque, int ret
     DPRINTF(ncq_tfs->drive->port_no, "NCQ transfer tag %d finished\n",
             ncq_tfs->tag);
 
+    bdrv_acct_done(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct);
     qemu_sglist_destroy(&ncq_tfs->sglist);
     ncq_tfs->used = 0;
 }
@@ -755,7 +756,11 @@ static void process_ncq_command(AHCIStat
                     ncq_tfs->sector_count-1, ncq_tfs->lba, ncq_tfs->tag);
             ncq_tfs->is_read = 1;
 
-            DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+	    DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+
+            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
+                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
+                            BDRV_ACCT_READ);
             ncq_tfs->aiocb = dma_bdrv_read(ncq_tfs->drive->port.ifs[0].bs,
                                            &ncq_tfs->sglist, ncq_tfs->lba,
                                            ncq_cb, ncq_tfs);
@@ -766,6 +771,10 @@ static void process_ncq_command(AHCIStat
             ncq_tfs->is_read = 0;
 
             DPRINTF(port, "tag %d aio write %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+
+            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
+                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
+                            BDRV_ACCT_WRITE);
             ncq_tfs->aiocb = dma_bdrv_write(ncq_tfs->drive->port.ifs[0].bs,
                                             &ncq_tfs->sglist, ncq_tfs->lba,
                                             ncq_cb, ncq_tfs);
Index: qemu/hw/ide/atapi.c
===================================================================
--- qemu.orig/hw/ide/atapi.c	2011-08-21 11:49:17.000000000 -0700
+++ qemu/hw/ide/atapi.c	2011-08-21 14:57:09.510891797 -0700
@@ -250,6 +250,7 @@ static void ide_atapi_cmd_reply(IDEState
     s->io_buffer_index = 0;
 
     if (s->atapi_dma) {
+        bdrv_acct_start(s->bs, &s->acct, size, BDRV_ACCT_READ);
         s->status = READY_STAT | SEEK_STAT | DRQ_STAT;
         s->bus->dma->ops->start_dma(s->bus->dma, s,
                                    ide_atapi_cmd_read_dma_cb);
@@ -322,10 +323,7 @@ static void ide_atapi_cmd_read_dma_cb(vo
         s->status = READY_STAT | SEEK_STAT;
         s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
         ide_set_irq(s->bus);
-    eot:
-        s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
-        ide_set_inactive(s);
-        return;
+        goto eot;
     }
 
     s->io_buffer_index = 0;
@@ -343,9 +341,11 @@ static void ide_atapi_cmd_read_dma_cb(vo
 #ifdef DEBUG_AIO
     printf("aio_read_cd: lba=%u n=%d\n", s->lba, n);
 #endif
+
     s->bus->dma->iov.iov_base = (void *)(s->io_buffer + data_offset);
     s->bus->dma->iov.iov_len = n * 4 * 512;
     qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1);
+
     s->bus->dma->aiocb = bdrv_aio_readv(s->bs, (int64_t)s->lba << 2,
                                        &s->bus->dma->qiov, n * 4,
                                        ide_atapi_cmd_read_dma_cb, s);
@@ -355,6 +355,12 @@ static void ide_atapi_cmd_read_dma_cb(vo
                             ASC_MEDIUM_NOT_PRESENT);
         goto eot;
     }
+
+    return;
+eot:
+    bdrv_acct_done(s->bs, &s->acct);
+    s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
+    ide_set_inactive(s);
 }
 
 /* start a CD-CDROM read command with DMA */
@@ -368,6 +374,8 @@ static void ide_atapi_cmd_read_dma(IDESt
     s->io_buffer_size = 0;
     s->cd_sector_size = sector_size;
 
+    bdrv_acct_start(s->bs, &s->acct, s->packet_transfer_size, BDRV_ACCT_READ);
+
     /* XXX: check if BUSY_STAT should be set */
     s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
     s->bus->dma->ops->start_dma(s->bus->dma, s,
Index: qemu/hw/ide/core.c
===================================================================
--- qemu.orig/hw/ide/core.c	2011-08-21 11:49:17.000000000 -0700
+++ qemu/hw/ide/core.c	2011-08-21 14:57:09.510891797 -0700
@@ -473,7 +473,10 @@ void ide_sector_read(IDEState *s)
 #endif
         if (n > s->req_nb_sectors)
             n = s->req_nb_sectors;
+
+        bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
         ret = bdrv_read(s->bs, sector_num, s->io_buffer, n);
+        bdrv_acct_done(s->bs, &s->acct);
         if (ret != 0) {
             if (ide_handle_rw_error(s, -ret,
                 BM_STATUS_PIO_RETRY | BM_STATUS_RETRY_READ))
@@ -610,7 +613,8 @@ handle_rw_error:
     return;
 
 eot:
-   ide_set_inactive(s);
+    bdrv_acct_done(s->bs, &s->acct);
+    ide_set_inactive(s);
 }
 
 static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd)
@@ -619,6 +623,20 @@ static void ide_sector_start_dma(IDEStat
     s->io_buffer_index = 0;
     s->io_buffer_size = 0;
     s->dma_cmd = dma_cmd;
+
+    switch (dma_cmd) {
+    case IDE_DMA_READ:
+        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
+                        BDRV_ACCT_READ);
+        break;
+    case IDE_DMA_WRITE:
+        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
+                        BDRV_ACCT_WRITE);
+	break;
+    default:
+        break;
+    }
+
     s->bus->dma->ops->start_dma(s->bus->dma, s, ide_dma_cb);
 }
 
@@ -641,7 +659,10 @@ void ide_sector_write(IDEState *s)
     n = s->nsector;
     if (n > s->req_nb_sectors)
         n = s->req_nb_sectors;
+
+    bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
     ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
+    bdrv_acct_done(s->bs, &s->acct);
 
     if (ret != 0) {
         if (ide_handle_rw_error(s, -ret, BM_STATUS_PIO_RETRY))
@@ -685,6 +706,7 @@ static void ide_flush_cb(void *opaque, i
         }
     }
 
+    bdrv_acct_done(s->bs, &s->acct);
     s->status = READY_STAT | SEEK_STAT;
     ide_set_irq(s->bus);
 }
@@ -698,6 +720,7 @@ void ide_flush_cache(IDEState *s)
         return;
     }
 
+    bdrv_acct_start(s->bs, &s->acct, 0, BDRV_ACCT_FLUSH);
     acb = bdrv_aio_flush(s->bs, ide_flush_cb, s);
     if (acb == NULL) {
         ide_flush_cb(s, -EIO);
Index: qemu/hw/ide/macio.c
===================================================================
--- qemu.orig/hw/ide/macio.c	2011-08-21 11:49:29.000000000 -0700
+++ qemu/hw/ide/macio.c	2011-08-21 14:57:09.514225130 -0700
@@ -52,8 +52,7 @@ static void pmac_ide_atapi_transfer_cb(v
         m->aiocb = NULL;
         qemu_sglist_destroy(&s->sg);
         ide_atapi_io_error(s, ret);
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
 
     if (s->io_buffer_size > 0) {
@@ -71,8 +70,7 @@ static void pmac_ide_atapi_transfer_cb(v
         ide_atapi_cmd_ok(s);
 
     if (io->len == 0) {
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
 
     /* launch next transfer */
@@ -92,9 +90,14 @@ static void pmac_ide_atapi_transfer_cb(v
         /* Note: media not present is the most likely case */
         ide_atapi_cmd_error(s, SENSE_NOT_READY,
                             ASC_MEDIUM_NOT_PRESENT);
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
+    return;
+
+done:
+    bdrv_acct_done(s->bs, &s->acct);
+    io->dma_end(opaque);
+    return;
 }
 
 static void pmac_ide_transfer_cb(void *opaque, int ret)
@@ -109,8 +112,7 @@ static void pmac_ide_transfer_cb(void *o
         m->aiocb = NULL;
         qemu_sglist_destroy(&s->sg);
 	ide_dma_error(s);
-        io->dma_end(io);
-        return;
+        goto done;
     }
 
     sector_num = ide_get_sector(s);
@@ -130,10 +132,8 @@ static void pmac_ide_transfer_cb(void *o
     }
 
     /* end of DMA ? */
-
     if (io->len == 0) {
-        io->dma_end(io);
-	return;
+        goto done;
     }
 
     /* launch next transfer */
@@ -163,6 +163,10 @@ static void pmac_ide_transfer_cb(void *o
 
     if (!m->aiocb)
         pmac_ide_transfer_cb(io, -1);
+    return;
+done:
+    bdrv_acct_done(s->bs, &s->acct);
+    io->dma_end(io);
 }
 
 static void pmac_ide_transfer(DBDMA_io *io)
@@ -172,10 +176,22 @@ static void pmac_ide_transfer(DBDMA_io *
 
     s->io_buffer_size = 0;
     if (s->drive_kind == IDE_CD) {
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
         pmac_ide_atapi_transfer_cb(io, 0);
         return;
     }
 
+    switch (s->dma_cmd) {
+    case IDE_DMA_READ:
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
+        break;
+    case IDE_DMA_WRITE:
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_WRITE);
+        break;
+    default:
+        break;
+    }
+
     pmac_ide_transfer_cb(io, 0);
 }
 
Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c	2011-08-21 11:49:29.000000000 -0700
+++ qemu/hw/scsi-disk.c	2011-08-21 14:57:09.514225130 -0700
@@ -57,6 +57,7 @@ typedef struct SCSIDiskReq {
     struct iovec iov;
     QEMUIOVector qiov;
     uint32_t status;
+    BlockAcctCookie acct;
 } SCSIDiskReq;
 
 struct SCSIDiskState
@@ -107,6 +108,7 @@ static void scsi_cancel_io(SCSIRequest *
 static void scsi_read_complete(void * opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     int n;
 
     r->req.aiocb = NULL;
@@ -117,6 +119,8 @@ static void scsi_read_complete(void * op
         }
     }
 
+    bdrv_acct_done(s->bs, &r->acct);
+
     DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->iov.iov_len);
 
     n = r->iov.iov_len / 512;
@@ -161,6 +165,8 @@ static void scsi_read_data(SCSIRequest *
 
     r->iov.iov_len = n * 512;
     qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+
+    bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
     r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n,
                               scsi_read_complete, r);
     if (r->req.aiocb == NULL) {
@@ -207,6 +213,7 @@ static int scsi_handle_rw_error(SCSIDisk
 static void scsi_write_complete(void * opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     uint32_t len;
     uint32_t n;
 
@@ -218,6 +225,8 @@ static void scsi_write_complete(void * o
         }
     }
 
+    bdrv_acct_done(s->bs, &r->acct);
+
     n = r->iov.iov_len / 512;
     r->sector += n;
     r->sector_count -= n;
@@ -252,6 +261,8 @@ static void scsi_write_data(SCSIRequest
     n = r->iov.iov_len / 512;
     if (n) {
         qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+
+        bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE);
         r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n,
                                    scsi_write_complete, r);
         if (r->req.aiocb == NULL) {
@@ -854,13 +865,19 @@ static int scsi_disk_emulate_command(SCS
         buflen = 8;
         break;
     case SYNCHRONIZE_CACHE:
+{
+        BlockAcctCookie acct;
+
+        bdrv_acct_start(s->bs, &acct, 0, BDRV_ACCT_FLUSH);
         ret = bdrv_flush(s->bs);
         if (ret < 0) {
             if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_FLUSH)) {
                 return -1;
             }
         }
+        bdrv_acct_done(s->bs, &acct);
         break;
+}
     case GET_CONFIGURATION:
         memset(outbuf, 0, 8);
         /* ??? This should probably return much more information.  For now
Index: qemu/hw/virtio-blk.c
===================================================================
--- qemu.orig/hw/virtio-blk.c	2011-08-21 11:49:29.000000000 -0700
+++ qemu/hw/virtio-blk.c	2011-08-21 14:57:44.887558247 -0700
@@ -47,6 +47,7 @@ typedef struct VirtIOBlockReq
     struct virtio_scsi_inhdr *scsi;
     QEMUIOVector qiov;
     struct VirtIOBlockReq *next;
+    BlockAcctCookie acct;
 } VirtIOBlockReq;
 
 static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
@@ -59,6 +60,7 @@ static void virtio_blk_req_complete(Virt
     virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
     virtio_notify(&s->vdev, s->vq);
 
+    bdrv_acct_done(s->bs, &req->acct);
     g_free(req);
 }
 
@@ -266,6 +268,8 @@ static void virtio_blk_handle_flush(Virt
 {
     BlockDriverAIOCB *acb;
 
+    bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH);
+
     /*
      * Make sure all outstanding writes are posted to the backing device.
      */
@@ -284,6 +288,8 @@ static void virtio_blk_handle_write(Virt
 
     sector = ldq_p(&req->out->sector);
 
+    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE);
+
     trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
 
     if (sector & req->dev->sector_mask) {
@@ -317,6 +323,8 @@ static void virtio_blk_handle_read(VirtI
 
     sector = ldq_p(&req->out->sector);
 
+    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ);
+
     if (sector & req->dev->sector_mask) {
         virtio_blk_rw_complete(req, -EIO);
         return;
Index: qemu/hw/xen_disk.c
===================================================================
--- qemu.orig/hw/xen_disk.c	2011-08-21 11:49:29.000000000 -0700
+++ qemu/hw/xen_disk.c	2011-08-21 14:57:09.517558463 -0700
@@ -79,6 +79,7 @@ struct ioreq {
 
     struct XenBlkDev    *blkdev;
     QLIST_ENTRY(ioreq)   list;
+    BlockAcctCookie     acct;
 };
 
 struct XenBlkDev {
@@ -401,6 +402,7 @@ static void qemu_aio_complete(void *opaq
     ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
     ioreq_unmap(ioreq);
     ioreq_finish(ioreq);
+    bdrv_acct_done(ioreq->blkdev->bs, &ioreq->acct);
     qemu_bh_schedule(ioreq->blkdev->bh);
 }
 
@@ -419,6 +421,7 @@ static int ioreq_runio_qemu_aio(struct i
 
     switch (ioreq->req.operation) {
     case BLKIF_OP_READ:
+        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_READ);
         ioreq->aio_inflight++;
         bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE,
                        &ioreq->v, ioreq->v.size / BLOCK_SIZE,
@@ -429,6 +432,8 @@ static int ioreq_runio_qemu_aio(struct i
         if (!ioreq->req.nr_segments) {
             break;
         }
+
+        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_WRITE);
         ioreq->aio_inflight++;
         bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
                         &ioreq->v, ioreq->v.size / BLOCK_SIZE,
Index: qemu/hw/ide/internal.h
===================================================================
--- qemu.orig/hw/ide/internal.h	2011-08-21 11:49:17.000000000 -0700
+++ qemu/hw/ide/internal.h	2011-08-21 14:57:09.517558463 -0700
@@ -440,6 +440,7 @@ struct IDEState {
     int lba;
     int cd_sector_size;
     int atapi_dma; /* true if dma is requested for the packet cmd */
+    BlockAcctCookie acct;
     /* ATA DMA state */
     int io_buffer_size;
     QEMUSGList sg;
Index: qemu/hw/ide/ahci.h
===================================================================
--- qemu.orig/hw/ide/ahci.h	2011-08-21 11:49:17.000000000 -0700
+++ qemu/hw/ide/ahci.h	2011-08-21 14:57:09.517558463 -0700
@@ -258,6 +258,7 @@ typedef struct NCQTransferState {
     AHCIDevice *drive;
     BlockDriverAIOCB *aiocb;
     QEMUSGList sglist;
+    BlockAcctCookie acct;
     int is_read;
     uint16_t sector_count;
     uint64_t lba;

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 3/3] block: latency accounting
  2011-08-21 22:25 [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Christoph Hellwig
  2011-08-21 22:25 ` [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats Christoph Hellwig
  2011-08-21 22:26 ` [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting Christoph Hellwig
@ 2011-08-21 22:26 ` Christoph Hellwig
  2011-08-22  8:45 ` [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Stefan Hajnoczi
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-21 22:26 UTC (permalink / raw)
  To: qemu-devel

Account the total latency for read/write/flush requests.  This allows
management tools to average it based on a snapshot of the nr ops
counters and allow checking for SLAs or provide statistics.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2011-08-11 07:05:24.000000000 -0700
+++ qemu/block.c	2011-08-11 07:37:01.114924279 -0700
@@ -1889,12 +1889,18 @@ static void bdrv_stats_iter(QObject *dat
                         " rd_operations=%" PRId64
                         " wr_operations=%" PRId64
                         " flush_operations=%" PRId64
+                        " wr_total_time_ns=%" PRId64
+                        " rd_total_time_ns=%" PRId64
+                        " flush_total_time_ns=%" PRId64
                         "\n",
                         qdict_get_int(qdict, "rd_bytes"),
                         qdict_get_int(qdict, "wr_bytes"),
                         qdict_get_int(qdict, "rd_operations"),
                         qdict_get_int(qdict, "wr_operations"),
-                        qdict_get_int(qdict, "flush_operations"));
+                        qdict_get_int(qdict, "flush_operations"),
+                        qdict_get_int(qdict, "wr_total_time_ns"),
+                        qdict_get_int(qdict, "rd_total_time_ns"),
+                        qdict_get_int(qdict, "flush_total_time_ns"));
 }
 
 void bdrv_stats_print(Monitor *mon, const QObject *data)
@@ -1913,7 +1919,10 @@ static QObject* bdrv_info_stats_bs(Block
                              "'rd_operations': %" PRId64 ","
                              "'wr_operations': %" PRId64 ","
                              "'wr_highest_offset': %" PRId64 ","
-                             "'flush_operations': %" PRId64
+                             "'flush_operations': %" PRId64 ","
+                             "'wr_total_time_ns': %" PRId64 ","
+                             "'rd_total_time_ns': %" PRId64 ","
+                             "'flush_total_time_ns': %" PRId64
                              "} }",
                              bs->nr_bytes[BDRV_ACCT_READ],
                              bs->nr_bytes[BDRV_ACCT_WRITE],
@@ -1921,7 +1930,10 @@ static QObject* bdrv_info_stats_bs(Block
                              bs->nr_ops[BDRV_ACCT_WRITE],
                              bs->wr_highest_sector *
                              (uint64_t)BDRV_SECTOR_SIZE,
-                             bs->nr_ops[BDRV_ACCT_FLUSH]);
+                             bs->nr_ops[BDRV_ACCT_FLUSH],
+                             bs->total_time_ns[BDRV_ACCT_READ],
+                             bs->total_time_ns[BDRV_ACCT_WRITE],
+                             bs->total_time_ns[BDRV_ACCT_FLUSH]);
     dict  = qobject_to_qdict(res);
 
     if (*bs->device_name) {
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2011-08-11 07:05:24.000000000 -0700
+++ qemu/block_int.h	2011-08-11 07:37:17.964832995 -0700
@@ -28,6 +28,7 @@
 #include "qemu-option.h"
 #include "qemu-queue.h"
 #include "qemu-coroutine.h"
+#include "qemu-timer.h"
 
 #define BLOCK_FLAG_ENCRYPT	1
 #define BLOCK_FLAG_COMPAT6	4
@@ -193,6 +194,7 @@ struct BlockDriverState {
     /* I/O stats (display with "info blockstats"). */
     uint64_t nr_bytes[BDRV_MAX_IOTYPE];
     uint64_t nr_ops[BDRV_MAX_IOTYPE];
+    uint64_t total_time_ns[BDRV_MAX_IOTYPE];
     uint64_t wr_highest_sector;
 
     /* Whether the disk can expand beyond total_sectors */
@@ -218,6 +220,7 @@ struct BlockDriverState {
 
 typedef struct BlockAcctCookie {
     int64_t bytes;
+    int64_t start_time_ns;
     enum BlockIOType type;
 } BlockAcctCookie;
 
@@ -227,6 +230,7 @@ bdrv_acct_start(BlockDriverState *bs, Bl
 {
     cookie->bytes = bytes;
     cookie->type = type;
+    cookie->start_time_ns = get_clock();
 }
 
 static inline void
@@ -234,6 +238,7 @@ bdrv_acct_done(BlockDriverState *bs, Blo
 {
     bs->nr_bytes[cookie->type] += cookie->bytes;
     bs->nr_ops[cookie->type]++;
+    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
 }
 
 
Index: qemu/qmp-commands.hx
===================================================================
--- qemu.orig/qmp-commands.hx	2011-08-11 07:04:48.000000000 -0700
+++ qemu/qmp-commands.hx	2011-08-11 07:37:01.121590911 -0700
@@ -1202,6 +1202,9 @@ Each json-object contain the following:
     - "rd_operations": read operations (json-int)
     - "wr_operations": write operations (json-int)
     - "flush_operations": cache flush operations (json-int)
+    - "wr_total_time_ns": total time spend on writes in nano-seconds (json-int)
+    - "rd_total_time_ns": total time spend on reads in nano-seconds (json-int)
+    - "flush_total_time_ns": total time spend on cache flushes in nano-seconds (json-int)
     - "wr_highest_offset": Highest offset of a sector written since the
                            BlockDriverState has been opened (json-int)
 - "parent": Contains recursively the statistics of the underlying
@@ -1223,6 +1226,9 @@ Example:
                   "wr_operations":751,
                   "rd_bytes":122567168,
                   "rd_operations":36772
+                  "wr_total_times_ns":313253456
+                  "rd_total_times_ns":3465673657
+                  "flush_total_times_ns":49653
                   "flush_operations":61,
                }
             },
@@ -1233,6 +1239,9 @@ Example:
                "rd_bytes":122739200,
                "rd_operations":36604
                "flush_operations":51,
+               "wr_total_times_ns":313253456
+               "rd_total_times_ns":3465673657
+               "flush_total_times_ns":49653
             }
          },
          {
@@ -1244,6 +1253,9 @@ Example:
                "rd_bytes":0,
                "rd_operations":0
                "flush_operations":0,
+               "wr_total_times_ns":0
+               "rd_total_times_ns":0
+               "flush_total_times_ns":0
             }
          },
          {
@@ -1255,6 +1267,9 @@ Example:
                "rd_bytes":0,
                "rd_operations":0
                "flush_operations":0,
+               "wr_total_times_ns":0
+               "rd_total_times_ns":0
+               "flush_total_times_ns":0
             }
          },
          {
@@ -1266,6 +1281,9 @@ Example:
                "rd_bytes":0,
                "rd_operations":0
                "flush_operations":0,
+               "wr_total_times_ns":0
+               "rd_total_times_ns":0
+               "flush_total_times_ns":0
             }
          }
       ]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V2
  2011-08-21 22:25 [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2011-08-21 22:26 ` [Qemu-devel] [PATCH 3/3] block: latency accounting Christoph Hellwig
@ 2011-08-22  8:45 ` Stefan Hajnoczi
  2011-08-22 14:59 ` Ryan Harper
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 23+ messages in thread
From: Stefan Hajnoczi @ 2011-08-22  8:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel

On Mon, Aug 22, 2011 at 12:25:47AM +0200, Christoph Hellwig wrote:
> changes since V1:
>  - rebase to apply against the current qemu.git tree

Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V2
  2011-08-21 22:25 [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2011-08-22  8:45 ` [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Stefan Hajnoczi
@ 2011-08-22 14:59 ` Ryan Harper
  2011-08-22 15:12   ` Christoph Hellwig
  2011-08-22 15:55 ` Kevin Wolf
  2011-08-25  6:25 ` [Qemu-devel] [PATCH 0/3] better I/O accounting V3 Christoph Hellwig
  6 siblings, 1 reply; 23+ messages in thread
From: Ryan Harper @ 2011-08-22 14:59 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel

* Christoph Hellwig <hch@lst.de> [2011-08-21 17:27]:
> changes since V1:
>  - rebase to apply against the current qemu.git tree


Testing on tip + this series with:

./configure --enable-debug --enable-kvm --enable-io-thread --target-list=x86_64-softmmu

x86_64-softmmu/qemu-system-x86_64 --enable-kvm -L pc-bios -m 2048 -drive file=/var/lib/libvirt/images/test01_20G.raw,if=virtio,cache=none -cdrom /var/lib/libvirt/images/Fedora-15-x86_64-Live-Desktop.iso -monitor stdio -vnc :5

during boot, I run one info blockstats, then during boot of F15 live env with 1 virtio-blk devices we segfault.


(qemu) info blockstats
virtio0: rd_bytes=512 wr_bytes=0 rd_operations=1 wr_operations=0 flush_operations=0 wr_total_time_ns=927403 rd_total_time_ns=0 flush_total_time_ns=0
ide1-cd0: rd_bytes=0 wr_bytes=0 rd_operations=0 wr_operations=0 flush_operations=0 wr_total_time_ns=0 rd_total_time_ns=0 flush_total_time_ns=0
floppy0: rd_bytes=0 wr_bytes=0 rd_operations=0 wr_operations=0 flush_operations=0 wr_total_time_ns=0 rd_total_time_ns=0 flush_total_time_ns=0
sd0: rd_bytes=0 wr_bytes=0 rd_operations=0 wr_operations=0 flush_operations=0 wr_total_time_ns=0 rd_total_time_ns=0 flush_total_time_ns=0
(qemu) [New Thread 0x7ffff5928700 (LWP 18032)]
[New Thread 0x7ffff4f27700 (LWP 18033)]

Program received signal SIGSEGV, Segmentation fault.
0x00000000004200c1 in bdrv_acct_done (bs=0x12310b0, cookie=0x1c69f50) at /root/git/qemu/block_int.h:239
239         bs->nr_bytes[cookie->type] += cookie->bytes;
Missing separate debuginfos, use: debuginfo-install cyrus-sasl-gssapi-2.1.23-8.el6.x86_64 cyrus-sasl-lib-2.1.23-8.el6.x86_64 cyrus-sasl-md5-2.1.23-8.el6.x86_64 cyrus-sasl-plain-2.1.23-8.el6.x86_64 db4-4.7.25-16.el6.x86_64 glib2-2.22.5-6.el6.x86_64 glibc-2.12-1.25.el6.x86_64 gnutls-2.8.5-4.el6.x86_64 keyutils-libs-1.4-1.el6.x86_64 krb5-libs-1.9-9.el6.x86_64 libattr-2.4.44-4.el6.x86_64 libcom_err-1.41.12-7.el6.x86_64 libcurl-7.19.7-26.el6.x86_64 libgcrypt-1.4.5-5.el6.x86_64 libgpg-error-1.7-3.el6.x86_64 libidn-1.18-2.el6.x86_64 libpng-1.2.44-1.el6.x86_64 libselinux-2.0.94-5.el6.x86_64 libssh2-1.2.2-7.el6.x86_64 libtasn1-2.3-3.el6.x86_64 ncurses-libs-5.7-3.20090208.el6.x86_64 nspr-4.8.7-1.el6.x86_64 nss-3.12.9-9.el6.x86_64 nss-softokn-freebl-3.12.9-3.el6.x86_64 nss-util-3.12.9-1.el6.x86_64 openldap-2.4.23-15.el6.x86_64 openssl-1.0.0-10.el6.x86_64 zlib-1.2.3-25.el6.x86_64
(gdb) bt
#0  0x00000000004200c1 in bdrv_acct_done (bs=0x12310b0, cookie=0x1c69f50) at /root/git/qemu/block_int.h:239
#1  0x00000000004202c1 in virtio_blk_req_complete (req=0x1c5df00, status=0) at /root/git/qemu/hw/virtio-blk.c:63
#2  0x0000000000420fdb in virtio_blk_handle_request (req=0x1c5df00, mrb=0x7fffffffd1c0)
    at /root/git/qemu/hw/virtio-blk.c:380
#3  0x00000000004210f4 in virtio_blk_handle_output (vdev=0x1aa2040, vq=0x1aa2110)
    at /root/git/qemu/hw/virtio-blk.c:401
#4  0x000000000041e932 in virtio_queue_notify_vq (vq=0x1aa2110) at /root/git/qemu/hw/virtio.c:630
#5  0x00000000005b2aa4 in virtio_pci_host_notifier_read (opaque=0x1aa2110) at /root/git/qemu/hw/virtio-pci.c:196
#6  0x00000000004d5b4a in qemu_iohandler_poll (readfds=0x7fffffffda10, writefds=0x7fffffffd990, xfds=
    0x7fffffffd910, ret=1) at iohandler.c:120
#7  0x00000000005a9795 in main_loop_wait (nonblocking=0) at /root/git/qemu/vl.c:1356
#8  0x00000000005a9816 in main_loop () at /root/git/qemu/vl.c:1400
#9  0x00000000005ae058 in main (argc=14, argv=0x7fffffffdec8, envp=0x7fffffffdf40) at /root/git/qemu/vl.c:3383
(gdb) frame 0
#0  0x00000000004200c1 in bdrv_acct_done (bs=0x12310b0, cookie=0x1c69f50) at /root/git/qemu/block_int.h:239
239         bs->nr_bytes[cookie->type] += cookie->bytes;
(gdb) p *cookie
$1 = {bytes = 72057589759737855, start_time_ns = 72057589759737855, type = 16777215}
(gdb) p *bs
$2 = {total_sectors = 41943040, read_only = 0, keep_read_only = 0, open_flags = 98, removable = 0, locked = 0, 
  tray_open = 0, encrypted = 0, valid_key = 0, sg = 0, change_cb = 0x42172c <virtio_blk_change_cb>, change_opaque = 
    0x1aa2040, drv = 0x9d9e40, opaque = 0x1231b60, peer = 0x1aa13c0, filename = 
    "/var/lib/libvirt/images/test01_20G.raw", '\000' <repeats 985 times>, backing_file = 
    '\000' <repeats 1023 times>, backing_format = '\000' <repeats 15 times>, is_temporary = 0, media_changed = 1, 
  backing_hd = 0x0, file = 0x1231b80, sync_aiocb = 0x0, nr_bytes = {349184, 0, 0}, nr_ops = {87, 0, 0}, 
  total_time_ns = {15991329, 0, 0}, wr_highest_sector = 0, growable = 0, buffer_alignment = 512, 
  enable_write_cache = 1, cyls = 16383, heads = 16, secs = 63, translation = 0, on_read_error = BLOCK_ERR_REPORT, 
  on_write_error = BLOCK_ERR_STOP_ENOSPC, device_name = "virtio0", '\000' <repeats 24 times>, dirty_bitmap = 0x0, 
  dirty_count = 0, in_use = 0, list = {tqe_next = 0x12326a0, tqe_prev = 0x9d8e10}, private = 0x0}
(gdb) list
234     }
235
236     static inline void
237     bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
238     {
239         bs->nr_bytes[cookie->type] += cookie->bytes;
240         bs->nr_ops[cookie->type]++;
241         bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
242     }
243


-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
ryanh@us.ibm.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V2
  2011-08-22 14:59 ` Ryan Harper
@ 2011-08-22 15:12   ` Christoph Hellwig
  2011-08-22 15:29     ` Ryan Harper
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-22 15:12 UTC (permalink / raw)
  To: Ryan Harper; +Cc: Christoph Hellwig, qemu-devel

On Mon, Aug 22, 2011 at 09:59:16AM -0500, Ryan Harper wrote:
> * Christoph Hellwig <hch@lst.de> [2011-08-21 17:27]:
> > changes since V1:
> >  - rebase to apply against the current qemu.git tree
> 
> 
> Testing on tip + this series with:
> 
> ./configure --enable-debug --enable-kvm --enable-io-thread --target-list=x86_64-softmmu

That is about the same config that I run.

> [New Thread 0x7ffff4f27700 (LWP 18033)]
> 
> Program received signal SIGSEGV, Segmentation fault.
> 0x00000000004200c1 in bdrv_acct_done (bs=0x12310b0, cookie=0x1c69f50) at /root/git/qemu/block_int.h:239
> 239         bs->nr_bytes[cookie->type] += cookie->bytes;

I can't see how this can result in a segfault, unless we get a corrupted
cookie.  Can you print what cookie->type is from gdb?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V2
  2011-08-22 15:12   ` Christoph Hellwig
@ 2011-08-22 15:29     ` Ryan Harper
  2011-08-22 15:35       ` Christoph Hellwig
  0 siblings, 1 reply; 23+ messages in thread
From: Ryan Harper @ 2011-08-22 15:29 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Ryan Harper, qemu-devel

* Christoph Hellwig <hch@lst.de> [2011-08-22 10:16]:
> On Mon, Aug 22, 2011 at 09:59:16AM -0500, Ryan Harper wrote:
> > * Christoph Hellwig <hch@lst.de> [2011-08-21 17:27]:
> > > changes since V1:
> > >  - rebase to apply against the current qemu.git tree
> > 
> > 
> > Testing on tip + this series with:
> > 
> > ./configure --enable-debug --enable-kvm --enable-io-thread --target-list=x86_64-softmmu
> 
> That is about the same config that I run.
> 
> > [New Thread 0x7ffff4f27700 (LWP 18033)]
> > 
> > Program received signal SIGSEGV, Segmentation fault.
> > 0x00000000004200c1 in bdrv_acct_done (bs=0x12310b0, cookie=0x1c69f50) at /root/git/qemu/block_int.h:239
> > 239         bs->nr_bytes[cookie->type] += cookie->bytes;
> 
> I can't see how this can result in a segfault, unless we get a corrupted
> cookie.  Can you print what cookie->type is from gdb?


(gdb) frame 0
#0  0x00000000004200c1 in bdrv_acct_done (bs=0x12310b0, cookie=0x1c68810) at /root/git/qemu/block_int.h:239 239         bs->nr_bytes[cookie->type] += cookie->bytes;
(gdb) p *cookie
$3 = {bytes = 72057589759737855, start_time_ns = 72057589759737855, type = 16777215}


-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
ryanh@us.ibm.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V2
  2011-08-22 15:29     ` Ryan Harper
@ 2011-08-22 15:35       ` Christoph Hellwig
  2011-08-22 16:46         ` Ryan Harper
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-22 15:35 UTC (permalink / raw)
  To: Ryan Harper; +Cc: Christoph Hellwig, qemu-devel

On Mon, Aug 22, 2011 at 10:29:11AM -0500, Ryan Harper wrote:
> (gdb) frame 0
> #0  0x00000000004200c1 in bdrv_acct_done (bs=0x12310b0, cookie=0x1c68810) at /root/git/qemu/block_int.h:239 239         bs->nr_bytes[cookie->type] += cookie->bytes;
> (gdb) p *cookie
> $3 = {bytes = 72057589759737855, start_time_ns = 72057589759737855, type = 16777215}

So it is indeed corrupted.  I'll try to figure out how that could have
happened.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting
  2011-08-21 22:26 ` [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting Christoph Hellwig
@ 2011-08-22 15:48   ` Kevin Wolf
  2011-08-24 18:22     ` Christoph Hellwig
  0 siblings, 1 reply; 23+ messages in thread
From: Kevin Wolf @ 2011-08-22 15:48 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel, Stefan Hajnoczi

Am 22.08.2011 00:26, schrieb Christoph Hellwig:
> Decouple the I/O accounting from bdrv_aio_readv/writev/flush and
> make the hardware models call directly into the accounting helpers.
> 
> This means:
>  - we do not count internal requests from image formats in addition
>    to guest originating I/O
>  - we do not double count I/O ops if the device model handles it
>    chunk wise
>  - we only account I/O once it actuall is done
>  - can extent I/O accounting to synchronous or coroutine I/O easily
>  - implement I/O latency tracking easily (see the next patch)
> 
> I've conveted the existing device model callers to the new model,
> device models that are using synchronous I/O and weren't accounted
> before haven't been updated yet.  Also scsi hasn't been converted
> to the end-to-end accounting as I want to defer that after the pending
> scsi layer overhaul.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Index: qemu/block.c
> ===================================================================
> --- qemu.orig/block.c	2011-08-21 14:53:09.684226561 -0700
> +++ qemu/block.c	2011-08-21 14:57:09.507558463 -0700
> @@ -1915,13 +1915,13 @@ static QObject* bdrv_info_stats_bs(Block
>                               "'wr_highest_offset': %" PRId64 ","
>                               "'flush_operations': %" PRId64
>                               "} }",
> -                             bs->rd_bytes,
> -                             bs->wr_bytes,
> -                             bs->rd_ops,
> -                             bs->wr_ops,
> +                             bs->nr_bytes[BDRV_ACCT_READ],
> +                             bs->nr_bytes[BDRV_ACCT_WRITE],
> +                             bs->nr_ops[BDRV_ACCT_READ],
> +                             bs->nr_ops[BDRV_ACCT_WRITE],
>                               bs->wr_highest_sector *
>                               (uint64_t)BDRV_SECTOR_SIZE,
> -                             bs->flush_ops);
> +                             bs->nr_ops[BDRV_ACCT_FLUSH]);
>      dict  = qobject_to_qdict(res);
>  
>      if (*bs->device_name) {
> @@ -2235,7 +2235,6 @@ char *bdrv_snapshot_dump(char *buf, int
>      return buf;
>  }
>  
> -
>  /**************************************************************/
>  /* async I/Os */
>  
> @@ -2244,7 +2243,6 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
>                                   BlockDriverCompletionFunc *cb, void *opaque)
>  {
>      BlockDriver *drv = bs->drv;
> -    BlockDriverAIOCB *ret;
>  
>      trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>  
> @@ -2253,16 +2251,7 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
>      if (bdrv_check_request(bs, sector_num, nb_sectors))
>          return NULL;
>  
> -    ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
> -                              cb, opaque);
> -
> -    if (ret) {
> -	/* Update stats even though technically transfer has not happened. */
> -	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> -	bs->rd_ops ++;
> -    }
> -
> -    return ret;
> +    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, cb, opaque);
>  }
>  
>  typedef struct BlockCompleteData {
> @@ -2329,9 +2318,6 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockD
>                                 cb, opaque);
>  
>      if (ret) {
> -        /* Update stats even though technically transfer has not happened. */
> -        bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> -        bs->wr_ops ++;
>          if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
>              bs->wr_highest_sector = sector_num + nb_sectors - 1;
>          }
> @@ -2585,8 +2571,6 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDr
>  
>      trace_bdrv_aio_flush(bs, opaque);
>  
> -    bs->flush_ops++;
> -
>      if (bs->open_flags & BDRV_O_NO_FLUSH) {
>          return bdrv_aio_noop_em(bs, cb, opaque);
>      }
> Index: qemu/block_int.h
> ===================================================================
> --- qemu.orig/block_int.h	2011-08-21 14:53:09.000000000 -0700
> +++ qemu/block_int.h	2011-08-21 14:57:09.510891797 -0700
> @@ -148,6 +148,13 @@ struct BlockDriver {
>      QLIST_ENTRY(BlockDriver) list;
>  };
>  
> +enum BlockIOType {
> +    BDRV_ACCT_READ,
> +    BDRV_ACCT_WRITE,
> +    BDRV_ACCT_FLUSH,
> +    BDRV_MAX_IOTYPE,
> +};
> +
>  struct BlockDriverState {
>      int64_t total_sectors; /* if we are reading a disk image, give its
>                                size in sectors */
> @@ -184,11 +191,8 @@ struct BlockDriverState {
>      void *sync_aiocb;
>  
>      /* I/O stats (display with "info blockstats"). */
> -    uint64_t rd_bytes;
> -    uint64_t wr_bytes;
> -    uint64_t rd_ops;
> -    uint64_t wr_ops;
> -    uint64_t flush_ops;
> +    uint64_t nr_bytes[BDRV_MAX_IOTYPE];
> +    uint64_t nr_ops[BDRV_MAX_IOTYPE];
>      uint64_t wr_highest_sector;
>  
>      /* Whether the disk can expand beyond total_sectors */
> @@ -212,6 +216,27 @@ struct BlockDriverState {
>      void *private;
>  };
>  
> +typedef struct BlockAcctCookie {
> +    int64_t bytes;
> +    enum BlockIOType type;
> +} BlockAcctCookie;
> +
> +static inline void
> +bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
> +        enum BlockIOType type)
> +{
> +    cookie->bytes = bytes;
> +    cookie->type = type;
> +}
> +
> +static inline void
> +bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
> +{
> +    bs->nr_bytes[cookie->type] += cookie->bytes;
> +    bs->nr_ops[cookie->type]++;
> +}

Device models actually shouldn't include block_int.h, so this isn't very
nice. Moving it to block.h would lose the inline, does it matter?

>  #define CHANGE_MEDIA    0x01
>  #define CHANGE_SIZE     0x02
>  
> Index: qemu/hw/ide/ahci.c
> ===================================================================
> --- qemu.orig/hw/ide/ahci.c	2011-08-21 11:49:29.000000000 -0700
> +++ qemu/hw/ide/ahci.c	2011-08-21 14:57:09.510891797 -0700
> @@ -710,6 +710,7 @@ static void ncq_cb(void *opaque, int ret
>      DPRINTF(ncq_tfs->drive->port_no, "NCQ transfer tag %d finished\n",
>              ncq_tfs->tag);
>  
> +    bdrv_acct_done(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct);
>      qemu_sglist_destroy(&ncq_tfs->sglist);
>      ncq_tfs->used = 0;
>  }
> @@ -755,7 +756,11 @@ static void process_ncq_command(AHCIStat
>                      ncq_tfs->sector_count-1, ncq_tfs->lba, ncq_tfs->tag);
>              ncq_tfs->is_read = 1;
>  
> -            DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);
> +	    DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);

The patch is adding tabs here and in other places. Please fix.

> +
> +            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
> +                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
> +                            BDRV_ACCT_READ);
>              ncq_tfs->aiocb = dma_bdrv_read(ncq_tfs->drive->port.ifs[0].bs,
>                                             &ncq_tfs->sglist, ncq_tfs->lba,
>                                             ncq_cb, ncq_tfs);
> @@ -766,6 +771,10 @@ static void process_ncq_command(AHCIStat
>              ncq_tfs->is_read = 0;
>  
>              DPRINTF(port, "tag %d aio write %ld\n", ncq_tfs->tag, ncq_tfs->lba);
> +
> +            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
> +                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
> +                            BDRV_ACCT_WRITE);
>              ncq_tfs->aiocb = dma_bdrv_write(ncq_tfs->drive->port.ifs[0].bs,
>                                              &ncq_tfs->sglist, ncq_tfs->lba,
>                                              ncq_cb, ncq_tfs);
> Index: qemu/hw/ide/atapi.c
> ===================================================================
> --- qemu.orig/hw/ide/atapi.c	2011-08-21 11:49:17.000000000 -0700
> +++ qemu/hw/ide/atapi.c	2011-08-21 14:57:09.510891797 -0700
> @@ -250,6 +250,7 @@ static void ide_atapi_cmd_reply(IDEState
>      s->io_buffer_index = 0;
>  
>      if (s->atapi_dma) {
> +        bdrv_acct_start(s->bs, &s->acct, size, BDRV_ACCT_READ);
>          s->status = READY_STAT | SEEK_STAT | DRQ_STAT;
>          s->bus->dma->ops->start_dma(s->bus->dma, s,
>                                     ide_atapi_cmd_read_dma_cb);
> @@ -322,10 +323,7 @@ static void ide_atapi_cmd_read_dma_cb(vo
>          s->status = READY_STAT | SEEK_STAT;
>          s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
>          ide_set_irq(s->bus);
> -    eot:
> -        s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
> -        ide_set_inactive(s);
> -        return;
> +        goto eot;
>      }
>  
>      s->io_buffer_index = 0;
> @@ -343,9 +341,11 @@ static void ide_atapi_cmd_read_dma_cb(vo
>  #ifdef DEBUG_AIO
>      printf("aio_read_cd: lba=%u n=%d\n", s->lba, n);
>  #endif
> +
>      s->bus->dma->iov.iov_base = (void *)(s->io_buffer + data_offset);
>      s->bus->dma->iov.iov_len = n * 4 * 512;
>      qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1);
> +
>      s->bus->dma->aiocb = bdrv_aio_readv(s->bs, (int64_t)s->lba << 2,
>                                         &s->bus->dma->qiov, n * 4,
>                                         ide_atapi_cmd_read_dma_cb, s);
> @@ -355,6 +355,12 @@ static void ide_atapi_cmd_read_dma_cb(vo
>                              ASC_MEDIUM_NOT_PRESENT);
>          goto eot;
>      }
> +
> +    return;
> +eot:
> +    bdrv_acct_done(s->bs, &s->acct);
> +    s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
> +    ide_set_inactive(s);
>  }
>  
>  /* start a CD-CDROM read command with DMA */
> @@ -368,6 +374,8 @@ static void ide_atapi_cmd_read_dma(IDESt
>      s->io_buffer_size = 0;
>      s->cd_sector_size = sector_size;
>  
> +    bdrv_acct_start(s->bs, &s->acct, s->packet_transfer_size, BDRV_ACCT_READ);
> +
>      /* XXX: check if BUSY_STAT should be set */
>      s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
>      s->bus->dma->ops->start_dma(s->bus->dma, s,
> Index: qemu/hw/ide/core.c
> ===================================================================
> --- qemu.orig/hw/ide/core.c	2011-08-21 11:49:17.000000000 -0700
> +++ qemu/hw/ide/core.c	2011-08-21 14:57:09.510891797 -0700
> @@ -473,7 +473,10 @@ void ide_sector_read(IDEState *s)
>  #endif
>          if (n > s->req_nb_sectors)
>              n = s->req_nb_sectors;
> +
> +        bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
>          ret = bdrv_read(s->bs, sector_num, s->io_buffer, n);
> +        bdrv_acct_done(s->bs, &s->acct);

The commit message says that you're only converting bdrv_aio_readv. I
think it makes sense to add the accounting to the sychronous calls, too,
so that devices have complete accounting or none at all.

If your plan is to convert all bdrv_read calls, I think you've missed
some in atapi.c.

>          if (ret != 0) {
>              if (ide_handle_rw_error(s, -ret,
>                  BM_STATUS_PIO_RETRY | BM_STATUS_RETRY_READ))
> @@ -610,7 +613,8 @@ handle_rw_error:
>      return;
>  
>  eot:
> -   ide_set_inactive(s);
> +    bdrv_acct_done(s->bs, &s->acct);
> +    ide_set_inactive(s);
>  }
>  
>  static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd)
> @@ -619,6 +623,20 @@ static void ide_sector_start_dma(IDEStat
>      s->io_buffer_index = 0;
>      s->io_buffer_size = 0;
>      s->dma_cmd = dma_cmd;
> +
> +    switch (dma_cmd) {
> +    case IDE_DMA_READ:
> +        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
> +                        BDRV_ACCT_READ);
> +        break;
> +    case IDE_DMA_WRITE:
> +        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
> +                        BDRV_ACCT_WRITE);
> +	break;
> +    default:
> +        break;
> +    }

So should the semantics of bdrv_acct_done be that it does nothing if no
bdrv_acct_start has been called before? If so, its implementation is
broken. Otherwise, the default case of this switch statement looks
broken to me.

> +
>      s->bus->dma->ops->start_dma(s->bus->dma, s, ide_dma_cb);
>  }
>  
> @@ -641,7 +659,10 @@ void ide_sector_write(IDEState *s)
>      n = s->nsector;
>      if (n > s->req_nb_sectors)
>          n = s->req_nb_sectors;
> +
> +    bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
>      ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
> +    bdrv_acct_done(s->bs, &s->acct);
>  
>      if (ret != 0) {
>          if (ide_handle_rw_error(s, -ret, BM_STATUS_PIO_RETRY))
> @@ -685,6 +706,7 @@ static void ide_flush_cb(void *opaque, i
>          }
>      }
>  
> +    bdrv_acct_done(s->bs, &s->acct);
>      s->status = READY_STAT | SEEK_STAT;
>      ide_set_irq(s->bus);
>  }
> @@ -698,6 +720,7 @@ void ide_flush_cache(IDEState *s)
>          return;
>      }
>  
> +    bdrv_acct_start(s->bs, &s->acct, 0, BDRV_ACCT_FLUSH);
>      acb = bdrv_aio_flush(s->bs, ide_flush_cb, s);
>      if (acb == NULL) {
>          ide_flush_cb(s, -EIO);
> Index: qemu/hw/ide/macio.c
> ===================================================================
> --- qemu.orig/hw/ide/macio.c	2011-08-21 11:49:29.000000000 -0700
> +++ qemu/hw/ide/macio.c	2011-08-21 14:57:09.514225130 -0700
> @@ -52,8 +52,7 @@ static void pmac_ide_atapi_transfer_cb(v
>          m->aiocb = NULL;
>          qemu_sglist_destroy(&s->sg);
>          ide_atapi_io_error(s, ret);
> -        io->dma_end(opaque);
> -        return;
> +        goto done;
>      }
>  
>      if (s->io_buffer_size > 0) {
> @@ -71,8 +70,7 @@ static void pmac_ide_atapi_transfer_cb(v
>          ide_atapi_cmd_ok(s);
>  
>      if (io->len == 0) {
> -        io->dma_end(opaque);
> -        return;
> +        goto done;
>      }
>  
>      /* launch next transfer */
> @@ -92,9 +90,14 @@ static void pmac_ide_atapi_transfer_cb(v
>          /* Note: media not present is the most likely case */
>          ide_atapi_cmd_error(s, SENSE_NOT_READY,
>                              ASC_MEDIUM_NOT_PRESENT);
> -        io->dma_end(opaque);
> -        return;
> +        goto done;
>      }
> +    return;
> +
> +done:
> +    bdrv_acct_done(s->bs, &s->acct);
> +    io->dma_end(opaque);
> +    return;
>  }
>  
>  static void pmac_ide_transfer_cb(void *opaque, int ret)
> @@ -109,8 +112,7 @@ static void pmac_ide_transfer_cb(void *o
>          m->aiocb = NULL;
>          qemu_sglist_destroy(&s->sg);
>  	ide_dma_error(s);
> -        io->dma_end(io);
> -        return;
> +        goto done;
>      }
>  
>      sector_num = ide_get_sector(s);
> @@ -130,10 +132,8 @@ static void pmac_ide_transfer_cb(void *o
>      }
>  
>      /* end of DMA ? */
> -
>      if (io->len == 0) {
> -        io->dma_end(io);
> -	return;
> +        goto done;
>      }
>  
>      /* launch next transfer */
> @@ -163,6 +163,10 @@ static void pmac_ide_transfer_cb(void *o
>  
>      if (!m->aiocb)
>          pmac_ide_transfer_cb(io, -1);
> +    return;
> +done:
> +    bdrv_acct_done(s->bs, &s->acct);
> +    io->dma_end(io);
>  }
>  
>  static void pmac_ide_transfer(DBDMA_io *io)
> @@ -172,10 +176,22 @@ static void pmac_ide_transfer(DBDMA_io *
>  
>      s->io_buffer_size = 0;
>      if (s->drive_kind == IDE_CD) {
> +        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
>          pmac_ide_atapi_transfer_cb(io, 0);
>          return;
>      }
>  
> +    switch (s->dma_cmd) {
> +    case IDE_DMA_READ:
> +        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
> +        break;
> +    case IDE_DMA_WRITE:
> +        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_WRITE);
> +        break;
> +    default:
> +        break;

Can it happen? If yes, see above. If no, abort() is better.

> +    }
> +
>      pmac_ide_transfer_cb(io, 0);
>  }
>  
> Index: qemu/hw/scsi-disk.c
> ===================================================================
> --- qemu.orig/hw/scsi-disk.c	2011-08-21 11:49:29.000000000 -0700
> +++ qemu/hw/scsi-disk.c	2011-08-21 14:57:09.514225130 -0700
> @@ -57,6 +57,7 @@ typedef struct SCSIDiskReq {
>      struct iovec iov;
>      QEMUIOVector qiov;
>      uint32_t status;
> +    BlockAcctCookie acct;
>  } SCSIDiskReq;
>  
>  struct SCSIDiskState
> @@ -107,6 +108,7 @@ static void scsi_cancel_io(SCSIRequest *
>  static void scsi_read_complete(void * opaque, int ret)
>  {
>      SCSIDiskReq *r = (SCSIDiskReq *)opaque;
> +    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
>      int n;
>  
>      r->req.aiocb = NULL;
> @@ -117,6 +119,8 @@ static void scsi_read_complete(void * op
>          }
>      }
>  
> +    bdrv_acct_done(s->bs, &r->acct);
> +
>      DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->iov.iov_len);

scsi-disk doesn't account for failed requests. IDE does. I don't have a
strong preference on which way we handle it, but I think it should be
consistent.

>  
>      n = r->iov.iov_len / 512;
> @@ -161,6 +165,8 @@ static void scsi_read_data(SCSIRequest *
>  
>      r->iov.iov_len = n * 512;
>      qemu_iovec_init_external(&r->qiov, &r->iov, 1);
> +
> +    bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
>      r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n,
>                                scsi_read_complete, r);
>      if (r->req.aiocb == NULL) {
> @@ -207,6 +213,7 @@ static int scsi_handle_rw_error(SCSIDisk
>  static void scsi_write_complete(void * opaque, int ret)
>  {
>      SCSIDiskReq *r = (SCSIDiskReq *)opaque;
> +    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
>      uint32_t len;
>      uint32_t n;
>  
> @@ -218,6 +225,8 @@ static void scsi_write_complete(void * o
>          }
>      }
>  
> +    bdrv_acct_done(s->bs, &r->acct);
> +
>      n = r->iov.iov_len / 512;
>      r->sector += n;
>      r->sector_count -= n;
> @@ -252,6 +261,8 @@ static void scsi_write_data(SCSIRequest
>      n = r->iov.iov_len / 512;
>      if (n) {
>          qemu_iovec_init_external(&r->qiov, &r->iov, 1);
> +
> +        bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE);
>          r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n,
>                                     scsi_write_complete, r);
>          if (r->req.aiocb == NULL) {
> @@ -854,13 +865,19 @@ static int scsi_disk_emulate_command(SCS
>          buflen = 8;
>          break;
>      case SYNCHRONIZE_CACHE:
> +{
> +        BlockAcctCookie acct;
> +
> +        bdrv_acct_start(s->bs, &acct, 0, BDRV_ACCT_FLUSH);
>          ret = bdrv_flush(s->bs);
>          if (ret < 0) {
>              if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_FLUSH)) {
>                  return -1;
>              }
>          }
> +        bdrv_acct_done(s->bs, &acct);
>          break;
> +}
>      case GET_CONFIGURATION:
>          memset(outbuf, 0, 8);
>          /* ??? This should probably return much more information.  For now
> Index: qemu/hw/virtio-blk.c
> ===================================================================
> --- qemu.orig/hw/virtio-blk.c	2011-08-21 11:49:29.000000000 -0700
> +++ qemu/hw/virtio-blk.c	2011-08-21 14:57:44.887558247 -0700
> @@ -47,6 +47,7 @@ typedef struct VirtIOBlockReq
>      struct virtio_scsi_inhdr *scsi;
>      QEMUIOVector qiov;
>      struct VirtIOBlockReq *next;
> +    BlockAcctCookie acct;
>  } VirtIOBlockReq;
>  
>  static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
> @@ -59,6 +60,7 @@ static void virtio_blk_req_complete(Virt
>      virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
>      virtio_notify(&s->vdev, s->vq);
>  
> +    bdrv_acct_done(s->bs, &req->acct);
>      g_free(req);
>  }
>  
> @@ -266,6 +268,8 @@ static void virtio_blk_handle_flush(Virt
>  {
>      BlockDriverAIOCB *acb;
>  
> +    bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH);
> +
>      /*
>       * Make sure all outstanding writes are posted to the backing device.
>       */
> @@ -284,6 +288,8 @@ static void virtio_blk_handle_write(Virt
>  
>      sector = ldq_p(&req->out->sector);
>  
> +    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE);
> +
>      trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
>  
>      if (sector & req->dev->sector_mask) {
> @@ -317,6 +323,8 @@ static void virtio_blk_handle_read(VirtI
>  
>      sector = ldq_p(&req->out->sector);
>  
> +    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ);
> +
>      if (sector & req->dev->sector_mask) {
>          virtio_blk_rw_complete(req, -EIO);
>          return;
> Index: qemu/hw/xen_disk.c
> ===================================================================
> --- qemu.orig/hw/xen_disk.c	2011-08-21 11:49:29.000000000 -0700
> +++ qemu/hw/xen_disk.c	2011-08-21 14:57:09.517558463 -0700
> @@ -79,6 +79,7 @@ struct ioreq {
>  
>      struct XenBlkDev    *blkdev;
>      QLIST_ENTRY(ioreq)   list;
> +    BlockAcctCookie     acct;

Indentation is off.

Kevin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V2
  2011-08-21 22:25 [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2011-08-22 14:59 ` Ryan Harper
@ 2011-08-22 15:55 ` Kevin Wolf
  2011-08-25  6:25 ` [Qemu-devel] [PATCH 0/3] better I/O accounting V3 Christoph Hellwig
  6 siblings, 0 replies; 23+ messages in thread
From: Kevin Wolf @ 2011-08-22 15:55 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel

Am 22.08.2011 00:25, schrieb Christoph Hellwig:
> changes since V1:
>  - rebase to apply against the current qemu.git tree

Applied patch 1, commented on patch 2. The third one looks good, but
doesn't apply without the other one.

Kevin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V2
  2011-08-22 15:35       ` Christoph Hellwig
@ 2011-08-22 16:46         ` Ryan Harper
  2011-08-24 18:45           ` Christoph Hellwig
  0 siblings, 1 reply; 23+ messages in thread
From: Ryan Harper @ 2011-08-22 16:46 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Ryan Harper, qemu-devel

* Christoph Hellwig <hch@lst.de> [2011-08-22 10:37]:
> On Mon, Aug 22, 2011 at 10:29:11AM -0500, Ryan Harper wrote:
> > (gdb) frame 0
> > #0  0x00000000004200c1 in bdrv_acct_done (bs=0x12310b0, cookie=0x1c68810) at /root/git/qemu/block_int.h:239 239         bs->nr_bytes[cookie->type] += cookie->bytes;
> > (gdb) p *cookie
> > $3 = {bytes = 72057589759737855, start_time_ns = 72057589759737855, type = 16777215}
> 
> So it is indeed corrupted.  I'll try to figure out how that could have
> happened.

So, I believe this is how it's happening.

we start accounting on a write which is turned into a multiwrite via
virtio_blk_handle_write() which calls virtio_submit_multiwrite().

Then when the multiwrite completes, we invoke virtio_blk_rw_complete()
on each part of the multiwrite.  None of these requests have had their
acct structure initialized since there was just *one* initial write.
We could do a bdrv_acct_start() on each req, but that would break the
concept of hiding the additional writes under the initial request.

So ensuring that the acct field is initialed when the request is
allocated will fix the issue.

With this patch, I don't see the crash anymore.

Signed-off-by: Ryan Harper <ryanh@us.ibm.com>


diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 2660d1d..e746917 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -123,6 +123,7 @@ static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
     req->dev = s;
     req->qiov.size = 0;
     req->next = NULL;
+    memset(&req->acct, 0, sizeof(BlockAcctCookie));
     return req;
 }



-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
ryanh@us.ibm.com

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting
  2011-08-22 15:48   ` Kevin Wolf
@ 2011-08-24 18:22     ` Christoph Hellwig
  0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-24 18:22 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: Christoph Hellwig, Stefan Hajnoczi, qemu-devel

On Mon, Aug 22, 2011 at 05:48:37PM +0200, Kevin Wolf wrote:
> > +static inline void
> > +bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
> > +        enum BlockIOType type)
> > +{
> > +    cookie->bytes = bytes;
> > +    cookie->type = type;
> > +}
> > +
> > +static inline void
> > +bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
> > +{
> > +    bs->nr_bytes[cookie->type] += cookie->bytes;
> > +    bs->nr_ops[cookie->type]++;
> > +}
> 
> Device models actually shouldn't include block_int.h, so this isn't very
> nice. Moving it to block.h would lose the inline, does it matter?

I have moved it out of line, we can change it if anyone cares.

> > +
> > +        bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
> >          ret = bdrv_read(s->bs, sector_num, s->io_buffer, n);
> > +        bdrv_acct_done(s->bs, &s->acct);
> 
> The commit message says that you're only converting bdrv_aio_readv. I
> think it makes sense to add the accounting to the sychronous calls, too,
> so that devices have complete accounting or none at all.

That was the plan, I just worded it incorrectly.  The idea is to fully
convert device modles that were using bdrv_aio_* in some places, but
leave those that never had any accounting out for now.

> If your plan is to convert all bdrv_read calls, I think you've missed
> some in atapi.c.

Added.


> > +    switch (dma_cmd) {
> > +    case IDE_DMA_READ:
> > +        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
> > +                        BDRV_ACCT_READ);
> > +        break;
> > +    case IDE_DMA_WRITE:
> > +        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
> > +                        BDRV_ACCT_WRITE);
> > +	break;
> > +    default:
> > +        break;
> > +    }
> 
> So should the semantics of bdrv_acct_done be that it does nothing if no
> bdrv_acct_start has been called before? If so, its implementation is
> broken. Otherwise, the default case of this switch statement looks
> broken to me.

I have fixed ide_dma_cb to only do the accounting for read and write requests.

> > +    switch (s->dma_cmd) {
> > +    case IDE_DMA_READ:
> > +        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
> > +        break;
> > +    case IDE_DMA_WRITE:
> > +        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_WRITE);
> > +        break;
> > +    default:
> > +        break;
> 
> Can it happen? If yes, see above. If no, abort() is better.

We can get a trim request, so it can happen.

> >  static void scsi_read_complete(void * opaque, int ret)
> >  {
> >      SCSIDiskReq *r = (SCSIDiskReq *)opaque;
> > +    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
> >      int n;
> >  
> >      r->req.aiocb = NULL;
> > @@ -117,6 +119,8 @@ static void scsi_read_complete(void * op
> >          }
> >      }
> >  
> > +    bdrv_acct_done(s->bs, &r->acct);
> > +
> >      DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->iov.iov_len);
> 
> scsi-disk doesn't account for failed requests. IDE does. I don't have a
> strong preference on which way we handle it, but I think it should be
> consistent.

The idea is to call bdrv_acct_done even for failed request to make sure
the calls are balanced.  In the future we might want to pass the error
code to it to e.g. count failed requests separately.

> > Index: qemu/hw/xen_disk.c
> > ===================================================================
> > --- qemu.orig/hw/xen_disk.c	2011-08-21 11:49:29.000000000 -0700
> > +++ qemu/hw/xen_disk.c	2011-08-21 14:57:09.517558463 -0700
> > @@ -79,6 +79,7 @@ struct ioreq {
> >  
> >      struct XenBlkDev    *blkdev;
> >      QLIST_ENTRY(ioreq)   list;
> > +    BlockAcctCookie     acct;
> 
> Indentation is off.

Actually only the indentation of list if off compared to the rest of
the structure and everything else in the file.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V2
  2011-08-22 16:46         ` Ryan Harper
@ 2011-08-24 18:45           ` Christoph Hellwig
  2011-08-25 16:41             ` Ryan Harper
  0 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-24 18:45 UTC (permalink / raw)
  To: Ryan Harper; +Cc: Christoph Hellwig, qemu-devel

On Mon, Aug 22, 2011 at 11:46:00AM -0500, Ryan Harper wrote:
> So, I believe this is how it's happening.
> 
> we start accounting on a write which is turned into a multiwrite via
> virtio_blk_handle_write() which calls virtio_submit_multiwrite().
> 
> Then when the multiwrite completes, we invoke virtio_blk_rw_complete()
> on each part of the multiwrite.  None of these requests have had their
> acct structure initialized since there was just *one* initial write.
> We could do a bdrv_acct_start() on each req, but that would break the
> concept of hiding the additional writes under the initial request.

We do initialize all write fields correctly.  Where we fail right now
is for non-read/write requests.  I can reproduce your issue by reading
the serial attribute in sysfs - any chance Fedora does that during boot?

The patch below should take care of these cases:


Index: qemu/hw/virtio-blk.c
===================================================================
--- qemu.orig/hw/virtio-blk.c	2011-08-24 20:32:03.764503179 +0200
+++ qemu/hw/virtio-blk.c	2011-08-24 20:35:55.716579922 +0200
@@ -59,9 +59,6 @@ static void virtio_blk_req_complete(Virt
     stb_p(&req->in->status, status);
     virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
     virtio_notify(&s->vdev, s->vq);
-
-    bdrv_acct_done(s->bs, &req->acct);
-    g_free(req);
 }
 
 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
@@ -83,6 +80,8 @@ static int virtio_blk_handle_rw_error(Vi
         vm_stop(VMSTOP_DISKFULL);
     } else {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+        bdrv_acct_done(s->bs, &req->acct);
+        g_free(req);
         bdrv_mon_event(s->bs, BDRV_ACTION_REPORT, is_read);
     }
 
@@ -102,6 +101,8 @@ static void virtio_blk_rw_complete(void
     }
 
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+    bdrv_acct_done(req->dev->bs, &req->acct);
+    g_free(req);
 }
 
 static void virtio_blk_flush_complete(void *opaque, int ret)
@@ -115,6 +116,8 @@ static void virtio_blk_flush_complete(vo
     }
 
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+    bdrv_acct_done(req->dev->bs, &req->acct);
+    g_free(req);
 }
 
 static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
@@ -157,6 +160,7 @@ static void virtio_blk_handle_scsi(VirtI
      */
     if (req->elem.out_num < 2 || req->elem.in_num < 3) {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+        g_free(req);
         return;
     }
 
@@ -165,6 +169,7 @@ static void virtio_blk_handle_scsi(VirtI
      */
     if (req->elem.out_num > 2 && req->elem.in_num > 3) {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
+        g_free(req);
         return;
     }
 
@@ -231,11 +236,13 @@ static void virtio_blk_handle_scsi(VirtI
     stl_p(&req->scsi->data_len, hdr.dxfer_len);
 
     virtio_blk_req_complete(req, status);
+    g_free(req);
 }
 #else
 static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
 {
     virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
+    g_free(req);
 }
 #endif /* __linux__ */
 
@@ -378,6 +385,7 @@ static void virtio_blk_handle_request(Vi
                 s->serial ? s->serial : "",
                 MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES));
         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+        g_free(req);
     } else if (type & VIRTIO_BLK_T_OUT) {
         qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
                                  req->elem.out_num - 1);

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V3
  2011-08-21 22:25 [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2011-08-22 15:55 ` Kevin Wolf
@ 2011-08-25  6:25 ` Christoph Hellwig
  2011-08-25  6:25   ` [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats Christoph Hellwig
                     ` (3 more replies)
  6 siblings, 4 replies; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-25  6:25 UTC (permalink / raw)
  To: qemu-devel

changes since V2:
 - fix the crash with non read/write/flush commands in virtio_blk
   pointed out by Ryan
 - address all review comments by Kevin
changes since V1:
 - rebase to apply against the current qemu.git tree


Note that this still includes the (unchanged) patch 1 that Kevin already
applied, but that didn't make it to qemu.git yet.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats
  2011-08-25  6:25 ` [Qemu-devel] [PATCH 0/3] better I/O accounting V3 Christoph Hellwig
@ 2011-08-25  6:25   ` Christoph Hellwig
  2011-08-25  6:26   ` [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting Christoph Hellwig
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-25  6:25 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2011-08-11 12:32:35.842658196 +0200
+++ qemu/block.c	2011-08-11 16:04:48.302061893 +0200
@@ -1888,11 +1888,13 @@ static void bdrv_stats_iter(QObject *dat
                         " wr_bytes=%" PRId64
                         " rd_operations=%" PRId64
                         " wr_operations=%" PRId64
+                        " flush_operations=%" PRId64
                         "\n",
                         qdict_get_int(qdict, "rd_bytes"),
                         qdict_get_int(qdict, "wr_bytes"),
                         qdict_get_int(qdict, "rd_operations"),
-                        qdict_get_int(qdict, "wr_operations"));
+                        qdict_get_int(qdict, "wr_operations"),
+                        qdict_get_int(qdict, "flush_operations"));
 }
 
 void bdrv_stats_print(Monitor *mon, const QObject *data)
@@ -1910,12 +1912,16 @@ static QObject* bdrv_info_stats_bs(Block
                              "'wr_bytes': %" PRId64 ","
                              "'rd_operations': %" PRId64 ","
                              "'wr_operations': %" PRId64 ","
-                             "'wr_highest_offset': %" PRId64
+                             "'wr_highest_offset': %" PRId64 ","
+                             "'flush_operations': %" PRId64
                              "} }",
-                             bs->rd_bytes, bs->wr_bytes,
-                             bs->rd_ops, bs->wr_ops,
+                             bs->rd_bytes,
+                             bs->wr_bytes,
+                             bs->rd_ops,
+                             bs->wr_ops,
                              bs->wr_highest_sector *
-                             (uint64_t)BDRV_SECTOR_SIZE);
+                             (uint64_t)BDRV_SECTOR_SIZE,
+                             bs->flush_ops);
     dict  = qobject_to_qdict(res);
 
     if (*bs->device_name) {
@@ -2579,6 +2585,8 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDr
 
     trace_bdrv_aio_flush(bs, opaque);
 
+    bs->flush_ops++;
+
     if (bs->open_flags & BDRV_O_NO_FLUSH) {
         return bdrv_aio_noop_em(bs, cb, opaque);
     }
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2011-08-11 12:32:35.852658142 +0200
+++ qemu/block_int.h	2011-08-11 16:04:48.302061893 +0200
@@ -188,6 +188,7 @@ struct BlockDriverState {
     uint64_t wr_bytes;
     uint64_t rd_ops;
     uint64_t wr_ops;
+    uint64_t flush_ops;
     uint64_t wr_highest_sector;
 
     /* Whether the disk can expand beyond total_sectors */
Index: qemu/qmp-commands.hx
===================================================================
--- qemu.orig/qmp-commands.hx	2011-08-11 12:32:36.062657004 +0200
+++ qemu/qmp-commands.hx	2011-08-11 16:04:48.305395208 +0200
@@ -1201,6 +1201,7 @@ Each json-object contain the following:
     - "wr_bytes": bytes written (json-int)
     - "rd_operations": read operations (json-int)
     - "wr_operations": write operations (json-int)
+    - "flush_operations": cache flush operations (json-int)
     - "wr_highest_offset": Highest offset of a sector written since the
                            BlockDriverState has been opened (json-int)
 - "parent": Contains recursively the statistics of the underlying
@@ -1222,6 +1223,7 @@ Example:
                   "wr_operations":751,
                   "rd_bytes":122567168,
                   "rd_operations":36772
+                  "flush_operations":61,
                }
             },
             "stats":{
@@ -1230,6 +1232,7 @@ Example:
                "wr_operations":692,
                "rd_bytes":122739200,
                "rd_operations":36604
+               "flush_operations":51,
             }
          },
          {
@@ -1240,6 +1243,7 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
             }
          },
          {
@@ -1250,6 +1254,7 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
             }
          },
          {
@@ -1260,6 +1265,7 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
             }
          }
       ]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting
  2011-08-25  6:25 ` [Qemu-devel] [PATCH 0/3] better I/O accounting V3 Christoph Hellwig
  2011-08-25  6:25   ` [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats Christoph Hellwig
@ 2011-08-25  6:26   ` Christoph Hellwig
  2011-08-25  6:26   ` [Qemu-devel] [PATCH 3/3] block: latency accounting Christoph Hellwig
  2011-08-25 16:10   ` [Qemu-devel] [PATCH 0/3] better I/O accounting V3 Kevin Wolf
  3 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-25  6:26 UTC (permalink / raw)
  To: qemu-devel

Decouple the I/O accounting from bdrv_aio_readv/writev/flush and
make the hardware models call directly into the accounting helpers.

This means:
 - we do not count internal requests from image formats in addition
   to guest originating I/O
 - we do not double count I/O ops if the device model handles it
   chunk wise
 - we only account I/O once it actuall is done
 - can extent I/O accounting to synchronous or coroutine I/O easily
 - implement I/O latency tracking easily (see the next patch)

I've conveted the existing device model callers to the new model,
device models that are using synchronous I/O and weren't accounted
before haven't been updated yet.  Also scsi hasn't been converted
to the end-to-end accounting as I want to defer that after the pending
scsi layer overhaul.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2011-08-25 08:01:04.661979573 +0200
+++ qemu/block.c	2011-08-25 08:01:12.221938617 +0200
@@ -1915,13 +1915,13 @@ static QObject* bdrv_info_stats_bs(Block
                              "'wr_highest_offset': %" PRId64 ","
                              "'flush_operations': %" PRId64
                              "} }",
-                             bs->rd_bytes,
-                             bs->wr_bytes,
-                             bs->rd_ops,
-                             bs->wr_ops,
+                             bs->nr_bytes[BDRV_ACCT_READ],
+                             bs->nr_bytes[BDRV_ACCT_WRITE],
+                             bs->nr_ops[BDRV_ACCT_READ],
+                             bs->nr_ops[BDRV_ACCT_WRITE],
                              bs->wr_highest_sector *
                              (uint64_t)BDRV_SECTOR_SIZE,
-                             bs->flush_ops);
+                             bs->nr_ops[BDRV_ACCT_FLUSH]);
     dict  = qobject_to_qdict(res);
 
     if (*bs->device_name) {
@@ -2235,7 +2235,6 @@ char *bdrv_snapshot_dump(char *buf, int
     return buf;
 }
 
-
 /**************************************************************/
 /* async I/Os */
 
@@ -2244,7 +2243,6 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
                                  BlockDriverCompletionFunc *cb, void *opaque)
 {
     BlockDriver *drv = bs->drv;
-    BlockDriverAIOCB *ret;
 
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
@@ -2253,16 +2251,8 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDr
     if (bdrv_check_request(bs, sector_num, nb_sectors))
         return NULL;
 
-    ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
-                              cb, opaque);
-
-    if (ret) {
-        /* Update stats even though technically transfer has not happened. */
-        bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-        bs->rd_ops++;
-    }
-
-    return ret;
+    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+                               cb, opaque);
 }
 
 typedef struct BlockCompleteData {
@@ -2329,9 +2319,6 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockD
                                cb, opaque);
 
     if (ret) {
-        /* Update stats even though technically transfer has not happened. */
-        bs->wr_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-        bs->wr_ops ++;
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
@@ -2585,8 +2572,6 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDr
 
     trace_bdrv_aio_flush(bs, opaque);
 
-    bs->flush_ops++;
-
     if (bs->open_flags & BDRV_O_NO_FLUSH) {
         return bdrv_aio_noop_em(bs, cb, opaque);
     }
@@ -3141,6 +3126,25 @@ int bdrv_in_use(BlockDriverState *bs)
     return bs->in_use;
 }
 
+void
+bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie, int64_t bytes,
+        enum BlockAcctType type)
+{
+    assert(type < BDRV_MAX_IOTYPE);
+
+    cookie->bytes = bytes;
+    cookie->type = type;
+}
+
+void
+bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie)
+{
+    assert(cookie->type < BDRV_MAX_IOTYPE);
+
+    bs->nr_bytes[cookie->type] += cookie->bytes;
+    bs->nr_ops[cookie->type]++;
+}
+
 int bdrv_img_create(const char *filename, const char *fmt,
                     const char *base_filename, const char *base_fmt,
                     char *options, uint64_t img_size, int flags)
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2011-08-25 08:01:04.661979573 +0200
+++ qemu/block_int.h	2011-08-25 08:01:12.000000000 +0200
@@ -184,11 +184,8 @@ struct BlockDriverState {
     void *sync_aiocb;
 
     /* I/O stats (display with "info blockstats"). */
-    uint64_t rd_bytes;
-    uint64_t wr_bytes;
-    uint64_t rd_ops;
-    uint64_t wr_ops;
-    uint64_t flush_ops;
+    uint64_t nr_bytes[BDRV_MAX_IOTYPE];
+    uint64_t nr_ops[BDRV_MAX_IOTYPE];
     uint64_t wr_highest_sector;
 
     /* Whether the disk can expand beyond total_sectors */
Index: qemu/hw/ide/ahci.c
===================================================================
--- qemu.orig/hw/ide/ahci.c	2011-08-25 08:00:55.028698426 +0200
+++ qemu/hw/ide/ahci.c	2011-08-25 08:01:12.000000000 +0200
@@ -710,6 +710,7 @@ static void ncq_cb(void *opaque, int ret
     DPRINTF(ncq_tfs->drive->port_no, "NCQ transfer tag %d finished\n",
             ncq_tfs->tag);
 
+    bdrv_acct_done(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct);
     qemu_sglist_destroy(&ncq_tfs->sglist);
     ncq_tfs->used = 0;
 }
@@ -755,7 +756,11 @@ static void process_ncq_command(AHCIStat
                     ncq_tfs->sector_count-1, ncq_tfs->lba, ncq_tfs->tag);
             ncq_tfs->is_read = 1;
 
-            DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+           DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+
+            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
+                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
+                            BDRV_ACCT_READ);
             ncq_tfs->aiocb = dma_bdrv_read(ncq_tfs->drive->port.ifs[0].bs,
                                            &ncq_tfs->sglist, ncq_tfs->lba,
                                            ncq_cb, ncq_tfs);
@@ -766,6 +771,10 @@ static void process_ncq_command(AHCIStat
             ncq_tfs->is_read = 0;
 
             DPRINTF(port, "tag %d aio write %ld\n", ncq_tfs->tag, ncq_tfs->lba);
+
+            bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct,
+                            (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE,
+                            BDRV_ACCT_WRITE);
             ncq_tfs->aiocb = dma_bdrv_write(ncq_tfs->drive->port.ifs[0].bs,
                                             &ncq_tfs->sglist, ncq_tfs->lba,
                                             ncq_cb, ncq_tfs);
Index: qemu/hw/ide/atapi.c
===================================================================
--- qemu.orig/hw/ide/atapi.c	2011-08-25 08:00:55.035365058 +0200
+++ qemu/hw/ide/atapi.c	2011-08-25 08:01:12.000000000 +0200
@@ -104,17 +104,20 @@ static void cd_data_to_raw(uint8_t *buf,
     memset(buf, 0, 288);
 }
 
-static int cd_read_sector(BlockDriverState *bs, int lba, uint8_t *buf,
-                           int sector_size)
+static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)
 {
     int ret;
 
     switch(sector_size) {
     case 2048:
-        ret = bdrv_read(bs, (int64_t)lba << 2, buf, 4);
+        bdrv_acct_start(s->bs, &s->acct, 4 * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
+        ret = bdrv_read(s->bs, (int64_t)lba << 2, buf, 4);
+        bdrv_acct_done(s->bs, &s->acct);
         break;
     case 2352:
-        ret = bdrv_read(bs, (int64_t)lba << 2, buf + 16, 4);
+        bdrv_acct_start(s->bs, &s->acct, 4 * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
+        ret = bdrv_read(s->bs, (int64_t)lba << 2, buf + 16, 4);
+        bdrv_acct_done(s->bs, &s->acct);
         if (ret < 0)
             return ret;
         cd_data_to_raw(buf, lba);
@@ -181,7 +184,7 @@ void ide_atapi_cmd_reply_end(IDEState *s
     } else {
         /* see if a new sector must be read */
         if (s->lba != -1 && s->io_buffer_index >= s->cd_sector_size) {
-            ret = cd_read_sector(s->bs, s->lba, s->io_buffer, s->cd_sector_size);
+            ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size);
             if (ret < 0) {
                 ide_transfer_stop(s);
                 ide_atapi_io_error(s, ret);
@@ -250,6 +253,7 @@ static void ide_atapi_cmd_reply(IDEState
     s->io_buffer_index = 0;
 
     if (s->atapi_dma) {
+        bdrv_acct_start(s->bs, &s->acct, size, BDRV_ACCT_READ);
         s->status = READY_STAT | SEEK_STAT | DRQ_STAT;
         s->bus->dma->ops->start_dma(s->bus->dma, s,
                                    ide_atapi_cmd_read_dma_cb);
@@ -322,10 +326,7 @@ static void ide_atapi_cmd_read_dma_cb(vo
         s->status = READY_STAT | SEEK_STAT;
         s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
         ide_set_irq(s->bus);
-    eot:
-        s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
-        ide_set_inactive(s);
-        return;
+        goto eot;
     }
 
     s->io_buffer_index = 0;
@@ -343,9 +344,11 @@ static void ide_atapi_cmd_read_dma_cb(vo
 #ifdef DEBUG_AIO
     printf("aio_read_cd: lba=%u n=%d\n", s->lba, n);
 #endif
+
     s->bus->dma->iov.iov_base = (void *)(s->io_buffer + data_offset);
     s->bus->dma->iov.iov_len = n * 4 * 512;
     qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1);
+
     s->bus->dma->aiocb = bdrv_aio_readv(s->bs, (int64_t)s->lba << 2,
                                        &s->bus->dma->qiov, n * 4,
                                        ide_atapi_cmd_read_dma_cb, s);
@@ -355,6 +358,12 @@ static void ide_atapi_cmd_read_dma_cb(vo
                             ASC_MEDIUM_NOT_PRESENT);
         goto eot;
     }
+
+    return;
+eot:
+    bdrv_acct_done(s->bs, &s->acct);
+    s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT);
+    ide_set_inactive(s);
 }
 
 /* start a CD-CDROM read command with DMA */
@@ -368,6 +377,8 @@ static void ide_atapi_cmd_read_dma(IDESt
     s->io_buffer_size = 0;
     s->cd_sector_size = sector_size;
 
+    bdrv_acct_start(s->bs, &s->acct, s->packet_transfer_size, BDRV_ACCT_READ);
+
     /* XXX: check if BUSY_STAT should be set */
     s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
     s->bus->dma->ops->start_dma(s->bus->dma, s,
Index: qemu/hw/ide/core.c
===================================================================
--- qemu.orig/hw/ide/core.c	2011-08-25 08:00:55.042031689 +0200
+++ qemu/hw/ide/core.c	2011-08-25 08:01:12.000000000 +0200
@@ -473,7 +473,10 @@ void ide_sector_read(IDEState *s)
 #endif
         if (n > s->req_nb_sectors)
             n = s->req_nb_sectors;
+
+        bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
         ret = bdrv_read(s->bs, sector_num, s->io_buffer, n);
+        bdrv_acct_done(s->bs, &s->acct);
         if (ret != 0) {
             if (ide_handle_rw_error(s, -ret,
                 BM_STATUS_PIO_RETRY | BM_STATUS_RETRY_READ))
@@ -610,7 +613,9 @@ handle_rw_error:
     return;
 
 eot:
-   ide_set_inactive(s);
+    if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE)
+	    bdrv_acct_done(s->bs, &s->acct);
+    ide_set_inactive(s);
 }
 
 static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd)
@@ -619,6 +624,20 @@ static void ide_sector_start_dma(IDEStat
     s->io_buffer_index = 0;
     s->io_buffer_size = 0;
     s->dma_cmd = dma_cmd;
+
+    switch (dma_cmd) {
+    case IDE_DMA_READ:
+        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
+                        BDRV_ACCT_READ);
+        break;
+    case IDE_DMA_WRITE:
+        bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE,
+                        BDRV_ACCT_WRITE);
+        break;
+    default:
+        break;
+    }
+
     s->bus->dma->ops->start_dma(s->bus->dma, s, ide_dma_cb);
 }
 
@@ -641,7 +660,10 @@ void ide_sector_write(IDEState *s)
     n = s->nsector;
     if (n > s->req_nb_sectors)
         n = s->req_nb_sectors;
+
+    bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
     ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
+    bdrv_acct_done(s->bs, &s->acct);
 
     if (ret != 0) {
         if (ide_handle_rw_error(s, -ret, BM_STATUS_PIO_RETRY))
@@ -685,6 +707,7 @@ static void ide_flush_cb(void *opaque, i
         }
     }
 
+    bdrv_acct_done(s->bs, &s->acct);
     s->status = READY_STAT | SEEK_STAT;
     ide_set_irq(s->bus);
 }
@@ -698,6 +721,7 @@ void ide_flush_cache(IDEState *s)
         return;
     }
 
+    bdrv_acct_start(s->bs, &s->acct, 0, BDRV_ACCT_FLUSH);
     acb = bdrv_aio_flush(s->bs, ide_flush_cb, s);
     if (acb == NULL) {
         ide_flush_cb(s, -EIO);
Index: qemu/hw/ide/macio.c
===================================================================
--- qemu.orig/hw/ide/macio.c	2011-08-25 08:00:55.048698319 +0200
+++ qemu/hw/ide/macio.c	2011-08-25 08:01:12.000000000 +0200
@@ -52,8 +52,7 @@ static void pmac_ide_atapi_transfer_cb(v
         m->aiocb = NULL;
         qemu_sglist_destroy(&s->sg);
         ide_atapi_io_error(s, ret);
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
 
     if (s->io_buffer_size > 0) {
@@ -71,8 +70,7 @@ static void pmac_ide_atapi_transfer_cb(v
         ide_atapi_cmd_ok(s);
 
     if (io->len == 0) {
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
 
     /* launch next transfer */
@@ -92,9 +90,14 @@ static void pmac_ide_atapi_transfer_cb(v
         /* Note: media not present is the most likely case */
         ide_atapi_cmd_error(s, SENSE_NOT_READY,
                             ASC_MEDIUM_NOT_PRESENT);
-        io->dma_end(opaque);
-        return;
+        goto done;
     }
+    return;
+
+done:
+    bdrv_acct_done(s->bs, &s->acct);
+    io->dma_end(opaque);
+    return;
 }
 
 static void pmac_ide_transfer_cb(void *opaque, int ret)
@@ -109,8 +112,7 @@ static void pmac_ide_transfer_cb(void *o
         m->aiocb = NULL;
         qemu_sglist_destroy(&s->sg);
 	ide_dma_error(s);
-        io->dma_end(io);
-        return;
+        goto done;
     }
 
     sector_num = ide_get_sector(s);
@@ -130,10 +132,8 @@ static void pmac_ide_transfer_cb(void *o
     }
 
     /* end of DMA ? */
-
     if (io->len == 0) {
-        io->dma_end(io);
-	return;
+        goto done;
     }
 
     /* launch next transfer */
@@ -163,6 +163,11 @@ static void pmac_ide_transfer_cb(void *o
 
     if (!m->aiocb)
         pmac_ide_transfer_cb(io, -1);
+    return;
+done:
+    if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE)
+        bdrv_acct_done(s->bs, &s->acct);
+    io->dma_end(io);
 }
 
 static void pmac_ide_transfer(DBDMA_io *io)
@@ -172,10 +177,22 @@ static void pmac_ide_transfer(DBDMA_io *
 
     s->io_buffer_size = 0;
     if (s->drive_kind == IDE_CD) {
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
         pmac_ide_atapi_transfer_cb(io, 0);
         return;
     }
 
+    switch (s->dma_cmd) {
+    case IDE_DMA_READ:
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ);
+        break;
+    case IDE_DMA_WRITE:
+        bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_WRITE);
+        break;
+    default:
+        break;
+    }
+
     pmac_ide_transfer_cb(io, 0);
 }
 
Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c	2011-08-25 08:00:55.075364842 +0200
+++ qemu/hw/scsi-disk.c	2011-08-25 08:01:12.000000000 +0200
@@ -57,6 +57,7 @@ typedef struct SCSIDiskReq {
     struct iovec iov;
     QEMUIOVector qiov;
     uint32_t status;
+    BlockAcctCookie acct;
 } SCSIDiskReq;
 
 struct SCSIDiskState
@@ -107,10 +108,13 @@ static void scsi_cancel_io(SCSIRequest *
 static void scsi_read_complete(void * opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     int n;
 
     r->req.aiocb = NULL;
 
+    bdrv_acct_done(s->bs, &r->acct);
+
     if (ret) {
         if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_READ)) {
             return;
@@ -161,6 +165,8 @@ static void scsi_read_data(SCSIRequest *
 
     r->iov.iov_len = n * 512;
     qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+
+    bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ);
     r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n,
                               scsi_read_complete, r);
     if (r->req.aiocb == NULL) {
@@ -207,11 +213,14 @@ static int scsi_handle_rw_error(SCSIDisk
 static void scsi_write_complete(void * opaque, int ret)
 {
     SCSIDiskReq *r = (SCSIDiskReq *)opaque;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     uint32_t len;
     uint32_t n;
 
     r->req.aiocb = NULL;
 
+    bdrv_acct_done(s->bs, &r->acct);
+
     if (ret) {
         if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_WRITE)) {
             return;
@@ -252,6 +261,8 @@ static void scsi_write_data(SCSIRequest
     n = r->iov.iov_len / 512;
     if (n) {
         qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+
+        bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE);
         r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n,
                                    scsi_write_complete, r);
         if (r->req.aiocb == NULL) {
@@ -854,13 +865,19 @@ static int scsi_disk_emulate_command(SCS
         buflen = 8;
         break;
     case SYNCHRONIZE_CACHE:
+{
+        BlockAcctCookie acct;
+
+        bdrv_acct_start(s->bs, &acct, 0, BDRV_ACCT_FLUSH);
         ret = bdrv_flush(s->bs);
+        bdrv_acct_done(s->bs, &acct);
         if (ret < 0) {
             if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_FLUSH)) {
                 return -1;
             }
         }
         break;
+}
     case GET_CONFIGURATION:
         memset(outbuf, 0, 8);
         /* ??? This should probably return much more information.  For now
Index: qemu/hw/virtio-blk.c
===================================================================
--- qemu.orig/hw/virtio-blk.c	2011-08-25 08:00:55.082031472 +0200
+++ qemu/hw/virtio-blk.c	2011-08-25 08:02:05.131651979 +0200
@@ -47,6 +47,7 @@ typedef struct VirtIOBlockReq
     struct virtio_scsi_inhdr *scsi;
     QEMUIOVector qiov;
     struct VirtIOBlockReq *next;
+    BlockAcctCookie acct;
 } VirtIOBlockReq;
 
 static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
@@ -58,8 +59,6 @@ static void virtio_blk_req_complete(Virt
     stb_p(&req->in->status, status);
     virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
     virtio_notify(&s->vdev, s->vq);
-
-    g_free(req);
 }
 
 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
@@ -81,6 +80,8 @@ static int virtio_blk_handle_rw_error(Vi
         vm_stop(VMSTOP_DISKFULL);
     } else {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+        bdrv_acct_done(s->bs, &req->acct);
+        g_free(req);
         bdrv_mon_event(s->bs, BDRV_ACTION_REPORT, is_read);
     }
 
@@ -100,6 +101,8 @@ static void virtio_blk_rw_complete(void
     }
 
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+    bdrv_acct_done(req->dev->bs, &req->acct);
+    g_free(req);
 }
 
 static void virtio_blk_flush_complete(void *opaque, int ret)
@@ -113,6 +116,8 @@ static void virtio_blk_flush_complete(vo
     }
 
     virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+    bdrv_acct_done(req->dev->bs, &req->acct);
+    g_free(req);
 }
 
 static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
@@ -155,6 +160,7 @@ static void virtio_blk_handle_scsi(VirtI
      */
     if (req->elem.out_num < 2 || req->elem.in_num < 3) {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+        g_free(req);
         return;
     }
 
@@ -163,6 +169,7 @@ static void virtio_blk_handle_scsi(VirtI
      */
     if (req->elem.out_num > 2 && req->elem.in_num > 3) {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
+        g_free(req);
         return;
     }
 
@@ -229,11 +236,13 @@ static void virtio_blk_handle_scsi(VirtI
     stl_p(&req->scsi->data_len, hdr.dxfer_len);
 
     virtio_blk_req_complete(req, status);
+    g_free(req);
 }
 #else
 static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
 {
     virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
+    g_free(req);
 }
 #endif /* __linux__ */
 
@@ -266,6 +275,8 @@ static void virtio_blk_handle_flush(Virt
 {
     BlockDriverAIOCB *acb;
 
+    bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH);
+
     /*
      * Make sure all outstanding writes are posted to the backing device.
      */
@@ -284,6 +295,8 @@ static void virtio_blk_handle_write(Virt
 
     sector = ldq_p(&req->out->sector);
 
+    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE);
+
     trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
 
     if (sector & req->dev->sector_mask) {
@@ -317,6 +330,8 @@ static void virtio_blk_handle_read(VirtI
 
     sector = ldq_p(&req->out->sector);
 
+    bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ);
+
     if (sector & req->dev->sector_mask) {
         virtio_blk_rw_complete(req, -EIO);
         return;
@@ -370,6 +385,7 @@ static void virtio_blk_handle_request(Vi
                 s->serial ? s->serial : "",
                 MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES));
         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+        g_free(req);
     } else if (type & VIRTIO_BLK_T_OUT) {
         qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
                                  req->elem.out_num - 1);
Index: qemu/hw/xen_disk.c
===================================================================
--- qemu.orig/hw/xen_disk.c	2011-08-25 08:00:55.092031417 +0200
+++ qemu/hw/xen_disk.c	2011-08-25 08:01:05.271976267 +0200
@@ -79,6 +79,7 @@ struct ioreq {
 
     struct XenBlkDev    *blkdev;
     QLIST_ENTRY(ioreq)   list;
+    BlockAcctCookie     acct;
 };
 
 struct XenBlkDev {
@@ -401,6 +402,7 @@ static void qemu_aio_complete(void *opaq
     ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
     ioreq_unmap(ioreq);
     ioreq_finish(ioreq);
+    bdrv_acct_done(ioreq->blkdev->bs, &ioreq->acct);
     qemu_bh_schedule(ioreq->blkdev->bh);
 }
 
@@ -419,6 +421,7 @@ static int ioreq_runio_qemu_aio(struct i
 
     switch (ioreq->req.operation) {
     case BLKIF_OP_READ:
+        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_READ);
         ioreq->aio_inflight++;
         bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE,
                        &ioreq->v, ioreq->v.size / BLOCK_SIZE,
@@ -429,6 +432,8 @@ static int ioreq_runio_qemu_aio(struct i
         if (!ioreq->req.nr_segments) {
             break;
         }
+
+        bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_WRITE);
         ioreq->aio_inflight++;
         bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
                         &ioreq->v, ioreq->v.size / BLOCK_SIZE,
Index: qemu/hw/ide/internal.h
===================================================================
--- qemu.orig/hw/ide/internal.h	2011-08-25 08:00:55.058698264 +0200
+++ qemu/hw/ide/internal.h	2011-08-25 08:01:05.275309582 +0200
@@ -440,6 +440,7 @@ struct IDEState {
     int lba;
     int cd_sector_size;
     int atapi_dma; /* true if dma is requested for the packet cmd */
+    BlockAcctCookie acct;
     /* ATA DMA state */
     int io_buffer_size;
     QEMUSGList sg;
Index: qemu/hw/ide/ahci.h
===================================================================
--- qemu.orig/hw/ide/ahci.h	2011-08-25 08:00:55.068698210 +0200
+++ qemu/hw/ide/ahci.h	2011-08-25 08:01:05.275309582 +0200
@@ -258,6 +258,7 @@ typedef struct NCQTransferState {
     AHCIDevice *drive;
     BlockDriverAIOCB *aiocb;
     QEMUSGList sglist;
+    BlockAcctCookie acct;
     int is_read;
     uint16_t sector_count;
     uint64_t lba;
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h	2011-08-25 08:01:07.808629192 +0200
+++ qemu/block.h	2011-08-25 08:01:12.225271933 +0200
@@ -254,6 +254,22 @@ int64_t bdrv_get_dirty_count(BlockDriver
 void bdrv_set_in_use(BlockDriverState *bs, int in_use);
 int bdrv_in_use(BlockDriverState *bs);
 
+enum BlockAcctType {
+    BDRV_ACCT_READ,
+    BDRV_ACCT_WRITE,
+    BDRV_ACCT_FLUSH,
+    BDRV_MAX_IOTYPE,
+};
+
+typedef struct BlockAcctCookie {
+    int64_t bytes;
+    enum BlockAcctType type;
+} BlockAcctCookie;
+
+void bdrv_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie,
+        int64_t bytes, enum BlockAcctType type);
+void bdrv_acct_done(BlockDriverState *bs, BlockAcctCookie *cookie);
+
 typedef enum {
     BLKDBG_L1_UPDATE,
 
@@ -306,3 +322,4 @@ typedef enum {
 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event);
 
 #endif
+

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 3/3] block: latency accounting
  2011-08-25  6:25 ` [Qemu-devel] [PATCH 0/3] better I/O accounting V3 Christoph Hellwig
  2011-08-25  6:25   ` [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats Christoph Hellwig
  2011-08-25  6:26   ` [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting Christoph Hellwig
@ 2011-08-25  6:26   ` Christoph Hellwig
  2011-08-26 13:05     ` Kevin Wolf
  2011-08-25 16:10   ` [Qemu-devel] [PATCH 0/3] better I/O accounting V3 Kevin Wolf
  3 siblings, 1 reply; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-25  6:26 UTC (permalink / raw)
  To: qemu-devel

Account the total latency for read/write/flush requests.  This allows
management tools to average it based on a snapshot of the nr ops
counters and allow checking for SLAs or provide statistics.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2011-08-25 08:01:12.000000000 +0200
+++ qemu/block.c	2011-08-25 08:03:02.908005646 +0200
@@ -1889,12 +1889,18 @@ static void bdrv_stats_iter(QObject *dat
                         " rd_operations=%" PRId64
                         " wr_operations=%" PRId64
                         " flush_operations=%" PRId64
+                        " wr_total_time_ns=%" PRId64
+                        " rd_total_time_ns=%" PRId64
+                        " flush_total_time_ns=%" PRId64
                         "\n",
                         qdict_get_int(qdict, "rd_bytes"),
                         qdict_get_int(qdict, "wr_bytes"),
                         qdict_get_int(qdict, "rd_operations"),
                         qdict_get_int(qdict, "wr_operations"),
-                        qdict_get_int(qdict, "flush_operations"));
+                        qdict_get_int(qdict, "flush_operations"),
+                        qdict_get_int(qdict, "wr_total_time_ns"),
+                        qdict_get_int(qdict, "rd_total_time_ns"),
+                        qdict_get_int(qdict, "flush_total_time_ns"));
 }
 
 void bdrv_stats_print(Monitor *mon, const QObject *data)
@@ -1913,7 +1919,10 @@ static QObject* bdrv_info_stats_bs(Block
                              "'rd_operations': %" PRId64 ","
                              "'wr_operations': %" PRId64 ","
                              "'wr_highest_offset': %" PRId64 ","
-                             "'flush_operations': %" PRId64
+                             "'flush_operations': %" PRId64 ","
+                             "'wr_total_time_ns': %" PRId64 ","
+                             "'rd_total_time_ns': %" PRId64 ","
+                             "'flush_total_time_ns': %" PRId64
                              "} }",
                              bs->nr_bytes[BDRV_ACCT_READ],
                              bs->nr_bytes[BDRV_ACCT_WRITE],
@@ -1921,7 +1930,10 @@ static QObject* bdrv_info_stats_bs(Block
                              bs->nr_ops[BDRV_ACCT_WRITE],
                              bs->wr_highest_sector *
                              (uint64_t)BDRV_SECTOR_SIZE,
-                             bs->nr_ops[BDRV_ACCT_FLUSH]);
+                             bs->nr_ops[BDRV_ACCT_FLUSH],
+                             bs->total_time_ns[BDRV_ACCT_READ],
+                             bs->total_time_ns[BDRV_ACCT_WRITE],
+                             bs->total_time_ns[BDRV_ACCT_FLUSH]);
     dict  = qobject_to_qdict(res);
 
     if (*bs->device_name) {
@@ -3133,6 +3145,7 @@ bdrv_acct_start(BlockDriverState *bs, Bl
     assert(type < BDRV_MAX_IOTYPE);
 
     cookie->bytes = bytes;
+    cookie->start_time_ns = get_clock();
     cookie->type = type;
 }
 
@@ -3143,6 +3156,7 @@ bdrv_acct_done(BlockDriverState *bs, Blo
 
     bs->nr_bytes[cookie->type] += cookie->bytes;
     bs->nr_ops[cookie->type]++;
+    bs->total_time_ns[cookie->type] += get_clock() - cookie->start_time_ns;
 }
 
 int bdrv_img_create(const char *filename, const char *fmt,
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2011-08-25 08:01:12.000000000 +0200
+++ qemu/block_int.h	2011-08-25 08:02:20.981566114 +0200
@@ -28,6 +28,7 @@
 #include "qemu-option.h"
 #include "qemu-queue.h"
 #include "qemu-coroutine.h"
+#include "qemu-timer.h"
 
 #define BLOCK_FLAG_ENCRYPT	1
 #define BLOCK_FLAG_COMPAT6	4
@@ -186,6 +187,7 @@ struct BlockDriverState {
     /* I/O stats (display with "info blockstats"). */
     uint64_t nr_bytes[BDRV_MAX_IOTYPE];
     uint64_t nr_ops[BDRV_MAX_IOTYPE];
+    uint64_t total_time_ns[BDRV_MAX_IOTYPE];
     uint64_t wr_highest_sector;
 
     /* Whether the disk can expand beyond total_sectors */
Index: qemu/qmp-commands.hx
===================================================================
--- qemu.orig/qmp-commands.hx	2011-08-25 08:01:04.000000000 +0200
+++ qemu/qmp-commands.hx	2011-08-25 08:02:20.984899429 +0200
@@ -1202,6 +1202,9 @@ Each json-object contain the following:
     - "rd_operations": read operations (json-int)
     - "wr_operations": write operations (json-int)
     - "flush_operations": cache flush operations (json-int)
+    - "wr_total_time_ns": total time spend on writes in nano-seconds (json-int)
+    - "rd_total_time_ns": total time spend on reads in nano-seconds (json-int)
+    - "flush_total_time_ns": total time spend on cache flushes in nano-seconds (json-int)
     - "wr_highest_offset": Highest offset of a sector written since the
                            BlockDriverState has been opened (json-int)
 - "parent": Contains recursively the statistics of the underlying
@@ -1223,6 +1226,9 @@ Example:
                   "wr_operations":751,
                   "rd_bytes":122567168,
                   "rd_operations":36772
+                  "wr_total_times_ns":313253456
+                  "rd_total_times_ns":3465673657
+                  "flush_total_times_ns":49653
                   "flush_operations":61,
                }
             },
@@ -1233,6 +1239,9 @@ Example:
                "rd_bytes":122739200,
                "rd_operations":36604
                "flush_operations":51,
+               "wr_total_times_ns":313253456
+               "rd_total_times_ns":3465673657
+               "flush_total_times_ns":49653
             }
          },
          {
@@ -1244,6 +1253,9 @@ Example:
                "rd_bytes":0,
                "rd_operations":0
                "flush_operations":0,
+               "wr_total_times_ns":0
+               "rd_total_times_ns":0
+               "flush_total_times_ns":0
             }
          },
          {
@@ -1255,6 +1267,9 @@ Example:
                "rd_bytes":0,
                "rd_operations":0
                "flush_operations":0,
+               "wr_total_times_ns":0
+               "rd_total_times_ns":0
+               "flush_total_times_ns":0
             }
          },
          {
@@ -1266,6 +1281,9 @@ Example:
                "rd_bytes":0,
                "rd_operations":0
                "flush_operations":0,
+               "wr_total_times_ns":0
+               "rd_total_times_ns":0
+               "flush_total_times_ns":0
             }
          }
       ]
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h	2011-08-25 08:02:32.094839241 +0200
+++ qemu/block.h	2011-08-25 08:02:40.718125859 +0200
@@ -263,6 +263,7 @@ enum BlockAcctType {
 
 typedef struct BlockAcctCookie {
     int64_t bytes;
+    int64_t start_time_ns;
     enum BlockAcctType type;
 } BlockAcctCookie;
 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V3
  2011-08-25  6:25 ` [Qemu-devel] [PATCH 0/3] better I/O accounting V3 Christoph Hellwig
                     ` (2 preceding siblings ...)
  2011-08-25  6:26   ` [Qemu-devel] [PATCH 3/3] block: latency accounting Christoph Hellwig
@ 2011-08-25 16:10   ` Kevin Wolf
  3 siblings, 0 replies; 23+ messages in thread
From: Kevin Wolf @ 2011-08-25 16:10 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel

Am 25.08.2011 08:25, schrieb Christoph Hellwig:
> changes since V2:
>  - fix the crash with non read/write/flush commands in virtio_blk
>    pointed out by Ryan
>  - address all review comments by Kevin
> changes since V1:
>  - rebase to apply against the current qemu.git tree
> 
> 
> Note that this still includes the (unchanged) patch 1 that Kevin already
> applied, but that didn't make it to qemu.git yet.

Thanks, applied to the block branch.

Kevin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 0/3] better I/O accounting V2
  2011-08-24 18:45           ` Christoph Hellwig
@ 2011-08-25 16:41             ` Ryan Harper
  0 siblings, 0 replies; 23+ messages in thread
From: Ryan Harper @ 2011-08-25 16:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Ryan Harper, qemu-devel

* Christoph Hellwig <hch@lst.de> [2011-08-24 13:46]:
> On Mon, Aug 22, 2011 at 11:46:00AM -0500, Ryan Harper wrote:
> > So, I believe this is how it's happening.
> > 
> > we start accounting on a write which is turned into a multiwrite via
> > virtio_blk_handle_write() which calls virtio_submit_multiwrite().
> > 
> > Then when the multiwrite completes, we invoke virtio_blk_rw_complete()
> > on each part of the multiwrite.  None of these requests have had their
> > acct structure initialized since there was just *one* initial write.
> > We could do a bdrv_acct_start() on each req, but that would break the
> > concept of hiding the additional writes under the initial request.
> 
> We do initialize all write fields correctly.  Where we fail right now
> is for non-read/write requests.  I can reproduce your issue by reading
> the serial attribute in sysfs - any chance Fedora does that during boot?

Indeed.  F15 has udev hooks to support /dev/disk/by-id for virtio-blk
devices which will issue non-read/write request (get drive serial).

> 
> The patch below should take care of these cases:

Tested with this patch, F15 boots fine, mkfs.ext4 and dd all work as
expected.

Tested-by: Ryan Harper <ryanh@us.ibm.com>

> 
> 
> Index: qemu/hw/virtio-blk.c
> ===================================================================
> --- qemu.orig/hw/virtio-blk.c	2011-08-24 20:32:03.764503179 +0200
> +++ qemu/hw/virtio-blk.c	2011-08-24 20:35:55.716579922 +0200
> @@ -59,9 +59,6 @@ static void virtio_blk_req_complete(Virt
>      stb_p(&req->in->status, status);
>      virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
>      virtio_notify(&s->vdev, s->vq);
> -
> -    bdrv_acct_done(s->bs, &req->acct);
> -    g_free(req);
>  }
> 
>  static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
> @@ -83,6 +80,8 @@ static int virtio_blk_handle_rw_error(Vi
>          vm_stop(VMSTOP_DISKFULL);
>      } else {
>          virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
> +        bdrv_acct_done(s->bs, &req->acct);
> +        g_free(req);
>          bdrv_mon_event(s->bs, BDRV_ACTION_REPORT, is_read);
>      }
> 
> @@ -102,6 +101,8 @@ static void virtio_blk_rw_complete(void
>      }
> 
>      virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
> +    bdrv_acct_done(req->dev->bs, &req->acct);
> +    g_free(req);
>  }
> 
>  static void virtio_blk_flush_complete(void *opaque, int ret)
> @@ -115,6 +116,8 @@ static void virtio_blk_flush_complete(vo
>      }
> 
>      virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
> +    bdrv_acct_done(req->dev->bs, &req->acct);
> +    g_free(req);
>  }
> 
>  static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
> @@ -157,6 +160,7 @@ static void virtio_blk_handle_scsi(VirtI
>       */
>      if (req->elem.out_num < 2 || req->elem.in_num < 3) {
>          virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
> +        g_free(req);
>          return;
>      }
> 
> @@ -165,6 +169,7 @@ static void virtio_blk_handle_scsi(VirtI
>       */
>      if (req->elem.out_num > 2 && req->elem.in_num > 3) {
>          virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
> +        g_free(req);
>          return;
>      }
> 
> @@ -231,11 +236,13 @@ static void virtio_blk_handle_scsi(VirtI
>      stl_p(&req->scsi->data_len, hdr.dxfer_len);
> 
>      virtio_blk_req_complete(req, status);
> +    g_free(req);
>  }
>  #else
>  static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
>  {
>      virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
> +    g_free(req);
>  }
>  #endif /* __linux__ */
> 
> @@ -378,6 +385,7 @@ static void virtio_blk_handle_request(Vi
>                  s->serial ? s->serial : "",
>                  MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES));
>          virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
> +        g_free(req);
>      } else if (type & VIRTIO_BLK_T_OUT) {
>          qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
>                                   req->elem.out_num - 1);

-- 
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
ryanh@us.ibm.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] block: latency accounting
  2011-08-25  6:26   ` [Qemu-devel] [PATCH 3/3] block: latency accounting Christoph Hellwig
@ 2011-08-26 13:05     ` Kevin Wolf
  2011-08-26 16:06       ` Christoph Hellwig
  0 siblings, 1 reply; 23+ messages in thread
From: Kevin Wolf @ 2011-08-26 13:05 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel

Am 25.08.2011 08:26, schrieb Christoph Hellwig:
> Account the total latency for read/write/flush requests.  This allows
> management tools to average it based on a snapshot of the nr ops
> counters and allow checking for SLAs or provide statistics.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> Index: qemu/block.c
> ===================================================================
> --- qemu.orig/block.c	2011-08-25 08:01:12.000000000 +0200
> +++ qemu/block.c	2011-08-25 08:03:02.908005646 +0200
> @@ -1889,12 +1889,18 @@ static void bdrv_stats_iter(QObject *dat
>                          " rd_operations=%" PRId64
>                          " wr_operations=%" PRId64
>                          " flush_operations=%" PRId64
> +                        " wr_total_time_ns=%" PRId64
> +                        " rd_total_time_ns=%" PRId64
> +                        " flush_total_time_ns=%" PRId64
>                          "\n",
>                          qdict_get_int(qdict, "rd_bytes"),
>                          qdict_get_int(qdict, "wr_bytes"),
>                          qdict_get_int(qdict, "rd_operations"),
>                          qdict_get_int(qdict, "wr_operations"),
> -                        qdict_get_int(qdict, "flush_operations"));
> +                        qdict_get_int(qdict, "flush_operations"),
> +                        qdict_get_int(qdict, "wr_total_time_ns"),
> +                        qdict_get_int(qdict, "rd_total_time_ns"),
> +                        qdict_get_int(qdict, "flush_total_time_ns"));
>  }
>  
>  void bdrv_stats_print(Monitor *mon, const QObject *data)
> @@ -1913,7 +1919,10 @@ static QObject* bdrv_info_stats_bs(Block
>                               "'rd_operations': %" PRId64 ","
>                               "'wr_operations': %" PRId64 ","
>                               "'wr_highest_offset': %" PRId64 ","
> -                             "'flush_operations': %" PRId64
> +                             "'flush_operations': %" PRId64 ","
> +                             "'wr_total_time_ns': %" PRId64 ","
> +                             "'rd_total_time_ns': %" PRId64 ","
> +                             "'flush_total_time_ns': %" PRId64
>                               "} }",
>                               bs->nr_bytes[BDRV_ACCT_READ],
>                               bs->nr_bytes[BDRV_ACCT_WRITE],
> @@ -1921,7 +1930,10 @@ static QObject* bdrv_info_stats_bs(Block
>                               bs->nr_ops[BDRV_ACCT_WRITE],
>                               bs->wr_highest_sector *
>                               (uint64_t)BDRV_SECTOR_SIZE,
> -                             bs->nr_ops[BDRV_ACCT_FLUSH]);
> +                             bs->nr_ops[BDRV_ACCT_FLUSH],
> +                             bs->total_time_ns[BDRV_ACCT_READ],
> +                             bs->total_time_ns[BDRV_ACCT_WRITE],
> +                             bs->total_time_ns[BDRV_ACCT_FLUSH]);

The order of READ vs. WRITE is wrong here, so read latencies show up as
wr_total_time. I can fix it locally if you agree that swapping the two
lines is the right fix.

Kevin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] block: latency accounting
  2011-08-26 13:05     ` Kevin Wolf
@ 2011-08-26 16:06       ` Christoph Hellwig
  0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-26 16:06 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: Christoph Hellwig, qemu-devel

On Fri, Aug 26, 2011 at 03:05:06PM +0200, Kevin Wolf wrote:
> The order of READ vs. WRITE is wrong here, so read latencies show up as
> wr_total_time. I can fix it locally if you agree that swapping the two
> lines is the right fix.

Yes, it got mixed up in one of the revamps.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats
  2011-08-11 23:44 [Qemu-devel] [PATCH 0/3] better I/O accounting Christoph Hellwig
@ 2011-08-11 23:44 ` Christoph Hellwig
  0 siblings, 0 replies; 23+ messages in thread
From: Christoph Hellwig @ 2011-08-11 23:44 UTC (permalink / raw)
  To: qemu-devel

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2011-08-11 12:32:35.842658196 +0200
+++ qemu/block.c	2011-08-11 16:04:48.302061893 +0200
@@ -1888,11 +1888,13 @@ static void bdrv_stats_iter(QObject *dat
                         " wr_bytes=%" PRId64
                         " rd_operations=%" PRId64
                         " wr_operations=%" PRId64
+                        " flush_operations=%" PRId64
                         "\n",
                         qdict_get_int(qdict, "rd_bytes"),
                         qdict_get_int(qdict, "wr_bytes"),
                         qdict_get_int(qdict, "rd_operations"),
-                        qdict_get_int(qdict, "wr_operations"));
+                        qdict_get_int(qdict, "wr_operations"),
+                        qdict_get_int(qdict, "flush_operations"));
 }
 
 void bdrv_stats_print(Monitor *mon, const QObject *data)
@@ -1910,12 +1912,16 @@ static QObject* bdrv_info_stats_bs(Block
                              "'wr_bytes': %" PRId64 ","
                              "'rd_operations': %" PRId64 ","
                              "'wr_operations': %" PRId64 ","
-                             "'wr_highest_offset': %" PRId64
+                             "'wr_highest_offset': %" PRId64 ","
+                             "'flush_operations': %" PRId64
                              "} }",
-                             bs->rd_bytes, bs->wr_bytes,
-                             bs->rd_ops, bs->wr_ops,
+                             bs->rd_bytes,
+                             bs->wr_bytes,
+                             bs->rd_ops,
+                             bs->wr_ops,
                              bs->wr_highest_sector *
-                             (uint64_t)BDRV_SECTOR_SIZE);
+                             (uint64_t)BDRV_SECTOR_SIZE,
+                             bs->flush_ops);
     dict  = qobject_to_qdict(res);
 
     if (*bs->device_name) {
@@ -2579,6 +2585,8 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDr
 
     trace_bdrv_aio_flush(bs, opaque);
 
+    bs->flush_ops++;
+
     if (bs->open_flags & BDRV_O_NO_FLUSH) {
         return bdrv_aio_noop_em(bs, cb, opaque);
     }
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2011-08-11 12:32:35.852658142 +0200
+++ qemu/block_int.h	2011-08-11 16:04:48.302061893 +0200
@@ -188,6 +188,7 @@ struct BlockDriverState {
     uint64_t wr_bytes;
     uint64_t rd_ops;
     uint64_t wr_ops;
+    uint64_t flush_ops;
     uint64_t wr_highest_sector;
 
     /* Whether the disk can expand beyond total_sectors */
Index: qemu/qmp-commands.hx
===================================================================
--- qemu.orig/qmp-commands.hx	2011-08-11 12:32:36.062657004 +0200
+++ qemu/qmp-commands.hx	2011-08-11 16:04:48.305395208 +0200
@@ -1201,6 +1201,7 @@ Each json-object contain the following:
     - "wr_bytes": bytes written (json-int)
     - "rd_operations": read operations (json-int)
     - "wr_operations": write operations (json-int)
+    - "flush_operations": cache flush operations (json-int)
     - "wr_highest_offset": Highest offset of a sector written since the
                            BlockDriverState has been opened (json-int)
 - "parent": Contains recursively the statistics of the underlying
@@ -1222,6 +1223,7 @@ Example:
                   "wr_operations":751,
                   "rd_bytes":122567168,
                   "rd_operations":36772
+                  "flush_operations":61,
                }
             },
             "stats":{
@@ -1230,6 +1232,7 @@ Example:
                "wr_operations":692,
                "rd_bytes":122739200,
                "rd_operations":36604
+               "flush_operations":51,
             }
          },
          {
@@ -1240,6 +1243,7 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
             }
          },
          {
@@ -1250,6 +1254,7 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
             }
          },
          {
@@ -1260,6 +1265,7 @@ Example:
                "wr_operations":0,
                "rd_bytes":0,
                "rd_operations":0
+               "flush_operations":0,
             }
          }
       ]

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2011-08-26 16:06 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-21 22:25 [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Christoph Hellwig
2011-08-21 22:25 ` [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats Christoph Hellwig
2011-08-21 22:26 ` [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting Christoph Hellwig
2011-08-22 15:48   ` Kevin Wolf
2011-08-24 18:22     ` Christoph Hellwig
2011-08-21 22:26 ` [Qemu-devel] [PATCH 3/3] block: latency accounting Christoph Hellwig
2011-08-22  8:45 ` [Qemu-devel] [PATCH 0/3] better I/O accounting V2 Stefan Hajnoczi
2011-08-22 14:59 ` Ryan Harper
2011-08-22 15:12   ` Christoph Hellwig
2011-08-22 15:29     ` Ryan Harper
2011-08-22 15:35       ` Christoph Hellwig
2011-08-22 16:46         ` Ryan Harper
2011-08-24 18:45           ` Christoph Hellwig
2011-08-25 16:41             ` Ryan Harper
2011-08-22 15:55 ` Kevin Wolf
2011-08-25  6:25 ` [Qemu-devel] [PATCH 0/3] better I/O accounting V3 Christoph Hellwig
2011-08-25  6:25   ` [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats Christoph Hellwig
2011-08-25  6:26   ` [Qemu-devel] [PATCH 2/3] block: explicit I/O accounting Christoph Hellwig
2011-08-25  6:26   ` [Qemu-devel] [PATCH 3/3] block: latency accounting Christoph Hellwig
2011-08-26 13:05     ` Kevin Wolf
2011-08-26 16:06       ` Christoph Hellwig
2011-08-25 16:10   ` [Qemu-devel] [PATCH 0/3] better I/O accounting V3 Kevin Wolf
  -- strict thread matches above, loose matches on Subject: below --
2011-08-11 23:44 [Qemu-devel] [PATCH 0/3] better I/O accounting Christoph Hellwig
2011-08-11 23:44 ` [Qemu-devel] [PATCH 1/3] block: include flush requests in info blockstats Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.