All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 1/4] virtio-blk: revert serial number support
@ 2010-01-29 19:04 Christoph Hellwig
  2010-01-29 19:04 ` [Qemu-devel] [PATCH 2/4] block: add block topology options Christoph Hellwig
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Christoph Hellwig @ 2010-01-29 19:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: Martin K. Petersen

The addition of the whole ATA IDENTIY page caused the config space to
go above the allowed size in the PCI spec, and thus the feature was
already reverted in the Linux guest driver and disabled by default in
qemu.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/hw/virtio-blk.c
===================================================================
--- qemu.orig/hw/virtio-blk.c	2010-01-29 19:38:34.977004394 +0100
+++ qemu/hw/virtio-blk.c	2010-01-29 19:38:37.137253963 +0100
@@ -25,9 +25,7 @@ typedef struct VirtIOBlock
     BlockDriverState *bs;
     VirtQueue *vq;
     void *rq;
-    char serial_str[BLOCK_SERIAL_STRLEN + 1];
     QEMUBH *bh;
-    size_t config_size;
 } VirtIOBlock;
 
 static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
@@ -35,47 +33,6 @@ static VirtIOBlock *to_virtio_blk(VirtIO
     return (VirtIOBlock *)vdev;
 }
 
-/* store identify data in little endian format
- */
-static inline void put_le16(uint16_t *p, unsigned int v)
-{
-    *p = cpu_to_le16(v);
-}
-
-/* copy to *dst from *src, nul pad dst tail as needed to len bytes
- */
-static inline void padstr(char *dst, const char *src, int len)
-{
-    while (len--)
-        *dst++ = *src ? *src++ : '\0';
-}
-
-/* setup simulated identify data as appropriate for virtio block device
- *
- * ref: AT Attachment 8 - ATA/ATAPI Command Set (ATA8-ACS)
- */
-static inline void virtio_identify_template(struct virtio_blk_config *bc)
-{
-    uint16_t *p = &bc->identify[0];
-    uint64_t lba_sectors = bc->capacity;
-
-    memset(p, 0, sizeof(bc->identify));
-    put_le16(p + 0, 0x0);                            /* ATA device */
-    padstr((char *)(p + 23), QEMU_VERSION, 8);       /* firmware revision */
-    padstr((char *)(p + 27), "QEMU VIRT_BLK", 40);   /* model# */
-    put_le16(p + 47, 0x80ff);                        /* max xfer 255 sectors */
-    put_le16(p + 49, 0x0b00);                        /* support IORDY/LBA/DMA */
-    put_le16(p + 59, 0x1ff);                         /* cur xfer 255 sectors */
-    put_le16(p + 80, 0x1f0);                         /* support ATA8/7/6/5/4 */
-    put_le16(p + 81, 0x16);
-    put_le16(p + 82, 0x400);
-    put_le16(p + 83, 0x400);
-    put_le16(p + 100, lba_sectors);
-    put_le16(p + 101, lba_sectors >> 16);
-    put_le16(p + 102, lba_sectors >> 32);
-    put_le16(p + 103, lba_sectors >> 48);
-}
-
 typedef struct VirtIOBlockReq
 {
     VirtIOBlock *dev;
@@ -423,10 +380,7 @@ static void virtio_blk_update_config(Vir
     blkcfg.heads = heads;
     blkcfg.sectors = secs;
     blkcfg.size_max = 0;
-    virtio_identify_template(&blkcfg);
-    memcpy(&blkcfg.identify[VIRTIO_BLK_ID_SN], s->serial_str,
-        VIRTIO_BLK_ID_SN_BYTES);
-    memcpy(config, &blkcfg, s->config_size);
+    memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
 }
 
 static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features)
@@ -438,8 +392,6 @@ static uint32_t virtio_blk_get_features(
 
     if (bdrv_enable_write_cache(s->bs))
         features |= (1 << VIRTIO_BLK_F_WCACHE);
-    if (strcmp(s->serial_str, "0"))
-        features |= 1 << VIRTIO_BLK_F_IDENTIFY;
     
     if (bdrv_is_read_only(s->bs))
         features |= 1 << VIRTIO_BLK_F_RO;
@@ -485,24 +437,16 @@ VirtIODevice *virtio_blk_init(DeviceStat
     VirtIOBlock *s;
     int cylinders, heads, secs;
     static int virtio_blk_id;
-    char *ps = (char *)drive_get_serial(dinfo->bdrv);
-    size_t size = strlen(ps) ? sizeof(struct virtio_blk_config) :
-	    offsetof(struct virtio_blk_config, _blk_size);
 
     s = (VirtIOBlock *)virtio_common_init("virtio-blk", VIRTIO_ID_BLOCK,
-                                          size,
+                                          sizeof(struct virtio_blk_config),
                                           sizeof(VirtIOBlock));
 
-    s->config_size = size;
     s->vdev.get_config = virtio_blk_update_config;
     s->vdev.get_features = virtio_blk_get_features;
     s->vdev.reset = virtio_blk_reset;
     s->bs = dinfo->bdrv;
     s->rq = NULL;
-    if (strlen(ps))
-        strncpy(s->serial_str, ps, sizeof(s->serial_str));
-    else
-        snprintf(s->serial_str, sizeof(s->serial_str), "0");
     bdrv_guess_geometry(s->bs, &cylinders, &heads, &secs);
     bdrv_set_geometry_hint(s->bs, cylinders, heads, secs);
 
Index: qemu/hw/virtio-blk.h
===================================================================
--- qemu.orig/hw/virtio-blk.h	2010-01-29 19:38:34.966004110 +0100
+++ qemu/hw/virtio-blk.h	2010-01-29 19:40:35.335255732 +0100
@@ -30,13 +30,9 @@
 #define VIRTIO_BLK_F_RO         5       /* Disk is read-only */
 #define VIRTIO_BLK_F_BLK_SIZE   6       /* Block size of disk is available*/
 #define VIRTIO_BLK_F_SCSI       7       /* Supports scsi command passthru */
-#define VIRTIO_BLK_F_IDENTIFY   8       /* ATA IDENTIFY supported */
+/* #define VIRTIO_BLK_F_IDENTIFY   8       ATA IDENTIFY supported, DEPRECATED */
 #define VIRTIO_BLK_F_WCACHE     9       /* write cache enabled */
 
-#define VIRTIO_BLK_ID_LEN       256     /* length of identify u16 array */
-#define VIRTIO_BLK_ID_SN        10      /* start of char * serial# */
-#define VIRTIO_BLK_ID_SN_BYTES  20      /* length in bytes of serial# */
-
 struct virtio_blk_config
 {
     uint64_t capacity;
@@ -46,7 +42,6 @@ struct virtio_blk_config
     uint8_t heads;
     uint8_t sectors;
     uint32_t _blk_size;    /* structure pad, currently unused */
-    uint16_t identify[VIRTIO_BLK_ID_LEN];
 } __attribute__((packed));
 
 /* These two define direction. */

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 2/4] block: add block topology options
  2010-01-29 19:04 [Qemu-devel] [PATCH 1/4] virtio-blk: revert serial number support Christoph Hellwig
@ 2010-01-29 19:04 ` Christoph Hellwig
  2010-02-03 19:00   ` Anthony Liguori
  2010-01-29 19:05 ` [Qemu-devel] [PATCH 3/5] scsi: add topology support Christoph Hellwig
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2010-01-29 19:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: Martin K. Petersen

Add three new suboptions for the drive option to export block topology
information to the guest.  This is needed to get optimal I/O alignment
for RAID arrays or SSDs.

The options are:

 - physical_block_size to specify the physical block size of the device,
   this is going to increase from 512 bytes to 4096 kilobytes for many
   modern storage devices
 - min_io_size to specify the minimal I/O size without performance impact,
   this is typically set to the RAID chunk size for arrays.
 - opt_io_size to specify the optimal sustained I/O size, this is
   typically the RAID stripe width for arrays.

I decided to not auto-probe these values from blkid which might easily
be possible as I don't know how to deal with these issues on migration.

Note that we specificly only set the physical_block_size, and not the
logial one which is the unit all I/O is described in.  The reason for
that is that IDE does not support increasing the logical block size and
at last for now I want to stick to one meachnisms in queue and allow
for easy switching of transports for a given backing image which would
not be possible if scsi and virtio use real 4k sectors, while ide only
uses the physical block exponent.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2010-01-29 11:07:50.083004364 +0100
+++ qemu/block.c	2010-01-29 11:08:32.940004255 +0100
@@ -1028,6 +1028,51 @@ int bdrv_enable_write_cache(BlockDriverS
     return bs->enable_write_cache;
 }
 
+unsigned int bdrv_get_physical_block_size(BlockDriverState *bs)
+{
+    return bs->physical_block_size;
+}
+
+unsigned int bdrv_get_physical_block_exp(BlockDriverState *bs)
+{
+    unsigned int exp = 0, size;
+
+    for (size = bs->physical_block_size; size > 512; size >>= 1) {
+        exp++;
+    }
+
+    return exp;
+}
+
+void bdrv_set_physical_block_size(BlockDriverState *bs,
+        unsigned int physical_block_size)
+{
+    bs->physical_block_size = physical_block_size;
+}
+
+
+unsigned int bdrv_get_min_io_size(BlockDriverState *bs)
+{
+    return bs->min_io_size;
+}
+
+void bdrv_set_min_io_size(BlockDriverState *bs,
+        unsigned int min_io_size)
+{
+    bs->min_io_size = min_io_size;
+}
+
+unsigned int bdrv_get_opt_io_size(BlockDriverState *bs)
+{
+    return bs->opt_io_size;
+}
+
+void bdrv_set_opt_io_size(BlockDriverState *bs,
+        unsigned int opt_io_size)
+{
+    bs->opt_io_size = opt_io_size;
+}
+
 /* XXX: no longer used */
 void bdrv_set_change_cb(BlockDriverState *bs,
                         void (*change_cb)(void *opaque), void *opaque)
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h	2010-01-29 11:07:50.089004011 +0100
+++ qemu/block.h	2010-01-29 11:08:32.940004255 +0100
@@ -152,6 +152,16 @@ int bdrv_is_inserted(BlockDriverState *b
 int bdrv_media_changed(BlockDriverState *bs);
 int bdrv_is_locked(BlockDriverState *bs);
 void bdrv_set_locked(BlockDriverState *bs, int locked);
+unsigned int bdrv_get_physical_block_size(BlockDriverState *bs);
+unsigned int bdrv_get_physical_block_exp(BlockDriverState *bs);
+void bdrv_set_physical_block_size(BlockDriverState *bs,
+        unsigned int physical_block_size);
+unsigned int bdrv_get_min_io_size(BlockDriverState *bs);
+void bdrv_set_min_io_size(BlockDriverState *bs,
+        unsigned int min_io_size);
+unsigned int bdrv_get_opt_io_size(BlockDriverState *bs);
+void bdrv_set_opt_io_size(BlockDriverState *bs,
+        unsigned int opt_io_size);
 int bdrv_eject(BlockDriverState *bs, int eject_flag);
 void bdrv_set_change_cb(BlockDriverState *bs,
                         void (*change_cb)(void *opaque), void *opaque);
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2010-01-29 11:07:50.096004065 +0100
+++ qemu/block_int.h	2010-01-29 11:08:32.941003474 +0100
@@ -173,6 +173,14 @@ struct BlockDriverState {
        drivers. They are not used by the block driver */
     int cyls, heads, secs, translation;
     int type;
+
+    /*
+     * Topology information, all optional.
+     */
+    unsigned int physical_block_size;
+    unsigned int min_io_size;
+    unsigned int opt_io_size;
+
     char device_name[32];
     unsigned long *dirty_bitmap;
     BlockDriverState *next;
Index: qemu/qemu-config.c
===================================================================
--- qemu.orig/qemu-config.c	2010-01-29 11:07:50.133004032 +0100
+++ qemu/qemu-config.c	2010-01-29 11:08:32.944025367 +0100
@@ -78,6 +78,15 @@ QemuOptsList qemu_drive_opts = {
         },{
             .name = "readonly",
             .type = QEMU_OPT_BOOL,
+        },{
+            .name = "physical_block_size",
+            .type = QEMU_OPT_NUMBER,
+        },{
+            .name = "min_io_size",
+            .type = QEMU_OPT_NUMBER,
+        },{
+            .name = "opt_io_size",
+            .type = QEMU_OPT_NUMBER,
         },
         { /* end if list */ }
     },
Index: qemu/vl.c
===================================================================
--- qemu.orig/vl.c	2010-01-29 11:07:50.141004284 +0100
+++ qemu/vl.c	2010-01-29 11:08:32.947003820 +0100
@@ -1904,6 +1904,9 @@ DriveInfo *drive_init(QemuOpts *opts, vo
     int index;
     int cache;
     int aio = 0;
+    unsigned long physical_block_size = 512;
+    unsigned long min_io_size = 0;
+    unsigned long opt_io_size = 0;
     int ro = 0;
     int bdrv_flags;
     int on_read_error, on_write_error;
@@ -2053,6 +2056,32 @@ DriveInfo *drive_init(QemuOpts *opts, vo
     }
 #endif
 
+    if ((buf = qemu_opt_get(opts, "physical_block_size")) != NULL) {
+        physical_block_size = strtoul(buf, NULL, 10);
+        if (physical_block_size < 512) {
+            fprintf(stderr, "sector size must be larger than 512 bytes\n");
+            return NULL;
+        }
+    }
+
+    if ((buf = qemu_opt_get(opts, "min_io_size")) != NULL) {
+        min_io_size = strtoul(buf, NULL, 10);
+        if (!min_io_size || (min_io_size % physical_block_size)) {
+            fprintf(stderr,
+                    "min_io_size must be a multiple of the sector size\n");
+            return NULL;
+        }
+    }
+
+    if ((buf = qemu_opt_get(opts, "opt_io_size")) != NULL) {
+        opt_io_size = strtoul(buf, NULL, 10);
+        if (!opt_io_size || (opt_io_size % min_io_size)) {
+            fprintf(stderr,
+                    "opt_io_size must be a multiple of min_io_size\n");
+            return NULL;
+        }
+    }
+
     if ((buf = qemu_opt_get(opts, "format")) != NULL) {
        if (strcmp(buf, "?") == 0) {
             fprintf(stderr, "qemu: Supported formats:");
@@ -2257,6 +2286,12 @@ DriveInfo *drive_init(QemuOpts *opts, vo
         return NULL;
     }
 
+    bdrv_set_physical_block_size(dinfo->bdrv, physical_block_size);
+    if (min_io_size)
+        bdrv_set_min_io_size(dinfo->bdrv, min_io_size);
+    if (opt_io_size)
+        bdrv_set_opt_io_size(dinfo->bdrv, opt_io_size);
+
     if (bdrv_key_required(dinfo->bdrv))
         autostart = 0;
     *fatal_error = 0;
Index: qemu/qemu-options.hx
===================================================================
--- qemu.orig/qemu-options.hx	2010-01-29 11:07:50.149004395 +0100
+++ qemu/qemu-options.hx	2010-01-29 19:36:06.118256061 +0100
@@ -104,6 +104,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
     "       [,cache=writethrough|writeback|none][,format=f][,serial=s]\n"
     "       [,addr=A][,id=name][,aio=threads|native][,readonly=on|off]\n"
+    "       [,physical_block=size=size][,min_io_size=size][,opt_io_size=size]\n"
     "                use 'file' as a drive image\n")
 DEF("set", HAS_ARG, QEMU_OPTION_set,
     "-set group.id.arg=value\n"
@@ -149,6 +150,15 @@ an untrusted format header.
 This option specifies the serial number to assign to the device.
 @item addr=@var{addr}
 Specify the controller's PCI address (if=virtio only).
+@item physical_sector_size=@var{size}
+Report a physical block size larger than the logical block size of 512 bytes.
+@item min_io_size=@var{size}
+Reported a minimum I/O size or optimum I/O granularity.  This is the smallest
+I/O size the device can perform without a performance penalty.  For RAID
+devices this should be set to the stripe chunk size.
+@item opt_io_size=@var{size}
+Report an optimal I/O size, which is the device's preferred unit for
+sustained I/O.  This should be set to the stripe width for RAID devices.
 @end table
 
 By default, writethrough caching is used for all block device.  This means that

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 3/5] scsi: add topology support
  2010-01-29 19:04 [Qemu-devel] [PATCH 1/4] virtio-blk: revert serial number support Christoph Hellwig
  2010-01-29 19:04 ` [Qemu-devel] [PATCH 2/4] block: add block topology options Christoph Hellwig
@ 2010-01-29 19:05 ` Christoph Hellwig
  2010-01-29 19:05 ` [Qemu-devel] [PATCH 4/5] ide: " Christoph Hellwig
  2010-01-29 19:05 ` [Qemu-devel] [PATCH 5/5] virtio-blk: " Christoph Hellwig
  3 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2010-01-29 19:05 UTC (permalink / raw)
  To: qemu-devel; +Cc: Martin K. Petersen


Export the physical block size in the READ CAPACITY (16) command,
and add the new block limits VPD page to export the minimum and
optiomal I/O sizes.

Note that we also need to bump the scsi revision level to SPC-2
as that is the minimum requirement by at least the Linux kernel
to try READ CAPACITY (16) first and look at the block limits VPD
page.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c	2010-01-29 11:07:50.121003760 +0100
+++ qemu/hw/scsi-disk.c	2010-01-29 11:08:32.944025367 +0100
@@ -346,10 +346,11 @@ static int scsi_disk_emulate_inquiry(SCS
         case 0x00: /* Supported page codes, mandatory */
             DPRINTF("Inquiry EVPD[Supported pages] "
                     "buffer size %zd\n", req->cmd.xfer);
-            outbuf[buflen++] = 3;    // number of pages
+            outbuf[buflen++] = 4;    // number of pages
             outbuf[buflen++] = 0x00; // list of supported pages (this page)
             outbuf[buflen++] = 0x80; // unit serial number
             outbuf[buflen++] = 0x83; // device identification
+            outbuf[buflen++] = 0xb0; // block device characteristics
             break;
 
         case 0x80: /* Device serial number, optional */
@@ -391,6 +392,27 @@ static int scsi_disk_emulate_inquiry(SCS
             buflen += id_len;
             break;
         }
+        case 0xb0: /* block device characteristics */
+        {
+            unsigned int min_io_size = bdrv_get_min_io_size(bdrv) >> 9;
+            unsigned int opt_io_size = bdrv_get_opt_io_size(bdrv) >> 9;
+
+            /* required VPD size with unmap support */
+            outbuf[3] = buflen = 0x3c;
+
+            memset(outbuf + 4, 0, buflen - 4);
+
+            /* optimal transfer length granularity */
+            outbuf[6] = (min_io_size >> 8) & 0xff;
+            outbuf[7] = min_io_size & 0xff;
+
+            /* optimal transfer length */
+            outbuf[12] = (opt_io_size >> 24) & 0xff;
+            outbuf[13] = (opt_io_size >> 16) & 0xff;
+            outbuf[14] = (opt_io_size >> 8) & 0xff;
+            outbuf[15] = opt_io_size & 0xff;
+            break;
+        }
         default:
             BADF("Error: unsupported Inquiry (EVPD[%02X]) "
                  "buffer size %zd\n", page_code, req->cmd.xfer);
@@ -437,7 +459,7 @@ static int scsi_disk_emulate_inquiry(SCS
     memcpy(&outbuf[32], s->version ? s->version : QEMU_VERSION, 4);
     /* Identify device as SCSI-3 rev 1.
        Some later commands are also implemented. */
-    outbuf[2] = 3;
+    outbuf[2] = 5;
     outbuf[3] = 2; /* Format 2 */
 
     if (buflen > 36) {
@@ -794,6 +816,8 @@ static int scsi_disk_emulate_command(SCS
             outbuf[9] = 0;
             outbuf[10] = s->cluster_size * 2;
             outbuf[11] = 0;
+            outbuf[12] = 0;
+            outbuf[13] = bdrv_get_physical_block_exp(bdrv);
             /* Protection, exponent and lowest lba field left blank. */
             buflen = req->cmd.xfer;
             break;

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 4/5] ide: add topology support
  2010-01-29 19:04 [Qemu-devel] [PATCH 1/4] virtio-blk: revert serial number support Christoph Hellwig
  2010-01-29 19:04 ` [Qemu-devel] [PATCH 2/4] block: add block topology options Christoph Hellwig
  2010-01-29 19:05 ` [Qemu-devel] [PATCH 3/5] scsi: add topology support Christoph Hellwig
@ 2010-01-29 19:05 ` Christoph Hellwig
  2010-01-29 19:05 ` [Qemu-devel] [PATCH 5/5] virtio-blk: " Christoph Hellwig
  3 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2010-01-29 19:05 UTC (permalink / raw)
  To: qemu-devel; +Cc: Martin K. Petersen

Export the physical block size in the ATA IDENTIFY command.  The
other topology values are not supported in ATA so skip them.

Add a new field to the savevm information which is initialized to
zero if migrating from an older qemu version.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/hw/ide/core.c
===================================================================
--- qemu.orig/hw/ide/core.c	2010-01-29 11:07:50.103003910 +0100
+++ qemu/hw/ide/core.c	2010-01-29 11:08:32.943004637 +0100
@@ -164,6 +164,8 @@ static void ide_identify(IDEState *s)
     put_le16(p + 101, s->nb_sectors >> 16);
     put_le16(p + 102, s->nb_sectors >> 32);
     put_le16(p + 103, s->nb_sectors >> 48);
+    if (s->physical_block_exp)
+        put_le16(p + 106, 0x6000 | s->physical_block_exp);
 
     memcpy(s->identify_data, p, sizeof(s->identify_data));
     s->identify_set = 1;
@@ -2615,6 +2617,7 @@ void ide_init_drive(IDEState *s, DriveIn
         }
         strncpy(s->drive_serial_str, drive_get_serial(s->bs),
                 sizeof(s->drive_serial_str));
+        s->physical_block_exp = bdrv_get_physical_block_exp(s->bs);
     }
     if (strlen(s->drive_serial_str) == 0)
         snprintf(s->drive_serial_str, sizeof(s->drive_serial_str),
@@ -2684,12 +2687,17 @@ static int ide_drive_post_load(void *opa
             s->cdrom_changed = 1;
         }
     }
+
+    if (version_id < 4) {
+        s->physical_block_exp = 0;
+    }
+
     return 0;
 }
 
 const VMStateDescription vmstate_ide_drive = {
     .name = "ide_drive",
-    .version_id = 3,
+    .version_id = 4,
     .minimum_version_id = 0,
     .minimum_version_id_old = 0,
     .post_load = ide_drive_post_load,
@@ -2714,6 +2722,7 @@ const VMStateDescription vmstate_ide_dri
         VMSTATE_UINT8(sense_key, IDEState),
         VMSTATE_UINT8(asc, IDEState),
         VMSTATE_UINT8_V(cdrom_changed, IDEState, 3),
+        VMSTATE_UINT8_V(physical_block_exp, IDEState, 4),
         /* XXX: if a transfer is pending, we do not save it yet */
         VMSTATE_END_OF_LIST()
     }
Index: qemu/hw/ide/internal.h
===================================================================
--- qemu.orig/hw/ide/internal.h	2010-01-29 11:07:50.116004031 +0100
+++ qemu/hw/ide/internal.h	2010-01-29 11:08:32.948011699 +0100
@@ -433,6 +433,8 @@ struct IDEState {
     int smart_errors;
     uint8_t smart_selftest_count;
     uint8_t *smart_selftest_data;
+    /* topology information */
+    uint8_t physical_block_exp;
 };
 
 struct IDEBus {

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 5/5] virtio-blk: add topology support
  2010-01-29 19:04 [Qemu-devel] [PATCH 1/4] virtio-blk: revert serial number support Christoph Hellwig
                   ` (2 preceding siblings ...)
  2010-01-29 19:05 ` [Qemu-devel] [PATCH 4/5] ide: " Christoph Hellwig
@ 2010-01-29 19:05 ` Christoph Hellwig
  2010-02-01  9:09   ` [Qemu-devel] [PATCH v2 " Christoph Hellwig
  3 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2010-01-29 19:05 UTC (permalink / raw)
  To: qemu-devel; +Cc: Martin K. Petersen

Export all topology information in the block config structure,
guarded by a new VIRTIO_BLK_F_TOPOLOGY config flag.

Note that there is no savevm support for the new information yet,
as it would be a pain without the table driven savevm format.
Juan has promised to send out a new version of the table driven
vmstate patches for virto next week.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/hw/virtio-blk.c
===================================================================
--- qemu.orig/hw/virtio-blk.c	2010-01-29 19:42:11.853022001 +0100
+++ qemu/hw/virtio-blk.c	2010-01-29 19:45:14.735253918 +0100
@@ -380,6 +380,10 @@ static void virtio_blk_update_config(Vir
     blkcfg.heads = heads;
     blkcfg.sectors = secs;
     blkcfg.size_max = 0;
+    blkcfg.physical_block_exp = bdrv_get_physical_block_exp(s->bs);
+    blkcfg.alignment_offset = 0;
+    blkcfg.min_io_size = bdrv_get_min_io_size(s->bs);
+    blkcfg.opt_io_size = bdrv_get_opt_io_size(s->bs);
     memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
 }
 
@@ -389,6 +393,7 @@ static uint32_t virtio_blk_get_features(
 
     features |= (1 << VIRTIO_BLK_F_SEG_MAX);
     features |= (1 << VIRTIO_BLK_F_GEOMETRY);
+    features |= (1 << VIRTIO_BLK_F_TOPOLOGY);
 
     if (bdrv_enable_write_cache(s->bs))
         features |= (1 << VIRTIO_BLK_F_WCACHE);
Index: qemu/hw/virtio-blk.h
===================================================================
--- qemu.orig/hw/virtio-blk.h	2010-01-29 19:45:17.996022961 +0100
+++ qemu/hw/virtio-blk.h	2010-01-29 19:45:39.253318328 +0100
@@ -32,6 +32,7 @@
 #define VIRTIO_BLK_F_SCSI       7       /* Supports scsi command passthru */
 /* #define VIRTIO_BLK_F_IDENTIFY   8       ATA IDENTIFY supported, DEPRECATED */
 #define VIRTIO_BLK_F_WCACHE     9       /* write cache enabled */
+#define VIRTIO_BLK_F_TOPOLOGY   10      /* Topology information is available */
 
 struct virtio_blk_config
 {
@@ -42,6 +43,10 @@ struct virtio_blk_config
     uint8_t heads;
     uint8_t sectors;
     uint32_t _blk_size;    /* structure pad, currently unused */
+    uint8_t physical_block_exp;
+    uint8_t alignment_offset;
+    uint16_t min_io_size;
+    uint32_t opt_io_size;
 } __attribute__((packed));
 
 /* These two define direction. */

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH v2 5/5] virtio-blk: add topology support
  2010-01-29 19:05 ` [Qemu-devel] [PATCH 5/5] virtio-blk: " Christoph Hellwig
@ 2010-02-01  9:09   ` Christoph Hellwig
  0 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2010-02-01  9:09 UTC (permalink / raw)
  To: qemu-devel; +Cc: Martin K. Petersen

Export all topology information in the block config structure,
guarded by a new VIRTIO_BLK_F_TOPOLOGY config flag.

Note that there is no savevm support for the new information yet,
as it would be a pain without the table driven savevm format.
Juan has promised to send out a new version of the table driven
vmstate patches for virto next week.

Version 2 updates: store min_io_size and opt_io_size in terms of logical
blocks.

Signed-off-by: Christoph Hellwig <hch@lst.de>

Index: qemu/hw/virtio-blk.c
===================================================================
--- qemu.orig/hw/virtio-blk.c	2010-01-30 19:06:20.485254411 +0100
+++ qemu/hw/virtio-blk.c	2010-01-30 19:08:00.300254199 +0100
@@ -380,6 +380,10 @@ static void virtio_blk_update_config(Vir
     blkcfg.heads = heads;
     blkcfg.sectors = secs;
     blkcfg.size_max = 0;
+    blkcfg.physical_block_exp = bdrv_get_physical_block_exp(s->bs);
+    blkcfg.alignment_offset = 0;
+    blkcfg.min_io_size = bdrv_get_min_io_size(s->bs) / 512;
+    blkcfg.opt_io_size = bdrv_get_opt_io_size(s->bs) / 512;
     memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
 }
 
@@ -389,6 +393,7 @@ static uint32_t virtio_blk_get_features(
 
     features |= (1 << VIRTIO_BLK_F_SEG_MAX);
     features |= (1 << VIRTIO_BLK_F_GEOMETRY);
+    features |= (1 << VIRTIO_BLK_F_TOPOLOGY);
 
     if (bdrv_enable_write_cache(s->bs))
         features |= (1 << VIRTIO_BLK_F_WCACHE);
Index: qemu/hw/virtio-blk.h
===================================================================
--- qemu.orig/hw/virtio-blk.h	2010-01-30 19:06:20.496011220 +0100
+++ qemu/hw/virtio-blk.h	2010-01-30 19:07:40.142003536 +0100
@@ -32,6 +32,7 @@
 #define VIRTIO_BLK_F_SCSI       7       /* Supports scsi command passthru */
 /* #define VIRTIO_BLK_F_IDENTIFY   8       ATA IDENTIFY supported, DEPRECATED */
 #define VIRTIO_BLK_F_WCACHE     9       /* write cache enabled */
+#define VIRTIO_BLK_F_TOPOLOGY   10      /* Topology information is available */
 
 struct virtio_blk_config
 {
@@ -42,6 +43,10 @@ struct virtio_blk_config
     uint8_t heads;
     uint8_t sectors;
     uint32_t _blk_size;    /* structure pad, currently unused */
+    uint8_t physical_block_exp;
+    uint8_t alignment_offset;
+    uint16_t min_io_size;
+    uint32_t opt_io_size;
 } __attribute__((packed));
 
 /* These two define direction. */

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] block: add block topology options
  2010-01-29 19:04 ` [Qemu-devel] [PATCH 2/4] block: add block topology options Christoph Hellwig
@ 2010-02-03 19:00   ` Anthony Liguori
  2010-02-05 13:09     ` Christoph Hellwig
  0 siblings, 1 reply; 13+ messages in thread
From: Anthony Liguori @ 2010-02-03 19:00 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel, Martin K. Petersen

On 01/29/2010 01:04 PM, Christoph Hellwig wrote:
> Add three new suboptions for the drive option to export block topology
> information to the guest.  This is needed to get optimal I/O alignment
> for RAID arrays or SSDs.
>
> The options are:
>
>   - physical_block_size to specify the physical block size of the device,
>     this is going to increase from 512 bytes to 4096 kilobytes for many
>     modern storage devices
>   - min_io_size to specify the minimal I/O size without performance impact,
>     this is typically set to the RAID chunk size for arrays.
>   - opt_io_size to specify the optimal sustained I/O size, this is
>     typically the RAID stripe width for arrays.
>    

This makes sense.

> I decided to not auto-probe these values from blkid which might easily
> be possible as I don't know how to deal with these issues on migration.
>
> Note that we specificly only set the physical_block_size, and not the
> logial one which is the unit all I/O is described in.  The reason for
> that is that IDE does not support increasing the logical block size and
> at last for now I want to stick to one meachnisms in queue and allow
> for easy switching of transports for a given backing image which would
> not be possible if scsi and virtio use real 4k sectors, while ide only
> uses the physical block exponent.
>
> Signed-off-by: Christoph Hellwig<hch@lst.de>
>
> Index: qemu/block.c
> ===================================================================
> --- qemu.orig/block.c	2010-01-29 11:07:50.083004364 +0100
> +++ qemu/block.c	2010-01-29 11:08:32.940004255 +0100
> @@ -1028,6 +1028,51 @@ int bdrv_enable_write_cache(BlockDriverS
>       return bs->enable_write_cache;
>   }
>
> +unsigned int bdrv_get_physical_block_size(BlockDriverState *bs)
> +{
> +    return bs->physical_block_size;
> +}
> +
> +unsigned int bdrv_get_physical_block_exp(BlockDriverState *bs)
> +{
> +    unsigned int exp = 0, size;
> +
> +    for (size = bs->physical_block_size; size>  512; size>>= 1) {
> +        exp++;
> +    }
> +
> +    return exp;
> +}
> +
> +void bdrv_set_physical_block_size(BlockDriverState *bs,
> +        unsigned int physical_block_size)
> +{
> +    bs->physical_block_size = physical_block_size;
> +}
> +
>    

But I don't think this is the wrong place to do it.  The 
BlockDriverState reflects that backing device, not the emulated device 
itself.  In this case, you're trying to set a property of the emulated 
device.

I think these need to be qdev properties of the respective devices.  
 From a UI perspective, you can still expose -drive options for the end 
user to consume, but this data should be associated with the devices 
themselves.

Regards,

Anthony Liguori

> +unsigned int bdrv_get_min_io_size(BlockDriverState *bs)
> +{
> +    return bs->min_io_size;
> +}
> +
> +void bdrv_set_min_io_size(BlockDriverState *bs,
> +        unsigned int min_io_size)
> +{
> +    bs->min_io_size = min_io_size;
> +}
> +
> +unsigned int bdrv_get_opt_io_size(BlockDriverState *bs)
> +{
> +    return bs->opt_io_size;
> +}
> +
> +void bdrv_set_opt_io_size(BlockDriverState *bs,
> +        unsigned int opt_io_size)
> +{
> +    bs->opt_io_size = opt_io_size;
> +}
> +
>   /* XXX: no longer used */
>   void bdrv_set_change_cb(BlockDriverState *bs,
>                           void (*change_cb)(void *opaque), void *opaque)
> Index: qemu/block.h
> ===================================================================
> --- qemu.orig/block.h	2010-01-29 11:07:50.089004011 +0100
> +++ qemu/block.h	2010-01-29 11:08:32.940004255 +0100
> @@ -152,6 +152,16 @@ int bdrv_is_inserted(BlockDriverState *b
>   int bdrv_media_changed(BlockDriverState *bs);
>   int bdrv_is_locked(BlockDriverState *bs);
>   void bdrv_set_locked(BlockDriverState *bs, int locked);
> +unsigned int bdrv_get_physical_block_size(BlockDriverState *bs);
> +unsigned int bdrv_get_physical_block_exp(BlockDriverState *bs);
> +void bdrv_set_physical_block_size(BlockDriverState *bs,
> +        unsigned int physical_block_size);
> +unsigned int bdrv_get_min_io_size(BlockDriverState *bs);
> +void bdrv_set_min_io_size(BlockDriverState *bs,
> +        unsigned int min_io_size);
> +unsigned int bdrv_get_opt_io_size(BlockDriverState *bs);
> +void bdrv_set_opt_io_size(BlockDriverState *bs,
> +        unsigned int opt_io_size);
>   int bdrv_eject(BlockDriverState *bs, int eject_flag);
>   void bdrv_set_change_cb(BlockDriverState *bs,
>                           void (*change_cb)(void *opaque), void *opaque);
> Index: qemu/block_int.h
> ===================================================================
> --- qemu.orig/block_int.h	2010-01-29 11:07:50.096004065 +0100
> +++ qemu/block_int.h	2010-01-29 11:08:32.941003474 +0100
> @@ -173,6 +173,14 @@ struct BlockDriverState {
>          drivers. They are not used by the block driver */
>       int cyls, heads, secs, translation;
>       int type;
> +
> +    /*
> +     * Topology information, all optional.
> +     */
> +    unsigned int physical_block_size;
> +    unsigned int min_io_size;
> +    unsigned int opt_io_size;
> +
>       char device_name[32];
>       unsigned long *dirty_bitmap;
>       BlockDriverState *next;
> Index: qemu/qemu-config.c
> ===================================================================
> --- qemu.orig/qemu-config.c	2010-01-29 11:07:50.133004032 +0100
> +++ qemu/qemu-config.c	2010-01-29 11:08:32.944025367 +0100
> @@ -78,6 +78,15 @@ QemuOptsList qemu_drive_opts = {
>           },{
>               .name = "readonly",
>               .type = QEMU_OPT_BOOL,
> +        },{
> +            .name = "physical_block_size",
> +            .type = QEMU_OPT_NUMBER,
> +        },{
> +            .name = "min_io_size",
> +            .type = QEMU_OPT_NUMBER,
> +        },{
> +            .name = "opt_io_size",
> +            .type = QEMU_OPT_NUMBER,
>           },
>           { /* end if list */ }
>       },
> Index: qemu/vl.c
> ===================================================================
> --- qemu.orig/vl.c	2010-01-29 11:07:50.141004284 +0100
> +++ qemu/vl.c	2010-01-29 11:08:32.947003820 +0100
> @@ -1904,6 +1904,9 @@ DriveInfo *drive_init(QemuOpts *opts, vo
>       int index;
>       int cache;
>       int aio = 0;
> +    unsigned long physical_block_size = 512;
> +    unsigned long min_io_size = 0;
> +    unsigned long opt_io_size = 0;
>       int ro = 0;
>       int bdrv_flags;
>       int on_read_error, on_write_error;
> @@ -2053,6 +2056,32 @@ DriveInfo *drive_init(QemuOpts *opts, vo
>       }
>   #endif
>
> +    if ((buf = qemu_opt_get(opts, "physical_block_size")) != NULL) {
> +        physical_block_size = strtoul(buf, NULL, 10);
> +        if (physical_block_size<  512) {
> +            fprintf(stderr, "sector size must be larger than 512 bytes\n");
> +            return NULL;
> +        }
> +    }
> +
> +    if ((buf = qemu_opt_get(opts, "min_io_size")) != NULL) {
> +        min_io_size = strtoul(buf, NULL, 10);
> +        if (!min_io_size || (min_io_size % physical_block_size)) {
> +            fprintf(stderr,
> +                    "min_io_size must be a multiple of the sector size\n");
> +            return NULL;
> +        }
> +    }
> +
> +    if ((buf = qemu_opt_get(opts, "opt_io_size")) != NULL) {
> +        opt_io_size = strtoul(buf, NULL, 10);
> +        if (!opt_io_size || (opt_io_size % min_io_size)) {
> +            fprintf(stderr,
> +                    "opt_io_size must be a multiple of min_io_size\n");
> +            return NULL;
> +        }
> +    }
> +
>       if ((buf = qemu_opt_get(opts, "format")) != NULL) {
>          if (strcmp(buf, "?") == 0) {
>               fprintf(stderr, "qemu: Supported formats:");
> @@ -2257,6 +2286,12 @@ DriveInfo *drive_init(QemuOpts *opts, vo
>           return NULL;
>       }
>
> +    bdrv_set_physical_block_size(dinfo->bdrv, physical_block_size);
> +    if (min_io_size)
> +        bdrv_set_min_io_size(dinfo->bdrv, min_io_size);
> +    if (opt_io_size)
> +        bdrv_set_opt_io_size(dinfo->bdrv, opt_io_size);
> +
>       if (bdrv_key_required(dinfo->bdrv))
>           autostart = 0;
>       *fatal_error = 0;
> Index: qemu/qemu-options.hx
> ===================================================================
> --- qemu.orig/qemu-options.hx	2010-01-29 11:07:50.149004395 +0100
> +++ qemu/qemu-options.hx	2010-01-29 19:36:06.118256061 +0100
> @@ -104,6 +104,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
>       "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
>       "       [,cache=writethrough|writeback|none][,format=f][,serial=s]\n"
>       "       [,addr=A][,id=name][,aio=threads|native][,readonly=on|off]\n"
> +    "       [,physical_block=size=size][,min_io_size=size][,opt_io_size=size]\n"
>       "                use 'file' as a drive image\n")
>   DEF("set", HAS_ARG, QEMU_OPTION_set,
>       "-set group.id.arg=value\n"
> @@ -149,6 +150,15 @@ an untrusted format header.
>   This option specifies the serial number to assign to the device.
>   @item addr=@var{addr}
>   Specify the controller's PCI address (if=virtio only).
> +@item physical_sector_size=@var{size}
> +Report a physical block size larger than the logical block size of 512 bytes.
> +@item min_io_size=@var{size}
> +Reported a minimum I/O size or optimum I/O granularity.  This is the smallest
> +I/O size the device can perform without a performance penalty.  For RAID
> +devices this should be set to the stripe chunk size.
> +@item opt_io_size=@var{size}
> +Report an optimal I/O size, which is the device's preferred unit for
> +sustained I/O.  This should be set to the stripe width for RAID devices.
>   @end table
>
>   By default, writethrough caching is used for all block device.  This means that
>
>
>
>    

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] block: add block topology options
  2010-02-03 19:00   ` Anthony Liguori
@ 2010-02-05 13:09     ` Christoph Hellwig
  2010-02-05 16:16       ` Jamie Lokier
  2010-02-05 17:33       ` Anthony Liguori
  0 siblings, 2 replies; 13+ messages in thread
From: Christoph Hellwig @ 2010-02-05 13:09 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Christoph Hellwig, Martin K. Petersen, qemu-devel

On Wed, Feb 03, 2010 at 01:00:47PM -0600, Anthony Liguori wrote:
> But I don't think this is the wrong place to do it.  The 
> BlockDriverState reflects that backing device, not the emulated device 
> itself.  In this case, you're trying to set a property of the emulated 
> device.

I think that's very borderline.  While the emulated device exposes these
properties, they are in fact a property of the backing storage, the
right sector and min/max I/O sizes are determined by the backing storage
device.

> I think these need to be qdev properties of the respective devices.  
> From a UI perspective, you can still expose -drive options for the end 
> user to consume, but this data should be associated with the devices 
> themselves.

In addition to not really beeing more logical this would be a lot more
effort.  We'd need to add properties to all the device, which means
including dealing with the n+1 ide variants, the virtio-pci proxy, etc.

If you believe it really needs to be in the qdev properties I'll
implement it, but I suspect the current version is a better idea.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] block: add block topology options
  2010-02-05 13:09     ` Christoph Hellwig
@ 2010-02-05 16:16       ` Jamie Lokier
  2010-02-05 16:22         ` Christoph Hellwig
  2010-02-05 17:33       ` Anthony Liguori
  1 sibling, 1 reply; 13+ messages in thread
From: Jamie Lokier @ 2010-02-05 16:16 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Martin K. Petersen, qemu-devel

Christoph Hellwig wrote:
> On Wed, Feb 03, 2010 at 01:00:47PM -0600, Anthony Liguori wrote:
> > But I don't think this is the wrong place to do it.  The 
> > BlockDriverState reflects that backing device, not the emulated device 
> > itself.  In this case, you're trying to set a property of the emulated 
> > device.
> 
> I think that's very borderline.  While the emulated device exposes these
> properties, they are in fact a property of the backing storage, the
> right sector and min/max I/O sizes are determined by the backing storage
> device.
> 
> > I think these need to be qdev properties of the respective devices.  
> > From a UI perspective, you can still expose -drive options for the end 
> > user to consume, but this data should be associated with the devices 
> > themselves.
> 
> In addition to not really beeing more logical this would be a lot more
> effort.  We'd need to add properties to all the device, which means
> including dealing with the n+1 ide variants, the virtio-pci proxy, etc.
> 
> If you believe it really needs to be in the qdev properties I'll
> implement it, but I suspect the current version is a better idea.

If you move your VM to a new system with different backing devices,
sometimes you want to be sure there is no guest-visible change.  Or
even if you just replace a drive - you might prefer confidence that
the guest sees no change.

Even if you just convert between qcow2 and a raw block device, or the
other way, you'll sometimes want to be sure it's not guest-visible.

-- Jamie

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] block: add block topology options
  2010-02-05 16:16       ` Jamie Lokier
@ 2010-02-05 16:22         ` Christoph Hellwig
  2010-02-05 17:16           ` Jamie Lokier
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2010-02-05 16:22 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: qemu-devel, Christoph Hellwig, Martin K. Petersen

On Fri, Feb 05, 2010 at 04:16:20PM +0000, Jamie Lokier wrote:
> If you move your VM to a new system with different backing devices,
> sometimes you want to be sure there is no guest-visible change.  Or
> even if you just replace a drive - you might prefer confidence that
> the guest sees no change.

Yes, that's why we do not auto probe it but require it to be set
manually.  Note that not the physical block size attribute can
we a data integrity issue, though.  A storage device guarnatees
that it can write a sector atomically, so moving from a 4k to a 512 byte
physcical sector device could lead to not beeing able to atomically
write a 4k piece of data that the guest expects to write atomcially.

I'm not sure how failure safe the read-modify-write algorithms on
4k sector disks with logical 512 bye blocks are, but I'd expect issues
there, too.

> Even if you just convert between qcow2 and a raw block device, or the
> other way, you'll sometimes want to be sure it's not guest-visible.

The image format has no hooks into these options currently.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] block: add block topology options
  2010-02-05 16:22         ` Christoph Hellwig
@ 2010-02-05 17:16           ` Jamie Lokier
  0 siblings, 0 replies; 13+ messages in thread
From: Jamie Lokier @ 2010-02-05 17:16 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Martin K. Petersen, qemu-devel

Christoph Hellwig wrote:
> Note that not the physical block size attribute can
> we a data integrity issue, though.

I agree.

> A storage device guarnatees that it can write a sector atomically,

I've looked everywhere for confirmation of *atomicity*, and so far all
I've seen are rumours.  Some people believe sector writes are atomic
during power failure, some people believe they are not.  Those who
believe it is, don't have reliable references, and I haven't seen it
in any standard.

SQLite has a flag which can be set for a backing store to say block
writes are atomic, to enable it to optimise some things; the flag is
not set when writing to a disk block device, because they didn't find
confirmation of it.

> so moving from a 4k to a 512 byte physcical sector device could lead
> to not beeing able to atomically write a 4k piece of data that the
> guest expects to write atomcially.

If there is no confirmation that sector writes are atomic, then no
database or filesystem should be relying on that property anyway.

> I'm not sure how failure safe the read-modify-write algorithms on
> 4k sector disks with logical 512 bye blocks are, but I'd expect issues
> there, too.

I think you might be referring to what I'm calling "radius of
destruction", because I don't know if there's a well known term for it.

By that I mean if you write 512 bytes, and it's implemented as RMW to
a 4k sector, then on power failure any part of the 4k sector could be
corrupted.

On some RAIDs the size is much larger; also on many flash devices.
(RAIDs make it clearer that the alignment is relevant too.)

Note that if 4k sector writes were _atomic_, then read-modify-write of
512 bytes would be completely reliable.

> > Even if you just convert between qcow2 and a raw block device, or the
> > other way, you'll sometimes want to be sure it's not guest-visible.
> 
> The image format has no hooks into these options currently.

No, but whatever is reported to the guest, you may device you want it
to continue being reported to the guest after doing the convert
operation.  Even if it's a data integrity concern.  In fact
*precisely* when the guest has algorithms which write differently
depending on the sector size, for integrity, that means changing the
guest-visible sector size may trigger bugs and other changes that you
sometimes don't want.

I agree that it should report the size by default, though, because the
integrity concern.

-- Jamie

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] block: add block topology options
  2010-02-05 13:09     ` Christoph Hellwig
  2010-02-05 16:16       ` Jamie Lokier
@ 2010-02-05 17:33       ` Anthony Liguori
  2010-02-09 15:26         ` Markus Armbruster
  1 sibling, 1 reply; 13+ messages in thread
From: Anthony Liguori @ 2010-02-05 17:33 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel, Martin K. Petersen

On 02/05/2010 07:09 AM, Christoph Hellwig wrote:
> On Wed, Feb 03, 2010 at 01:00:47PM -0600, Anthony Liguori wrote:
>    
>> But I don't think this is the wrong place to do it.  The
>> BlockDriverState reflects that backing device, not the emulated device
>> itself.  In this case, you're trying to set a property of the emulated
>> device.
>>      
> I think that's very borderline.  While the emulated device exposes these
> properties, they are in fact a property of the backing storage, the
> right sector and min/max I/O sizes are determined by the backing storage
> device.
>    

It's guest visible state that should be configured based on the 
properties of the backing store.

However, as you said, live migration complicates things.

drive is not an optimal interface because it mixes host and guest 
state.  But the subset of -drive that you normally use should be 
independent of the guest.  For instance, with:

-drive file=/home/anthony/foo.img,cache=off,aio=native,id=foo -device 
virtio-blk-pci,drive=foo

file, cache, and aio are all host properties.  They have no visible 
impact to the guest and if I change those properties during live 
migration, they take affect without the guest noticing

These topology options still can be specified via -drive, but the proper 
way of splitting things would have them as part of the -device option.  
During live migration, things on the -device side inherited as part of 
the live migration process.

>> I think these need to be qdev properties of the respective devices.
>>  From a UI perspective, you can still expose -drive options for the end
>> user to consume, but this data should be associated with the devices
>> themselves.
>>      
> In addition to not really beeing more logical this would be a lot more
> effort.  We'd need to add properties to all the device, which means
> including dealing with the n+1 ide variants, the virtio-pci proxy, etc.
>    

The way we deal with this with network devices is we have a common 
DEFINE_NIC_PROPERTIES that lets us consistently add properties for all 
network devices.  We need exactly this sort of thing for disk drives for 
the reason you describe.

> If you believe it really needs to be in the qdev properties I'll
> implement it, but I suspect the current version is a better idea.
>    

It definitely a correctness issue.  Once we can describe the full PC via 
qdev, we want to be able to migrate that description as part of the 
migration traffic.  For that to work, we need to have all of the device 
characteristics expressed as qdev properties.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 2/4] block: add block topology options
  2010-02-05 17:33       ` Anthony Liguori
@ 2010-02-09 15:26         ` Markus Armbruster
  0 siblings, 0 replies; 13+ messages in thread
From: Markus Armbruster @ 2010-02-09 15:26 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Christoph Hellwig, Martin K. Petersen, qemu-devel

Anthony Liguori <anthony@codemonkey.ws> writes:

> On 02/05/2010 07:09 AM, Christoph Hellwig wrote:
>> On Wed, Feb 03, 2010 at 01:00:47PM -0600, Anthony Liguori wrote:
>>    
>>> But I don't think this is the wrong place to do it.  The
>>> BlockDriverState reflects that backing device, not the emulated device
>>> itself.  In this case, you're trying to set a property of the emulated
>>> device.
>>>      
>> I think that's very borderline.  While the emulated device exposes these
>> properties, they are in fact a property of the backing storage, the
>> right sector and min/max I/O sizes are determined by the backing storage
>> device.
>>    
>
> It's guest visible state that should be configured based on the
> properties of the backing store.
>
> However, as you said, live migration complicates things.
>
> drive is not an optimal interface because it mixes host and guest
> state.

Qdev separates guest and host stuff.  -device does guest, -netdev,
-chardev, -drive if=none do host.  Unfortunately, -drive can also do
host, with other values for parameter if.  A clean, host-only -blockdev
for use with -device would be nice to have.

>         But the subset of -drive that you normally use should be
> independent of the guest.

To be precise: those properties that make sense with if=none.

>                            For instance, with:
>
> -drive file=/home/anthony/foo.img,cache=off,aio=native,id=foo -device
> virtio-blk-pci,drive=foo

There's an implied if=ide, which means it's mixed host and guest stuff.

> file, cache, and aio are all host properties.  They have no visible
> impact to the guest and if I change those properties during live
> migration, they take affect without the guest noticing
>
> These topology options still can be specified via -drive, but the
> proper way of splitting things would have them as part of the -device
> option.  During live migration, things on the -device side inherited
> as part of the live migration process.

If it's guest-visible, it should live in guest state (qdev device
state), and be configured with -device.  Else, it should live in host
state (block driver state), and be configured via -drive if=none (or
-blockdev once we have it).

[...]

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2010-02-09 15:26 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-01-29 19:04 [Qemu-devel] [PATCH 1/4] virtio-blk: revert serial number support Christoph Hellwig
2010-01-29 19:04 ` [Qemu-devel] [PATCH 2/4] block: add block topology options Christoph Hellwig
2010-02-03 19:00   ` Anthony Liguori
2010-02-05 13:09     ` Christoph Hellwig
2010-02-05 16:16       ` Jamie Lokier
2010-02-05 16:22         ` Christoph Hellwig
2010-02-05 17:16           ` Jamie Lokier
2010-02-05 17:33       ` Anthony Liguori
2010-02-09 15:26         ` Markus Armbruster
2010-01-29 19:05 ` [Qemu-devel] [PATCH 3/5] scsi: add topology support Christoph Hellwig
2010-01-29 19:05 ` [Qemu-devel] [PATCH 4/5] ide: " Christoph Hellwig
2010-01-29 19:05 ` [Qemu-devel] [PATCH 5/5] virtio-blk: " Christoph Hellwig
2010-02-01  9:09   ` [Qemu-devel] [PATCH v2 " Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.