All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
@ 2015-01-27  8:35 Teruaki Ishizaki
  2015-02-02  6:52 ` Liu Yuan
                   ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Teruaki Ishizaki @ 2015-01-27  8:35 UTC (permalink / raw)
  To: qemu-devel, sheepdog, kwolf, stefanha, mitake.hitoshi, namei.unix
  Cc: Teruaki Ishizaki

Previously, qemu block driver of sheepdog used hard-coded VDI object size.
This patch enables users to handle "block_size_shift" value for
calculating VDI object size.

When you start qemu, you don't need to specify additional command option.

But when you create the VDI which doesn't have default object size
with qemu-img command, you specify block_size_shift option.

If you want to create a VDI of 8MB(1 << 23) object size,
you need to specify following command option.

 # qemu-img create -o block_size_shift=23 sheepdog:test1 100M

In addition, when you don't specify qemu-img command option,
a default value of sheepdog cluster is used for creating VDI.

 # qemu-img create sheepdog:test2 100M

Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
---
V4:
 - Limit a read/write buffer size for creating a preallocated VDI.
 - Replace a parse function for the block_size_shift option.
 - Fix an error message.

V3:
 - Delete the needless operation of buffer.
 - Delete the needless operations of request header.
   for SD_OP_GET_CLUSTER_DEFAULT.
 - Fix coding style problems.

V2:
 - Fix coding style problem (white space).
 - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
 - Initialize request header to use block_size_shift specified by user.
---
 block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
 include/block/block_int.h |    1 +
 2 files changed, 119 insertions(+), 20 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index be3176f..a43b947 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -37,6 +37,7 @@
 #define SD_OP_READ_VDIS      0x15
 #define SD_OP_FLUSH_VDI      0x16
 #define SD_OP_DEL_VDI        0x17
+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
 
 #define SD_FLAG_CMD_WRITE    0x01
 #define SD_FLAG_CMD_COW      0x02
@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
     uint32_t base_vdi_id;
     uint8_t copies;
     uint8_t copy_policy;
-    uint8_t reserved[2];
+    uint8_t store_policy;
+    uint8_t block_size_shift;
     uint32_t snapid;
     uint32_t type;
     uint32_t pad[2];
@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
     uint32_t pad[5];
 } SheepdogVdiRsp;
 
+typedef struct SheepdogClusterRsp {
+    uint8_t proto_ver;
+    uint8_t opcode;
+    uint16_t flags;
+    uint32_t epoch;
+    uint32_t id;
+    uint32_t data_length;
+    uint32_t result;
+    uint8_t nr_copies;
+    uint8_t copy_policy;
+    uint8_t block_size_shift;
+    uint8_t __pad1;
+    uint32_t __pad2[6];
+} SheepdogClusterRsp;
+
 typedef struct SheepdogInode {
     char name[SD_MAX_VDI_LEN];
     char tag[SD_MAX_VDI_TAG_LEN];
@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
     hdr.vdi_size = s->inode.vdi_size;
     hdr.copy_policy = s->inode.copy_policy;
     hdr.copies = s->inode.nr_copies;
+    hdr.block_size_shift = s->inode.block_size_shift;
 
     ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
 
@@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
 static int sd_prealloc(const char *filename, Error **errp)
 {
     BlockDriverState *bs = NULL;
+    BDRVSheepdogState *base = NULL;
+    unsigned long buf_size;
     uint32_t idx, max_idx;
+    uint32_t object_size;
     int64_t vdi_size;
-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
+    void *buf = NULL;
     int ret;
 
     ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
@@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
         ret = vdi_size;
         goto out;
     }
-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
+
+    base = bs->opaque;
+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
+    buf = g_malloc0(buf_size);
+
+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
 
     for (idx = 0; idx < max_idx; idx++) {
         /*
          * The created image can be a cloned image, so we need to read
          * a data from the source image.
          */
-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
         if (ret < 0) {
             goto out;
         }
-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
         if (ret < 0) {
             goto out;
         }
@@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
     return 0;
 }
 
+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
+{
+    struct SheepdogInode *inode = &s->inode;
+    inode->block_size_shift =
+        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
+    if (inode->block_size_shift == 0) {
+        /* block_size_shift is set for cluster default value by sheepdog */
+        return 0;
+    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 static int sd_create(const char *filename, QemuOpts *opts,
                      Error **errp)
 {
@@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
     BDRVSheepdogState *s;
     char tag[SD_MAX_VDI_TAG_LEN];
     uint32_t snapid;
+    uint64_t max_vdi_size;
     bool prealloc = false;
 
     s = g_new0(BDRVSheepdogState, 1);
@@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
         }
     }
 
-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
-        error_setg(errp, "too big image size");
-        ret = -EINVAL;
+    ret = parse_block_size_shift(s, opts);
+    if (ret < 0) {
+        error_setg(errp, "Invalid block_size_shift:"
+                         " '%s'. please use 20-31", buf);
         goto out;
     }
 
@@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
     }
 
     s->aio_context = qemu_get_aio_context();
+
+    /* if block_size_shift is not specified, get cluster default value */
+    if (s->inode.block_size_shift == 0) {
+        SheepdogVdiReq hdr;
+        SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
+        Error *local_err = NULL;
+        int fd;
+        unsigned int wlen = 0, rlen = 0;
+
+        fd = connect_to_sdog(s, &local_err);
+        if (fd < 0) {
+            error_report("%s", error_get_pretty(local_err));
+            error_free(local_err);
+            ret = -EIO;
+            goto out;
+        }
+
+        memset(&hdr, 0, sizeof(hdr));
+        hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
+        hdr.proto_ver = SD_PROTO_VER;
+
+        ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+                     NULL, &wlen, &rlen);
+        closesocket(fd);
+        if (ret) {
+            error_setg_errno(errp, -ret, "failed to get cluster default");
+            goto out;
+        }
+        s->inode.block_size_shift = rsp->block_size_shift;
+    }
+
+    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
+
+    if (s->inode.vdi_size > max_vdi_size) {
+        error_setg(errp, "An image is too big."
+                         " The maximum image size is %"PRIu64 "GB",
+                         max_vdi_size / 1024 / 1024 / 1024);
+        ret = -EINVAL;
+        goto out;
+    }
+
     ret = do_sd_create(s, &vid, 0, errp);
     if (ret) {
         goto out;
@@ -2013,9 +2098,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
     SheepdogAIOCB *acb = p;
     int ret = 0;
     unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
-    unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE;
+    unsigned long idx;
+    uint32_t object_size;
     uint64_t oid;
-    uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
+    uint64_t offset;
     BDRVSheepdogState *s = acb->common.bs->opaque;
     SheepdogInode *inode = &s->inode;
     AIOReq *aio_req;
@@ -2032,6 +2118,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
         }
     }
 
+    object_size = (UINT32_C(1) << inode->block_size_shift);
+    idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
+    offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
+
     /*
      * Make sure we don't free the aiocb before we are done with all requests.
      * This additional reference is dropped at the end of this function.
@@ -2045,7 +2135,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
 
         oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
 
-        len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
+        len = MIN(total - done, object_size - offset);
 
         switch (acb->aiocb_type) {
         case AIOCB_READ_UDATA:
@@ -2069,7 +2159,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
              * We discard the object only when the whole object is
              * 1) allocated 2) trimmed. Otherwise, simply skip it.
              */
-            if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) {
+            if (len != object_size || inode->data_vdi_id[idx] == 0) {
                 goto done;
             }
             break;
@@ -2426,6 +2516,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
     uint64_t offset;
     uint32_t vdi_index;
     uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
+    uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
 
     fd = connect_to_sdog(s, &local_err);
     if (fd < 0) {
@@ -2435,10 +2526,10 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
     }
 
     while (remaining) {
-        vdi_index = pos / SD_DATA_OBJ_SIZE;
-        offset = pos % SD_DATA_OBJ_SIZE;
+        vdi_index = pos / object_size;
+        offset = pos % object_size;
 
-        data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
+        data_len = MIN(remaining, object_size - offset);
 
         vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
 
@@ -2525,10 +2616,11 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 {
     BDRVSheepdogState *s = bs->opaque;
     SheepdogInode *inode = &s->inode;
+    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
     uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
-    unsigned long start = offset / SD_DATA_OBJ_SIZE,
+    unsigned long start = offset / object_size,
                   end = DIV_ROUND_UP((sector_num + nb_sectors) *
-                                     BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
+                                     BDRV_SECTOR_SIZE, object_size);
     unsigned long idx;
     int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 
@@ -2547,7 +2639,7 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
         }
     }
 
-    *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE;
+    *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
     if (*pnum > nb_sectors) {
         *pnum = nb_sectors;
     }
@@ -2558,14 +2650,15 @@ static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
 {
     BDRVSheepdogState *s = bs->opaque;
     SheepdogInode *inode = &s->inode;
-    unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
+    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
+    unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
     uint64_t size = 0;
 
     for (i = 0; i < last; i++) {
         if (inode->data_vdi_id[i] == 0) {
             continue;
         }
-        size += SD_DATA_OBJ_SIZE;
+        size += object_size;
     }
     return size;
 }
@@ -2594,6 +2687,11 @@ static QemuOptsList sd_create_opts = {
             .type = QEMU_OPT_STRING,
             .help = "Redundancy of the image"
         },
+        {
+            .name = BLOCK_OPT_BLOCK_SIZE_SHIFT,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Block Size Shift of the image"
+        },
         { /* end of list */ }
     }
 };
diff --git a/include/block/block_int.h b/include/block/block_int.h
index e264be9..61951a8 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -56,6 +56,7 @@
 #define BLOCK_OPT_ADAPTER_TYPE      "adapter_type"
 #define BLOCK_OPT_REDUNDANCY        "redundancy"
 #define BLOCK_OPT_NOCOW             "nocow"
+#define BLOCK_OPT_BLOCK_SIZE_SHIFT  "block_size_shift"
 
 #define BLOCK_PROBE_BUF_SIZE        512
 
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-01-27  8:35 [Qemu-devel] [PATCH v4] sheepdog: selectable object size support Teruaki Ishizaki
@ 2015-02-02  6:52 ` Liu Yuan
  2015-02-04  4:54   ` Teruaki Ishizaki
  2015-02-10  3:10 ` Liu Yuan
  2015-02-10 11:12 ` [Qemu-devel] " Liu Yuan
  2 siblings, 1 reply; 28+ messages in thread
From: Liu Yuan @ 2015-02-02  6:52 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> This patch enables users to handle "block_size_shift" value for
> calculating VDI object size.
> 
> When you start qemu, you don't need to specify additional command option.
> 
> But when you create the VDI which doesn't have default object size
> with qemu-img command, you specify block_size_shift option.
> 
> If you want to create a VDI of 8MB(1 << 23) object size,
> you need to specify following command option.
> 
>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M

Is it possible to make this option more user friendly? such as

 $ qemu-img create -o object_size=8M sheepdog:test 1G

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-02  6:52 ` Liu Yuan
@ 2015-02-04  4:54   ` Teruaki Ishizaki
  2015-02-06  2:18     ` Liu Yuan
  0 siblings, 1 reply; 28+ messages in thread
From: Teruaki Ishizaki @ 2015-02-04  4:54 UTC (permalink / raw)
  To: Liu Yuan; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

(2015/02/02 15:52), Liu Yuan wrote:
> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>> This patch enables users to handle "block_size_shift" value for
>> calculating VDI object size.
>>
>> When you start qemu, you don't need to specify additional command option.
>>
>> But when you create the VDI which doesn't have default object size
>> with qemu-img command, you specify block_size_shift option.
>>
>> If you want to create a VDI of 8MB(1 << 23) object size,
>> you need to specify following command option.
>>
>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>
> Is it possible to make this option more user friendly? such as
>
>   $ qemu-img create -o object_size=8M sheepdog:test 1G

At first, I thought that the object_size was user friendly.
But, Sheepdog has already the value of block_size_shift
in the inode layout that means like object_size.

'object_size' doesn't always fit right in 'block_size_shift'.
On the other hands, 'block_size_shift' always fit right in
'object_size'.

I think that existing layout shouldn't be changed easily and
it seems that it is difficult for users to specify
the object_size value that fit right in 'block_size_shift'.

Thanks,
Teruaki Ishizaki

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-04  4:54   ` Teruaki Ishizaki
@ 2015-02-06  2:18     ` Liu Yuan
  2015-02-06  7:57       ` Teruaki Ishizaki
  0 siblings, 1 reply; 28+ messages in thread
From: Liu Yuan @ 2015-02-06  2:18 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Wed, Feb 04, 2015 at 01:54:19PM +0900, Teruaki Ishizaki wrote:
> (2015/02/02 15:52), Liu Yuan wrote:
> >On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> >>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> >>This patch enables users to handle "block_size_shift" value for
> >>calculating VDI object size.
> >>
> >>When you start qemu, you don't need to specify additional command option.
> >>
> >>But when you create the VDI which doesn't have default object size
> >>with qemu-img command, you specify block_size_shift option.
> >>
> >>If you want to create a VDI of 8MB(1 << 23) object size,
> >>you need to specify following command option.
> >>
> >>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> >
> >Is it possible to make this option more user friendly? such as
> >
> >  $ qemu-img create -o object_size=8M sheepdog:test 1G
> 
> At first, I thought that the object_size was user friendly.
> But, Sheepdog has already the value of block_size_shift
> in the inode layout that means like object_size.
> 
> 'object_size' doesn't always fit right in 'block_size_shift'.
> On the other hands, 'block_size_shift' always fit right in
> 'object_size'.
> 
> I think that existing layout shouldn't be changed easily and
> it seems that it is difficult for users to specify
> the object_size value that fit right in 'block_size_shift'.
 
I don't think we need to change the layout. block_size_shift is what QEMU talks
to sheep, and QEMU options is what users talks to QEMU. We can convert the user
friendly object size into block_size_shift internally in the driver before
sending it tosheep daemon, no?

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-06  2:18     ` Liu Yuan
@ 2015-02-06  7:57       ` Teruaki Ishizaki
  2015-02-09  3:08         ` Liu Yuan
  0 siblings, 1 reply; 28+ messages in thread
From: Teruaki Ishizaki @ 2015-02-06  7:57 UTC (permalink / raw)
  To: Liu Yuan; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

(2015/02/06 11:18), Liu Yuan wrote:
> On Wed, Feb 04, 2015 at 01:54:19PM +0900, Teruaki Ishizaki wrote:
>> (2015/02/02 15:52), Liu Yuan wrote:
>>> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
>>>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>>>> This patch enables users to handle "block_size_shift" value for
>>>> calculating VDI object size.
>>>>
>>>> When you start qemu, you don't need to specify additional command option.
>>>>
>>>> But when you create the VDI which doesn't have default object size
>>>> with qemu-img command, you specify block_size_shift option.
>>>>
>>>> If you want to create a VDI of 8MB(1 << 23) object size,
>>>> you need to specify following command option.
>>>>
>>>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>>
>>> Is it possible to make this option more user friendly? such as
>>>
>>>   $ qemu-img create -o object_size=8M sheepdog:test 1G
>>
>> At first, I thought that the object_size was user friendly.
>> But, Sheepdog has already the value of block_size_shift
>> in the inode layout that means like object_size.
>>
>> 'object_size' doesn't always fit right in 'block_size_shift'.
>> On the other hands, 'block_size_shift' always fit right in
>> 'object_size'.
>>
>> I think that existing layout shouldn't be changed easily and
>> it seems that it is difficult for users to specify
>> the object_size value that fit right in 'block_size_shift'.
>
> I don't think we need to change the layout. block_size_shift is what QEMU talks
> to sheep, and QEMU options is what users talks to QEMU. We can convert the user
> friendly object size into block_size_shift internally in the driver before
> sending it tosheep daemon, no?
>
For example, users specify 12MB for object size, block_size_shift
doesn't fit exactly to an integer.

I suppose that normally an administrator do format sheepdog cluster
with specifying block_size_shift and users usually do qemu-img command
without a block_size_shift option.

I think that the block_size_shift option of qemu-img command is used
for a experimental purpose.

Thanks,
Teruaki Ishizaki

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-06  7:57       ` Teruaki Ishizaki
@ 2015-02-09  3:08         ` Liu Yuan
  0 siblings, 0 replies; 28+ messages in thread
From: Liu Yuan @ 2015-02-09  3:08 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Fri, Feb 06, 2015 at 04:57:55PM +0900, Teruaki Ishizaki wrote:
> (2015/02/06 11:18), Liu Yuan wrote:
> >On Wed, Feb 04, 2015 at 01:54:19PM +0900, Teruaki Ishizaki wrote:
> >>(2015/02/02 15:52), Liu Yuan wrote:
> >>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> >>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> >>>>This patch enables users to handle "block_size_shift" value for
> >>>>calculating VDI object size.
> >>>>
> >>>>When you start qemu, you don't need to specify additional command option.
> >>>>
> >>>>But when you create the VDI which doesn't have default object size
> >>>>with qemu-img command, you specify block_size_shift option.
> >>>>
> >>>>If you want to create a VDI of 8MB(1 << 23) object size,
> >>>>you need to specify following command option.
> >>>>
> >>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> >>>
> >>>Is it possible to make this option more user friendly? such as
> >>>
> >>>  $ qemu-img create -o object_size=8M sheepdog:test 1G
> >>
> >>At first, I thought that the object_size was user friendly.
> >>But, Sheepdog has already the value of block_size_shift
> >>in the inode layout that means like object_size.
> >>
> >>'object_size' doesn't always fit right in 'block_size_shift'.
> >>On the other hands, 'block_size_shift' always fit right in
> >>'object_size'.
> >>
> >>I think that existing layout shouldn't be changed easily and
> >>it seems that it is difficult for users to specify
> >>the object_size value that fit right in 'block_size_shift'.
> >
> >I don't think we need to change the layout. block_size_shift is what QEMU talks
> >to sheep, and QEMU options is what users talks to QEMU. We can convert the user
> >friendly object size into block_size_shift internally in the driver before
> >sending it tosheep daemon, no?
> >
> For example, users specify 12MB for object size, block_size_shift
> doesn't fit exactly to an integer.

In this case, we should abort and print the error message and notify the users
the acceptable range like erasure coding option.

> 
> I suppose that normally an administrator do format sheepdog cluster
> with specifying block_size_shift and users usually do qemu-img command
> without a block_size_shift option.

block_size_shift is too developer centered and has a direct relation to the
underlying algorithm. If in the future, e.g, we change the underlying algorithm
about how we represent block size, block_size_shift might not even exsit. So
use object_size would be more generic and won't have this kind of problem.

Secondly, it is not hard to parse the object_size into block_size_shift, so I'd
suggest we'd better try our best to make it user friendly.

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-01-27  8:35 [Qemu-devel] [PATCH v4] sheepdog: selectable object size support Teruaki Ishizaki
  2015-02-02  6:52 ` Liu Yuan
@ 2015-02-10  3:10 ` Liu Yuan
  2015-02-10  3:18   ` Liu Yuan
  2015-02-10  8:22   ` Teruaki Ishizaki
  2015-02-10 11:12 ` [Qemu-devel] " Liu Yuan
  2 siblings, 2 replies; 28+ messages in thread
From: Liu Yuan @ 2015-02-10  3:10 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> This patch enables users to handle "block_size_shift" value for
> calculating VDI object size.
> 
> When you start qemu, you don't need to specify additional command option.
> 
> But when you create the VDI which doesn't have default object size
> with qemu-img command, you specify block_size_shift option.
> 
> If you want to create a VDI of 8MB(1 << 23) object size,
> you need to specify following command option.
> 
>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> 
> In addition, when you don't specify qemu-img command option,
> a default value of sheepdog cluster is used for creating VDI.
> 
>  # qemu-img create sheepdog:test2 100M
> 
> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> ---
> V4:
>  - Limit a read/write buffer size for creating a preallocated VDI.
>  - Replace a parse function for the block_size_shift option.
>  - Fix an error message.
> 
> V3:
>  - Delete the needless operation of buffer.
>  - Delete the needless operations of request header.
>    for SD_OP_GET_CLUSTER_DEFAULT.
>  - Fix coding style problems.
> 
> V2:
>  - Fix coding style problem (white space).
>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
>  - Initialize request header to use block_size_shift specified by user.
> ---
>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
>  include/block/block_int.h |    1 +
>  2 files changed, 119 insertions(+), 20 deletions(-)
> 
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index be3176f..a43b947 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -37,6 +37,7 @@
>  #define SD_OP_READ_VDIS      0x15
>  #define SD_OP_FLUSH_VDI      0x16
>  #define SD_OP_DEL_VDI        0x17
> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>  
>  #define SD_FLAG_CMD_WRITE    0x01
>  #define SD_FLAG_CMD_COW      0x02
> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
>      uint32_t base_vdi_id;
>      uint8_t copies;
>      uint8_t copy_policy;
> -    uint8_t reserved[2];
> +    uint8_t store_policy;
> +    uint8_t block_size_shift;
>      uint32_t snapid;
>      uint32_t type;
>      uint32_t pad[2];
> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
>      uint32_t pad[5];
>  } SheepdogVdiRsp;
>  
> +typedef struct SheepdogClusterRsp {
> +    uint8_t proto_ver;
> +    uint8_t opcode;
> +    uint16_t flags;
> +    uint32_t epoch;
> +    uint32_t id;
> +    uint32_t data_length;
> +    uint32_t result;
> +    uint8_t nr_copies;
> +    uint8_t copy_policy;
> +    uint8_t block_size_shift;
> +    uint8_t __pad1;
> +    uint32_t __pad2[6];
> +} SheepdogClusterRsp;
> +
>  typedef struct SheepdogInode {
>      char name[SD_MAX_VDI_LEN];
>      char tag[SD_MAX_VDI_TAG_LEN];
> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>      hdr.vdi_size = s->inode.vdi_size;
>      hdr.copy_policy = s->inode.copy_policy;
>      hdr.copies = s->inode.nr_copies;
> +    hdr.block_size_shift = s->inode.block_size_shift;
>  
>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>  
> @@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>  static int sd_prealloc(const char *filename, Error **errp)
>  {
>      BlockDriverState *bs = NULL;
> +    BDRVSheepdogState *base = NULL;
> +    unsigned long buf_size;
>      uint32_t idx, max_idx;
> +    uint32_t object_size;
>      int64_t vdi_size;
> -    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> +    void *buf = NULL;
>      int ret;
>  
>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> @@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
>          ret = vdi_size;
>          goto out;
>      }
> -    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> +
> +    base = bs->opaque;
> +    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> +    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> +    buf = g_malloc0(buf_size);
> +
> +    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
>  
>      for (idx = 0; idx < max_idx; idx++) {
>          /*
>           * The created image can be a cloned image, so we need to read
>           * a data from the source image.
>           */
> -        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> +        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
>          if (ret < 0) {
>              goto out;
>          }
> -        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> +        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
>          if (ret < 0) {
>              goto out;
>          }
> @@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
>      return 0;
>  }
>  
> +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> +{
> +    struct SheepdogInode *inode = &s->inode;
> +    inode->block_size_shift =
> +        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> +    if (inode->block_size_shift == 0) {
> +        /* block_size_shift is set for cluster default value by sheepdog */
> +        return 0;
> +    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
>  static int sd_create(const char *filename, QemuOpts *opts,
>                       Error **errp)
>  {
> @@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
>      BDRVSheepdogState *s;
>      char tag[SD_MAX_VDI_TAG_LEN];
>      uint32_t snapid;
> +    uint64_t max_vdi_size;
>      bool prealloc = false;
>  
>      s = g_new0(BDRVSheepdogState, 1);
> @@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
>          }
>      }
>  
> -    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> -        error_setg(errp, "too big image size");
> -        ret = -EINVAL;
> +    ret = parse_block_size_shift(s, opts);
> +    if (ret < 0) {
> +        error_setg(errp, "Invalid block_size_shift:"
> +                         " '%s'. please use 20-31", buf);
>          goto out;
>      }
>  
> @@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
>      }
>  
>      s->aio_context = qemu_get_aio_context();
> +
> +    /* if block_size_shift is not specified, get cluster default value */
> +    if (s->inode.block_size_shift == 0) {

Seems that we don't keep backward compatibility for new QEMU and old sheep that
doesn't support SD_OP_GET_CLUSTER_DEFAULT.

What will happen if old QEMU with new sheep that support block_size_shift? Most
distributions will ship the old stable qemu that wouldn't be aware of
block_size_shift.

I'd suggest we have 0 in block_size_shift represent 4MB to keep backward
compatiblity.

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-10  3:10 ` Liu Yuan
@ 2015-02-10  3:18   ` Liu Yuan
  2015-02-10  8:22   ` Teruaki Ishizaki
  1 sibling, 0 replies; 28+ messages in thread
From: Liu Yuan @ 2015-02-10  3:18 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Tue, Feb 10, 2015 at 11:10:51AM +0800, Liu Yuan wrote:
> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> > Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> > This patch enables users to handle "block_size_shift" value for
> > calculating VDI object size.
> > 
> > When you start qemu, you don't need to specify additional command option.
> > 
> > But when you create the VDI which doesn't have default object size
> > with qemu-img command, you specify block_size_shift option.
> > 
> > If you want to create a VDI of 8MB(1 << 23) object size,
> > you need to specify following command option.
> > 
> >  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> > 
> > In addition, when you don't specify qemu-img command option,
> > a default value of sheepdog cluster is used for creating VDI.
> > 
> >  # qemu-img create sheepdog:test2 100M
> > 
> > Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> > ---
> > V4:
> >  - Limit a read/write buffer size for creating a preallocated VDI.
> >  - Replace a parse function for the block_size_shift option.
> >  - Fix an error message.
> > 
> > V3:
> >  - Delete the needless operation of buffer.
> >  - Delete the needless operations of request header.
> >    for SD_OP_GET_CLUSTER_DEFAULT.
> >  - Fix coding style problems.
> > 
> > V2:
> >  - Fix coding style problem (white space).
> >  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> >  - Initialize request header to use block_size_shift specified by user.
> > ---
> >  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> >  include/block/block_int.h |    1 +
> >  2 files changed, 119 insertions(+), 20 deletions(-)
> > 
> > diff --git a/block/sheepdog.c b/block/sheepdog.c
> > index be3176f..a43b947 100644
> > --- a/block/sheepdog.c
> > +++ b/block/sheepdog.c
> > @@ -37,6 +37,7 @@
> >  #define SD_OP_READ_VDIS      0x15
> >  #define SD_OP_FLUSH_VDI      0x16
> >  #define SD_OP_DEL_VDI        0x17
> > +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> >  
> >  #define SD_FLAG_CMD_WRITE    0x01
> >  #define SD_FLAG_CMD_COW      0x02
> > @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> >      uint32_t base_vdi_id;
> >      uint8_t copies;
> >      uint8_t copy_policy;
> > -    uint8_t reserved[2];
> > +    uint8_t store_policy;
> > +    uint8_t block_size_shift;
> >      uint32_t snapid;
> >      uint32_t type;
> >      uint32_t pad[2];
> > @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> >      uint32_t pad[5];
> >  } SheepdogVdiRsp;
> >  
> > +typedef struct SheepdogClusterRsp {
> > +    uint8_t proto_ver;
> > +    uint8_t opcode;
> > +    uint16_t flags;
> > +    uint32_t epoch;
> > +    uint32_t id;
> > +    uint32_t data_length;
> > +    uint32_t result;
> > +    uint8_t nr_copies;
> > +    uint8_t copy_policy;
> > +    uint8_t block_size_shift;
> > +    uint8_t __pad1;
> > +    uint32_t __pad2[6];
> > +} SheepdogClusterRsp;
> > +
> >  typedef struct SheepdogInode {
> >      char name[SD_MAX_VDI_LEN];
> >      char tag[SD_MAX_VDI_TAG_LEN];
> > @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> >      hdr.vdi_size = s->inode.vdi_size;
> >      hdr.copy_policy = s->inode.copy_policy;
> >      hdr.copies = s->inode.nr_copies;
> > +    hdr.block_size_shift = s->inode.block_size_shift;
> >  
> >      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> >  
> > @@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> >  static int sd_prealloc(const char *filename, Error **errp)
> >  {
> >      BlockDriverState *bs = NULL;
> > +    BDRVSheepdogState *base = NULL;
> > +    unsigned long buf_size;
> >      uint32_t idx, max_idx;
> > +    uint32_t object_size;
> >      int64_t vdi_size;
> > -    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> > +    void *buf = NULL;
> >      int ret;
> >  
> >      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> > @@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> >          ret = vdi_size;
> >          goto out;
> >      }
> > -    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> > +
> > +    base = bs->opaque;
> > +    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> > +    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> > +    buf = g_malloc0(buf_size);
> > +
> > +    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
> >  
> >      for (idx = 0; idx < max_idx; idx++) {
> >          /*
> >           * The created image can be a cloned image, so we need to read
> >           * a data from the source image.
> >           */
> > -        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > +        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> >          if (ret < 0) {
> >              goto out;
> >          }
> > -        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > +        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> >          if (ret < 0) {
> >              goto out;
> >          }
> > @@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> >      return 0;
> >  }
> >  
> > +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> > +{
> > +    struct SheepdogInode *inode = &s->inode;
> > +    inode->block_size_shift =
> > +        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> > +    if (inode->block_size_shift == 0) {
> > +        /* block_size_shift is set for cluster default value by sheepdog */
> > +        return 0;
> > +    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> > +        return -EINVAL;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  static int sd_create(const char *filename, QemuOpts *opts,
> >                       Error **errp)
> >  {
> > @@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >      BDRVSheepdogState *s;
> >      char tag[SD_MAX_VDI_TAG_LEN];
> >      uint32_t snapid;
> > +    uint64_t max_vdi_size;
> >      bool prealloc = false;
> >  
> >      s = g_new0(BDRVSheepdogState, 1);
> > @@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >          }
> >      }
> >  
> > -    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> > -        error_setg(errp, "too big image size");
> > -        ret = -EINVAL;
> > +    ret = parse_block_size_shift(s, opts);
> > +    if (ret < 0) {
> > +        error_setg(errp, "Invalid block_size_shift:"
> > +                         " '%s'. please use 20-31", buf);
> >          goto out;
> >      }
> >  
> > @@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >      }
> >  
> >      s->aio_context = qemu_get_aio_context();
> > +
> > +    /* if block_size_shift is not specified, get cluster default value */
> > +    if (s->inode.block_size_shift == 0) {
> 
> Seems that we don't keep backward compatibility for new QEMU and old sheep that
> doesn't support SD_OP_GET_CLUSTER_DEFAULT.
> 
> What will happen if old QEMU with new sheep that support block_size_shift? Most
> distributions will ship the old stable qemu that wouldn't be aware of
> block_size_shift.
> 
> I'd suggest we have 0 in block_size_shift represent 4MB to keep backward
> compatiblity.

How about make this additional field as the factor to multiply 4MB? that is,

(x + 1) * 4MB

qemu-img create -o object_size=8MB test 1G

will have x set as 1 and the formula will equal to 8MB.

-o object_size=4MB will set x as 0 and default value for x is 0 too to keep
backward compatiblity.

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-10  3:10 ` Liu Yuan
  2015-02-10  3:18   ` Liu Yuan
@ 2015-02-10  8:22   ` Teruaki Ishizaki
  2015-02-10  8:58     ` Liu Yuan
  1 sibling, 1 reply; 28+ messages in thread
From: Teruaki Ishizaki @ 2015-02-10  8:22 UTC (permalink / raw)
  To: Liu Yuan; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

(2015/02/10 12:10), Liu Yuan wrote:
> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>> This patch enables users to handle "block_size_shift" value for
>> calculating VDI object size.
>>
>> When you start qemu, you don't need to specify additional command option.
>>
>> But when you create the VDI which doesn't have default object size
>> with qemu-img command, you specify block_size_shift option.
>>
>> If you want to create a VDI of 8MB(1 << 23) object size,
>> you need to specify following command option.
>>
>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>
>> In addition, when you don't specify qemu-img command option,
>> a default value of sheepdog cluster is used for creating VDI.
>>
>>   # qemu-img create sheepdog:test2 100M
>>
>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
>> ---
>> V4:
>>   - Limit a read/write buffer size for creating a preallocated VDI.
>>   - Replace a parse function for the block_size_shift option.
>>   - Fix an error message.
>>
>> V3:
>>   - Delete the needless operation of buffer.
>>   - Delete the needless operations of request header.
>>     for SD_OP_GET_CLUSTER_DEFAULT.
>>   - Fix coding style problems.
>>
>> V2:
>>   - Fix coding style problem (white space).
>>   - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
>>   - Initialize request header to use block_size_shift specified by user.
>> ---
>>   block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
>>   include/block/block_int.h |    1 +
>>   2 files changed, 119 insertions(+), 20 deletions(-)
>>
>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>> index be3176f..a43b947 100644
>> --- a/block/sheepdog.c
>> +++ b/block/sheepdog.c
>> @@ -37,6 +37,7 @@
>>   #define SD_OP_READ_VDIS      0x15
>>   #define SD_OP_FLUSH_VDI      0x16
>>   #define SD_OP_DEL_VDI        0x17
>> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>>
>>   #define SD_FLAG_CMD_WRITE    0x01
>>   #define SD_FLAG_CMD_COW      0x02
>> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
>>       uint32_t base_vdi_id;
>>       uint8_t copies;
>>       uint8_t copy_policy;
>> -    uint8_t reserved[2];
>> +    uint8_t store_policy;
>> +    uint8_t block_size_shift;
>>       uint32_t snapid;
>>       uint32_t type;
>>       uint32_t pad[2];
>> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
>>       uint32_t pad[5];
>>   } SheepdogVdiRsp;
>>
>> +typedef struct SheepdogClusterRsp {
>> +    uint8_t proto_ver;
>> +    uint8_t opcode;
>> +    uint16_t flags;
>> +    uint32_t epoch;
>> +    uint32_t id;
>> +    uint32_t data_length;
>> +    uint32_t result;
>> +    uint8_t nr_copies;
>> +    uint8_t copy_policy;
>> +    uint8_t block_size_shift;
>> +    uint8_t __pad1;
>> +    uint32_t __pad2[6];
>> +} SheepdogClusterRsp;
>> +
>>   typedef struct SheepdogInode {
>>       char name[SD_MAX_VDI_LEN];
>>       char tag[SD_MAX_VDI_TAG_LEN];
>> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>       hdr.vdi_size = s->inode.vdi_size;
>>       hdr.copy_policy = s->inode.copy_policy;
>>       hdr.copies = s->inode.nr_copies;
>> +    hdr.block_size_shift = s->inode.block_size_shift;
>>
>>       ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>>
>> @@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>   static int sd_prealloc(const char *filename, Error **errp)
>>   {
>>       BlockDriverState *bs = NULL;
>> +    BDRVSheepdogState *base = NULL;
>> +    unsigned long buf_size;
>>       uint32_t idx, max_idx;
>> +    uint32_t object_size;
>>       int64_t vdi_size;
>> -    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
>> +    void *buf = NULL;
>>       int ret;
>>
>>       ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
>> @@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
>>           ret = vdi_size;
>>           goto out;
>>       }
>> -    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
>> +
>> +    base = bs->opaque;
>> +    object_size = (UINT32_C(1) << base->inode.block_size_shift);
>> +    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
>> +    buf = g_malloc0(buf_size);
>> +
>> +    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
>>
>>       for (idx = 0; idx < max_idx; idx++) {
>>           /*
>>            * The created image can be a cloned image, so we need to read
>>            * a data from the source image.
>>            */
>> -        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>> +        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
>>           if (ret < 0) {
>>               goto out;
>>           }
>> -        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>> +        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
>>           if (ret < 0) {
>>               goto out;
>>           }
>> @@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
>>       return 0;
>>   }
>>
>> +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
>> +{
>> +    struct SheepdogInode *inode = &s->inode;
>> +    inode->block_size_shift =
>> +        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
>> +    if (inode->block_size_shift == 0) {
>> +        /* block_size_shift is set for cluster default value by sheepdog */
>> +        return 0;
>> +    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static int sd_create(const char *filename, QemuOpts *opts,
>>                        Error **errp)
>>   {
>> @@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>       BDRVSheepdogState *s;
>>       char tag[SD_MAX_VDI_TAG_LEN];
>>       uint32_t snapid;
>> +    uint64_t max_vdi_size;
>>       bool prealloc = false;
>>
>>       s = g_new0(BDRVSheepdogState, 1);
>> @@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>           }
>>       }
>>
>> -    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
>> -        error_setg(errp, "too big image size");
>> -        ret = -EINVAL;
>> +    ret = parse_block_size_shift(s, opts);
>> +    if (ret < 0) {
>> +        error_setg(errp, "Invalid block_size_shift:"
>> +                         " '%s'. please use 20-31", buf);
>>           goto out;
>>       }
>>
>> @@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>       }
>>
>>       s->aio_context = qemu_get_aio_context();
>> +
>> +    /* if block_size_shift is not specified, get cluster default value */
>> +    if (s->inode.block_size_shift == 0) {
>
> Seems that we don't keep backward compatibility for new QEMU and old sheep that
> doesn't support SD_OP_GET_CLUSTER_DEFAULT.
Then, I'll add the operation that if block_size_shift from
SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
Is it OK?

>
> What will happen if old QEMU with new sheep that support block_size_shift? Most
> distributions will ship the old stable qemu that wouldn't be aware of
> block_size_shift.
If old QEMU with new sheep is used, VDI is created with cluster default
block_size_shift and accessed as 4MB object_size.
So I think that for backward compatibility, users must do cluster
format command with default block_size_shift 22 equal to
4MB object_size.
If users want to use for iSCSI target with changing block_size_shift,
they must specify the value to create VDI.

>
> I'd suggest we have 0 in block_size_shift represent 4MB to keep backward
> compatiblity.
>

Thanks,
Teruaki Ishizaki

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-10  8:22   ` Teruaki Ishizaki
@ 2015-02-10  8:58     ` Liu Yuan
  2015-02-10  9:56       ` Teruaki Ishizaki
  0 siblings, 1 reply; 28+ messages in thread
From: Liu Yuan @ 2015-02-10  8:58 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
> (2015/02/10 12:10), Liu Yuan wrote:
> >On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> >>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> >>This patch enables users to handle "block_size_shift" value for
> >>calculating VDI object size.
> >>
> >>When you start qemu, you don't need to specify additional command option.
> >>
> >>But when you create the VDI which doesn't have default object size
> >>with qemu-img command, you specify block_size_shift option.
> >>
> >>If you want to create a VDI of 8MB(1 << 23) object size,
> >>you need to specify following command option.
> >>
> >>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> >>
> >>In addition, when you don't specify qemu-img command option,
> >>a default value of sheepdog cluster is used for creating VDI.
> >>
> >>  # qemu-img create sheepdog:test2 100M
> >>
> >>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> >>---
> >>V4:
> >>  - Limit a read/write buffer size for creating a preallocated VDI.
> >>  - Replace a parse function for the block_size_shift option.
> >>  - Fix an error message.
> >>
> >>V3:
> >>  - Delete the needless operation of buffer.
> >>  - Delete the needless operations of request header.
> >>    for SD_OP_GET_CLUSTER_DEFAULT.
> >>  - Fix coding style problems.
> >>
> >>V2:
> >>  - Fix coding style problem (white space).
> >>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> >>  - Initialize request header to use block_size_shift specified by user.
> >>---
> >>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> >>  include/block/block_int.h |    1 +
> >>  2 files changed, 119 insertions(+), 20 deletions(-)
> >>
> >>diff --git a/block/sheepdog.c b/block/sheepdog.c
> >>index be3176f..a43b947 100644
> >>--- a/block/sheepdog.c
> >>+++ b/block/sheepdog.c
> >>@@ -37,6 +37,7 @@
> >>  #define SD_OP_READ_VDIS      0x15
> >>  #define SD_OP_FLUSH_VDI      0x16
> >>  #define SD_OP_DEL_VDI        0x17
> >>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> >>
> >>  #define SD_FLAG_CMD_WRITE    0x01
> >>  #define SD_FLAG_CMD_COW      0x02
> >>@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> >>      uint32_t base_vdi_id;
> >>      uint8_t copies;
> >>      uint8_t copy_policy;
> >>-    uint8_t reserved[2];
> >>+    uint8_t store_policy;
> >>+    uint8_t block_size_shift;
> >>      uint32_t snapid;
> >>      uint32_t type;
> >>      uint32_t pad[2];
> >>@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> >>      uint32_t pad[5];
> >>  } SheepdogVdiRsp;
> >>
> >>+typedef struct SheepdogClusterRsp {
> >>+    uint8_t proto_ver;
> >>+    uint8_t opcode;
> >>+    uint16_t flags;
> >>+    uint32_t epoch;
> >>+    uint32_t id;
> >>+    uint32_t data_length;
> >>+    uint32_t result;
> >>+    uint8_t nr_copies;
> >>+    uint8_t copy_policy;
> >>+    uint8_t block_size_shift;
> >>+    uint8_t __pad1;
> >>+    uint32_t __pad2[6];
> >>+} SheepdogClusterRsp;
> >>+
> >>  typedef struct SheepdogInode {
> >>      char name[SD_MAX_VDI_LEN];
> >>      char tag[SD_MAX_VDI_TAG_LEN];
> >>@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> >>      hdr.vdi_size = s->inode.vdi_size;
> >>      hdr.copy_policy = s->inode.copy_policy;
> >>      hdr.copies = s->inode.nr_copies;
> >>+    hdr.block_size_shift = s->inode.block_size_shift;
> >>
> >>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> >>
> >>@@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> >>  static int sd_prealloc(const char *filename, Error **errp)
> >>  {
> >>      BlockDriverState *bs = NULL;
> >>+    BDRVSheepdogState *base = NULL;
> >>+    unsigned long buf_size;
> >>      uint32_t idx, max_idx;
> >>+    uint32_t object_size;
> >>      int64_t vdi_size;
> >>-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> >>+    void *buf = NULL;
> >>      int ret;
> >>
> >>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> >>@@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> >>          ret = vdi_size;
> >>          goto out;
> >>      }
> >>-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> >>+
> >>+    base = bs->opaque;
> >>+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> >>+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> >>+    buf = g_malloc0(buf_size);
> >>+
> >>+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
> >>
> >>      for (idx = 0; idx < max_idx; idx++) {
> >>          /*
> >>           * The created image can be a cloned image, so we need to read
> >>           * a data from the source image.
> >>           */
> >>-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> >>+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> >>          if (ret < 0) {
> >>              goto out;
> >>          }
> >>-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> >>+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> >>          if (ret < 0) {
> >>              goto out;
> >>          }
> >>@@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> >>      return 0;
> >>  }
> >>
> >>+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> >>+{
> >>+    struct SheepdogInode *inode = &s->inode;
> >>+    inode->block_size_shift =
> >>+        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> >>+    if (inode->block_size_shift == 0) {
> >>+        /* block_size_shift is set for cluster default value by sheepdog */
> >>+        return 0;
> >>+    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> >>+        return -EINVAL;
> >>+    }
> >>+
> >>+    return 0;
> >>+}
> >>+
> >>  static int sd_create(const char *filename, QemuOpts *opts,
> >>                       Error **errp)
> >>  {
> >>@@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >>      BDRVSheepdogState *s;
> >>      char tag[SD_MAX_VDI_TAG_LEN];
> >>      uint32_t snapid;
> >>+    uint64_t max_vdi_size;
> >>      bool prealloc = false;
> >>
> >>      s = g_new0(BDRVSheepdogState, 1);
> >>@@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >>          }
> >>      }
> >>
> >>-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> >>-        error_setg(errp, "too big image size");
> >>-        ret = -EINVAL;
> >>+    ret = parse_block_size_shift(s, opts);
> >>+    if (ret < 0) {
> >>+        error_setg(errp, "Invalid block_size_shift:"
> >>+                         " '%s'. please use 20-31", buf);
> >>          goto out;
> >>      }
> >>
> >>@@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >>      }
> >>
> >>      s->aio_context = qemu_get_aio_context();
> >>+
> >>+    /* if block_size_shift is not specified, get cluster default value */
> >>+    if (s->inode.block_size_shift == 0) {
> >
> >Seems that we don't keep backward compatibility for new QEMU and old sheep that
> >doesn't support SD_OP_GET_CLUSTER_DEFAULT.
> Then, I'll add the operation that if block_size_shift from
> SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
> Is it OK?

If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.

My point is that, after user upgrading the sheep and still stick with old QEMU,
(I guess many users will), any operations in the past sholudn't fail right after
upgrading. 

> 
> >
> >What will happen if old QEMU with new sheep that support block_size_shift? Most
> >distributions will ship the old stable qemu that wouldn't be aware of
> >block_size_shift.
> If old QEMU with new sheep is used, VDI is created with cluster default
> block_size_shift and accessed as 4MB object_size.
> So I think that for backward compatibility, users must do cluster
> format command with default block_size_shift 22 equal to
> 4MB object_size.

how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
encoded as 0 and then send the create request to sheep. Does sheep can handle
block_size_shift = 0? You know, we can't pass any value to old QEMU for
block_size_shift.

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-10  8:58     ` Liu Yuan
@ 2015-02-10  9:56       ` Teruaki Ishizaki
  2015-02-10 10:35         ` Liu Yuan
  0 siblings, 1 reply; 28+ messages in thread
From: Teruaki Ishizaki @ 2015-02-10  9:56 UTC (permalink / raw)
  To: Liu Yuan; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

(2015/02/10 17:58), Liu Yuan wrote:
> On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
>> (2015/02/10 12:10), Liu Yuan wrote:
>>> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
>>>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>>>> This patch enables users to handle "block_size_shift" value for
>>>> calculating VDI object size.
>>>>
>>>> When you start qemu, you don't need to specify additional command option.
>>>>
>>>> But when you create the VDI which doesn't have default object size
>>>> with qemu-img command, you specify block_size_shift option.
>>>>
>>>> If you want to create a VDI of 8MB(1 << 23) object size,
>>>> you need to specify following command option.
>>>>
>>>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>>>
>>>> In addition, when you don't specify qemu-img command option,
>>>> a default value of sheepdog cluster is used for creating VDI.
>>>>
>>>>   # qemu-img create sheepdog:test2 100M
>>>>
>>>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
>>>> ---
>>>> V4:
>>>>   - Limit a read/write buffer size for creating a preallocated VDI.
>>>>   - Replace a parse function for the block_size_shift option.
>>>>   - Fix an error message.
>>>>
>>>> V3:
>>>>   - Delete the needless operation of buffer.
>>>>   - Delete the needless operations of request header.
>>>>     for SD_OP_GET_CLUSTER_DEFAULT.
>>>>   - Fix coding style problems.
>>>>
>>>> V2:
>>>>   - Fix coding style problem (white space).
>>>>   - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
>>>>   - Initialize request header to use block_size_shift specified by user.
>>>> ---
>>>>   block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
>>>>   include/block/block_int.h |    1 +
>>>>   2 files changed, 119 insertions(+), 20 deletions(-)
>>>>
>>>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>>>> index be3176f..a43b947 100644
>>>> --- a/block/sheepdog.c
>>>> +++ b/block/sheepdog.c
>>>> @@ -37,6 +37,7 @@
>>>>   #define SD_OP_READ_VDIS      0x15
>>>>   #define SD_OP_FLUSH_VDI      0x16
>>>>   #define SD_OP_DEL_VDI        0x17
>>>> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>>>>
>>>>   #define SD_FLAG_CMD_WRITE    0x01
>>>>   #define SD_FLAG_CMD_COW      0x02
>>>> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
>>>>       uint32_t base_vdi_id;
>>>>       uint8_t copies;
>>>>       uint8_t copy_policy;
>>>> -    uint8_t reserved[2];
>>>> +    uint8_t store_policy;
>>>> +    uint8_t block_size_shift;
>>>>       uint32_t snapid;
>>>>       uint32_t type;
>>>>       uint32_t pad[2];
>>>> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
>>>>       uint32_t pad[5];
>>>>   } SheepdogVdiRsp;
>>>>
>>>> +typedef struct SheepdogClusterRsp {
>>>> +    uint8_t proto_ver;
>>>> +    uint8_t opcode;
>>>> +    uint16_t flags;
>>>> +    uint32_t epoch;
>>>> +    uint32_t id;
>>>> +    uint32_t data_length;
>>>> +    uint32_t result;
>>>> +    uint8_t nr_copies;
>>>> +    uint8_t copy_policy;
>>>> +    uint8_t block_size_shift;
>>>> +    uint8_t __pad1;
>>>> +    uint32_t __pad2[6];
>>>> +} SheepdogClusterRsp;
>>>> +
>>>>   typedef struct SheepdogInode {
>>>>       char name[SD_MAX_VDI_LEN];
>>>>       char tag[SD_MAX_VDI_TAG_LEN];
>>>> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>>>       hdr.vdi_size = s->inode.vdi_size;
>>>>       hdr.copy_policy = s->inode.copy_policy;
>>>>       hdr.copies = s->inode.nr_copies;
>>>> +    hdr.block_size_shift = s->inode.block_size_shift;
>>>>
>>>>       ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>>>>
>>>> @@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>>>   static int sd_prealloc(const char *filename, Error **errp)
>>>>   {
>>>>       BlockDriverState *bs = NULL;
>>>> +    BDRVSheepdogState *base = NULL;
>>>> +    unsigned long buf_size;
>>>>       uint32_t idx, max_idx;
>>>> +    uint32_t object_size;
>>>>       int64_t vdi_size;
>>>> -    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
>>>> +    void *buf = NULL;
>>>>       int ret;
>>>>
>>>>       ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
>>>> @@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
>>>>           ret = vdi_size;
>>>>           goto out;
>>>>       }
>>>> -    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
>>>> +
>>>> +    base = bs->opaque;
>>>> +    object_size = (UINT32_C(1) << base->inode.block_size_shift);
>>>> +    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
>>>> +    buf = g_malloc0(buf_size);
>>>> +
>>>> +    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
>>>>
>>>>       for (idx = 0; idx < max_idx; idx++) {
>>>>           /*
>>>>            * The created image can be a cloned image, so we need to read
>>>>            * a data from the source image.
>>>>            */
>>>> -        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>>>> +        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
>>>>           if (ret < 0) {
>>>>               goto out;
>>>>           }
>>>> -        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>>>> +        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
>>>>           if (ret < 0) {
>>>>               goto out;
>>>>           }
>>>> @@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
>>>>       return 0;
>>>>   }
>>>>
>>>> +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
>>>> +{
>>>> +    struct SheepdogInode *inode = &s->inode;
>>>> +    inode->block_size_shift =
>>>> +        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
>>>> +    if (inode->block_size_shift == 0) {
>>>> +        /* block_size_shift is set for cluster default value by sheepdog */
>>>> +        return 0;
>>>> +    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
>>>> +        return -EINVAL;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>>   static int sd_create(const char *filename, QemuOpts *opts,
>>>>                        Error **errp)
>>>>   {
>>>> @@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>>>       BDRVSheepdogState *s;
>>>>       char tag[SD_MAX_VDI_TAG_LEN];
>>>>       uint32_t snapid;
>>>> +    uint64_t max_vdi_size;
>>>>       bool prealloc = false;
>>>>
>>>>       s = g_new0(BDRVSheepdogState, 1);
>>>> @@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>>>           }
>>>>       }
>>>>
>>>> -    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
>>>> -        error_setg(errp, "too big image size");
>>>> -        ret = -EINVAL;
>>>> +    ret = parse_block_size_shift(s, opts);
>>>> +    if (ret < 0) {
>>>> +        error_setg(errp, "Invalid block_size_shift:"
>>>> +                         " '%s'. please use 20-31", buf);
>>>>           goto out;
>>>>       }
>>>>
>>>> @@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>>>       }
>>>>
>>>>       s->aio_context = qemu_get_aio_context();
>>>> +
>>>> +    /* if block_size_shift is not specified, get cluster default value */
>>>> +    if (s->inode.block_size_shift == 0) {
>>>
>>> Seems that we don't keep backward compatibility for new QEMU and old sheep that
>>> doesn't support SD_OP_GET_CLUSTER_DEFAULT.
>> Then, I'll add the operation that if block_size_shift from
>> SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
>> Is it OK?
>
> If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
> SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
> SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.
OK, I think that the way is better.
>
> My point is that, after user upgrading the sheep and still stick with old QEMU,
> (I guess many users will), any operations in the past sholudn't fail right after
> upgrading.
>
>>
>>>
>>> What will happen if old QEMU with new sheep that support block_size_shift? Most
>>> distributions will ship the old stable qemu that wouldn't be aware of
>>> block_size_shift.
>> If old QEMU with new sheep is used, VDI is created with cluster default
>> block_size_shift and accessed as 4MB object_size.
>> So I think that for backward compatibility, users must do cluster
>> format command with default block_size_shift 22 equal to
>> 4MB object_size.
>
> how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
> encoded as 0 and then send the create request to sheep. Does sheep can handle
> block_size_shift = 0? You know, we can't pass any value to old QEMU for
> block_size_shift.
Sheep can handle block_size_shift = 0, when users create new VDI.
Old QEMU does do_sd_create() without setting hdr.block_size_shift and
sends a request SD_OP_NEW_VDI.
If block_size_shift is set to zero, new sheep sets cluster default value
in cluster_new_vdi() like copies and copy_policy.

Thanks,
Teruaki

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-10  9:56       ` Teruaki Ishizaki
@ 2015-02-10 10:35         ` Liu Yuan
  2015-02-12  6:19           ` [Qemu-devel] [sheepdog] " Hitoshi Mitake
  0 siblings, 1 reply; 28+ messages in thread
From: Liu Yuan @ 2015-02-10 10:35 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Tue, Feb 10, 2015 at 06:56:33PM +0900, Teruaki Ishizaki wrote:
> (2015/02/10 17:58), Liu Yuan wrote:
> >On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
> >>(2015/02/10 12:10), Liu Yuan wrote:
> >>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> >>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> >>>>This patch enables users to handle "block_size_shift" value for
> >>>>calculating VDI object size.
> >>>>
> >>>>When you start qemu, you don't need to specify additional command option.
> >>>>
> >>>>But when you create the VDI which doesn't have default object size
> >>>>with qemu-img command, you specify block_size_shift option.
> >>>>
> >>>>If you want to create a VDI of 8MB(1 << 23) object size,
> >>>>you need to specify following command option.
> >>>>
> >>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> >>>>
> >>>>In addition, when you don't specify qemu-img command option,
> >>>>a default value of sheepdog cluster is used for creating VDI.
> >>>>
> >>>>  # qemu-img create sheepdog:test2 100M
> >>>>
> >>>>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> >>>>---
> >>>>V4:
> >>>>  - Limit a read/write buffer size for creating a preallocated VDI.
> >>>>  - Replace a parse function for the block_size_shift option.
> >>>>  - Fix an error message.
> >>>>
> >>>>V3:
> >>>>  - Delete the needless operation of buffer.
> >>>>  - Delete the needless operations of request header.
> >>>>    for SD_OP_GET_CLUSTER_DEFAULT.
> >>>>  - Fix coding style problems.
> >>>>
> >>>>V2:
> >>>>  - Fix coding style problem (white space).
> >>>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> >>>>  - Initialize request header to use block_size_shift specified by user.
> >>>>---
> >>>>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> >>>>  include/block/block_int.h |    1 +
> >>>>  2 files changed, 119 insertions(+), 20 deletions(-)
> >>>>
> >>>>diff --git a/block/sheepdog.c b/block/sheepdog.c
> >>>>index be3176f..a43b947 100644
> >>>>--- a/block/sheepdog.c
> >>>>+++ b/block/sheepdog.c
> >>>>@@ -37,6 +37,7 @@
> >>>>  #define SD_OP_READ_VDIS      0x15
> >>>>  #define SD_OP_FLUSH_VDI      0x16
> >>>>  #define SD_OP_DEL_VDI        0x17
> >>>>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> >>>>
> >>>>  #define SD_FLAG_CMD_WRITE    0x01
> >>>>  #define SD_FLAG_CMD_COW      0x02
> >>>>@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> >>>>      uint32_t base_vdi_id;
> >>>>      uint8_t copies;
> >>>>      uint8_t copy_policy;
> >>>>-    uint8_t reserved[2];
> >>>>+    uint8_t store_policy;
> >>>>+    uint8_t block_size_shift;
> >>>>      uint32_t snapid;
> >>>>      uint32_t type;
> >>>>      uint32_t pad[2];
> >>>>@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> >>>>      uint32_t pad[5];
> >>>>  } SheepdogVdiRsp;
> >>>>
> >>>>+typedef struct SheepdogClusterRsp {
> >>>>+    uint8_t proto_ver;
> >>>>+    uint8_t opcode;
> >>>>+    uint16_t flags;
> >>>>+    uint32_t epoch;
> >>>>+    uint32_t id;
> >>>>+    uint32_t data_length;
> >>>>+    uint32_t result;
> >>>>+    uint8_t nr_copies;
> >>>>+    uint8_t copy_policy;
> >>>>+    uint8_t block_size_shift;
> >>>>+    uint8_t __pad1;
> >>>>+    uint32_t __pad2[6];
> >>>>+} SheepdogClusterRsp;
> >>>>+
> >>>>  typedef struct SheepdogInode {
> >>>>      char name[SD_MAX_VDI_LEN];
> >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> >>>>@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> >>>>      hdr.vdi_size = s->inode.vdi_size;
> >>>>      hdr.copy_policy = s->inode.copy_policy;
> >>>>      hdr.copies = s->inode.nr_copies;
> >>>>+    hdr.block_size_shift = s->inode.block_size_shift;
> >>>>
> >>>>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> >>>>
> >>>>@@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> >>>>  static int sd_prealloc(const char *filename, Error **errp)
> >>>>  {
> >>>>      BlockDriverState *bs = NULL;
> >>>>+    BDRVSheepdogState *base = NULL;
> >>>>+    unsigned long buf_size;
> >>>>      uint32_t idx, max_idx;
> >>>>+    uint32_t object_size;
> >>>>      int64_t vdi_size;
> >>>>-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> >>>>+    void *buf = NULL;
> >>>>      int ret;
> >>>>
> >>>>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> >>>>@@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> >>>>          ret = vdi_size;
> >>>>          goto out;
> >>>>      }
> >>>>-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> >>>>+
> >>>>+    base = bs->opaque;
> >>>>+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> >>>>+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> >>>>+    buf = g_malloc0(buf_size);
> >>>>+
> >>>>+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
> >>>>
> >>>>      for (idx = 0; idx < max_idx; idx++) {
> >>>>          /*
> >>>>           * The created image can be a cloned image, so we need to read
> >>>>           * a data from the source image.
> >>>>           */
> >>>>-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> >>>>+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> >>>>          if (ret < 0) {
> >>>>              goto out;
> >>>>          }
> >>>>-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> >>>>+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> >>>>          if (ret < 0) {
> >>>>              goto out;
> >>>>          }
> >>>>@@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> >>>>      return 0;
> >>>>  }
> >>>>
> >>>>+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> >>>>+{
> >>>>+    struct SheepdogInode *inode = &s->inode;
> >>>>+    inode->block_size_shift =
> >>>>+        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> >>>>+    if (inode->block_size_shift == 0) {
> >>>>+        /* block_size_shift is set for cluster default value by sheepdog */
> >>>>+        return 0;
> >>>>+    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> >>>>+        return -EINVAL;
> >>>>+    }
> >>>>+
> >>>>+    return 0;
> >>>>+}
> >>>>+
> >>>>  static int sd_create(const char *filename, QemuOpts *opts,
> >>>>                       Error **errp)
> >>>>  {
> >>>>@@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >>>>      BDRVSheepdogState *s;
> >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> >>>>      uint32_t snapid;
> >>>>+    uint64_t max_vdi_size;
> >>>>      bool prealloc = false;
> >>>>
> >>>>      s = g_new0(BDRVSheepdogState, 1);
> >>>>@@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >>>>          }
> >>>>      }
> >>>>
> >>>>-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> >>>>-        error_setg(errp, "too big image size");
> >>>>-        ret = -EINVAL;
> >>>>+    ret = parse_block_size_shift(s, opts);
> >>>>+    if (ret < 0) {
> >>>>+        error_setg(errp, "Invalid block_size_shift:"
> >>>>+                         " '%s'. please use 20-31", buf);
> >>>>          goto out;
> >>>>      }
> >>>>
> >>>>@@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >>>>      }
> >>>>
> >>>>      s->aio_context = qemu_get_aio_context();
> >>>>+
> >>>>+    /* if block_size_shift is not specified, get cluster default value */
> >>>>+    if (s->inode.block_size_shift == 0) {
> >>>
> >>>Seems that we don't keep backward compatibility for new QEMU and old sheep that
> >>>doesn't support SD_OP_GET_CLUSTER_DEFAULT.
> >>Then, I'll add the operation that if block_size_shift from
> >>SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
> >>Is it OK?
> >
> >If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
> >SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
> >SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.
> OK, I think that the way is better.
> >
> >My point is that, after user upgrading the sheep and still stick with old QEMU,
> >(I guess many users will), any operations in the past sholudn't fail right after
> >upgrading.
> >
> >>
> >>>
> >>>What will happen if old QEMU with new sheep that support block_size_shift? Most
> >>>distributions will ship the old stable qemu that wouldn't be aware of
> >>>block_size_shift.
> >>If old QEMU with new sheep is used, VDI is created with cluster default
> >>block_size_shift and accessed as 4MB object_size.
> >>So I think that for backward compatibility, users must do cluster
> >>format command with default block_size_shift 22 equal to
> >>4MB object_size.
> >
> >how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
> >encoded as 0 and then send the create request to sheep. Does sheep can handle
> >block_size_shift = 0? You know, we can't pass any value to old QEMU for
> >block_size_shift.
> Sheep can handle block_size_shift = 0, when users create new VDI.
> Old QEMU does do_sd_create() without setting hdr.block_size_shift and
> sends a request SD_OP_NEW_VDI.
> If block_size_shift is set to zero, new sheep sets cluster default value
> in cluster_new_vdi() like copies and copy_policy.

Okay, so new sheep can handle old QEMU request. By the way, how about the
suggestion in my last email? (x + 1) * 4MB stuff...

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-01-27  8:35 [Qemu-devel] [PATCH v4] sheepdog: selectable object size support Teruaki Ishizaki
  2015-02-02  6:52 ` Liu Yuan
  2015-02-10  3:10 ` Liu Yuan
@ 2015-02-10 11:12 ` Liu Yuan
  2015-02-12  1:51   ` Teruaki Ishizaki
  2 siblings, 1 reply; 28+ messages in thread
From: Liu Yuan @ 2015-02-10 11:12 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> This patch enables users to handle "block_size_shift" value for
> calculating VDI object size.
> 
> When you start qemu, you don't need to specify additional command option.
> 
> But when you create the VDI which doesn't have default object size
> with qemu-img command, you specify block_size_shift option.
> 
> If you want to create a VDI of 8MB(1 << 23) object size,
> you need to specify following command option.
> 
>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> 
> In addition, when you don't specify qemu-img command option,
> a default value of sheepdog cluster is used for creating VDI.
> 
>  # qemu-img create sheepdog:test2 100M
> 
> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> ---
> V4:
>  - Limit a read/write buffer size for creating a preallocated VDI.
>  - Replace a parse function for the block_size_shift option.
>  - Fix an error message.
> 
> V3:
>  - Delete the needless operation of buffer.
>  - Delete the needless operations of request header.
>    for SD_OP_GET_CLUSTER_DEFAULT.
>  - Fix coding style problems.
> 
> V2:
>  - Fix coding style problem (white space).
>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
>  - Initialize request header to use block_size_shift specified by user.
> ---
>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
>  include/block/block_int.h |    1 +
>  2 files changed, 119 insertions(+), 20 deletions(-)
> 
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index be3176f..a43b947 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -37,6 +37,7 @@
>  #define SD_OP_READ_VDIS      0x15
>  #define SD_OP_FLUSH_VDI      0x16
>  #define SD_OP_DEL_VDI        0x17
> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18

This might not be necessary. For old qemu or the qemu-img without setting
option, the block_size_shift will be 0.

If we make 0 to represent 4MB object, then we don't need to get the default
cluster object size.

We migth even get rid of the idea of cluster default size. The downsize is that,
if we want to create a vdi with different size not the default 4MB,
we have to write it every time for qemu-img or dog.

If we choose to keep the idea of cluster default size, I think we'd also try to
avoid call this request from QEMU to make backward compatibility easier. In this
scenario, 0 might be used to ask new sheep to decide to use cluster default size.

Both old qemu and new QEMU will send 0 to sheep and both old and new sheep can
handle 0 though it has different meanings. 

Table for this bit as 0:
Qe: qemu
SD: Sheep daemon
CDS: Cluster Default Size
Ign: Ignored by the sheep daemon

Qe/sd   new    old
new     CDS    Ign
old     CDS    NULL

I think this approach is acceptable. The difference to your patch is that
we don't send SD_OP_GET_CLUSTER_DEFAULT to sheep daemon and
SD_OP_GET_CLUSTER_DEFAULT can be removed.

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-10 11:12 ` [Qemu-devel] " Liu Yuan
@ 2015-02-12  1:51   ` Teruaki Ishizaki
  2015-02-12  2:19     ` Liu Yuan
  0 siblings, 1 reply; 28+ messages in thread
From: Teruaki Ishizaki @ 2015-02-12  1:51 UTC (permalink / raw)
  To: Liu Yuan; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

(2015/02/10 20:12), Liu Yuan wrote:
> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>> This patch enables users to handle "block_size_shift" value for
>> calculating VDI object size.
>>
>> When you start qemu, you don't need to specify additional command option.
>>
>> But when you create the VDI which doesn't have default object size
>> with qemu-img command, you specify block_size_shift option.
>>
>> If you want to create a VDI of 8MB(1 << 23) object size,
>> you need to specify following command option.
>>
>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>
>> In addition, when you don't specify qemu-img command option,
>> a default value of sheepdog cluster is used for creating VDI.
>>
>>   # qemu-img create sheepdog:test2 100M
>>
>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
>> ---
>> V4:
>>   - Limit a read/write buffer size for creating a preallocated VDI.
>>   - Replace a parse function for the block_size_shift option.
>>   - Fix an error message.
>>
>> V3:
>>   - Delete the needless operation of buffer.
>>   - Delete the needless operations of request header.
>>     for SD_OP_GET_CLUSTER_DEFAULT.
>>   - Fix coding style problems.
>>
>> V2:
>>   - Fix coding style problem (white space).
>>   - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
>>   - Initialize request header to use block_size_shift specified by user.
>> ---
>>   block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
>>   include/block/block_int.h |    1 +
>>   2 files changed, 119 insertions(+), 20 deletions(-)
>>
>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>> index be3176f..a43b947 100644
>> --- a/block/sheepdog.c
>> +++ b/block/sheepdog.c
>> @@ -37,6 +37,7 @@
>>   #define SD_OP_READ_VDIS      0x15
>>   #define SD_OP_FLUSH_VDI      0x16
>>   #define SD_OP_DEL_VDI        0x17
>> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>
> This might not be necessary. For old qemu or the qemu-img without setting
> option, the block_size_shift will be 0.
>
> If we make 0 to represent 4MB object, then we don't need to get the default
> cluster object size.
>
> We migth even get rid of the idea of cluster default size. The downsize is that,
> if we want to create a vdi with different size not the default 4MB,
> we have to write it every time for qemu-img or dog.
>
> If we choose to keep the idea of cluster default size, I think we'd also try to
> avoid call this request from QEMU to make backward compatibility easier. In this
> scenario, 0 might be used to ask new sheep to decide to use cluster default size.
>
> Both old qemu and new QEMU will send 0 to sheep and both old and new sheep can
> handle 0 though it has different meanings.
>
> Table for this bit as 0:
> Qe: qemu
> SD: Sheep daemon
> CDS: Cluster Default Size
> Ign: Ignored by the sheep daemon
>
> Qe/sd   new    old
> new     CDS    Ign
> old     CDS    NULL
Does Ign mean that VDI is handled as 4MB object size?

>
> I think this approach is acceptable. The difference to your patch is that
> we don't send SD_OP_GET_CLUSTER_DEFAULT to sheep daemon and
> SD_OP_GET_CLUSTER_DEFAULT can be removed.
When users create a new VDI with qemu-img, qemu's Sheepdog backend
driver calculates max limit VDI size.
But if block_size_shift option is not specified, qemu's Sheepdog backend
driver can't calculate max limit VDI size.

So, I think that qemu's Sheepdog backend driver must get cluster default
value from sheep daemon.

Thanks,
Teruaki

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  1:51   ` Teruaki Ishizaki
@ 2015-02-12  2:19     ` Liu Yuan
  2015-02-12  2:33       ` Teruaki Ishizaki
  0 siblings, 1 reply; 28+ messages in thread
From: Liu Yuan @ 2015-02-12  2:19 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Thu, Feb 12, 2015 at 10:51:25AM +0900, Teruaki Ishizaki wrote:
> (2015/02/10 20:12), Liu Yuan wrote:
> >On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> >>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> >>This patch enables users to handle "block_size_shift" value for
> >>calculating VDI object size.
> >>
> >>When you start qemu, you don't need to specify additional command option.
> >>
> >>But when you create the VDI which doesn't have default object size
> >>with qemu-img command, you specify block_size_shift option.
> >>
> >>If you want to create a VDI of 8MB(1 << 23) object size,
> >>you need to specify following command option.
> >>
> >>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> >>
> >>In addition, when you don't specify qemu-img command option,
> >>a default value of sheepdog cluster is used for creating VDI.
> >>
> >>  # qemu-img create sheepdog:test2 100M
> >>
> >>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> >>---
> >>V4:
> >>  - Limit a read/write buffer size for creating a preallocated VDI.
> >>  - Replace a parse function for the block_size_shift option.
> >>  - Fix an error message.
> >>
> >>V3:
> >>  - Delete the needless operation of buffer.
> >>  - Delete the needless operations of request header.
> >>    for SD_OP_GET_CLUSTER_DEFAULT.
> >>  - Fix coding style problems.
> >>
> >>V2:
> >>  - Fix coding style problem (white space).
> >>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> >>  - Initialize request header to use block_size_shift specified by user.
> >>---
> >>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> >>  include/block/block_int.h |    1 +
> >>  2 files changed, 119 insertions(+), 20 deletions(-)
> >>
> >>diff --git a/block/sheepdog.c b/block/sheepdog.c
> >>index be3176f..a43b947 100644
> >>--- a/block/sheepdog.c
> >>+++ b/block/sheepdog.c
> >>@@ -37,6 +37,7 @@
> >>  #define SD_OP_READ_VDIS      0x15
> >>  #define SD_OP_FLUSH_VDI      0x16
> >>  #define SD_OP_DEL_VDI        0x17
> >>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> >
> >This might not be necessary. For old qemu or the qemu-img without setting
> >option, the block_size_shift will be 0.
> >
> >If we make 0 to represent 4MB object, then we don't need to get the default
> >cluster object size.
> >
> >We migth even get rid of the idea of cluster default size. The downsize is that,
> >if we want to create a vdi with different size not the default 4MB,
> >we have to write it every time for qemu-img or dog.
> >
> >If we choose to keep the idea of cluster default size, I think we'd also try to
> >avoid call this request from QEMU to make backward compatibility easier. In this
> >scenario, 0 might be used to ask new sheep to decide to use cluster default size.
> >
> >Both old qemu and new QEMU will send 0 to sheep and both old and new sheep can
> >handle 0 though it has different meanings.
> >
> >Table for this bit as 0:
> >Qe: qemu
> >SD: Sheep daemon
> >CDS: Cluster Default Size
> >Ign: Ignored by the sheep daemon
> >
> >Qe/sd   new    old
> >new     CDS    Ign
> >old     CDS    NULL
> Does Ign mean that VDI is handled as 4MB object size?

Yes, old sheep can only handle 4MB object and doesn't check this field at all.

> 
> >
> >I think this approach is acceptable. The difference to your patch is that
> >we don't send SD_OP_GET_CLUSTER_DEFAULT to sheep daemon and
> >SD_OP_GET_CLUSTER_DEFAULT can be removed.
> When users create a new VDI with qemu-img, qemu's Sheepdog backend
> driver calculates max limit VDI size.

> But if block_size_shift option is not specified, qemu's Sheepdog backend
> driver can't calculate max limit VDI size.

If block_size_shift not specified, this means

1 for old sheep, use 4MB size
2 for new sheep, use cluster wide default value.

And sheep then can calculate it on its own, no?

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  2:19     ` Liu Yuan
@ 2015-02-12  2:33       ` Teruaki Ishizaki
  2015-02-12  2:55         ` Liu Yuan
  0 siblings, 1 reply; 28+ messages in thread
From: Teruaki Ishizaki @ 2015-02-12  2:33 UTC (permalink / raw)
  To: Liu Yuan; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

(2015/02/12 11:19), Liu Yuan wrote:
> On Thu, Feb 12, 2015 at 10:51:25AM +0900, Teruaki Ishizaki wrote:
>> (2015/02/10 20:12), Liu Yuan wrote:
>>> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
>>>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>>>> This patch enables users to handle "block_size_shift" value for
>>>> calculating VDI object size.
>>>>
>>>> When you start qemu, you don't need to specify additional command option.
>>>>
>>>> But when you create the VDI which doesn't have default object size
>>>> with qemu-img command, you specify block_size_shift option.
>>>>
>>>> If you want to create a VDI of 8MB(1 << 23) object size,
>>>> you need to specify following command option.
>>>>
>>>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>>>
>>>> In addition, when you don't specify qemu-img command option,
>>>> a default value of sheepdog cluster is used for creating VDI.
>>>>
>>>>   # qemu-img create sheepdog:test2 100M
>>>>
>>>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
>>>> ---
>>>> V4:
>>>>   - Limit a read/write buffer size for creating a preallocated VDI.
>>>>   - Replace a parse function for the block_size_shift option.
>>>>   - Fix an error message.
>>>>
>>>> V3:
>>>>   - Delete the needless operation of buffer.
>>>>   - Delete the needless operations of request header.
>>>>     for SD_OP_GET_CLUSTER_DEFAULT.
>>>>   - Fix coding style problems.
>>>>
>>>> V2:
>>>>   - Fix coding style problem (white space).
>>>>   - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
>>>>   - Initialize request header to use block_size_shift specified by user.
>>>> ---
>>>>   block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
>>>>   include/block/block_int.h |    1 +
>>>>   2 files changed, 119 insertions(+), 20 deletions(-)
>>>>
>>>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>>>> index be3176f..a43b947 100644
>>>> --- a/block/sheepdog.c
>>>> +++ b/block/sheepdog.c
>>>> @@ -37,6 +37,7 @@
>>>>   #define SD_OP_READ_VDIS      0x15
>>>>   #define SD_OP_FLUSH_VDI      0x16
>>>>   #define SD_OP_DEL_VDI        0x17
>>>> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>>>
>>> This might not be necessary. For old qemu or the qemu-img without setting
>>> option, the block_size_shift will be 0.
>>>
>>> If we make 0 to represent 4MB object, then we don't need to get the default
>>> cluster object size.
>>>
>>> We migth even get rid of the idea of cluster default size. The downsize is that,
>>> if we want to create a vdi with different size not the default 4MB,
>>> we have to write it every time for qemu-img or dog.
>>>
>>> If we choose to keep the idea of cluster default size, I think we'd also try to
>>> avoid call this request from QEMU to make backward compatibility easier. In this
>>> scenario, 0 might be used to ask new sheep to decide to use cluster default size.
>>>
>>> Both old qemu and new QEMU will send 0 to sheep and both old and new sheep can
>>> handle 0 though it has different meanings.
>>>
>>> Table for this bit as 0:
>>> Qe: qemu
>>> SD: Sheep daemon
>>> CDS: Cluster Default Size
>>> Ign: Ignored by the sheep daemon
>>>
>>> Qe/sd   new    old
>>> new     CDS    Ign
>>> old     CDS    NULL
>> Does Ign mean that VDI is handled as 4MB object size?
>
> Yes, old sheep can only handle 4MB object and doesn't check this field at all.
>
>>
>>>
>>> I think this approach is acceptable. The difference to your patch is that
>>> we don't send SD_OP_GET_CLUSTER_DEFAULT to sheep daemon and
>>> SD_OP_GET_CLUSTER_DEFAULT can be removed.
>> When users create a new VDI with qemu-img, qemu's Sheepdog backend
>> driver calculates max limit VDI size.
>
>> But if block_size_shift option is not specified, qemu's Sheepdog backend
>> driver can't calculate max limit VDI size.
>
> If block_size_shift not specified, this means
>
> 1 for old sheep, use 4MB size
> 2 for new sheep, use cluster wide default value.
>
> And sheep then can calculate it on its own, no?
>
Dog command(client) calculate max size, so I think
that qemu's Sheepdog backend driver should calculate it
like dog command.

Is that policy changeable?
Is there no policy?

Thanks,
Teruaki

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  2:33       ` Teruaki Ishizaki
@ 2015-02-12  2:55         ` Liu Yuan
  2015-02-13  1:33           ` Teruaki Ishizaki
  0 siblings, 1 reply; 28+ messages in thread
From: Liu Yuan @ 2015-02-12  2:55 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Thu, Feb 12, 2015 at 11:33:16AM +0900, Teruaki Ishizaki wrote:
> (2015/02/12 11:19), Liu Yuan wrote:
> >On Thu, Feb 12, 2015 at 10:51:25AM +0900, Teruaki Ishizaki wrote:
> >>(2015/02/10 20:12), Liu Yuan wrote:
> >>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> >>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> >>>>This patch enables users to handle "block_size_shift" value for
> >>>>calculating VDI object size.
> >>>>
> >>>>When you start qemu, you don't need to specify additional command option.
> >>>>
> >>>>But when you create the VDI which doesn't have default object size
> >>>>with qemu-img command, you specify block_size_shift option.
> >>>>
> >>>>If you want to create a VDI of 8MB(1 << 23) object size,
> >>>>you need to specify following command option.
> >>>>
> >>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> >>>>
> >>>>In addition, when you don't specify qemu-img command option,
> >>>>a default value of sheepdog cluster is used for creating VDI.
> >>>>
> >>>>  # qemu-img create sheepdog:test2 100M
> >>>>
> >>>>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> >>>>---
> >>>>V4:
> >>>>  - Limit a read/write buffer size for creating a preallocated VDI.
> >>>>  - Replace a parse function for the block_size_shift option.
> >>>>  - Fix an error message.
> >>>>
> >>>>V3:
> >>>>  - Delete the needless operation of buffer.
> >>>>  - Delete the needless operations of request header.
> >>>>    for SD_OP_GET_CLUSTER_DEFAULT.
> >>>>  - Fix coding style problems.
> >>>>
> >>>>V2:
> >>>>  - Fix coding style problem (white space).
> >>>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> >>>>  - Initialize request header to use block_size_shift specified by user.
> >>>>---
> >>>>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> >>>>  include/block/block_int.h |    1 +
> >>>>  2 files changed, 119 insertions(+), 20 deletions(-)
> >>>>
> >>>>diff --git a/block/sheepdog.c b/block/sheepdog.c
> >>>>index be3176f..a43b947 100644
> >>>>--- a/block/sheepdog.c
> >>>>+++ b/block/sheepdog.c
> >>>>@@ -37,6 +37,7 @@
> >>>>  #define SD_OP_READ_VDIS      0x15
> >>>>  #define SD_OP_FLUSH_VDI      0x16
> >>>>  #define SD_OP_DEL_VDI        0x17
> >>>>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> >>>
> >>>This might not be necessary. For old qemu or the qemu-img without setting
> >>>option, the block_size_shift will be 0.
> >>>
> >>>If we make 0 to represent 4MB object, then we don't need to get the default
> >>>cluster object size.
> >>>
> >>>We migth even get rid of the idea of cluster default size. The downsize is that,
> >>>if we want to create a vdi with different size not the default 4MB,
> >>>we have to write it every time for qemu-img or dog.
> >>>
> >>>If we choose to keep the idea of cluster default size, I think we'd also try to
> >>>avoid call this request from QEMU to make backward compatibility easier. In this
> >>>scenario, 0 might be used to ask new sheep to decide to use cluster default size.
> >>>
> >>>Both old qemu and new QEMU will send 0 to sheep and both old and new sheep can
> >>>handle 0 though it has different meanings.
> >>>
> >>>Table for this bit as 0:
> >>>Qe: qemu
> >>>SD: Sheep daemon
> >>>CDS: Cluster Default Size
> >>>Ign: Ignored by the sheep daemon
> >>>
> >>>Qe/sd   new    old
> >>>new     CDS    Ign
> >>>old     CDS    NULL
> >>Does Ign mean that VDI is handled as 4MB object size?
> >
> >Yes, old sheep can only handle 4MB object and doesn't check this field at all.
> >
> >>
> >>>
> >>>I think this approach is acceptable. The difference to your patch is that
> >>>we don't send SD_OP_GET_CLUSTER_DEFAULT to sheep daemon and
> >>>SD_OP_GET_CLUSTER_DEFAULT can be removed.
> >>When users create a new VDI with qemu-img, qemu's Sheepdog backend
> >>driver calculates max limit VDI size.
> >
> >>But if block_size_shift option is not specified, qemu's Sheepdog backend
> >>driver can't calculate max limit VDI size.
> >
> >If block_size_shift not specified, this means
> >
> >1 for old sheep, use 4MB size
> >2 for new sheep, use cluster wide default value.
> >
> >And sheep then can calculate it on its own, no?
> >
> Dog command(client) calculate max size, so I think
> that qemu's Sheepdog backend driver should calculate it
> like dog command.
> 
> Is that policy changeable?

I checked the QEMU code and got your idea. In the past it was fixed size so very
easy to hardcode the check in the client, no communication with sheep needed.

Yes, if it is reasonable, we can change it.

I think we can push the size calculation logic into sheep, if not the right size
return INVALID_PARAMETER to clients. Clients just check this and report error
back to users.

There is no backward compability for this approach, since 4MB is the smallest
size.

OLD QEMU will limit the max_size as 4TB, which is no problem for new sheep.

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [sheepdog] [PATCH v4] sheepdog: selectable object size support
  2015-02-10 10:35         ` Liu Yuan
@ 2015-02-12  6:19           ` Hitoshi Mitake
  2015-02-12  7:00             ` Liu Yuan
  0 siblings, 1 reply; 28+ messages in thread
From: Hitoshi Mitake @ 2015-02-12  6:19 UTC (permalink / raw)
  To: Liu Yuan
  Cc: kwolf, sheepdog, mitake.hitoshi, qemu-devel, stefanha, Teruaki Ishizaki

At Tue, 10 Feb 2015 18:35:58 +0800,
Liu Yuan wrote:
> 
> On Tue, Feb 10, 2015 at 06:56:33PM +0900, Teruaki Ishizaki wrote:
> > (2015/02/10 17:58), Liu Yuan wrote:
> > >On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
> > >>(2015/02/10 12:10), Liu Yuan wrote:
> > >>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> > >>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> > >>>>This patch enables users to handle "block_size_shift" value for
> > >>>>calculating VDI object size.
> > >>>>
> > >>>>When you start qemu, you don't need to specify additional command option.
> > >>>>
> > >>>>But when you create the VDI which doesn't have default object size
> > >>>>with qemu-img command, you specify block_size_shift option.
> > >>>>
> > >>>>If you want to create a VDI of 8MB(1 << 23) object size,
> > >>>>you need to specify following command option.
> > >>>>
> > >>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> > >>>>
> > >>>>In addition, when you don't specify qemu-img command option,
> > >>>>a default value of sheepdog cluster is used for creating VDI.
> > >>>>
> > >>>>  # qemu-img create sheepdog:test2 100M
> > >>>>
> > >>>>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> > >>>>---
> > >>>>V4:
> > >>>>  - Limit a read/write buffer size for creating a preallocated VDI.
> > >>>>  - Replace a parse function for the block_size_shift option.
> > >>>>  - Fix an error message.
> > >>>>
> > >>>>V3:
> > >>>>  - Delete the needless operation of buffer.
> > >>>>  - Delete the needless operations of request header.
> > >>>>    for SD_OP_GET_CLUSTER_DEFAULT.
> > >>>>  - Fix coding style problems.
> > >>>>
> > >>>>V2:
> > >>>>  - Fix coding style problem (white space).
> > >>>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> > >>>>  - Initialize request header to use block_size_shift specified by user.
> > >>>>---
> > >>>>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> > >>>>  include/block/block_int.h |    1 +
> > >>>>  2 files changed, 119 insertions(+), 20 deletions(-)
> > >>>>
> > >>>>diff --git a/block/sheepdog.c b/block/sheepdog.c
> > >>>>index be3176f..a43b947 100644
> > >>>>--- a/block/sheepdog.c
> > >>>>+++ b/block/sheepdog.c
> > >>>>@@ -37,6 +37,7 @@
> > >>>>  #define SD_OP_READ_VDIS      0x15
> > >>>>  #define SD_OP_FLUSH_VDI      0x16
> > >>>>  #define SD_OP_DEL_VDI        0x17
> > >>>>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> > >>>>
> > >>>>  #define SD_FLAG_CMD_WRITE    0x01
> > >>>>  #define SD_FLAG_CMD_COW      0x02
> > >>>>@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> > >>>>      uint32_t base_vdi_id;
> > >>>>      uint8_t copies;
> > >>>>      uint8_t copy_policy;
> > >>>>-    uint8_t reserved[2];
> > >>>>+    uint8_t store_policy;
> > >>>>+    uint8_t block_size_shift;
> > >>>>      uint32_t snapid;
> > >>>>      uint32_t type;
> > >>>>      uint32_t pad[2];
> > >>>>@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> > >>>>      uint32_t pad[5];
> > >>>>  } SheepdogVdiRsp;
> > >>>>
> > >>>>+typedef struct SheepdogClusterRsp {
> > >>>>+    uint8_t proto_ver;
> > >>>>+    uint8_t opcode;
> > >>>>+    uint16_t flags;
> > >>>>+    uint32_t epoch;
> > >>>>+    uint32_t id;
> > >>>>+    uint32_t data_length;
> > >>>>+    uint32_t result;
> > >>>>+    uint8_t nr_copies;
> > >>>>+    uint8_t copy_policy;
> > >>>>+    uint8_t block_size_shift;
> > >>>>+    uint8_t __pad1;
> > >>>>+    uint32_t __pad2[6];
> > >>>>+} SheepdogClusterRsp;
> > >>>>+
> > >>>>  typedef struct SheepdogInode {
> > >>>>      char name[SD_MAX_VDI_LEN];
> > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > >>>>@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > >>>>      hdr.vdi_size = s->inode.vdi_size;
> > >>>>      hdr.copy_policy = s->inode.copy_policy;
> > >>>>      hdr.copies = s->inode.nr_copies;
> > >>>>+    hdr.block_size_shift = s->inode.block_size_shift;
> > >>>>
> > >>>>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> > >>>>
> > >>>>@@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > >>>>  static int sd_prealloc(const char *filename, Error **errp)
> > >>>>  {
> > >>>>      BlockDriverState *bs = NULL;
> > >>>>+    BDRVSheepdogState *base = NULL;
> > >>>>+    unsigned long buf_size;
> > >>>>      uint32_t idx, max_idx;
> > >>>>+    uint32_t object_size;
> > >>>>      int64_t vdi_size;
> > >>>>-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> > >>>>+    void *buf = NULL;
> > >>>>      int ret;
> > >>>>
> > >>>>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> > >>>>@@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> > >>>>          ret = vdi_size;
> > >>>>          goto out;
> > >>>>      }
> > >>>>-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> > >>>>+
> > >>>>+    base = bs->opaque;
> > >>>>+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> > >>>>+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> > >>>>+    buf = g_malloc0(buf_size);
> > >>>>+
> > >>>>+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
> > >>>>
> > >>>>      for (idx = 0; idx < max_idx; idx++) {
> > >>>>          /*
> > >>>>           * The created image can be a cloned image, so we need to read
> > >>>>           * a data from the source image.
> > >>>>           */
> > >>>>-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > >>>>+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> > >>>>          if (ret < 0) {
> > >>>>              goto out;
> > >>>>          }
> > >>>>-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > >>>>+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> > >>>>          if (ret < 0) {
> > >>>>              goto out;
> > >>>>          }
> > >>>>@@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> > >>>>      return 0;
> > >>>>  }
> > >>>>
> > >>>>+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> > >>>>+{
> > >>>>+    struct SheepdogInode *inode = &s->inode;
> > >>>>+    inode->block_size_shift =
> > >>>>+        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> > >>>>+    if (inode->block_size_shift == 0) {
> > >>>>+        /* block_size_shift is set for cluster default value by sheepdog */
> > >>>>+        return 0;
> > >>>>+    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> > >>>>+        return -EINVAL;
> > >>>>+    }
> > >>>>+
> > >>>>+    return 0;
> > >>>>+}
> > >>>>+
> > >>>>  static int sd_create(const char *filename, QemuOpts *opts,
> > >>>>                       Error **errp)
> > >>>>  {
> > >>>>@@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > >>>>      BDRVSheepdogState *s;
> > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > >>>>      uint32_t snapid;
> > >>>>+    uint64_t max_vdi_size;
> > >>>>      bool prealloc = false;
> > >>>>
> > >>>>      s = g_new0(BDRVSheepdogState, 1);
> > >>>>@@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > >>>>          }
> > >>>>      }
> > >>>>
> > >>>>-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> > >>>>-        error_setg(errp, "too big image size");
> > >>>>-        ret = -EINVAL;
> > >>>>+    ret = parse_block_size_shift(s, opts);
> > >>>>+    if (ret < 0) {
> > >>>>+        error_setg(errp, "Invalid block_size_shift:"
> > >>>>+                         " '%s'. please use 20-31", buf);
> > >>>>          goto out;
> > >>>>      }
> > >>>>
> > >>>>@@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > >>>>      }
> > >>>>
> > >>>>      s->aio_context = qemu_get_aio_context();
> > >>>>+
> > >>>>+    /* if block_size_shift is not specified, get cluster default value */
> > >>>>+    if (s->inode.block_size_shift == 0) {
> > >>>
> > >>>Seems that we don't keep backward compatibility for new QEMU and old sheep that
> > >>>doesn't support SD_OP_GET_CLUSTER_DEFAULT.
> > >>Then, I'll add the operation that if block_size_shift from
> > >>SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
> > >>Is it OK?
> > >
> > >If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
> > >SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
> > >SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.
> > OK, I think that the way is better.
> > >
> > >My point is that, after user upgrading the sheep and still stick with old QEMU,
> > >(I guess many users will), any operations in the past sholudn't fail right after
> > >upgrading.
> > >
> > >>
> > >>>
> > >>>What will happen if old QEMU with new sheep that support block_size_shift? Most
> > >>>distributions will ship the old stable qemu that wouldn't be aware of
> > >>>block_size_shift.
> > >>If old QEMU with new sheep is used, VDI is created with cluster default
> > >>block_size_shift and accessed as 4MB object_size.
> > >>So I think that for backward compatibility, users must do cluster
> > >>format command with default block_size_shift 22 equal to
> > >>4MB object_size.
> > >
> > >how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
> > >encoded as 0 and then send the create request to sheep. Does sheep can handle
> > >block_size_shift = 0? You know, we can't pass any value to old QEMU for
> > >block_size_shift.
> > Sheep can handle block_size_shift = 0, when users create new VDI.
> > Old QEMU does do_sd_create() without setting hdr.block_size_shift and
> > sends a request SD_OP_NEW_VDI.
> > If block_size_shift is set to zero, new sheep sets cluster default value
> > in cluster_new_vdi() like copies and copy_policy.
> 
> Okay, so new sheep can handle old QEMU request. By the way, how about the
> suggestion in my last email? (x + 1) * 4MB stuff...

I think specifying object size in multiple of 4MB is overkill. Because
we can specify both of block_size_shift and a number of objects which
belong to VDI. More precisely,
2 ^ block_size_shift * #objects = VDI size
we can choose the block_size_shift between 20 and 30, and #objects
from 1 < 2 ^ 20.
# #objects is specified via VDI size implicitly
The granualarity of VDI sizes seems to be really fine (even
block_size_shift == 30, step is 1GB). So supporting block size with
multiple of 4MB will not provide practical benefit.

Expression of the block size is another topic. I don't have strong
opinion about this.

Thanks,
Hitoshi

> 
> Thanks
> Yuan
> -- 
> sheepdog mailing list
> sheepdog@lists.wpkg.org
> https://lists.wpkg.org/mailman/listinfo/sheepdog

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [sheepdog] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  6:19           ` [Qemu-devel] [sheepdog] " Hitoshi Mitake
@ 2015-02-12  7:00             ` Liu Yuan
  2015-02-12  7:28               ` Hitoshi Mitake
  0 siblings, 1 reply; 28+ messages in thread
From: Liu Yuan @ 2015-02-12  7:00 UTC (permalink / raw)
  To: Hitoshi Mitake; +Cc: kwolf, sheepdog, Teruaki Ishizaki, qemu-devel, stefanha

On Thu, Feb 12, 2015 at 03:19:21PM +0900, Hitoshi Mitake wrote:
> At Tue, 10 Feb 2015 18:35:58 +0800,
> Liu Yuan wrote:
> > 
> > On Tue, Feb 10, 2015 at 06:56:33PM +0900, Teruaki Ishizaki wrote:
> > > (2015/02/10 17:58), Liu Yuan wrote:
> > > >On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
> > > >>(2015/02/10 12:10), Liu Yuan wrote:
> > > >>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> > > >>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> > > >>>>This patch enables users to handle "block_size_shift" value for
> > > >>>>calculating VDI object size.
> > > >>>>
> > > >>>>When you start qemu, you don't need to specify additional command option.
> > > >>>>
> > > >>>>But when you create the VDI which doesn't have default object size
> > > >>>>with qemu-img command, you specify block_size_shift option.
> > > >>>>
> > > >>>>If you want to create a VDI of 8MB(1 << 23) object size,
> > > >>>>you need to specify following command option.
> > > >>>>
> > > >>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> > > >>>>
> > > >>>>In addition, when you don't specify qemu-img command option,
> > > >>>>a default value of sheepdog cluster is used for creating VDI.
> > > >>>>
> > > >>>>  # qemu-img create sheepdog:test2 100M
> > > >>>>
> > > >>>>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> > > >>>>---
> > > >>>>V4:
> > > >>>>  - Limit a read/write buffer size for creating a preallocated VDI.
> > > >>>>  - Replace a parse function for the block_size_shift option.
> > > >>>>  - Fix an error message.
> > > >>>>
> > > >>>>V3:
> > > >>>>  - Delete the needless operation of buffer.
> > > >>>>  - Delete the needless operations of request header.
> > > >>>>    for SD_OP_GET_CLUSTER_DEFAULT.
> > > >>>>  - Fix coding style problems.
> > > >>>>
> > > >>>>V2:
> > > >>>>  - Fix coding style problem (white space).
> > > >>>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> > > >>>>  - Initialize request header to use block_size_shift specified by user.
> > > >>>>---
> > > >>>>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> > > >>>>  include/block/block_int.h |    1 +
> > > >>>>  2 files changed, 119 insertions(+), 20 deletions(-)
> > > >>>>
> > > >>>>diff --git a/block/sheepdog.c b/block/sheepdog.c
> > > >>>>index be3176f..a43b947 100644
> > > >>>>--- a/block/sheepdog.c
> > > >>>>+++ b/block/sheepdog.c
> > > >>>>@@ -37,6 +37,7 @@
> > > >>>>  #define SD_OP_READ_VDIS      0x15
> > > >>>>  #define SD_OP_FLUSH_VDI      0x16
> > > >>>>  #define SD_OP_DEL_VDI        0x17
> > > >>>>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> > > >>>>
> > > >>>>  #define SD_FLAG_CMD_WRITE    0x01
> > > >>>>  #define SD_FLAG_CMD_COW      0x02
> > > >>>>@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> > > >>>>      uint32_t base_vdi_id;
> > > >>>>      uint8_t copies;
> > > >>>>      uint8_t copy_policy;
> > > >>>>-    uint8_t reserved[2];
> > > >>>>+    uint8_t store_policy;
> > > >>>>+    uint8_t block_size_shift;
> > > >>>>      uint32_t snapid;
> > > >>>>      uint32_t type;
> > > >>>>      uint32_t pad[2];
> > > >>>>@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> > > >>>>      uint32_t pad[5];
> > > >>>>  } SheepdogVdiRsp;
> > > >>>>
> > > >>>>+typedef struct SheepdogClusterRsp {
> > > >>>>+    uint8_t proto_ver;
> > > >>>>+    uint8_t opcode;
> > > >>>>+    uint16_t flags;
> > > >>>>+    uint32_t epoch;
> > > >>>>+    uint32_t id;
> > > >>>>+    uint32_t data_length;
> > > >>>>+    uint32_t result;
> > > >>>>+    uint8_t nr_copies;
> > > >>>>+    uint8_t copy_policy;
> > > >>>>+    uint8_t block_size_shift;
> > > >>>>+    uint8_t __pad1;
> > > >>>>+    uint32_t __pad2[6];
> > > >>>>+} SheepdogClusterRsp;
> > > >>>>+
> > > >>>>  typedef struct SheepdogInode {
> > > >>>>      char name[SD_MAX_VDI_LEN];
> > > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > > >>>>@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > > >>>>      hdr.vdi_size = s->inode.vdi_size;
> > > >>>>      hdr.copy_policy = s->inode.copy_policy;
> > > >>>>      hdr.copies = s->inode.nr_copies;
> > > >>>>+    hdr.block_size_shift = s->inode.block_size_shift;
> > > >>>>
> > > >>>>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> > > >>>>
> > > >>>>@@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > > >>>>  static int sd_prealloc(const char *filename, Error **errp)
> > > >>>>  {
> > > >>>>      BlockDriverState *bs = NULL;
> > > >>>>+    BDRVSheepdogState *base = NULL;
> > > >>>>+    unsigned long buf_size;
> > > >>>>      uint32_t idx, max_idx;
> > > >>>>+    uint32_t object_size;
> > > >>>>      int64_t vdi_size;
> > > >>>>-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> > > >>>>+    void *buf = NULL;
> > > >>>>      int ret;
> > > >>>>
> > > >>>>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> > > >>>>@@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> > > >>>>          ret = vdi_size;
> > > >>>>          goto out;
> > > >>>>      }
> > > >>>>-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> > > >>>>+
> > > >>>>+    base = bs->opaque;
> > > >>>>+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> > > >>>>+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> > > >>>>+    buf = g_malloc0(buf_size);
> > > >>>>+
> > > >>>>+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
> > > >>>>
> > > >>>>      for (idx = 0; idx < max_idx; idx++) {
> > > >>>>          /*
> > > >>>>           * The created image can be a cloned image, so we need to read
> > > >>>>           * a data from the source image.
> > > >>>>           */
> > > >>>>-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > > >>>>+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> > > >>>>          if (ret < 0) {
> > > >>>>              goto out;
> > > >>>>          }
> > > >>>>-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > > >>>>+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> > > >>>>          if (ret < 0) {
> > > >>>>              goto out;
> > > >>>>          }
> > > >>>>@@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> > > >>>>      return 0;
> > > >>>>  }
> > > >>>>
> > > >>>>+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> > > >>>>+{
> > > >>>>+    struct SheepdogInode *inode = &s->inode;
> > > >>>>+    inode->block_size_shift =
> > > >>>>+        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> > > >>>>+    if (inode->block_size_shift == 0) {
> > > >>>>+        /* block_size_shift is set for cluster default value by sheepdog */
> > > >>>>+        return 0;
> > > >>>>+    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> > > >>>>+        return -EINVAL;
> > > >>>>+    }
> > > >>>>+
> > > >>>>+    return 0;
> > > >>>>+}
> > > >>>>+
> > > >>>>  static int sd_create(const char *filename, QemuOpts *opts,
> > > >>>>                       Error **errp)
> > > >>>>  {
> > > >>>>@@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > >>>>      BDRVSheepdogState *s;
> > > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > > >>>>      uint32_t snapid;
> > > >>>>+    uint64_t max_vdi_size;
> > > >>>>      bool prealloc = false;
> > > >>>>
> > > >>>>      s = g_new0(BDRVSheepdogState, 1);
> > > >>>>@@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > >>>>          }
> > > >>>>      }
> > > >>>>
> > > >>>>-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> > > >>>>-        error_setg(errp, "too big image size");
> > > >>>>-        ret = -EINVAL;
> > > >>>>+    ret = parse_block_size_shift(s, opts);
> > > >>>>+    if (ret < 0) {
> > > >>>>+        error_setg(errp, "Invalid block_size_shift:"
> > > >>>>+                         " '%s'. please use 20-31", buf);
> > > >>>>          goto out;
> > > >>>>      }
> > > >>>>
> > > >>>>@@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > >>>>      }
> > > >>>>
> > > >>>>      s->aio_context = qemu_get_aio_context();
> > > >>>>+
> > > >>>>+    /* if block_size_shift is not specified, get cluster default value */
> > > >>>>+    if (s->inode.block_size_shift == 0) {
> > > >>>
> > > >>>Seems that we don't keep backward compatibility for new QEMU and old sheep that
> > > >>>doesn't support SD_OP_GET_CLUSTER_DEFAULT.
> > > >>Then, I'll add the operation that if block_size_shift from
> > > >>SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
> > > >>Is it OK?
> > > >
> > > >If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
> > > >SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
> > > >SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.
> > > OK, I think that the way is better.
> > > >
> > > >My point is that, after user upgrading the sheep and still stick with old QEMU,
> > > >(I guess many users will), any operations in the past sholudn't fail right after
> > > >upgrading.
> > > >
> > > >>
> > > >>>
> > > >>>What will happen if old QEMU with new sheep that support block_size_shift? Most
> > > >>>distributions will ship the old stable qemu that wouldn't be aware of
> > > >>>block_size_shift.
> > > >>If old QEMU with new sheep is used, VDI is created with cluster default
> > > >>block_size_shift and accessed as 4MB object_size.
> > > >>So I think that for backward compatibility, users must do cluster
> > > >>format command with default block_size_shift 22 equal to
> > > >>4MB object_size.
> > > >
> > > >how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
> > > >encoded as 0 and then send the create request to sheep. Does sheep can handle
> > > >block_size_shift = 0? You know, we can't pass any value to old QEMU for
> > > >block_size_shift.
> > > Sheep can handle block_size_shift = 0, when users create new VDI.
> > > Old QEMU does do_sd_create() without setting hdr.block_size_shift and
> > > sends a request SD_OP_NEW_VDI.
> > > If block_size_shift is set to zero, new sheep sets cluster default value
> > > in cluster_new_vdi() like copies and copy_policy.
> > 
> > Okay, so new sheep can handle old QEMU request. By the way, how about the
> > suggestion in my last email? (x + 1) * 4MB stuff...
> 
> I think specifying object size in multiple of 4MB is overkill. Because
> we can specify both of block_size_shift and a number of objects which
> belong to VDI. More precisely,
> 2 ^ block_size_shift * #objects = VDI size
> we can choose the block_size_shift between 20 and 30, and #objects
> from 1 < 2 ^ 20.
> # #objects is specified via VDI size implicitly

I can't understand this paragraph. If object size is fixed, we can calculate
# of object easily. It has nothing to do with block_size_shift.

> The granualarity of VDI sizes seems to be really fine (even
> block_size_shift == 30, step is 1GB). So supporting block size with
> multiple of 4MB will not provide practical benefit.

Prabably yes, but finer granularity doesn't cause trouble and gives more
flexibility. We can allow 12MB at user's will, for example. Even it doesn't
provide practical benefits, what benefit block_size_shift will provide over
(x + 1) * 4MB scheme? My point is, give a wider range won't hurt and will
pose less constaint in the future.

The main reason I suggest (x + 1) * 4MB, is that we can easily keep backward
compatibility because x = 0 means 4MB, but with this patch proposal, x = 22
means 4MB.

> Expression of the block size is another topic. I don't have strong
> opinion about this.

I'm fine with whatever way we choose to represent object size internally,
instead I'm more concerned about the user option form to specify object size.
Even if we choose block_size_shift, we should make the option user friendly as
much as possible. Expose block_size_shift to users is not a good idea.

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [sheepdog] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  7:00             ` Liu Yuan
@ 2015-02-12  7:28               ` Hitoshi Mitake
  2015-02-12  7:42                 ` Liu Yuan
  0 siblings, 1 reply; 28+ messages in thread
From: Hitoshi Mitake @ 2015-02-12  7:28 UTC (permalink / raw)
  To: Liu Yuan
  Cc: kwolf, Teruaki Ishizaki, Hitoshi Mitake, qemu-devel, stefanha, sheepdog

At Thu, 12 Feb 2015 15:00:49 +0800,
Liu Yuan wrote:
> 
> On Thu, Feb 12, 2015 at 03:19:21PM +0900, Hitoshi Mitake wrote:
> > At Tue, 10 Feb 2015 18:35:58 +0800,
> > Liu Yuan wrote:
> > > 
> > > On Tue, Feb 10, 2015 at 06:56:33PM +0900, Teruaki Ishizaki wrote:
> > > > (2015/02/10 17:58), Liu Yuan wrote:
> > > > >On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
> > > > >>(2015/02/10 12:10), Liu Yuan wrote:
> > > > >>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> > > > >>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> > > > >>>>This patch enables users to handle "block_size_shift" value for
> > > > >>>>calculating VDI object size.
> > > > >>>>
> > > > >>>>When you start qemu, you don't need to specify additional command option.
> > > > >>>>
> > > > >>>>But when you create the VDI which doesn't have default object size
> > > > >>>>with qemu-img command, you specify block_size_shift option.
> > > > >>>>
> > > > >>>>If you want to create a VDI of 8MB(1 << 23) object size,
> > > > >>>>you need to specify following command option.
> > > > >>>>
> > > > >>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> > > > >>>>
> > > > >>>>In addition, when you don't specify qemu-img command option,
> > > > >>>>a default value of sheepdog cluster is used for creating VDI.
> > > > >>>>
> > > > >>>>  # qemu-img create sheepdog:test2 100M
> > > > >>>>
> > > > >>>>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> > > > >>>>---
> > > > >>>>V4:
> > > > >>>>  - Limit a read/write buffer size for creating a preallocated VDI.
> > > > >>>>  - Replace a parse function for the block_size_shift option.
> > > > >>>>  - Fix an error message.
> > > > >>>>
> > > > >>>>V3:
> > > > >>>>  - Delete the needless operation of buffer.
> > > > >>>>  - Delete the needless operations of request header.
> > > > >>>>    for SD_OP_GET_CLUSTER_DEFAULT.
> > > > >>>>  - Fix coding style problems.
> > > > >>>>
> > > > >>>>V2:
> > > > >>>>  - Fix coding style problem (white space).
> > > > >>>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> > > > >>>>  - Initialize request header to use block_size_shift specified by user.
> > > > >>>>---
> > > > >>>>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> > > > >>>>  include/block/block_int.h |    1 +
> > > > >>>>  2 files changed, 119 insertions(+), 20 deletions(-)
> > > > >>>>
> > > > >>>>diff --git a/block/sheepdog.c b/block/sheepdog.c
> > > > >>>>index be3176f..a43b947 100644
> > > > >>>>--- a/block/sheepdog.c
> > > > >>>>+++ b/block/sheepdog.c
> > > > >>>>@@ -37,6 +37,7 @@
> > > > >>>>  #define SD_OP_READ_VDIS      0x15
> > > > >>>>  #define SD_OP_FLUSH_VDI      0x16
> > > > >>>>  #define SD_OP_DEL_VDI        0x17
> > > > >>>>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> > > > >>>>
> > > > >>>>  #define SD_FLAG_CMD_WRITE    0x01
> > > > >>>>  #define SD_FLAG_CMD_COW      0x02
> > > > >>>>@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> > > > >>>>      uint32_t base_vdi_id;
> > > > >>>>      uint8_t copies;
> > > > >>>>      uint8_t copy_policy;
> > > > >>>>-    uint8_t reserved[2];
> > > > >>>>+    uint8_t store_policy;
> > > > >>>>+    uint8_t block_size_shift;
> > > > >>>>      uint32_t snapid;
> > > > >>>>      uint32_t type;
> > > > >>>>      uint32_t pad[2];
> > > > >>>>@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> > > > >>>>      uint32_t pad[5];
> > > > >>>>  } SheepdogVdiRsp;
> > > > >>>>
> > > > >>>>+typedef struct SheepdogClusterRsp {
> > > > >>>>+    uint8_t proto_ver;
> > > > >>>>+    uint8_t opcode;
> > > > >>>>+    uint16_t flags;
> > > > >>>>+    uint32_t epoch;
> > > > >>>>+    uint32_t id;
> > > > >>>>+    uint32_t data_length;
> > > > >>>>+    uint32_t result;
> > > > >>>>+    uint8_t nr_copies;
> > > > >>>>+    uint8_t copy_policy;
> > > > >>>>+    uint8_t block_size_shift;
> > > > >>>>+    uint8_t __pad1;
> > > > >>>>+    uint32_t __pad2[6];
> > > > >>>>+} SheepdogClusterRsp;
> > > > >>>>+
> > > > >>>>  typedef struct SheepdogInode {
> > > > >>>>      char name[SD_MAX_VDI_LEN];
> > > > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > > > >>>>@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > > > >>>>      hdr.vdi_size = s->inode.vdi_size;
> > > > >>>>      hdr.copy_policy = s->inode.copy_policy;
> > > > >>>>      hdr.copies = s->inode.nr_copies;
> > > > >>>>+    hdr.block_size_shift = s->inode.block_size_shift;
> > > > >>>>
> > > > >>>>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> > > > >>>>
> > > > >>>>@@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > > > >>>>  static int sd_prealloc(const char *filename, Error **errp)
> > > > >>>>  {
> > > > >>>>      BlockDriverState *bs = NULL;
> > > > >>>>+    BDRVSheepdogState *base = NULL;
> > > > >>>>+    unsigned long buf_size;
> > > > >>>>      uint32_t idx, max_idx;
> > > > >>>>+    uint32_t object_size;
> > > > >>>>      int64_t vdi_size;
> > > > >>>>-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> > > > >>>>+    void *buf = NULL;
> > > > >>>>      int ret;
> > > > >>>>
> > > > >>>>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> > > > >>>>@@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> > > > >>>>          ret = vdi_size;
> > > > >>>>          goto out;
> > > > >>>>      }
> > > > >>>>-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> > > > >>>>+
> > > > >>>>+    base = bs->opaque;
> > > > >>>>+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> > > > >>>>+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> > > > >>>>+    buf = g_malloc0(buf_size);
> > > > >>>>+
> > > > >>>>+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
> > > > >>>>
> > > > >>>>      for (idx = 0; idx < max_idx; idx++) {
> > > > >>>>          /*
> > > > >>>>           * The created image can be a cloned image, so we need to read
> > > > >>>>           * a data from the source image.
> > > > >>>>           */
> > > > >>>>-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > > > >>>>+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> > > > >>>>          if (ret < 0) {
> > > > >>>>              goto out;
> > > > >>>>          }
> > > > >>>>-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > > > >>>>+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> > > > >>>>          if (ret < 0) {
> > > > >>>>              goto out;
> > > > >>>>          }
> > > > >>>>@@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> > > > >>>>      return 0;
> > > > >>>>  }
> > > > >>>>
> > > > >>>>+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> > > > >>>>+{
> > > > >>>>+    struct SheepdogInode *inode = &s->inode;
> > > > >>>>+    inode->block_size_shift =
> > > > >>>>+        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> > > > >>>>+    if (inode->block_size_shift == 0) {
> > > > >>>>+        /* block_size_shift is set for cluster default value by sheepdog */
> > > > >>>>+        return 0;
> > > > >>>>+    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> > > > >>>>+        return -EINVAL;
> > > > >>>>+    }
> > > > >>>>+
> > > > >>>>+    return 0;
> > > > >>>>+}
> > > > >>>>+
> > > > >>>>  static int sd_create(const char *filename, QemuOpts *opts,
> > > > >>>>                       Error **errp)
> > > > >>>>  {
> > > > >>>>@@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > >>>>      BDRVSheepdogState *s;
> > > > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > > > >>>>      uint32_t snapid;
> > > > >>>>+    uint64_t max_vdi_size;
> > > > >>>>      bool prealloc = false;
> > > > >>>>
> > > > >>>>      s = g_new0(BDRVSheepdogState, 1);
> > > > >>>>@@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > >>>>          }
> > > > >>>>      }
> > > > >>>>
> > > > >>>>-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> > > > >>>>-        error_setg(errp, "too big image size");
> > > > >>>>-        ret = -EINVAL;
> > > > >>>>+    ret = parse_block_size_shift(s, opts);
> > > > >>>>+    if (ret < 0) {
> > > > >>>>+        error_setg(errp, "Invalid block_size_shift:"
> > > > >>>>+                         " '%s'. please use 20-31", buf);
> > > > >>>>          goto out;
> > > > >>>>      }
> > > > >>>>
> > > > >>>>@@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > >>>>      }
> > > > >>>>
> > > > >>>>      s->aio_context = qemu_get_aio_context();
> > > > >>>>+
> > > > >>>>+    /* if block_size_shift is not specified, get cluster default value */
> > > > >>>>+    if (s->inode.block_size_shift == 0) {
> > > > >>>
> > > > >>>Seems that we don't keep backward compatibility for new QEMU and old sheep that
> > > > >>>doesn't support SD_OP_GET_CLUSTER_DEFAULT.
> > > > >>Then, I'll add the operation that if block_size_shift from
> > > > >>SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
> > > > >>Is it OK?
> > > > >
> > > > >If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
> > > > >SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
> > > > >SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.
> > > > OK, I think that the way is better.
> > > > >
> > > > >My point is that, after user upgrading the sheep and still stick with old QEMU,
> > > > >(I guess many users will), any operations in the past sholudn't fail right after
> > > > >upgrading.
> > > > >
> > > > >>
> > > > >>>
> > > > >>>What will happen if old QEMU with new sheep that support block_size_shift? Most
> > > > >>>distributions will ship the old stable qemu that wouldn't be aware of
> > > > >>>block_size_shift.
> > > > >>If old QEMU with new sheep is used, VDI is created with cluster default
> > > > >>block_size_shift and accessed as 4MB object_size.
> > > > >>So I think that for backward compatibility, users must do cluster
> > > > >>format command with default block_size_shift 22 equal to
> > > > >>4MB object_size.
> > > > >
> > > > >how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
> > > > >encoded as 0 and then send the create request to sheep. Does sheep can handle
> > > > >block_size_shift = 0? You know, we can't pass any value to old QEMU for
> > > > >block_size_shift.
> > > > Sheep can handle block_size_shift = 0, when users create new VDI.
> > > > Old QEMU does do_sd_create() without setting hdr.block_size_shift and
> > > > sends a request SD_OP_NEW_VDI.
> > > > If block_size_shift is set to zero, new sheep sets cluster default value
> > > > in cluster_new_vdi() like copies and copy_policy.
> > > 
> > > Okay, so new sheep can handle old QEMU request. By the way, how about the
> > > suggestion in my last email? (x + 1) * 4MB stuff...
> > 
> > I think specifying object size in multiple of 4MB is overkill. Because
> > we can specify both of block_size_shift and a number of objects which
> > belong to VDI. More precisely,
> > 2 ^ block_size_shift * #objects = VDI size
> > we can choose the block_size_shift between 20 and 30, and #objects
> > from 1 < 2 ^ 20.
> > # #objects is specified via VDI size implicitly
> 
> I can't understand this paragraph. If object size is fixed, we can calculate
> # of object easily. It has nothing to do with block_size_shift.

I wanted to say that we can choose both of block_size_shift and # of
objects from wide range, so granualarity is flexible and steps between
VDI sizes are reasonably small.

> 
> > The granualarity of VDI sizes seems to be really fine (even
> > block_size_shift == 30, step is 1GB). So supporting block size with
> > multiple of 4MB will not provide practical benefit.
> 
> Prabably yes, but finer granularity doesn't cause trouble and gives more
> flexibility. We can allow 12MB at user's will, for example. Even it doesn't
> provide practical benefits, what benefit block_size_shift will provide over
> (x + 1) * 4MB scheme? My point is, give a wider range won't hurt and will
> pose less constaint in the future.
> 
> The main reason I suggest (x + 1) * 4MB, is that we can easily keep backward
> compatibility because x = 0 means 4MB, but with this patch proposal, x = 22
> means 4MB.

Utilizing block_size_shift of inode doesn't break
compatibility. Because older versions of sheepdog initialize with
inode->block_size_shift with 22.

Thanks,
Hitoshi

> 
> > Expression of the block size is another topic. I don't have strong
> > opinion about this.
> 
> I'm fine with whatever way we choose to represent object size internally,
> instead I'm more concerned about the user option form to specify object size.
> Even if we choose block_size_shift, we should make the option user friendly as
> much as possible. Expose block_size_shift to users is not a good idea.
> 
> Thanks
> Yuan
> -- 
> sheepdog mailing list
> sheepdog@lists.wpkg.org
> https://lists.wpkg.org/mailman/listinfo/sheepdog

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [sheepdog] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  7:28               ` Hitoshi Mitake
@ 2015-02-12  7:42                 ` Liu Yuan
  2015-02-12  8:01                   ` Teruaki Ishizaki
  2015-02-12  8:13                   ` Hitoshi Mitake
  0 siblings, 2 replies; 28+ messages in thread
From: Liu Yuan @ 2015-02-12  7:42 UTC (permalink / raw)
  To: Hitoshi Mitake; +Cc: kwolf, Teruaki Ishizaki, sheepdog, qemu-devel, stefanha

On Thu, Feb 12, 2015 at 04:28:01PM +0900, Hitoshi Mitake wrote:
> At Thu, 12 Feb 2015 15:00:49 +0800,
> Liu Yuan wrote:
> > 
> > On Thu, Feb 12, 2015 at 03:19:21PM +0900, Hitoshi Mitake wrote:
> > > At Tue, 10 Feb 2015 18:35:58 +0800,
> > > Liu Yuan wrote:
> > > > 
> > > > On Tue, Feb 10, 2015 at 06:56:33PM +0900, Teruaki Ishizaki wrote:
> > > > > (2015/02/10 17:58), Liu Yuan wrote:
> > > > > >On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
> > > > > >>(2015/02/10 12:10), Liu Yuan wrote:
> > > > > >>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> > > > > >>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> > > > > >>>>This patch enables users to handle "block_size_shift" value for
> > > > > >>>>calculating VDI object size.
> > > > > >>>>
> > > > > >>>>When you start qemu, you don't need to specify additional command option.
> > > > > >>>>
> > > > > >>>>But when you create the VDI which doesn't have default object size
> > > > > >>>>with qemu-img command, you specify block_size_shift option.
> > > > > >>>>
> > > > > >>>>If you want to create a VDI of 8MB(1 << 23) object size,
> > > > > >>>>you need to specify following command option.
> > > > > >>>>
> > > > > >>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> > > > > >>>>
> > > > > >>>>In addition, when you don't specify qemu-img command option,
> > > > > >>>>a default value of sheepdog cluster is used for creating VDI.
> > > > > >>>>
> > > > > >>>>  # qemu-img create sheepdog:test2 100M
> > > > > >>>>
> > > > > >>>>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> > > > > >>>>---
> > > > > >>>>V4:
> > > > > >>>>  - Limit a read/write buffer size for creating a preallocated VDI.
> > > > > >>>>  - Replace a parse function for the block_size_shift option.
> > > > > >>>>  - Fix an error message.
> > > > > >>>>
> > > > > >>>>V3:
> > > > > >>>>  - Delete the needless operation of buffer.
> > > > > >>>>  - Delete the needless operations of request header.
> > > > > >>>>    for SD_OP_GET_CLUSTER_DEFAULT.
> > > > > >>>>  - Fix coding style problems.
> > > > > >>>>
> > > > > >>>>V2:
> > > > > >>>>  - Fix coding style problem (white space).
> > > > > >>>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> > > > > >>>>  - Initialize request header to use block_size_shift specified by user.
> > > > > >>>>---
> > > > > >>>>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> > > > > >>>>  include/block/block_int.h |    1 +
> > > > > >>>>  2 files changed, 119 insertions(+), 20 deletions(-)
> > > > > >>>>
> > > > > >>>>diff --git a/block/sheepdog.c b/block/sheepdog.c
> > > > > >>>>index be3176f..a43b947 100644
> > > > > >>>>--- a/block/sheepdog.c
> > > > > >>>>+++ b/block/sheepdog.c
> > > > > >>>>@@ -37,6 +37,7 @@
> > > > > >>>>  #define SD_OP_READ_VDIS      0x15
> > > > > >>>>  #define SD_OP_FLUSH_VDI      0x16
> > > > > >>>>  #define SD_OP_DEL_VDI        0x17
> > > > > >>>>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> > > > > >>>>
> > > > > >>>>  #define SD_FLAG_CMD_WRITE    0x01
> > > > > >>>>  #define SD_FLAG_CMD_COW      0x02
> > > > > >>>>@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> > > > > >>>>      uint32_t base_vdi_id;
> > > > > >>>>      uint8_t copies;
> > > > > >>>>      uint8_t copy_policy;
> > > > > >>>>-    uint8_t reserved[2];
> > > > > >>>>+    uint8_t store_policy;
> > > > > >>>>+    uint8_t block_size_shift;
> > > > > >>>>      uint32_t snapid;
> > > > > >>>>      uint32_t type;
> > > > > >>>>      uint32_t pad[2];
> > > > > >>>>@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> > > > > >>>>      uint32_t pad[5];
> > > > > >>>>  } SheepdogVdiRsp;
> > > > > >>>>
> > > > > >>>>+typedef struct SheepdogClusterRsp {
> > > > > >>>>+    uint8_t proto_ver;
> > > > > >>>>+    uint8_t opcode;
> > > > > >>>>+    uint16_t flags;
> > > > > >>>>+    uint32_t epoch;
> > > > > >>>>+    uint32_t id;
> > > > > >>>>+    uint32_t data_length;
> > > > > >>>>+    uint32_t result;
> > > > > >>>>+    uint8_t nr_copies;
> > > > > >>>>+    uint8_t copy_policy;
> > > > > >>>>+    uint8_t block_size_shift;
> > > > > >>>>+    uint8_t __pad1;
> > > > > >>>>+    uint32_t __pad2[6];
> > > > > >>>>+} SheepdogClusterRsp;
> > > > > >>>>+
> > > > > >>>>  typedef struct SheepdogInode {
> > > > > >>>>      char name[SD_MAX_VDI_LEN];
> > > > > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > > > > >>>>@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > > > > >>>>      hdr.vdi_size = s->inode.vdi_size;
> > > > > >>>>      hdr.copy_policy = s->inode.copy_policy;
> > > > > >>>>      hdr.copies = s->inode.nr_copies;
> > > > > >>>>+    hdr.block_size_shift = s->inode.block_size_shift;
> > > > > >>>>
> > > > > >>>>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> > > > > >>>>
> > > > > >>>>@@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > > > > >>>>  static int sd_prealloc(const char *filename, Error **errp)
> > > > > >>>>  {
> > > > > >>>>      BlockDriverState *bs = NULL;
> > > > > >>>>+    BDRVSheepdogState *base = NULL;
> > > > > >>>>+    unsigned long buf_size;
> > > > > >>>>      uint32_t idx, max_idx;
> > > > > >>>>+    uint32_t object_size;
> > > > > >>>>      int64_t vdi_size;
> > > > > >>>>-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> > > > > >>>>+    void *buf = NULL;
> > > > > >>>>      int ret;
> > > > > >>>>
> > > > > >>>>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> > > > > >>>>@@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> > > > > >>>>          ret = vdi_size;
> > > > > >>>>          goto out;
> > > > > >>>>      }
> > > > > >>>>-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> > > > > >>>>+
> > > > > >>>>+    base = bs->opaque;
> > > > > >>>>+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> > > > > >>>>+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> > > > > >>>>+    buf = g_malloc0(buf_size);
> > > > > >>>>+
> > > > > >>>>+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
> > > > > >>>>
> > > > > >>>>      for (idx = 0; idx < max_idx; idx++) {
> > > > > >>>>          /*
> > > > > >>>>           * The created image can be a cloned image, so we need to read
> > > > > >>>>           * a data from the source image.
> > > > > >>>>           */
> > > > > >>>>-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > > > > >>>>+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> > > > > >>>>          if (ret < 0) {
> > > > > >>>>              goto out;
> > > > > >>>>          }
> > > > > >>>>-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > > > > >>>>+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> > > > > >>>>          if (ret < 0) {
> > > > > >>>>              goto out;
> > > > > >>>>          }
> > > > > >>>>@@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> > > > > >>>>      return 0;
> > > > > >>>>  }
> > > > > >>>>
> > > > > >>>>+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> > > > > >>>>+{
> > > > > >>>>+    struct SheepdogInode *inode = &s->inode;
> > > > > >>>>+    inode->block_size_shift =
> > > > > >>>>+        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> > > > > >>>>+    if (inode->block_size_shift == 0) {
> > > > > >>>>+        /* block_size_shift is set for cluster default value by sheepdog */
> > > > > >>>>+        return 0;
> > > > > >>>>+    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> > > > > >>>>+        return -EINVAL;
> > > > > >>>>+    }
> > > > > >>>>+
> > > > > >>>>+    return 0;
> > > > > >>>>+}
> > > > > >>>>+
> > > > > >>>>  static int sd_create(const char *filename, QemuOpts *opts,
> > > > > >>>>                       Error **errp)
> > > > > >>>>  {
> > > > > >>>>@@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > > >>>>      BDRVSheepdogState *s;
> > > > > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > > > > >>>>      uint32_t snapid;
> > > > > >>>>+    uint64_t max_vdi_size;
> > > > > >>>>      bool prealloc = false;
> > > > > >>>>
> > > > > >>>>      s = g_new0(BDRVSheepdogState, 1);
> > > > > >>>>@@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > > >>>>          }
> > > > > >>>>      }
> > > > > >>>>
> > > > > >>>>-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> > > > > >>>>-        error_setg(errp, "too big image size");
> > > > > >>>>-        ret = -EINVAL;
> > > > > >>>>+    ret = parse_block_size_shift(s, opts);
> > > > > >>>>+    if (ret < 0) {
> > > > > >>>>+        error_setg(errp, "Invalid block_size_shift:"
> > > > > >>>>+                         " '%s'. please use 20-31", buf);
> > > > > >>>>          goto out;
> > > > > >>>>      }
> > > > > >>>>
> > > > > >>>>@@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > > >>>>      }
> > > > > >>>>
> > > > > >>>>      s->aio_context = qemu_get_aio_context();
> > > > > >>>>+
> > > > > >>>>+    /* if block_size_shift is not specified, get cluster default value */
> > > > > >>>>+    if (s->inode.block_size_shift == 0) {
> > > > > >>>
> > > > > >>>Seems that we don't keep backward compatibility for new QEMU and old sheep that
> > > > > >>>doesn't support SD_OP_GET_CLUSTER_DEFAULT.
> > > > > >>Then, I'll add the operation that if block_size_shift from
> > > > > >>SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
> > > > > >>Is it OK?
> > > > > >
> > > > > >If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
> > > > > >SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
> > > > > >SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.
> > > > > OK, I think that the way is better.
> > > > > >
> > > > > >My point is that, after user upgrading the sheep and still stick with old QEMU,
> > > > > >(I guess many users will), any operations in the past sholudn't fail right after
> > > > > >upgrading.
> > > > > >
> > > > > >>
> > > > > >>>
> > > > > >>>What will happen if old QEMU with new sheep that support block_size_shift? Most
> > > > > >>>distributions will ship the old stable qemu that wouldn't be aware of
> > > > > >>>block_size_shift.
> > > > > >>If old QEMU with new sheep is used, VDI is created with cluster default
> > > > > >>block_size_shift and accessed as 4MB object_size.
> > > > > >>So I think that for backward compatibility, users must do cluster
> > > > > >>format command with default block_size_shift 22 equal to
> > > > > >>4MB object_size.
> > > > > >
> > > > > >how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
> > > > > >encoded as 0 and then send the create request to sheep. Does sheep can handle
> > > > > >block_size_shift = 0? You know, we can't pass any value to old QEMU for
> > > > > >block_size_shift.
> > > > > Sheep can handle block_size_shift = 0, when users create new VDI.
> > > > > Old QEMU does do_sd_create() without setting hdr.block_size_shift and
> > > > > sends a request SD_OP_NEW_VDI.
> > > > > If block_size_shift is set to zero, new sheep sets cluster default value
> > > > > in cluster_new_vdi() like copies and copy_policy.
> > > > 
> > > > Okay, so new sheep can handle old QEMU request. By the way, how about the
> > > > suggestion in my last email? (x + 1) * 4MB stuff...
> > > 
> > > I think specifying object size in multiple of 4MB is overkill. Because
> > > we can specify both of block_size_shift and a number of objects which
> > > belong to VDI. More precisely,
> > > 2 ^ block_size_shift * #objects = VDI size
> > > we can choose the block_size_shift between 20 and 30, and #objects
> > > from 1 < 2 ^ 20.
> > > # #objects is specified via VDI size implicitly
> > 
> > I can't understand this paragraph. If object size is fixed, we can calculate
> > # of object easily. It has nothing to do with block_size_shift.
> 
> I wanted to say that we can choose both of block_size_shift and # of
> objects from wide range, so granualarity is flexible and steps between
> VDI sizes are reasonably small.
> 
> > 
> > > The granualarity of VDI sizes seems to be really fine (even
> > > block_size_shift == 30, step is 1GB). So supporting block size with
> > > multiple of 4MB will not provide practical benefit.
> > 
> > Prabably yes, but finer granularity doesn't cause trouble and gives more
> > flexibility. We can allow 12MB at user's will, for example. Even it doesn't
> > provide practical benefits, what benefit block_size_shift will provide over
> > (x + 1) * 4MB scheme? My point is, give a wider range won't hurt and will
> > pose less constaint in the future.
> > 
> > The main reason I suggest (x + 1) * 4MB, is that we can easily keep backward
> > compatibility because x = 0 means 4MB, but with this patch proposal, x = 22
> > means 4MB.
> 
> Utilizing block_size_shift of inode doesn't break
> compatibility. Because older versions of sheepdog initialize with
> inode->block_size_shift with 22.

Older version? What about the old sheep that doesn't support block_size_shift?
If I remember well, block_size_shift is a new thing introduced by
fd9e4a28fad7c16b2237f4f48c9d264af192e40e, at Dec 12, 2014, very new. This means
all our stable sheep won't have any idea of what block_size_shift is.

The main trouble is old QEMU, which will always set inode->block_size_shift as 0
and expect the object size is 4MB.

Think about following case:

1 Old qemu and old sheep runs a long time with many vdis that has 4MB object.
2 Users upgrade sheep into new sheep but doesn't upgrade QEMU. This is quit
  normal case because many users use stable QEMU branch.
3 He still use qemu-img to create new vdi and expect the object size is 4MB
  If new sheep doesn't set object size as 4MB, this qemu block driver will
  malfunction.

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [sheepdog] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  7:42                 ` Liu Yuan
@ 2015-02-12  8:01                   ` Teruaki Ishizaki
  2015-02-12  8:11                     ` Liu Yuan
  2015-02-12  8:13                   ` Hitoshi Mitake
  1 sibling, 1 reply; 28+ messages in thread
From: Teruaki Ishizaki @ 2015-02-12  8:01 UTC (permalink / raw)
  To: Liu Yuan, Hitoshi Mitake; +Cc: kwolf, sheepdog, qemu-devel, stefanha

(2015/02/12 16:42), Liu Yuan wrote:
> On Thu, Feb 12, 2015 at 04:28:01PM +0900, Hitoshi Mitake wrote:
>> At Thu, 12 Feb 2015 15:00:49 +0800,
>> Liu Yuan wrote:
>>>
>>> On Thu, Feb 12, 2015 at 03:19:21PM +0900, Hitoshi Mitake wrote:
>>>> At Tue, 10 Feb 2015 18:35:58 +0800,
>>>> Liu Yuan wrote:
>>>>>
>>>>> On Tue, Feb 10, 2015 at 06:56:33PM +0900, Teruaki Ishizaki wrote:
>>>>>> (2015/02/10 17:58), Liu Yuan wrote:
>>>>>>> On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
>>>>>>>> (2015/02/10 12:10), Liu Yuan wrote:
>>>>>>>>> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
>>>>>>>>>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>>>>>>>>>> This patch enables users to handle "block_size_shift" value for
>>>>>>>>>> calculating VDI object size.
>>>>>>>>>>
>>>>>>>>>> When you start qemu, you don't need to specify additional command option.
>>>>>>>>>>
>>>>>>>>>> But when you create the VDI which doesn't have default object size
>>>>>>>>>> with qemu-img command, you specify block_size_shift option.
>>>>>>>>>>
>>>>>>>>>> If you want to create a VDI of 8MB(1 << 23) object size,
>>>>>>>>>> you need to specify following command option.
>>>>>>>>>>
>>>>>>>>>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>>>>>>>>>
>>>>>>>>>> In addition, when you don't specify qemu-img command option,
>>>>>>>>>> a default value of sheepdog cluster is used for creating VDI.
>>>>>>>>>>
>>>>>>>>>>   # qemu-img create sheepdog:test2 100M
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
>>>>>>>>>> ---
>>>>>>>>>> V4:
>>>>>>>>>>   - Limit a read/write buffer size for creating a preallocated VDI.
>>>>>>>>>>   - Replace a parse function for the block_size_shift option.
>>>>>>>>>>   - Fix an error message.
>>>>>>>>>>
>>>>>>>>>> V3:
>>>>>>>>>>   - Delete the needless operation of buffer.
>>>>>>>>>>   - Delete the needless operations of request header.
>>>>>>>>>>     for SD_OP_GET_CLUSTER_DEFAULT.
>>>>>>>>>>   - Fix coding style problems.
>>>>>>>>>>
>>>>>>>>>> V2:
>>>>>>>>>>   - Fix coding style problem (white space).
>>>>>>>>>>   - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
>>>>>>>>>>   - Initialize request header to use block_size_shift specified by user.
>>>>>>>>>> ---
>>>>>>>>>>   block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
>>>>>>>>>>   include/block/block_int.h |    1 +
>>>>>>>>>>   2 files changed, 119 insertions(+), 20 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>>>>>>>>>> index be3176f..a43b947 100644
>>>>>>>>>> --- a/block/sheepdog.c
>>>>>>>>>> +++ b/block/sheepdog.c
>>>>>>>>>> @@ -37,6 +37,7 @@
>>>>>>>>>>   #define SD_OP_READ_VDIS      0x15
>>>>>>>>>>   #define SD_OP_FLUSH_VDI      0x16
>>>>>>>>>>   #define SD_OP_DEL_VDI        0x17
>>>>>>>>>> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>>>>>>>>>>
>>>>>>>>>>   #define SD_FLAG_CMD_WRITE    0x01
>>>>>>>>>>   #define SD_FLAG_CMD_COW      0x02
>>>>>>>>>> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
>>>>>>>>>>       uint32_t base_vdi_id;
>>>>>>>>>>       uint8_t copies;
>>>>>>>>>>       uint8_t copy_policy;
>>>>>>>>>> -    uint8_t reserved[2];
>>>>>>>>>> +    uint8_t store_policy;
>>>>>>>>>> +    uint8_t block_size_shift;
>>>>>>>>>>       uint32_t snapid;
>>>>>>>>>>       uint32_t type;
>>>>>>>>>>       uint32_t pad[2];
>>>>>>>>>> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
>>>>>>>>>>       uint32_t pad[5];
>>>>>>>>>>   } SheepdogVdiRsp;
>>>>>>>>>>
>>>>>>>>>> +typedef struct SheepdogClusterRsp {
>>>>>>>>>> +    uint8_t proto_ver;
>>>>>>>>>> +    uint8_t opcode;
>>>>>>>>>> +    uint16_t flags;
>>>>>>>>>> +    uint32_t epoch;
>>>>>>>>>> +    uint32_t id;
>>>>>>>>>> +    uint32_t data_length;
>>>>>>>>>> +    uint32_t result;
>>>>>>>>>> +    uint8_t nr_copies;
>>>>>>>>>> +    uint8_t copy_policy;
>>>>>>>>>> +    uint8_t block_size_shift;
>>>>>>>>>> +    uint8_t __pad1;
>>>>>>>>>> +    uint32_t __pad2[6];
>>>>>>>>>> +} SheepdogClusterRsp;
>>>>>>>>>> +
>>>>>>>>>>   typedef struct SheepdogInode {
>>>>>>>>>>       char name[SD_MAX_VDI_LEN];
>>>>>>>>>>       char tag[SD_MAX_VDI_TAG_LEN];
>>>>>>>>>> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>>>>>>>>>       hdr.vdi_size = s->inode.vdi_size;
>>>>>>>>>>       hdr.copy_policy = s->inode.copy_policy;
>>>>>>>>>>       hdr.copies = s->inode.nr_copies;
>>>>>>>>>> +    hdr.block_size_shift = s->inode.block_size_shift;
>>>>>>>>>>
>>>>>>>>>>       ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>>>>>>>>>>
>>>>>>>>>> @@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>>>>>>>>>   static int sd_prealloc(const char *filename, Error **errp)
>>>>>>>>>>   {
>>>>>>>>>>       BlockDriverState *bs = NULL;
>>>>>>>>>> +    BDRVSheepdogState *base = NULL;
>>>>>>>>>> +    unsigned long buf_size;
>>>>>>>>>>       uint32_t idx, max_idx;
>>>>>>>>>> +    uint32_t object_size;
>>>>>>>>>>       int64_t vdi_size;
>>>>>>>>>> -    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
>>>>>>>>>> +    void *buf = NULL;
>>>>>>>>>>       int ret;
>>>>>>>>>>
>>>>>>>>>>       ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
>>>>>>>>>> @@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
>>>>>>>>>>           ret = vdi_size;
>>>>>>>>>>           goto out;
>>>>>>>>>>       }
>>>>>>>>>> -    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
>>>>>>>>>> +
>>>>>>>>>> +    base = bs->opaque;
>>>>>>>>>> +    object_size = (UINT32_C(1) << base->inode.block_size_shift);
>>>>>>>>>> +    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
>>>>>>>>>> +    buf = g_malloc0(buf_size);
>>>>>>>>>> +
>>>>>>>>>> +    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
>>>>>>>>>>
>>>>>>>>>>       for (idx = 0; idx < max_idx; idx++) {
>>>>>>>>>>           /*
>>>>>>>>>>            * The created image can be a cloned image, so we need to read
>>>>>>>>>>            * a data from the source image.
>>>>>>>>>>            */
>>>>>>>>>> -        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>>>>>>>>>> +        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
>>>>>>>>>>           if (ret < 0) {
>>>>>>>>>>               goto out;
>>>>>>>>>>           }
>>>>>>>>>> -        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>>>>>>>>>> +        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
>>>>>>>>>>           if (ret < 0) {
>>>>>>>>>>               goto out;
>>>>>>>>>>           }
>>>>>>>>>> @@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
>>>>>>>>>>       return 0;
>>>>>>>>>>   }
>>>>>>>>>>
>>>>>>>>>> +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
>>>>>>>>>> +{
>>>>>>>>>> +    struct SheepdogInode *inode = &s->inode;
>>>>>>>>>> +    inode->block_size_shift =
>>>>>>>>>> +        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
>>>>>>>>>> +    if (inode->block_size_shift == 0) {
>>>>>>>>>> +        /* block_size_shift is set for cluster default value by sheepdog */
>>>>>>>>>> +        return 0;
>>>>>>>>>> +    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
>>>>>>>>>> +        return -EINVAL;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    return 0;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>   static int sd_create(const char *filename, QemuOpts *opts,
>>>>>>>>>>                        Error **errp)
>>>>>>>>>>   {
>>>>>>>>>> @@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>>>>>>>>>       BDRVSheepdogState *s;
>>>>>>>>>>       char tag[SD_MAX_VDI_TAG_LEN];
>>>>>>>>>>       uint32_t snapid;
>>>>>>>>>> +    uint64_t max_vdi_size;
>>>>>>>>>>       bool prealloc = false;
>>>>>>>>>>
>>>>>>>>>>       s = g_new0(BDRVSheepdogState, 1);
>>>>>>>>>> @@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>>>>>>>>>           }
>>>>>>>>>>       }
>>>>>>>>>>
>>>>>>>>>> -    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
>>>>>>>>>> -        error_setg(errp, "too big image size");
>>>>>>>>>> -        ret = -EINVAL;
>>>>>>>>>> +    ret = parse_block_size_shift(s, opts);
>>>>>>>>>> +    if (ret < 0) {
>>>>>>>>>> +        error_setg(errp, "Invalid block_size_shift:"
>>>>>>>>>> +                         " '%s'. please use 20-31", buf);
>>>>>>>>>>           goto out;
>>>>>>>>>>       }
>>>>>>>>>>
>>>>>>>>>> @@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>>>>>>>>>       }
>>>>>>>>>>
>>>>>>>>>>       s->aio_context = qemu_get_aio_context();
>>>>>>>>>> +
>>>>>>>>>> +    /* if block_size_shift is not specified, get cluster default value */
>>>>>>>>>> +    if (s->inode.block_size_shift == 0) {
>>>>>>>>>
>>>>>>>>> Seems that we don't keep backward compatibility for new QEMU and old sheep that
>>>>>>>>> doesn't support SD_OP_GET_CLUSTER_DEFAULT.
>>>>>>>> Then, I'll add the operation that if block_size_shift from
>>>>>>>> SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
>>>>>>>> Is it OK?
>>>>>>>
>>>>>>> If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
>>>>>>> SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
>>>>>>> SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.
>>>>>> OK, I think that the way is better.
>>>>>>>
>>>>>>> My point is that, after user upgrading the sheep and still stick with old QEMU,
>>>>>>> (I guess many users will), any operations in the past sholudn't fail right after
>>>>>>> upgrading.
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>> What will happen if old QEMU with new sheep that support block_size_shift? Most
>>>>>>>>> distributions will ship the old stable qemu that wouldn't be aware of
>>>>>>>>> block_size_shift.
>>>>>>>> If old QEMU with new sheep is used, VDI is created with cluster default
>>>>>>>> block_size_shift and accessed as 4MB object_size.
>>>>>>>> So I think that for backward compatibility, users must do cluster
>>>>>>>> format command with default block_size_shift 22 equal to
>>>>>>>> 4MB object_size.
>>>>>>>
>>>>>>> how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
>>>>>>> encoded as 0 and then send the create request to sheep. Does sheep can handle
>>>>>>> block_size_shift = 0? You know, we can't pass any value to old QEMU for
>>>>>>> block_size_shift.
>>>>>> Sheep can handle block_size_shift = 0, when users create new VDI.
>>>>>> Old QEMU does do_sd_create() without setting hdr.block_size_shift and
>>>>>> sends a request SD_OP_NEW_VDI.
>>>>>> If block_size_shift is set to zero, new sheep sets cluster default value
>>>>>> in cluster_new_vdi() like copies and copy_policy.
>>>>>
>>>>> Okay, so new sheep can handle old QEMU request. By the way, how about the
>>>>> suggestion in my last email? (x + 1) * 4MB stuff...
>>>>
>>>> I think specifying object size in multiple of 4MB is overkill. Because
>>>> we can specify both of block_size_shift and a number of objects which
>>>> belong to VDI. More precisely,
>>>> 2 ^ block_size_shift * #objects = VDI size
>>>> we can choose the block_size_shift between 20 and 30, and #objects
>>>> from 1 < 2 ^ 20.
>>>> # #objects is specified via VDI size implicitly
>>>
>>> I can't understand this paragraph. If object size is fixed, we can calculate
>>> # of object easily. It has nothing to do with block_size_shift.
>>
>> I wanted to say that we can choose both of block_size_shift and # of
>> objects from wide range, so granualarity is flexible and steps between
>> VDI sizes are reasonably small.
>>
>>>
>>>> The granualarity of VDI sizes seems to be really fine (even
>>>> block_size_shift == 30, step is 1GB). So supporting block size with
>>>> multiple of 4MB will not provide practical benefit.
>>>
>>> Prabably yes, but finer granularity doesn't cause trouble and gives more
>>> flexibility. We can allow 12MB at user's will, for example. Even it doesn't
>>> provide practical benefits, what benefit block_size_shift will provide over
>>> (x + 1) * 4MB scheme? My point is, give a wider range won't hurt and will
>>> pose less constaint in the future.
>>>
>>> The main reason I suggest (x + 1) * 4MB, is that we can easily keep backward
>>> compatibility because x = 0 means 4MB, but with this patch proposal, x = 22
>>> means 4MB.
>>
>> Utilizing block_size_shift of inode doesn't break
>> compatibility. Because older versions of sheepdog initialize with
>> inode->block_size_shift with 22.
>
> Older version? What about the old sheep that doesn't support block_size_shift?
> If I remember well, block_size_shift is a new thing introduced by
> fd9e4a28fad7c16b2237f4f48c9d264af192e40e, at Dec 12, 2014, very new. This means
> all our stable sheep won't have any idea of what block_size_shift is.

Old sheepdog initialize with inode->block_size_shift with 22.
The implementation of Sheepdog 0.9 is following

=======
static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
                                     uint32_t new_snapid, uint32_t new_vid,
                                     uint32_t *data_vdi_id,
                                     struct generation_reference *gref)
{
         struct sd_inode *new = xzalloc(sizeof(*new));
         unsigned long block_size = SD_DATA_OBJ_SIZE;

...(snip)
         new->block_size_shift = find_next_bit(&block_size, 
BITS_PER_LONG, 0);
=======

It means that block_size_shift is initialized with 22.

>
> The main trouble is old QEMU, which will always set inode->block_size_shift as 0
> and expect the object size is 4MB.
>
> Think about following case:
>
> 1 Old qemu and old sheep runs a long time with many vdis that has 4MB object.
> 2 Users upgrade sheep into new sheep but doesn't upgrade QEMU. This is quit
>    normal case because many users use stable QEMU branch.
> 3 He still use qemu-img to create new vdi and expect the object size is 4MB
>    If new sheep doesn't set object size as 4MB, this qemu block driver will
>    malfunction.

As I said,

Qe/sd   new    old
new     CDS    Ign
old     CDS    NULL

So, when we use old Qemu with new Sheepdog, we should do cluster format
with default block_size_shift 22.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [sheepdog] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  8:01                   ` Teruaki Ishizaki
@ 2015-02-12  8:11                     ` Liu Yuan
  0 siblings, 0 replies; 28+ messages in thread
From: Liu Yuan @ 2015-02-12  8:11 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: Hitoshi Mitake, kwolf, sheepdog, qemu-devel, stefanha

On Thu, Feb 12, 2015 at 05:01:05PM +0900, Teruaki Ishizaki wrote:
> (2015/02/12 16:42), Liu Yuan wrote:
> >On Thu, Feb 12, 2015 at 04:28:01PM +0900, Hitoshi Mitake wrote:
> >>At Thu, 12 Feb 2015 15:00:49 +0800,
> >>Liu Yuan wrote:
> >>>
> >>>On Thu, Feb 12, 2015 at 03:19:21PM +0900, Hitoshi Mitake wrote:
> >>>>At Tue, 10 Feb 2015 18:35:58 +0800,
> >>>>Liu Yuan wrote:
> >>>>>
> >>>>>On Tue, Feb 10, 2015 at 06:56:33PM +0900, Teruaki Ishizaki wrote:
> >>>>>>(2015/02/10 17:58), Liu Yuan wrote:
> >>>>>>>On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
> >>>>>>>>(2015/02/10 12:10), Liu Yuan wrote:
> >>>>>>>>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> >>>>>>>>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> >>>>>>>>>>This patch enables users to handle "block_size_shift" value for
> >>>>>>>>>>calculating VDI object size.
> >>>>>>>>>>
> >>>>>>>>>>When you start qemu, you don't need to specify additional command option.
> >>>>>>>>>>
> >>>>>>>>>>But when you create the VDI which doesn't have default object size
> >>>>>>>>>>with qemu-img command, you specify block_size_shift option.
> >>>>>>>>>>
> >>>>>>>>>>If you want to create a VDI of 8MB(1 << 23) object size,
> >>>>>>>>>>you need to specify following command option.
> >>>>>>>>>>
> >>>>>>>>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> >>>>>>>>>>
> >>>>>>>>>>In addition, when you don't specify qemu-img command option,
> >>>>>>>>>>a default value of sheepdog cluster is used for creating VDI.
> >>>>>>>>>>
> >>>>>>>>>>  # qemu-img create sheepdog:test2 100M
> >>>>>>>>>>
> >>>>>>>>>>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> >>>>>>>>>>---
> >>>>>>>>>>V4:
> >>>>>>>>>>  - Limit a read/write buffer size for creating a preallocated VDI.
> >>>>>>>>>>  - Replace a parse function for the block_size_shift option.
> >>>>>>>>>>  - Fix an error message.
> >>>>>>>>>>
> >>>>>>>>>>V3:
> >>>>>>>>>>  - Delete the needless operation of buffer.
> >>>>>>>>>>  - Delete the needless operations of request header.
> >>>>>>>>>>    for SD_OP_GET_CLUSTER_DEFAULT.
> >>>>>>>>>>  - Fix coding style problems.
> >>>>>>>>>>
> >>>>>>>>>>V2:
> >>>>>>>>>>  - Fix coding style problem (white space).
> >>>>>>>>>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> >>>>>>>>>>  - Initialize request header to use block_size_shift specified by user.
> >>>>>>>>>>---
> >>>>>>>>>>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> >>>>>>>>>>  include/block/block_int.h |    1 +
> >>>>>>>>>>  2 files changed, 119 insertions(+), 20 deletions(-)
> >>>>>>>>>>
> >>>>>>>>>>diff --git a/block/sheepdog.c b/block/sheepdog.c
> >>>>>>>>>>index be3176f..a43b947 100644
> >>>>>>>>>>--- a/block/sheepdog.c
> >>>>>>>>>>+++ b/block/sheepdog.c
> >>>>>>>>>>@@ -37,6 +37,7 @@
> >>>>>>>>>>  #define SD_OP_READ_VDIS      0x15
> >>>>>>>>>>  #define SD_OP_FLUSH_VDI      0x16
> >>>>>>>>>>  #define SD_OP_DEL_VDI        0x17
> >>>>>>>>>>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> >>>>>>>>>>
> >>>>>>>>>>  #define SD_FLAG_CMD_WRITE    0x01
> >>>>>>>>>>  #define SD_FLAG_CMD_COW      0x02
> >>>>>>>>>>@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> >>>>>>>>>>      uint32_t base_vdi_id;
> >>>>>>>>>>      uint8_t copies;
> >>>>>>>>>>      uint8_t copy_policy;
> >>>>>>>>>>-    uint8_t reserved[2];
> >>>>>>>>>>+    uint8_t store_policy;
> >>>>>>>>>>+    uint8_t block_size_shift;
> >>>>>>>>>>      uint32_t snapid;
> >>>>>>>>>>      uint32_t type;
> >>>>>>>>>>      uint32_t pad[2];
> >>>>>>>>>>@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> >>>>>>>>>>      uint32_t pad[5];
> >>>>>>>>>>  } SheepdogVdiRsp;
> >>>>>>>>>>
> >>>>>>>>>>+typedef struct SheepdogClusterRsp {
> >>>>>>>>>>+    uint8_t proto_ver;
> >>>>>>>>>>+    uint8_t opcode;
> >>>>>>>>>>+    uint16_t flags;
> >>>>>>>>>>+    uint32_t epoch;
> >>>>>>>>>>+    uint32_t id;
> >>>>>>>>>>+    uint32_t data_length;
> >>>>>>>>>>+    uint32_t result;
> >>>>>>>>>>+    uint8_t nr_copies;
> >>>>>>>>>>+    uint8_t copy_policy;
> >>>>>>>>>>+    uint8_t block_size_shift;
> >>>>>>>>>>+    uint8_t __pad1;
> >>>>>>>>>>+    uint32_t __pad2[6];
> >>>>>>>>>>+} SheepdogClusterRsp;
> >>>>>>>>>>+
> >>>>>>>>>>  typedef struct SheepdogInode {
> >>>>>>>>>>      char name[SD_MAX_VDI_LEN];
> >>>>>>>>>>      char tag[SD_MAX_VDI_TAG_LEN];
> >>>>>>>>>>@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> >>>>>>>>>>      hdr.vdi_size = s->inode.vdi_size;
> >>>>>>>>>>      hdr.copy_policy = s->inode.copy_policy;
> >>>>>>>>>>      hdr.copies = s->inode.nr_copies;
> >>>>>>>>>>+    hdr.block_size_shift = s->inode.block_size_shift;
> >>>>>>>>>>
> >>>>>>>>>>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> >>>>>>>>>>
> >>>>>>>>>>@@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> >>>>>>>>>>  static int sd_prealloc(const char *filename, Error **errp)
> >>>>>>>>>>  {
> >>>>>>>>>>      BlockDriverState *bs = NULL;
> >>>>>>>>>>+    BDRVSheepdogState *base = NULL;
> >>>>>>>>>>+    unsigned long buf_size;
> >>>>>>>>>>      uint32_t idx, max_idx;
> >>>>>>>>>>+    uint32_t object_size;
> >>>>>>>>>>      int64_t vdi_size;
> >>>>>>>>>>-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> >>>>>>>>>>+    void *buf = NULL;
> >>>>>>>>>>      int ret;
> >>>>>>>>>>
> >>>>>>>>>>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> >>>>>>>>>>@@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> >>>>>>>>>>          ret = vdi_size;
> >>>>>>>>>>          goto out;
> >>>>>>>>>>      }
> >>>>>>>>>>-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> >>>>>>>>>>+
> >>>>>>>>>>+    base = bs->opaque;
> >>>>>>>>>>+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> >>>>>>>>>>+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> >>>>>>>>>>+    buf = g_malloc0(buf_size);
> >>>>>>>>>>+
> >>>>>>>>>>+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
> >>>>>>>>>>
> >>>>>>>>>>      for (idx = 0; idx < max_idx; idx++) {
> >>>>>>>>>>          /*
> >>>>>>>>>>           * The created image can be a cloned image, so we need to read
> >>>>>>>>>>           * a data from the source image.
> >>>>>>>>>>           */
> >>>>>>>>>>-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> >>>>>>>>>>+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> >>>>>>>>>>          if (ret < 0) {
> >>>>>>>>>>              goto out;
> >>>>>>>>>>          }
> >>>>>>>>>>-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> >>>>>>>>>>+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> >>>>>>>>>>          if (ret < 0) {
> >>>>>>>>>>              goto out;
> >>>>>>>>>>          }
> >>>>>>>>>>@@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> >>>>>>>>>>      return 0;
> >>>>>>>>>>  }
> >>>>>>>>>>
> >>>>>>>>>>+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> >>>>>>>>>>+{
> >>>>>>>>>>+    struct SheepdogInode *inode = &s->inode;
> >>>>>>>>>>+    inode->block_size_shift =
> >>>>>>>>>>+        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> >>>>>>>>>>+    if (inode->block_size_shift == 0) {
> >>>>>>>>>>+        /* block_size_shift is set for cluster default value by sheepdog */
> >>>>>>>>>>+        return 0;
> >>>>>>>>>>+    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> >>>>>>>>>>+        return -EINVAL;
> >>>>>>>>>>+    }
> >>>>>>>>>>+
> >>>>>>>>>>+    return 0;
> >>>>>>>>>>+}
> >>>>>>>>>>+
> >>>>>>>>>>  static int sd_create(const char *filename, QemuOpts *opts,
> >>>>>>>>>>                       Error **errp)
> >>>>>>>>>>  {
> >>>>>>>>>>@@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >>>>>>>>>>      BDRVSheepdogState *s;
> >>>>>>>>>>      char tag[SD_MAX_VDI_TAG_LEN];
> >>>>>>>>>>      uint32_t snapid;
> >>>>>>>>>>+    uint64_t max_vdi_size;
> >>>>>>>>>>      bool prealloc = false;
> >>>>>>>>>>
> >>>>>>>>>>      s = g_new0(BDRVSheepdogState, 1);
> >>>>>>>>>>@@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >>>>>>>>>>          }
> >>>>>>>>>>      }
> >>>>>>>>>>
> >>>>>>>>>>-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> >>>>>>>>>>-        error_setg(errp, "too big image size");
> >>>>>>>>>>-        ret = -EINVAL;
> >>>>>>>>>>+    ret = parse_block_size_shift(s, opts);
> >>>>>>>>>>+    if (ret < 0) {
> >>>>>>>>>>+        error_setg(errp, "Invalid block_size_shift:"
> >>>>>>>>>>+                         " '%s'. please use 20-31", buf);
> >>>>>>>>>>          goto out;
> >>>>>>>>>>      }
> >>>>>>>>>>
> >>>>>>>>>>@@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
> >>>>>>>>>>      }
> >>>>>>>>>>
> >>>>>>>>>>      s->aio_context = qemu_get_aio_context();
> >>>>>>>>>>+
> >>>>>>>>>>+    /* if block_size_shift is not specified, get cluster default value */
> >>>>>>>>>>+    if (s->inode.block_size_shift == 0) {
> >>>>>>>>>
> >>>>>>>>>Seems that we don't keep backward compatibility for new QEMU and old sheep that
> >>>>>>>>>doesn't support SD_OP_GET_CLUSTER_DEFAULT.
> >>>>>>>>Then, I'll add the operation that if block_size_shift from
> >>>>>>>>SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
> >>>>>>>>Is it OK?
> >>>>>>>
> >>>>>>>If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
> >>>>>>>SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
> >>>>>>>SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.
> >>>>>>OK, I think that the way is better.
> >>>>>>>
> >>>>>>>My point is that, after user upgrading the sheep and still stick with old QEMU,
> >>>>>>>(I guess many users will), any operations in the past sholudn't fail right after
> >>>>>>>upgrading.
> >>>>>>>
> >>>>>>>>
> >>>>>>>>>
> >>>>>>>>>What will happen if old QEMU with new sheep that support block_size_shift? Most
> >>>>>>>>>distributions will ship the old stable qemu that wouldn't be aware of
> >>>>>>>>>block_size_shift.
> >>>>>>>>If old QEMU with new sheep is used, VDI is created with cluster default
> >>>>>>>>block_size_shift and accessed as 4MB object_size.
> >>>>>>>>So I think that for backward compatibility, users must do cluster
> >>>>>>>>format command with default block_size_shift 22 equal to
> >>>>>>>>4MB object_size.
> >>>>>>>
> >>>>>>>how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
> >>>>>>>encoded as 0 and then send the create request to sheep. Does sheep can handle
> >>>>>>>block_size_shift = 0? You know, we can't pass any value to old QEMU for
> >>>>>>>block_size_shift.
> >>>>>>Sheep can handle block_size_shift = 0, when users create new VDI.
> >>>>>>Old QEMU does do_sd_create() without setting hdr.block_size_shift and
> >>>>>>sends a request SD_OP_NEW_VDI.
> >>>>>>If block_size_shift is set to zero, new sheep sets cluster default value
> >>>>>>in cluster_new_vdi() like copies and copy_policy.
> >>>>>
> >>>>>Okay, so new sheep can handle old QEMU request. By the way, how about the
> >>>>>suggestion in my last email? (x + 1) * 4MB stuff...
> >>>>
> >>>>I think specifying object size in multiple of 4MB is overkill. Because
> >>>>we can specify both of block_size_shift and a number of objects which
> >>>>belong to VDI. More precisely,
> >>>>2 ^ block_size_shift * #objects = VDI size
> >>>>we can choose the block_size_shift between 20 and 30, and #objects
> >>>>from 1 < 2 ^ 20.
> >>>># #objects is specified via VDI size implicitly
> >>>
> >>>I can't understand this paragraph. If object size is fixed, we can calculate
> >>># of object easily. It has nothing to do with block_size_shift.
> >>
> >>I wanted to say that we can choose both of block_size_shift and # of
> >>objects from wide range, so granualarity is flexible and steps between
> >>VDI sizes are reasonably small.
> >>
> >>>
> >>>>The granualarity of VDI sizes seems to be really fine (even
> >>>>block_size_shift == 30, step is 1GB). So supporting block size with
> >>>>multiple of 4MB will not provide practical benefit.
> >>>
> >>>Prabably yes, but finer granularity doesn't cause trouble and gives more
> >>>flexibility. We can allow 12MB at user's will, for example. Even it doesn't
> >>>provide practical benefits, what benefit block_size_shift will provide over
> >>>(x + 1) * 4MB scheme? My point is, give a wider range won't hurt and will
> >>>pose less constaint in the future.
> >>>
> >>>The main reason I suggest (x + 1) * 4MB, is that we can easily keep backward
> >>>compatibility because x = 0 means 4MB, but with this patch proposal, x = 22
> >>>means 4MB.
> >>
> >>Utilizing block_size_shift of inode doesn't break
> >>compatibility. Because older versions of sheepdog initialize with
> >>inode->block_size_shift with 22.
> >
> >Older version? What about the old sheep that doesn't support block_size_shift?
> >If I remember well, block_size_shift is a new thing introduced by
> >fd9e4a28fad7c16b2237f4f48c9d264af192e40e, at Dec 12, 2014, very new. This means
> >all our stable sheep won't have any idea of what block_size_shift is.
> 
> Old sheepdog initialize with inode->block_size_shift with 22.
> The implementation of Sheepdog 0.9 is following
> 
> =======
> static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
>                                     uint32_t new_snapid, uint32_t new_vid,
>                                     uint32_t *data_vdi_id,
>                                     struct generation_reference *gref)
> {
>         struct sd_inode *new = xzalloc(sizeof(*new));
>         unsigned long block_size = SD_DATA_OBJ_SIZE;
> 
> ...(snip)
>         new->block_size_shift = find_next_bit(&block_size,
> BITS_PER_LONG, 0);
> =======
> 
> It means that block_size_shift is initialized with 22.

Thanks for your code snappet, it works as you said.

> >
> >The main trouble is old QEMU, which will always set inode->block_size_shift as 0
> >and expect the object size is 4MB.
> >
> >Think about following case:
> >
> >1 Old qemu and old sheep runs a long time with many vdis that has 4MB object.
> >2 Users upgrade sheep into new sheep but doesn't upgrade QEMU. This is quit
> >   normal case because many users use stable QEMU branch.
> >3 He still use qemu-img to create new vdi and expect the object size is 4MB
> >   If new sheep doesn't set object size as 4MB, this qemu block driver will
> >   malfunction.
> 
> As I said,
> 
> Qe/sd   new    old
> new     CDS    Ign
> old     CDS    NULL
> 
> So, when we use old Qemu with new Sheepdog, we should do cluster format
> with default block_size_shift 22.

Okay, it looks our sheep can handle it. It is fine to me to use block_size_shift
though it pose some constraint to the size range. But considering sheepdog
already have the block_size_shift in the inode, it is okay to make use of it for
QEMU.

But for user option, I stick with the 'object_size' and translate it into
block_size_shift internally in the driver.

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [sheepdog] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  7:42                 ` Liu Yuan
  2015-02-12  8:01                   ` Teruaki Ishizaki
@ 2015-02-12  8:13                   ` Hitoshi Mitake
  2015-02-12  8:16                     ` Liu Yuan
  1 sibling, 1 reply; 28+ messages in thread
From: Hitoshi Mitake @ 2015-02-12  8:13 UTC (permalink / raw)
  To: Liu Yuan
  Cc: kwolf, sheepdog, Hitoshi Mitake, qemu-devel, stefanha, Teruaki Ishizaki

At Thu, 12 Feb 2015 15:42:17 +0800,
Liu Yuan wrote:
> 
> On Thu, Feb 12, 2015 at 04:28:01PM +0900, Hitoshi Mitake wrote:
> > At Thu, 12 Feb 2015 15:00:49 +0800,
> > Liu Yuan wrote:
> > > 
> > > On Thu, Feb 12, 2015 at 03:19:21PM +0900, Hitoshi Mitake wrote:
> > > > At Tue, 10 Feb 2015 18:35:58 +0800,
> > > > Liu Yuan wrote:
> > > > > 
> > > > > On Tue, Feb 10, 2015 at 06:56:33PM +0900, Teruaki Ishizaki wrote:
> > > > > > (2015/02/10 17:58), Liu Yuan wrote:
> > > > > > >On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
> > > > > > >>(2015/02/10 12:10), Liu Yuan wrote:
> > > > > > >>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> > > > > > >>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> > > > > > >>>>This patch enables users to handle "block_size_shift" value for
> > > > > > >>>>calculating VDI object size.
> > > > > > >>>>
> > > > > > >>>>When you start qemu, you don't need to specify additional command option.
> > > > > > >>>>
> > > > > > >>>>But when you create the VDI which doesn't have default object size
> > > > > > >>>>with qemu-img command, you specify block_size_shift option.
> > > > > > >>>>
> > > > > > >>>>If you want to create a VDI of 8MB(1 << 23) object size,
> > > > > > >>>>you need to specify following command option.
> > > > > > >>>>
> > > > > > >>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> > > > > > >>>>
> > > > > > >>>>In addition, when you don't specify qemu-img command option,
> > > > > > >>>>a default value of sheepdog cluster is used for creating VDI.
> > > > > > >>>>
> > > > > > >>>>  # qemu-img create sheepdog:test2 100M
> > > > > > >>>>
> > > > > > >>>>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> > > > > > >>>>---
> > > > > > >>>>V4:
> > > > > > >>>>  - Limit a read/write buffer size for creating a preallocated VDI.
> > > > > > >>>>  - Replace a parse function for the block_size_shift option.
> > > > > > >>>>  - Fix an error message.
> > > > > > >>>>
> > > > > > >>>>V3:
> > > > > > >>>>  - Delete the needless operation of buffer.
> > > > > > >>>>  - Delete the needless operations of request header.
> > > > > > >>>>    for SD_OP_GET_CLUSTER_DEFAULT.
> > > > > > >>>>  - Fix coding style problems.
> > > > > > >>>>
> > > > > > >>>>V2:
> > > > > > >>>>  - Fix coding style problem (white space).
> > > > > > >>>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> > > > > > >>>>  - Initialize request header to use block_size_shift specified by user.
> > > > > > >>>>---
> > > > > > >>>>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> > > > > > >>>>  include/block/block_int.h |    1 +
> > > > > > >>>>  2 files changed, 119 insertions(+), 20 deletions(-)
> > > > > > >>>>
> > > > > > >>>>diff --git a/block/sheepdog.c b/block/sheepdog.c
> > > > > > >>>>index be3176f..a43b947 100644
> > > > > > >>>>--- a/block/sheepdog.c
> > > > > > >>>>+++ b/block/sheepdog.c
> > > > > > >>>>@@ -37,6 +37,7 @@
> > > > > > >>>>  #define SD_OP_READ_VDIS      0x15
> > > > > > >>>>  #define SD_OP_FLUSH_VDI      0x16
> > > > > > >>>>  #define SD_OP_DEL_VDI        0x17
> > > > > > >>>>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> > > > > > >>>>
> > > > > > >>>>  #define SD_FLAG_CMD_WRITE    0x01
> > > > > > >>>>  #define SD_FLAG_CMD_COW      0x02
> > > > > > >>>>@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> > > > > > >>>>      uint32_t base_vdi_id;
> > > > > > >>>>      uint8_t copies;
> > > > > > >>>>      uint8_t copy_policy;
> > > > > > >>>>-    uint8_t reserved[2];
> > > > > > >>>>+    uint8_t store_policy;
> > > > > > >>>>+    uint8_t block_size_shift;
> > > > > > >>>>      uint32_t snapid;
> > > > > > >>>>      uint32_t type;
> > > > > > >>>>      uint32_t pad[2];
> > > > > > >>>>@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> > > > > > >>>>      uint32_t pad[5];
> > > > > > >>>>  } SheepdogVdiRsp;
> > > > > > >>>>
> > > > > > >>>>+typedef struct SheepdogClusterRsp {
> > > > > > >>>>+    uint8_t proto_ver;
> > > > > > >>>>+    uint8_t opcode;
> > > > > > >>>>+    uint16_t flags;
> > > > > > >>>>+    uint32_t epoch;
> > > > > > >>>>+    uint32_t id;
> > > > > > >>>>+    uint32_t data_length;
> > > > > > >>>>+    uint32_t result;
> > > > > > >>>>+    uint8_t nr_copies;
> > > > > > >>>>+    uint8_t copy_policy;
> > > > > > >>>>+    uint8_t block_size_shift;
> > > > > > >>>>+    uint8_t __pad1;
> > > > > > >>>>+    uint32_t __pad2[6];
> > > > > > >>>>+} SheepdogClusterRsp;
> > > > > > >>>>+
> > > > > > >>>>  typedef struct SheepdogInode {
> > > > > > >>>>      char name[SD_MAX_VDI_LEN];
> > > > > > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > > > > > >>>>@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > > > > > >>>>      hdr.vdi_size = s->inode.vdi_size;
> > > > > > >>>>      hdr.copy_policy = s->inode.copy_policy;
> > > > > > >>>>      hdr.copies = s->inode.nr_copies;
> > > > > > >>>>+    hdr.block_size_shift = s->inode.block_size_shift;
> > > > > > >>>>
> > > > > > >>>>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> > > > > > >>>>
> > > > > > >>>>@@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > > > > > >>>>  static int sd_prealloc(const char *filename, Error **errp)
> > > > > > >>>>  {
> > > > > > >>>>      BlockDriverState *bs = NULL;
> > > > > > >>>>+    BDRVSheepdogState *base = NULL;
> > > > > > >>>>+    unsigned long buf_size;
> > > > > > >>>>      uint32_t idx, max_idx;
> > > > > > >>>>+    uint32_t object_size;
> > > > > > >>>>      int64_t vdi_size;
> > > > > > >>>>-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> > > > > > >>>>+    void *buf = NULL;
> > > > > > >>>>      int ret;
> > > > > > >>>>
> > > > > > >>>>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> > > > > > >>>>@@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> > > > > > >>>>          ret = vdi_size;
> > > > > > >>>>          goto out;
> > > > > > >>>>      }
> > > > > > >>>>-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> > > > > > >>>>+
> > > > > > >>>>+    base = bs->opaque;
> > > > > > >>>>+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> > > > > > >>>>+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> > > > > > >>>>+    buf = g_malloc0(buf_size);
> > > > > > >>>>+
> > > > > > >>>>+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
> > > > > > >>>>
> > > > > > >>>>      for (idx = 0; idx < max_idx; idx++) {
> > > > > > >>>>          /*
> > > > > > >>>>           * The created image can be a cloned image, so we need to read
> > > > > > >>>>           * a data from the source image.
> > > > > > >>>>           */
> > > > > > >>>>-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > > > > > >>>>+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> > > > > > >>>>          if (ret < 0) {
> > > > > > >>>>              goto out;
> > > > > > >>>>          }
> > > > > > >>>>-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > > > > > >>>>+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> > > > > > >>>>          if (ret < 0) {
> > > > > > >>>>              goto out;
> > > > > > >>>>          }
> > > > > > >>>>@@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> > > > > > >>>>      return 0;
> > > > > > >>>>  }
> > > > > > >>>>
> > > > > > >>>>+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> > > > > > >>>>+{
> > > > > > >>>>+    struct SheepdogInode *inode = &s->inode;
> > > > > > >>>>+    inode->block_size_shift =
> > > > > > >>>>+        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> > > > > > >>>>+    if (inode->block_size_shift == 0) {
> > > > > > >>>>+        /* block_size_shift is set for cluster default value by sheepdog */
> > > > > > >>>>+        return 0;
> > > > > > >>>>+    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> > > > > > >>>>+        return -EINVAL;
> > > > > > >>>>+    }
> > > > > > >>>>+
> > > > > > >>>>+    return 0;
> > > > > > >>>>+}
> > > > > > >>>>+
> > > > > > >>>>  static int sd_create(const char *filename, QemuOpts *opts,
> > > > > > >>>>                       Error **errp)
> > > > > > >>>>  {
> > > > > > >>>>@@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > > > >>>>      BDRVSheepdogState *s;
> > > > > > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > > > > > >>>>      uint32_t snapid;
> > > > > > >>>>+    uint64_t max_vdi_size;
> > > > > > >>>>      bool prealloc = false;
> > > > > > >>>>
> > > > > > >>>>      s = g_new0(BDRVSheepdogState, 1);
> > > > > > >>>>@@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > > > >>>>          }
> > > > > > >>>>      }
> > > > > > >>>>
> > > > > > >>>>-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> > > > > > >>>>-        error_setg(errp, "too big image size");
> > > > > > >>>>-        ret = -EINVAL;
> > > > > > >>>>+    ret = parse_block_size_shift(s, opts);
> > > > > > >>>>+    if (ret < 0) {
> > > > > > >>>>+        error_setg(errp, "Invalid block_size_shift:"
> > > > > > >>>>+                         " '%s'. please use 20-31", buf);
> > > > > > >>>>          goto out;
> > > > > > >>>>      }
> > > > > > >>>>
> > > > > > >>>>@@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > > > >>>>      }
> > > > > > >>>>
> > > > > > >>>>      s->aio_context = qemu_get_aio_context();
> > > > > > >>>>+
> > > > > > >>>>+    /* if block_size_shift is not specified, get cluster default value */
> > > > > > >>>>+    if (s->inode.block_size_shift == 0) {
> > > > > > >>>
> > > > > > >>>Seems that we don't keep backward compatibility for new QEMU and old sheep that
> > > > > > >>>doesn't support SD_OP_GET_CLUSTER_DEFAULT.
> > > > > > >>Then, I'll add the operation that if block_size_shift from
> > > > > > >>SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
> > > > > > >>Is it OK?
> > > > > > >
> > > > > > >If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
> > > > > > >SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
> > > > > > >SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.
> > > > > > OK, I think that the way is better.
> > > > > > >
> > > > > > >My point is that, after user upgrading the sheep and still stick with old QEMU,
> > > > > > >(I guess many users will), any operations in the past sholudn't fail right after
> > > > > > >upgrading.
> > > > > > >
> > > > > > >>
> > > > > > >>>
> > > > > > >>>What will happen if old QEMU with new sheep that support block_size_shift? Most
> > > > > > >>>distributions will ship the old stable qemu that wouldn't be aware of
> > > > > > >>>block_size_shift.
> > > > > > >>If old QEMU with new sheep is used, VDI is created with cluster default
> > > > > > >>block_size_shift and accessed as 4MB object_size.
> > > > > > >>So I think that for backward compatibility, users must do cluster
> > > > > > >>format command with default block_size_shift 22 equal to
> > > > > > >>4MB object_size.
> > > > > > >
> > > > > > >how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
> > > > > > >encoded as 0 and then send the create request to sheep. Does sheep can handle
> > > > > > >block_size_shift = 0? You know, we can't pass any value to old QEMU for
> > > > > > >block_size_shift.
> > > > > > Sheep can handle block_size_shift = 0, when users create new VDI.
> > > > > > Old QEMU does do_sd_create() without setting hdr.block_size_shift and
> > > > > > sends a request SD_OP_NEW_VDI.
> > > > > > If block_size_shift is set to zero, new sheep sets cluster default value
> > > > > > in cluster_new_vdi() like copies and copy_policy.
> > > > > 
> > > > > Okay, so new sheep can handle old QEMU request. By the way, how about the
> > > > > suggestion in my last email? (x + 1) * 4MB stuff...
> > > > 
> > > > I think specifying object size in multiple of 4MB is overkill. Because
> > > > we can specify both of block_size_shift and a number of objects which
> > > > belong to VDI. More precisely,
> > > > 2 ^ block_size_shift * #objects = VDI size
> > > > we can choose the block_size_shift between 20 and 30, and #objects
> > > > from 1 < 2 ^ 20.
> > > > # #objects is specified via VDI size implicitly
> > > 
> > > I can't understand this paragraph. If object size is fixed, we can calculate
> > > # of object easily. It has nothing to do with block_size_shift.
> > 
> > I wanted to say that we can choose both of block_size_shift and # of
> > objects from wide range, so granualarity is flexible and steps between
> > VDI sizes are reasonably small.
> > 
> > > 
> > > > The granualarity of VDI sizes seems to be really fine (even
> > > > block_size_shift == 30, step is 1GB). So supporting block size with
> > > > multiple of 4MB will not provide practical benefit.
> > > 
> > > Prabably yes, but finer granularity doesn't cause trouble and gives more
> > > flexibility. We can allow 12MB at user's will, for example. Even it doesn't
> > > provide practical benefits, what benefit block_size_shift will provide over
> > > (x + 1) * 4MB scheme? My point is, give a wider range won't hurt and will
> > > pose less constaint in the future.
> > > 
> > > The main reason I suggest (x + 1) * 4MB, is that we can easily keep backward
> > > compatibility because x = 0 means 4MB, but with this patch proposal, x = 22
> > > means 4MB.
> > 
> > Utilizing block_size_shift of inode doesn't break
> > compatibility. Because older versions of sheepdog initialize with
> > inode->block_size_shift with 22.
> 
> Older version? What about the old sheep that doesn't support block_size_shift?
> If I remember well, block_size_shift is a new thing introduced by
> fd9e4a28fad7c16b2237f4f48c9d264af192e40e, at Dec 12, 2014, very new. This means
> all our stable sheep won't have any idea of what block_size_shift is.
> 
> The main trouble is old QEMU, which will always set inode->block_size_shift as 0
> and expect the object size is 4MB.
> 
> Think about following case:
> 
> 1 Old qemu and old sheep runs a long time with many vdis that has 4MB object.
> 2 Users upgrade sheep into new sheep but doesn't upgrade QEMU. This is quit
>   normal case because many users use stable QEMU branch.
> 3 He still use qemu-img to create new vdi and expect the object size is 4MB
>   If new sheep doesn't set object size as 4MB, this qemu block driver will
>   malfunction.

Older qemu drivers doesn't use inode->block_size_shift at all even for
VDI creation (parameters for creation are passed via
SheepdogVdiReq). So it doesn't break compatibility.

I tested qemu-img create with qemu 1.0 and the master branch of
sheepdog but there's no problem. Am I missing something?

Thanks,
Hitoshi

> 
> Thanks
> Yuan
> -- 
> sheepdog mailing list
> sheepdog@lists.wpkg.org
> https://lists.wpkg.org/mailman/listinfo/sheepdog

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [sheepdog] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  8:13                   ` Hitoshi Mitake
@ 2015-02-12  8:16                     ` Liu Yuan
  0 siblings, 0 replies; 28+ messages in thread
From: Liu Yuan @ 2015-02-12  8:16 UTC (permalink / raw)
  To: Hitoshi Mitake; +Cc: kwolf, sheepdog, Teruaki Ishizaki, qemu-devel, stefanha

On Thu, Feb 12, 2015 at 05:13:55PM +0900, Hitoshi Mitake wrote:
> At Thu, 12 Feb 2015 15:42:17 +0800,
> Liu Yuan wrote:
> > 
> > On Thu, Feb 12, 2015 at 04:28:01PM +0900, Hitoshi Mitake wrote:
> > > At Thu, 12 Feb 2015 15:00:49 +0800,
> > > Liu Yuan wrote:
> > > > 
> > > > On Thu, Feb 12, 2015 at 03:19:21PM +0900, Hitoshi Mitake wrote:
> > > > > At Tue, 10 Feb 2015 18:35:58 +0800,
> > > > > Liu Yuan wrote:
> > > > > > 
> > > > > > On Tue, Feb 10, 2015 at 06:56:33PM +0900, Teruaki Ishizaki wrote:
> > > > > > > (2015/02/10 17:58), Liu Yuan wrote:
> > > > > > > >On Tue, Feb 10, 2015 at 05:22:02PM +0900, Teruaki Ishizaki wrote:
> > > > > > > >>(2015/02/10 12:10), Liu Yuan wrote:
> > > > > > > >>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> > > > > > > >>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> > > > > > > >>>>This patch enables users to handle "block_size_shift" value for
> > > > > > > >>>>calculating VDI object size.
> > > > > > > >>>>
> > > > > > > >>>>When you start qemu, you don't need to specify additional command option.
> > > > > > > >>>>
> > > > > > > >>>>But when you create the VDI which doesn't have default object size
> > > > > > > >>>>with qemu-img command, you specify block_size_shift option.
> > > > > > > >>>>
> > > > > > > >>>>If you want to create a VDI of 8MB(1 << 23) object size,
> > > > > > > >>>>you need to specify following command option.
> > > > > > > >>>>
> > > > > > > >>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> > > > > > > >>>>
> > > > > > > >>>>In addition, when you don't specify qemu-img command option,
> > > > > > > >>>>a default value of sheepdog cluster is used for creating VDI.
> > > > > > > >>>>
> > > > > > > >>>>  # qemu-img create sheepdog:test2 100M
> > > > > > > >>>>
> > > > > > > >>>>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> > > > > > > >>>>---
> > > > > > > >>>>V4:
> > > > > > > >>>>  - Limit a read/write buffer size for creating a preallocated VDI.
> > > > > > > >>>>  - Replace a parse function for the block_size_shift option.
> > > > > > > >>>>  - Fix an error message.
> > > > > > > >>>>
> > > > > > > >>>>V3:
> > > > > > > >>>>  - Delete the needless operation of buffer.
> > > > > > > >>>>  - Delete the needless operations of request header.
> > > > > > > >>>>    for SD_OP_GET_CLUSTER_DEFAULT.
> > > > > > > >>>>  - Fix coding style problems.
> > > > > > > >>>>
> > > > > > > >>>>V2:
> > > > > > > >>>>  - Fix coding style problem (white space).
> > > > > > > >>>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> > > > > > > >>>>  - Initialize request header to use block_size_shift specified by user.
> > > > > > > >>>>---
> > > > > > > >>>>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> > > > > > > >>>>  include/block/block_int.h |    1 +
> > > > > > > >>>>  2 files changed, 119 insertions(+), 20 deletions(-)
> > > > > > > >>>>
> > > > > > > >>>>diff --git a/block/sheepdog.c b/block/sheepdog.c
> > > > > > > >>>>index be3176f..a43b947 100644
> > > > > > > >>>>--- a/block/sheepdog.c
> > > > > > > >>>>+++ b/block/sheepdog.c
> > > > > > > >>>>@@ -37,6 +37,7 @@
> > > > > > > >>>>  #define SD_OP_READ_VDIS      0x15
> > > > > > > >>>>  #define SD_OP_FLUSH_VDI      0x16
> > > > > > > >>>>  #define SD_OP_DEL_VDI        0x17
> > > > > > > >>>>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> > > > > > > >>>>
> > > > > > > >>>>  #define SD_FLAG_CMD_WRITE    0x01
> > > > > > > >>>>  #define SD_FLAG_CMD_COW      0x02
> > > > > > > >>>>@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> > > > > > > >>>>      uint32_t base_vdi_id;
> > > > > > > >>>>      uint8_t copies;
> > > > > > > >>>>      uint8_t copy_policy;
> > > > > > > >>>>-    uint8_t reserved[2];
> > > > > > > >>>>+    uint8_t store_policy;
> > > > > > > >>>>+    uint8_t block_size_shift;
> > > > > > > >>>>      uint32_t snapid;
> > > > > > > >>>>      uint32_t type;
> > > > > > > >>>>      uint32_t pad[2];
> > > > > > > >>>>@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> > > > > > > >>>>      uint32_t pad[5];
> > > > > > > >>>>  } SheepdogVdiRsp;
> > > > > > > >>>>
> > > > > > > >>>>+typedef struct SheepdogClusterRsp {
> > > > > > > >>>>+    uint8_t proto_ver;
> > > > > > > >>>>+    uint8_t opcode;
> > > > > > > >>>>+    uint16_t flags;
> > > > > > > >>>>+    uint32_t epoch;
> > > > > > > >>>>+    uint32_t id;
> > > > > > > >>>>+    uint32_t data_length;
> > > > > > > >>>>+    uint32_t result;
> > > > > > > >>>>+    uint8_t nr_copies;
> > > > > > > >>>>+    uint8_t copy_policy;
> > > > > > > >>>>+    uint8_t block_size_shift;
> > > > > > > >>>>+    uint8_t __pad1;
> > > > > > > >>>>+    uint32_t __pad2[6];
> > > > > > > >>>>+} SheepdogClusterRsp;
> > > > > > > >>>>+
> > > > > > > >>>>  typedef struct SheepdogInode {
> > > > > > > >>>>      char name[SD_MAX_VDI_LEN];
> > > > > > > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > > > > > > >>>>@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > > > > > > >>>>      hdr.vdi_size = s->inode.vdi_size;
> > > > > > > >>>>      hdr.copy_policy = s->inode.copy_policy;
> > > > > > > >>>>      hdr.copies = s->inode.nr_copies;
> > > > > > > >>>>+    hdr.block_size_shift = s->inode.block_size_shift;
> > > > > > > >>>>
> > > > > > > >>>>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> > > > > > > >>>>
> > > > > > > >>>>@@ -1569,9 +1587,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> > > > > > > >>>>  static int sd_prealloc(const char *filename, Error **errp)
> > > > > > > >>>>  {
> > > > > > > >>>>      BlockDriverState *bs = NULL;
> > > > > > > >>>>+    BDRVSheepdogState *base = NULL;
> > > > > > > >>>>+    unsigned long buf_size;
> > > > > > > >>>>      uint32_t idx, max_idx;
> > > > > > > >>>>+    uint32_t object_size;
> > > > > > > >>>>      int64_t vdi_size;
> > > > > > > >>>>-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> > > > > > > >>>>+    void *buf = NULL;
> > > > > > > >>>>      int ret;
> > > > > > > >>>>
> > > > > > > >>>>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> > > > > > > >>>>@@ -1585,18 +1606,24 @@ static int sd_prealloc(const char *filename, Error **errp)
> > > > > > > >>>>          ret = vdi_size;
> > > > > > > >>>>          goto out;
> > > > > > > >>>>      }
> > > > > > > >>>>-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> > > > > > > >>>>+
> > > > > > > >>>>+    base = bs->opaque;
> > > > > > > >>>>+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> > > > > > > >>>>+    buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
> > > > > > > >>>>+    buf = g_malloc0(buf_size);
> > > > > > > >>>>+
> > > > > > > >>>>+    max_idx = DIV_ROUND_UP(vdi_size, buf_size);
> > > > > > > >>>>
> > > > > > > >>>>      for (idx = 0; idx < max_idx; idx++) {
> > > > > > > >>>>          /*
> > > > > > > >>>>           * The created image can be a cloned image, so we need to read
> > > > > > > >>>>           * a data from the source image.
> > > > > > > >>>>           */
> > > > > > > >>>>-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > > > > > > >>>>+        ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
> > > > > > > >>>>          if (ret < 0) {
> > > > > > > >>>>              goto out;
> > > > > > > >>>>          }
> > > > > > > >>>>-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> > > > > > > >>>>+        ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
> > > > > > > >>>>          if (ret < 0) {
> > > > > > > >>>>              goto out;
> > > > > > > >>>>          }
> > > > > > > >>>>@@ -1669,6 +1696,21 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
> > > > > > > >>>>      return 0;
> > > > > > > >>>>  }
> > > > > > > >>>>
> > > > > > > >>>>+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
> > > > > > > >>>>+{
> > > > > > > >>>>+    struct SheepdogInode *inode = &s->inode;
> > > > > > > >>>>+    inode->block_size_shift =
> > > > > > > >>>>+        (uint8_t)qemu_opt_get_number_del(opt, "block_size_shift", 0);
> > > > > > > >>>>+    if (inode->block_size_shift == 0) {
> > > > > > > >>>>+        /* block_size_shift is set for cluster default value by sheepdog */
> > > > > > > >>>>+        return 0;
> > > > > > > >>>>+    } else if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> > > > > > > >>>>+        return -EINVAL;
> > > > > > > >>>>+    }
> > > > > > > >>>>+
> > > > > > > >>>>+    return 0;
> > > > > > > >>>>+}
> > > > > > > >>>>+
> > > > > > > >>>>  static int sd_create(const char *filename, QemuOpts *opts,
> > > > > > > >>>>                       Error **errp)
> > > > > > > >>>>  {
> > > > > > > >>>>@@ -1679,6 +1721,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > > > > >>>>      BDRVSheepdogState *s;
> > > > > > > >>>>      char tag[SD_MAX_VDI_TAG_LEN];
> > > > > > > >>>>      uint32_t snapid;
> > > > > > > >>>>+    uint64_t max_vdi_size;
> > > > > > > >>>>      bool prealloc = false;
> > > > > > > >>>>
> > > > > > > >>>>      s = g_new0(BDRVSheepdogState, 1);
> > > > > > > >>>>@@ -1718,9 +1761,10 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > > > > >>>>          }
> > > > > > > >>>>      }
> > > > > > > >>>>
> > > > > > > >>>>-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> > > > > > > >>>>-        error_setg(errp, "too big image size");
> > > > > > > >>>>-        ret = -EINVAL;
> > > > > > > >>>>+    ret = parse_block_size_shift(s, opts);
> > > > > > > >>>>+    if (ret < 0) {
> > > > > > > >>>>+        error_setg(errp, "Invalid block_size_shift:"
> > > > > > > >>>>+                         " '%s'. please use 20-31", buf);
> > > > > > > >>>>          goto out;
> > > > > > > >>>>      }
> > > > > > > >>>>
> > > > > > > >>>>@@ -1757,6 +1801,47 @@ static int sd_create(const char *filename, QemuOpts *opts,
> > > > > > > >>>>      }
> > > > > > > >>>>
> > > > > > > >>>>      s->aio_context = qemu_get_aio_context();
> > > > > > > >>>>+
> > > > > > > >>>>+    /* if block_size_shift is not specified, get cluster default value */
> > > > > > > >>>>+    if (s->inode.block_size_shift == 0) {
> > > > > > > >>>
> > > > > > > >>>Seems that we don't keep backward compatibility for new QEMU and old sheep that
> > > > > > > >>>doesn't support SD_OP_GET_CLUSTER_DEFAULT.
> > > > > > > >>Then, I'll add the operation that if block_size_shift from
> > > > > > > >>SD_OP_GET_CLUSTER_DEFAULT is zero, block_size_shift is set to 22.
> > > > > > > >>Is it OK?
> > > > > > > >
> > > > > > > >If sheep doesn't support SD_OP_GET_CLUSTER_DEFAULT, sheep will return
> > > > > > > >SD_RES_INVALID_PARMS. So to keep backward compatibility, we shouldn't issue
> > > > > > > >SD_OP_GET_CLUSTER_DEFAULT to sheep daemon.
> > > > > > > OK, I think that the way is better.
> > > > > > > >
> > > > > > > >My point is that, after user upgrading the sheep and still stick with old QEMU,
> > > > > > > >(I guess many users will), any operations in the past sholudn't fail right after
> > > > > > > >upgrading.
> > > > > > > >
> > > > > > > >>
> > > > > > > >>>
> > > > > > > >>>What will happen if old QEMU with new sheep that support block_size_shift? Most
> > > > > > > >>>distributions will ship the old stable qemu that wouldn't be aware of
> > > > > > > >>>block_size_shift.
> > > > > > > >>If old QEMU with new sheep is used, VDI is created with cluster default
> > > > > > > >>block_size_shift and accessed as 4MB object_size.
> > > > > > > >>So I think that for backward compatibility, users must do cluster
> > > > > > > >>format command with default block_size_shift 22 equal to
> > > > > > > >>4MB object_size.
> > > > > > > >
> > > > > > > >how old QEMU know about block_size_shift? For old QEMU, block_size_shift is
> > > > > > > >encoded as 0 and then send the create request to sheep. Does sheep can handle
> > > > > > > >block_size_shift = 0? You know, we can't pass any value to old QEMU for
> > > > > > > >block_size_shift.
> > > > > > > Sheep can handle block_size_shift = 0, when users create new VDI.
> > > > > > > Old QEMU does do_sd_create() without setting hdr.block_size_shift and
> > > > > > > sends a request SD_OP_NEW_VDI.
> > > > > > > If block_size_shift is set to zero, new sheep sets cluster default value
> > > > > > > in cluster_new_vdi() like copies and copy_policy.
> > > > > > 
> > > > > > Okay, so new sheep can handle old QEMU request. By the way, how about the
> > > > > > suggestion in my last email? (x + 1) * 4MB stuff...
> > > > > 
> > > > > I think specifying object size in multiple of 4MB is overkill. Because
> > > > > we can specify both of block_size_shift and a number of objects which
> > > > > belong to VDI. More precisely,
> > > > > 2 ^ block_size_shift * #objects = VDI size
> > > > > we can choose the block_size_shift between 20 and 30, and #objects
> > > > > from 1 < 2 ^ 20.
> > > > > # #objects is specified via VDI size implicitly
> > > > 
> > > > I can't understand this paragraph. If object size is fixed, we can calculate
> > > > # of object easily. It has nothing to do with block_size_shift.
> > > 
> > > I wanted to say that we can choose both of block_size_shift and # of
> > > objects from wide range, so granualarity is flexible and steps between
> > > VDI sizes are reasonably small.
> > > 
> > > > 
> > > > > The granualarity of VDI sizes seems to be really fine (even
> > > > > block_size_shift == 30, step is 1GB). So supporting block size with
> > > > > multiple of 4MB will not provide practical benefit.
> > > > 
> > > > Prabably yes, but finer granularity doesn't cause trouble and gives more
> > > > flexibility. We can allow 12MB at user's will, for example. Even it doesn't
> > > > provide practical benefits, what benefit block_size_shift will provide over
> > > > (x + 1) * 4MB scheme? My point is, give a wider range won't hurt and will
> > > > pose less constaint in the future.
> > > > 
> > > > The main reason I suggest (x + 1) * 4MB, is that we can easily keep backward
> > > > compatibility because x = 0 means 4MB, but with this patch proposal, x = 22
> > > > means 4MB.
> > > 
> > > Utilizing block_size_shift of inode doesn't break
> > > compatibility. Because older versions of sheepdog initialize with
> > > inode->block_size_shift with 22.
> > 
> > Older version? What about the old sheep that doesn't support block_size_shift?
> > If I remember well, block_size_shift is a new thing introduced by
> > fd9e4a28fad7c16b2237f4f48c9d264af192e40e, at Dec 12, 2014, very new. This means
> > all our stable sheep won't have any idea of what block_size_shift is.
> > 
> > The main trouble is old QEMU, which will always set inode->block_size_shift as 0
> > and expect the object size is 4MB.
> > 
> > Think about following case:
> > 
> > 1 Old qemu and old sheep runs a long time with many vdis that has 4MB object.
> > 2 Users upgrade sheep into new sheep but doesn't upgrade QEMU. This is quit
> >   normal case because many users use stable QEMU branch.
> > 3 He still use qemu-img to create new vdi and expect the object size is 4MB
> >   If new sheep doesn't set object size as 4MB, this qemu block driver will
> >   malfunction.
> 
> Older qemu drivers doesn't use inode->block_size_shift at all even for
> VDI creation (parameters for creation are passed via
> SheepdogVdiReq). So it doesn't break compatibility.
> 
> I tested qemu-img create with qemu 1.0 and the master branch of
> sheepdog but there's no problem. Am I missing something?

I guess I misunderstand how the master sheep handles block_size_shift = 0 case.
Sorry for the trouble.

Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-12  2:55         ` Liu Yuan
@ 2015-02-13  1:33           ` Teruaki Ishizaki
  2015-02-13  2:01             ` Liu Yuan
  0 siblings, 1 reply; 28+ messages in thread
From: Teruaki Ishizaki @ 2015-02-13  1:33 UTC (permalink / raw)
  To: Liu Yuan; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

(2015/02/12 11:55), Liu Yuan wrote:
> On Thu, Feb 12, 2015 at 11:33:16AM +0900, Teruaki Ishizaki wrote:
>> (2015/02/12 11:19), Liu Yuan wrote:
>>> On Thu, Feb 12, 2015 at 10:51:25AM +0900, Teruaki Ishizaki wrote:
>>>> (2015/02/10 20:12), Liu Yuan wrote:
>>>>> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
>>>>>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>>>>>> This patch enables users to handle "block_size_shift" value for
>>>>>> calculating VDI object size.
>>>>>>
>>>>>> When you start qemu, you don't need to specify additional command option.
>>>>>>
>>>>>> But when you create the VDI which doesn't have default object size
>>>>>> with qemu-img command, you specify block_size_shift option.
>>>>>>
>>>>>> If you want to create a VDI of 8MB(1 << 23) object size,
>>>>>> you need to specify following command option.
>>>>>>
>>>>>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>>>>>
>>>>>> In addition, when you don't specify qemu-img command option,
>>>>>> a default value of sheepdog cluster is used for creating VDI.
>>>>>>
>>>>>>   # qemu-img create sheepdog:test2 100M
>>>>>>
>>>>>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
>>>>>> ---
>>>>>> V4:
>>>>>>   - Limit a read/write buffer size for creating a preallocated VDI.
>>>>>>   - Replace a parse function for the block_size_shift option.
>>>>>>   - Fix an error message.
>>>>>>
>>>>>> V3:
>>>>>>   - Delete the needless operation of buffer.
>>>>>>   - Delete the needless operations of request header.
>>>>>>     for SD_OP_GET_CLUSTER_DEFAULT.
>>>>>>   - Fix coding style problems.
>>>>>>
>>>>>> V2:
>>>>>>   - Fix coding style problem (white space).
>>>>>>   - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
>>>>>>   - Initialize request header to use block_size_shift specified by user.
>>>>>> ---
>>>>>>   block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
>>>>>>   include/block/block_int.h |    1 +
>>>>>>   2 files changed, 119 insertions(+), 20 deletions(-)
>>>>>>
>>>>>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>>>>>> index be3176f..a43b947 100644
>>>>>> --- a/block/sheepdog.c
>>>>>> +++ b/block/sheepdog.c
>>>>>> @@ -37,6 +37,7 @@
>>>>>>   #define SD_OP_READ_VDIS      0x15
>>>>>>   #define SD_OP_FLUSH_VDI      0x16
>>>>>>   #define SD_OP_DEL_VDI        0x17
>>>>>> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>>>>>
>>>>> This might not be necessary. For old qemu or the qemu-img without setting
>>>>> option, the block_size_shift will be 0.
>>>>>
>>>>> If we make 0 to represent 4MB object, then we don't need to get the default
>>>>> cluster object size.
>>>>>
>>>>> We migth even get rid of the idea of cluster default size. The downsize is that,
>>>>> if we want to create a vdi with different size not the default 4MB,
>>>>> we have to write it every time for qemu-img or dog.
>>>>>
>>>>> If we choose to keep the idea of cluster default size, I think we'd also try to
>>>>> avoid call this request from QEMU to make backward compatibility easier. In this
>>>>> scenario, 0 might be used to ask new sheep to decide to use cluster default size.
>>>>>
>>>>> Both old qemu and new QEMU will send 0 to sheep and both old and new sheep can
>>>>> handle 0 though it has different meanings.
>>>>>
>>>>> Table for this bit as 0:
>>>>> Qe: qemu
>>>>> SD: Sheep daemon
>>>>> CDS: Cluster Default Size
>>>>> Ign: Ignored by the sheep daemon
>>>>>
>>>>> Qe/sd   new    old
>>>>> new     CDS    Ign
>>>>> old     CDS    NULL
>>>> Does Ign mean that VDI is handled as 4MB object size?
>>>
>>> Yes, old sheep can only handle 4MB object and doesn't check this field at all.
>>>
>>>>
>>>>>
>>>>> I think this approach is acceptable. The difference to your patch is that
>>>>> we don't send SD_OP_GET_CLUSTER_DEFAULT to sheep daemon and
>>>>> SD_OP_GET_CLUSTER_DEFAULT can be removed.
>>>> When users create a new VDI with qemu-img, qemu's Sheepdog backend
>>>> driver calculates max limit VDI size.
>>>
>>>> But if block_size_shift option is not specified, qemu's Sheepdog backend
>>>> driver can't calculate max limit VDI size.
>>>
>>> If block_size_shift not specified, this means
>>>
>>> 1 for old sheep, use 4MB size
>>> 2 for new sheep, use cluster wide default value.
>>>
>>> And sheep then can calculate it on its own, no?
>>>
>> Dog command(client) calculate max size, so I think
>> that qemu's Sheepdog backend driver should calculate it
>> like dog command.
>>
>> Is that policy changeable?
>
> I checked the QEMU code and got your idea. In the past it was fixed size so very
> easy to hardcode the check in the client, no communication with sheep needed.
>
> Yes, if it is reasonable, we can change it.
>
> I think we can push the size calculation logic into sheep, if not the right size
> return INVALID_PARAMETER to clients. Clients just check this and report error
> back to users.
>
> There is no backward compability for this approach, since 4MB is the smallest
> size.
>
> OLD QEMU will limit the max_size as 4TB, which is no problem for new sheep.

I have checked the Qemu and sheepdog code.
When we resize VDI, sd_truncate() is called and
resize value is handled by Qemu.
(Sorry I haven't noticed this operation)

Then, sd_truncate() writes Sheepdog inode object directly.
So Sheepdog server can't handle maximum VDI size.

As I thought, should we use SD_OP_GET_CLUSTER_DEFAULT?
Should maxmimum VDI size be calculated on client program?

Thanks,
Teruaki

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-13  1:33           ` Teruaki Ishizaki
@ 2015-02-13  2:01             ` Liu Yuan
  2015-02-13  4:28               ` Teruaki Ishizaki
  0 siblings, 1 reply; 28+ messages in thread
From: Liu Yuan @ 2015-02-13  2:01 UTC (permalink / raw)
  To: Teruaki Ishizaki; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

On Fri, Feb 13, 2015 at 10:33:04AM +0900, Teruaki Ishizaki wrote:
> (2015/02/12 11:55), Liu Yuan wrote:
> >On Thu, Feb 12, 2015 at 11:33:16AM +0900, Teruaki Ishizaki wrote:
> >>(2015/02/12 11:19), Liu Yuan wrote:
> >>>On Thu, Feb 12, 2015 at 10:51:25AM +0900, Teruaki Ishizaki wrote:
> >>>>(2015/02/10 20:12), Liu Yuan wrote:
> >>>>>On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
> >>>>>>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> >>>>>>This patch enables users to handle "block_size_shift" value for
> >>>>>>calculating VDI object size.
> >>>>>>
> >>>>>>When you start qemu, you don't need to specify additional command option.
> >>>>>>
> >>>>>>But when you create the VDI which doesn't have default object size
> >>>>>>with qemu-img command, you specify block_size_shift option.
> >>>>>>
> >>>>>>If you want to create a VDI of 8MB(1 << 23) object size,
> >>>>>>you need to specify following command option.
> >>>>>>
> >>>>>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> >>>>>>
> >>>>>>In addition, when you don't specify qemu-img command option,
> >>>>>>a default value of sheepdog cluster is used for creating VDI.
> >>>>>>
> >>>>>>  # qemu-img create sheepdog:test2 100M
> >>>>>>
> >>>>>>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> >>>>>>---
> >>>>>>V4:
> >>>>>>  - Limit a read/write buffer size for creating a preallocated VDI.
> >>>>>>  - Replace a parse function for the block_size_shift option.
> >>>>>>  - Fix an error message.
> >>>>>>
> >>>>>>V3:
> >>>>>>  - Delete the needless operation of buffer.
> >>>>>>  - Delete the needless operations of request header.
> >>>>>>    for SD_OP_GET_CLUSTER_DEFAULT.
> >>>>>>  - Fix coding style problems.
> >>>>>>
> >>>>>>V2:
> >>>>>>  - Fix coding style problem (white space).
> >>>>>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
> >>>>>>  - Initialize request header to use block_size_shift specified by user.
> >>>>>>---
> >>>>>>  block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
> >>>>>>  include/block/block_int.h |    1 +
> >>>>>>  2 files changed, 119 insertions(+), 20 deletions(-)
> >>>>>>
> >>>>>>diff --git a/block/sheepdog.c b/block/sheepdog.c
> >>>>>>index be3176f..a43b947 100644
> >>>>>>--- a/block/sheepdog.c
> >>>>>>+++ b/block/sheepdog.c
> >>>>>>@@ -37,6 +37,7 @@
> >>>>>>  #define SD_OP_READ_VDIS      0x15
> >>>>>>  #define SD_OP_FLUSH_VDI      0x16
> >>>>>>  #define SD_OP_DEL_VDI        0x17
> >>>>>>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> >>>>>
> >>>>>This might not be necessary. For old qemu or the qemu-img without setting
> >>>>>option, the block_size_shift will be 0.
> >>>>>
> >>>>>If we make 0 to represent 4MB object, then we don't need to get the default
> >>>>>cluster object size.
> >>>>>
> >>>>>We migth even get rid of the idea of cluster default size. The downsize is that,
> >>>>>if we want to create a vdi with different size not the default 4MB,
> >>>>>we have to write it every time for qemu-img or dog.
> >>>>>
> >>>>>If we choose to keep the idea of cluster default size, I think we'd also try to
> >>>>>avoid call this request from QEMU to make backward compatibility easier. In this
> >>>>>scenario, 0 might be used to ask new sheep to decide to use cluster default size.
> >>>>>
> >>>>>Both old qemu and new QEMU will send 0 to sheep and both old and new sheep can
> >>>>>handle 0 though it has different meanings.
> >>>>>
> >>>>>Table for this bit as 0:
> >>>>>Qe: qemu
> >>>>>SD: Sheep daemon
> >>>>>CDS: Cluster Default Size
> >>>>>Ign: Ignored by the sheep daemon
> >>>>>
> >>>>>Qe/sd   new    old
> >>>>>new     CDS    Ign
> >>>>>old     CDS    NULL
> >>>>Does Ign mean that VDI is handled as 4MB object size?
> >>>
> >>>Yes, old sheep can only handle 4MB object and doesn't check this field at all.
> >>>
> >>>>
> >>>>>
> >>>>>I think this approach is acceptable. The difference to your patch is that
> >>>>>we don't send SD_OP_GET_CLUSTER_DEFAULT to sheep daemon and
> >>>>>SD_OP_GET_CLUSTER_DEFAULT can be removed.
> >>>>When users create a new VDI with qemu-img, qemu's Sheepdog backend
> >>>>driver calculates max limit VDI size.
> >>>
> >>>>But if block_size_shift option is not specified, qemu's Sheepdog backend
> >>>>driver can't calculate max limit VDI size.
> >>>
> >>>If block_size_shift not specified, this means
> >>>
> >>>1 for old sheep, use 4MB size
> >>>2 for new sheep, use cluster wide default value.
> >>>
> >>>And sheep then can calculate it on its own, no?
> >>>
> >>Dog command(client) calculate max size, so I think
> >>that qemu's Sheepdog backend driver should calculate it
> >>like dog command.
> >>
> >>Is that policy changeable?
> >
> >I checked the QEMU code and got your idea. In the past it was fixed size so very
> >easy to hardcode the check in the client, no communication with sheep needed.
> >
> >Yes, if it is reasonable, we can change it.
> >
> >I think we can push the size calculation logic into sheep, if not the right size
> >return INVALID_PARAMETER to clients. Clients just check this and report error
> >back to users.
> >
> >There is no backward compability for this approach, since 4MB is the smallest
> >size.
> >
> >OLD QEMU will limit the max_size as 4TB, which is no problem for new sheep.
> 
> I have checked the Qemu and sheepdog code.
> When we resize VDI, sd_truncate() is called and
> resize value is handled by Qemu.
> (Sorry I haven't noticed this operation)
> 
> Then, sd_truncate() writes Sheepdog inode object directly.
> So Sheepdog server can't handle maximum VDI size.
> 
> As I thought, should we use SD_OP_GET_CLUSTER_DEFAULT?
> Should maxmimum VDI size be calculated on client program?

Based on your description, yes, we have to use it. I'd suggest rename
SD_OP_GET_CLUSTER_DEFAULT as SD_OP_GET_DEFAULT_OBJECT_SIZE. If we use it, you
need to take care of old sheep that will return INVALID_PARAMETER and handle it.

Thanks
Yuan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [Qemu-devel] [PATCH v4] sheepdog: selectable object size support
  2015-02-13  2:01             ` Liu Yuan
@ 2015-02-13  4:28               ` Teruaki Ishizaki
  0 siblings, 0 replies; 28+ messages in thread
From: Teruaki Ishizaki @ 2015-02-13  4:28 UTC (permalink / raw)
  To: Liu Yuan; +Cc: kwolf, mitake.hitoshi, sheepdog, qemu-devel, stefanha

(2015/02/13 11:01), Liu Yuan wrote:
> On Fri, Feb 13, 2015 at 10:33:04AM +0900, Teruaki Ishizaki wrote:
>> (2015/02/12 11:55), Liu Yuan wrote:
>>> On Thu, Feb 12, 2015 at 11:33:16AM +0900, Teruaki Ishizaki wrote:
>>>> (2015/02/12 11:19), Liu Yuan wrote:
>>>>> On Thu, Feb 12, 2015 at 10:51:25AM +0900, Teruaki Ishizaki wrote:
>>>>>> (2015/02/10 20:12), Liu Yuan wrote:
>>>>>>> On Tue, Jan 27, 2015 at 05:35:27PM +0900, Teruaki Ishizaki wrote:
>>>>>>>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>>>>>>>> This patch enables users to handle "block_size_shift" value for
>>>>>>>> calculating VDI object size.
>>>>>>>>
>>>>>>>> When you start qemu, you don't need to specify additional command option.
>>>>>>>>
>>>>>>>> But when you create the VDI which doesn't have default object size
>>>>>>>> with qemu-img command, you specify block_size_shift option.
>>>>>>>>
>>>>>>>> If you want to create a VDI of 8MB(1 << 23) object size,
>>>>>>>> you need to specify following command option.
>>>>>>>>
>>>>>>>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>>>>>>>
>>>>>>>> In addition, when you don't specify qemu-img command option,
>>>>>>>> a default value of sheepdog cluster is used for creating VDI.
>>>>>>>>
>>>>>>>>   # qemu-img create sheepdog:test2 100M
>>>>>>>>
>>>>>>>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
>>>>>>>> ---
>>>>>>>> V4:
>>>>>>>>   - Limit a read/write buffer size for creating a preallocated VDI.
>>>>>>>>   - Replace a parse function for the block_size_shift option.
>>>>>>>>   - Fix an error message.
>>>>>>>>
>>>>>>>> V3:
>>>>>>>>   - Delete the needless operation of buffer.
>>>>>>>>   - Delete the needless operations of request header.
>>>>>>>>     for SD_OP_GET_CLUSTER_DEFAULT.
>>>>>>>>   - Fix coding style problems.
>>>>>>>>
>>>>>>>> V2:
>>>>>>>>   - Fix coding style problem (white space).
>>>>>>>>   - Add members, store_policy and block_size_shift to struct SheepdogVdiReq.
>>>>>>>>   - Initialize request header to use block_size_shift specified by user.
>>>>>>>> ---
>>>>>>>>   block/sheepdog.c          |  138 ++++++++++++++++++++++++++++++++++++++-------
>>>>>>>>   include/block/block_int.h |    1 +
>>>>>>>>   2 files changed, 119 insertions(+), 20 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>>>>>>>> index be3176f..a43b947 100644
>>>>>>>> --- a/block/sheepdog.c
>>>>>>>> +++ b/block/sheepdog.c
>>>>>>>> @@ -37,6 +37,7 @@
>>>>>>>>   #define SD_OP_READ_VDIS      0x15
>>>>>>>>   #define SD_OP_FLUSH_VDI      0x16
>>>>>>>>   #define SD_OP_DEL_VDI        0x17
>>>>>>>> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>>>>>>>
>>>>>>> This might not be necessary. For old qemu or the qemu-img without setting
>>>>>>> option, the block_size_shift will be 0.
>>>>>>>
>>>>>>> If we make 0 to represent 4MB object, then we don't need to get the default
>>>>>>> cluster object size.
>>>>>>>
>>>>>>> We migth even get rid of the idea of cluster default size. The downsize is that,
>>>>>>> if we want to create a vdi with different size not the default 4MB,
>>>>>>> we have to write it every time for qemu-img or dog.
>>>>>>>
>>>>>>> If we choose to keep the idea of cluster default size, I think we'd also try to
>>>>>>> avoid call this request from QEMU to make backward compatibility easier. In this
>>>>>>> scenario, 0 might be used to ask new sheep to decide to use cluster default size.
>>>>>>>
>>>>>>> Both old qemu and new QEMU will send 0 to sheep and both old and new sheep can
>>>>>>> handle 0 though it has different meanings.
>>>>>>>
>>>>>>> Table for this bit as 0:
>>>>>>> Qe: qemu
>>>>>>> SD: Sheep daemon
>>>>>>> CDS: Cluster Default Size
>>>>>>> Ign: Ignored by the sheep daemon
>>>>>>>
>>>>>>> Qe/sd   new    old
>>>>>>> new     CDS    Ign
>>>>>>> old     CDS    NULL
>>>>>> Does Ign mean that VDI is handled as 4MB object size?
>>>>>
>>>>> Yes, old sheep can only handle 4MB object and doesn't check this field at all.
>>>>>
>>>>>>
>>>>>>>
>>>>>>> I think this approach is acceptable. The difference to your patch is that
>>>>>>> we don't send SD_OP_GET_CLUSTER_DEFAULT to sheep daemon and
>>>>>>> SD_OP_GET_CLUSTER_DEFAULT can be removed.
>>>>>> When users create a new VDI with qemu-img, qemu's Sheepdog backend
>>>>>> driver calculates max limit VDI size.
>>>>>
>>>>>> But if block_size_shift option is not specified, qemu's Sheepdog backend
>>>>>> driver can't calculate max limit VDI size.
>>>>>
>>>>> If block_size_shift not specified, this means
>>>>>
>>>>> 1 for old sheep, use 4MB size
>>>>> 2 for new sheep, use cluster wide default value.
>>>>>
>>>>> And sheep then can calculate it on its own, no?
>>>>>
>>>> Dog command(client) calculate max size, so I think
>>>> that qemu's Sheepdog backend driver should calculate it
>>>> like dog command.
>>>>
>>>> Is that policy changeable?
>>>
>>> I checked the QEMU code and got your idea. In the past it was fixed size so very
>>> easy to hardcode the check in the client, no communication with sheep needed.
>>>
>>> Yes, if it is reasonable, we can change it.
>>>
>>> I think we can push the size calculation logic into sheep, if not the right size
>>> return INVALID_PARAMETER to clients. Clients just check this and report error
>>> back to users.
>>>
>>> There is no backward compability for this approach, since 4MB is the smallest
>>> size.
>>>
>>> OLD QEMU will limit the max_size as 4TB, which is no problem for new sheep.
>>
>> I have checked the Qemu and sheepdog code.
>> When we resize VDI, sd_truncate() is called and
>> resize value is handled by Qemu.
>> (Sorry I haven't noticed this operation)
>>
>> Then, sd_truncate() writes Sheepdog inode object directly.
>> So Sheepdog server can't handle maximum VDI size.
>>
>> As I thought, should we use SD_OP_GET_CLUSTER_DEFAULT?
>> Should maxmimum VDI size be calculated on client program?
>
> Based on your description, yes, we have to use it. I'd suggest rename
> SD_OP_GET_CLUSTER_DEFAULT as SD_OP_GET_DEFAULT_OBJECT_SIZE. If we use it, you
> need to take care of old sheep that will return INVALID_PARAMETER and handle it.
>
Now, SD_OP_GET_CLUSTER_DEFAULT can get block_size_shift, copy_policy
and copies informations.
I think that SD_OP_GET_DEFAULT_OBJECT_SIZE doesn't fit in.

I'll change to handle INVALID_PARAMETER.

Thanks,
Teruaki

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2015-02-13  4:28 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-27  8:35 [Qemu-devel] [PATCH v4] sheepdog: selectable object size support Teruaki Ishizaki
2015-02-02  6:52 ` Liu Yuan
2015-02-04  4:54   ` Teruaki Ishizaki
2015-02-06  2:18     ` Liu Yuan
2015-02-06  7:57       ` Teruaki Ishizaki
2015-02-09  3:08         ` Liu Yuan
2015-02-10  3:10 ` Liu Yuan
2015-02-10  3:18   ` Liu Yuan
2015-02-10  8:22   ` Teruaki Ishizaki
2015-02-10  8:58     ` Liu Yuan
2015-02-10  9:56       ` Teruaki Ishizaki
2015-02-10 10:35         ` Liu Yuan
2015-02-12  6:19           ` [Qemu-devel] [sheepdog] " Hitoshi Mitake
2015-02-12  7:00             ` Liu Yuan
2015-02-12  7:28               ` Hitoshi Mitake
2015-02-12  7:42                 ` Liu Yuan
2015-02-12  8:01                   ` Teruaki Ishizaki
2015-02-12  8:11                     ` Liu Yuan
2015-02-12  8:13                   ` Hitoshi Mitake
2015-02-12  8:16                     ` Liu Yuan
2015-02-10 11:12 ` [Qemu-devel] " Liu Yuan
2015-02-12  1:51   ` Teruaki Ishizaki
2015-02-12  2:19     ` Liu Yuan
2015-02-12  2:33       ` Teruaki Ishizaki
2015-02-12  2:55         ` Liu Yuan
2015-02-13  1:33           ` Teruaki Ishizaki
2015-02-13  2:01             ` Liu Yuan
2015-02-13  4:28               ` Teruaki Ishizaki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.