All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/7] block/rbd: migrate to coroutines and add write zeroes support
@ 2020-12-27 16:42 Peter Lieven
  2020-12-27 16:42 ` [PATCH 1/7] block/rbd: bump librbd requirement to luminous release Peter Lieven
                   ` (6 more replies)
  0 siblings, 7 replies; 28+ messages in thread
From: Peter Lieven @ 2020-12-27 16:42 UTC (permalink / raw)
  To: qemu-block; +Cc: kwolf, Peter Lieven, qemu-devel, ct, mreitz, dillaman

this series migrates the qemu rbd driver from the old aio emulation
to native coroutines and adds write zeroes support which is important
for block operations.

To archive this we first bump the librbd requirement to the already
outdated luminous release of ceph to get rid of some wrappers and
ifdef'ry in the code.

Peter Lieven (7):
  block/rbd: bump librbd requirement to luminous release
  block/rbd: store object_size in BDRVRBDState
  block/rbd: use stored image_size in qemu_rbd_getlength
  block/rbd: add bdrv_{attach,detach}_aio_context
  block/rbd: migrate from aio to coroutines
  block/rbd: add write zeroes support
  block/rbd: change request alignment to 1 byte

 block/rbd.c | 421 +++++++++++++++++-----------------------------------
 configure   |   7 +-
 2 files changed, 139 insertions(+), 289 deletions(-)

-- 
2.17.1




^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/7] block/rbd: bump librbd requirement to luminous release
  2020-12-27 16:42 [PATCH 0/7] block/rbd: migrate to coroutines and add write zeroes support Peter Lieven
@ 2020-12-27 16:42 ` Peter Lieven
  2020-12-27 16:42 ` [PATCH 2/7] block/rbd: store object_size in BDRVRBDState Peter Lieven
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Peter Lieven @ 2020-12-27 16:42 UTC (permalink / raw)
  To: qemu-block; +Cc: kwolf, Peter Lieven, qemu-devel, ct, mreitz, dillaman

even luminous (version 12.2) is unmaintained for over 3 years now.
Bump the requirement to get rid of the ifdef'ry in the code.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 block/rbd.c | 120 ++++------------------------------------------------
 configure   |   7 +--
 2 files changed, 12 insertions(+), 115 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 9bd2bce716..650e27c351 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -55,24 +55,10 @@
  * leading "\".
  */
 
-/* rbd_aio_discard added in 0.1.2 */
-#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2)
-#define LIBRBD_SUPPORTS_DISCARD
-#else
-#undef LIBRBD_SUPPORTS_DISCARD
-#endif
-
 #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
 
 #define RBD_MAX_SNAPS 100
 
-/* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
-#ifdef LIBRBD_SUPPORTS_IOVEC
-#define LIBRBD_USE_IOVEC 1
-#else
-#define LIBRBD_USE_IOVEC 0
-#endif
-
 typedef enum {
     RBD_AIO_READ,
     RBD_AIO_WRITE,
@@ -84,7 +70,6 @@ typedef struct RBDAIOCB {
     BlockAIOCB common;
     int64_t ret;
     QEMUIOVector *qiov;
-    char *bounce;
     RBDAIOCmd cmd;
     int error;
     struct BDRVRBDState *s;
@@ -94,7 +79,6 @@ typedef struct RADOSCB {
     RBDAIOCB *acb;
     struct BDRVRBDState *s;
     int64_t size;
-    char *buf;
     int64_t ret;
 } RADOSCB;
 
@@ -332,13 +316,9 @@ static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json,
 
 static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
 {
-    if (LIBRBD_USE_IOVEC) {
-        RBDAIOCB *acb = rcb->acb;
-        iov_memset(acb->qiov->iov, acb->qiov->niov, offs, 0,
-                   acb->qiov->size - offs);
-    } else {
-        memset(rcb->buf + offs, 0, rcb->size - offs);
-    }
+    RBDAIOCB *acb = rcb->acb;
+    iov_memset(acb->qiov->iov, acb->qiov->niov, offs, 0,
+               acb->qiov->size - offs);
 }
 
 /* FIXME Deprecate and remove keypairs or make it available in QMP. */
@@ -493,13 +473,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
 
     g_free(rcb);
 
-    if (!LIBRBD_USE_IOVEC) {
-        if (acb->cmd == RBD_AIO_READ) {
-            qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
-        }
-        qemu_vfree(acb->bounce);
-    }
-
     acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
 
     qemu_aio_unref(acb);
@@ -866,28 +839,6 @@ static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
                                      rbd_finish_bh, rcb);
 }
 
-static int rbd_aio_discard_wrapper(rbd_image_t image,
-                                   uint64_t off,
-                                   uint64_t len,
-                                   rbd_completion_t comp)
-{
-#ifdef LIBRBD_SUPPORTS_DISCARD
-    return rbd_aio_discard(image, off, len, comp);
-#else
-    return -ENOTSUP;
-#endif
-}
-
-static int rbd_aio_flush_wrapper(rbd_image_t image,
-                                 rbd_completion_t comp)
-{
-#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
-    return rbd_aio_flush(image, comp);
-#else
-    return -ENOTSUP;
-#endif
-}
-
 static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
                                  int64_t off,
                                  QEMUIOVector *qiov,
@@ -910,21 +861,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
 
     rcb = g_new(RADOSCB, 1);
 
-    if (!LIBRBD_USE_IOVEC) {
-        if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
-            acb->bounce = NULL;
-        } else {
-            acb->bounce = qemu_try_blockalign(bs, qiov->size);
-            if (acb->bounce == NULL) {
-                goto failed;
-            }
-        }
-        if (cmd == RBD_AIO_WRITE) {
-            qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size);
-        }
-        rcb->buf = acb->bounce;
-    }
-
     acb->ret = 0;
     acb->error = 0;
     acb->s = s;
@@ -938,7 +874,7 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
     }
 
     switch (cmd) {
-    case RBD_AIO_WRITE: {
+    case RBD_AIO_WRITE:
         /*
          * RBD APIs don't allow us to write more than actual size, so in order
          * to support growing images, we resize the image before write
@@ -950,25 +886,16 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
                 goto failed_completion;
             }
         }
-#ifdef LIBRBD_SUPPORTS_IOVEC
-            r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c);
-#else
-            r = rbd_aio_write(s->image, off, size, rcb->buf, c);
-#endif
+        r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c);
         break;
-    }
     case RBD_AIO_READ:
-#ifdef LIBRBD_SUPPORTS_IOVEC
-            r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c);
-#else
-            r = rbd_aio_read(s->image, off, size, rcb->buf, c);
-#endif
+        r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c);
         break;
     case RBD_AIO_DISCARD:
-        r = rbd_aio_discard_wrapper(s->image, off, size, c);
+        r = rbd_aio_discard(s->image, off, size, c);
         break;
     case RBD_AIO_FLUSH:
-        r = rbd_aio_flush_wrapper(s->image, c);
+        r = rbd_aio_flush(s->image, c);
         break;
     default:
         r = -EINVAL;
@@ -983,9 +910,6 @@ failed_completion:
     rbd_aio_release(c);
 failed:
     g_free(rcb);
-    if (!LIBRBD_USE_IOVEC) {
-        qemu_vfree(acb->bounce);
-    }
 
     qemu_aio_unref(acb);
     return NULL;
@@ -1011,7 +935,6 @@ static BlockAIOCB *qemu_rbd_aio_pwritev(BlockDriverState *bs,
                          RBD_AIO_WRITE);
 }
 
-#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
 static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
                                       BlockCompletionFunc *cb,
                                       void *opaque)
@@ -1019,20 +942,6 @@ static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
     return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
 }
 
-#else
-
-static int qemu_rbd_co_flush(BlockDriverState *bs)
-{
-#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1)
-    /* rbd_flush added in 0.1.1 */
-    BDRVRBDState *s = bs->opaque;
-    return rbd_flush(s->image);
-#else
-    return 0;
-#endif
-}
-#endif
-
 static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BDRVRBDState *s = bs->opaque;
@@ -1198,7 +1107,6 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
     return snap_count;
 }
 
-#ifdef LIBRBD_SUPPORTS_DISCARD
 static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
                                          int64_t offset,
                                          int bytes,
@@ -1208,9 +1116,7 @@ static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
     return rbd_start_aio(bs, offset, NULL, bytes, cb, opaque,
                          RBD_AIO_DISCARD);
 }
-#endif
 
-#ifdef LIBRBD_SUPPORTS_INVALIDATE
 static void coroutine_fn qemu_rbd_co_invalidate_cache(BlockDriverState *bs,
                                                       Error **errp)
 {
@@ -1220,7 +1126,6 @@ static void coroutine_fn qemu_rbd_co_invalidate_cache(BlockDriverState *bs,
         error_setg_errno(errp, -r, "Failed to invalidate the cache");
     }
 }
-#endif
 
 static QemuOptsList qemu_rbd_create_opts = {
     .name = "rbd-create-opts",
@@ -1278,23 +1183,14 @@ static BlockDriver bdrv_rbd = {
     .bdrv_aio_preadv        = qemu_rbd_aio_preadv,
     .bdrv_aio_pwritev       = qemu_rbd_aio_pwritev,
 
-#ifdef LIBRBD_SUPPORTS_AIO_FLUSH
     .bdrv_aio_flush         = qemu_rbd_aio_flush,
-#else
-    .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
-#endif
-
-#ifdef LIBRBD_SUPPORTS_DISCARD
     .bdrv_aio_pdiscard      = qemu_rbd_aio_pdiscard,
-#endif
 
     .bdrv_snapshot_create   = qemu_rbd_snap_create,
     .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
     .bdrv_snapshot_list     = qemu_rbd_snap_list,
     .bdrv_snapshot_goto     = qemu_rbd_snap_rollback,
-#ifdef LIBRBD_SUPPORTS_INVALIDATE
     .bdrv_co_invalidate_cache = qemu_rbd_co_invalidate_cache,
-#endif
 
     .strong_runtime_opts    = qemu_rbd_strong_runtime_opts,
 };
diff --git a/configure b/configure
index c228f7c21e..e07f80cab4 100755
--- a/configure
+++ b/configure
@@ -3663,8 +3663,9 @@ if test "$rbd" != "no" ; then
 #include <stdio.h>
 #include <rbd/librbd.h>
 int main(void) {
-    rados_t cluster;
-    rados_create(&cluster, NULL);
+#if LIBRBD_VERSION_CODE < LIBRBD_VERSION(1, 12, 0)
+#error
+#endif
     return 0;
 }
 EOF
@@ -3673,7 +3674,7 @@ EOF
     rbd=yes
   else
     if test "$rbd" = "yes" ; then
-      feature_not_found "rados block device" "Install librbd/ceph devel"
+      feature_not_found "rados block device" "Install librbd/ceph devel (>= 12.0.0)"
     fi
     rbd=no
   fi
-- 
2.17.1




^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/7] block/rbd: store object_size in BDRVRBDState
  2020-12-27 16:42 [PATCH 0/7] block/rbd: migrate to coroutines and add write zeroes support Peter Lieven
  2020-12-27 16:42 ` [PATCH 1/7] block/rbd: bump librbd requirement to luminous release Peter Lieven
@ 2020-12-27 16:42 ` Peter Lieven
  2020-12-27 16:42 ` [PATCH 3/7] block/rbd: use stored image_size in qemu_rbd_getlength Peter Lieven
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Peter Lieven @ 2020-12-27 16:42 UTC (permalink / raw)
  To: qemu-block; +Cc: kwolf, Peter Lieven, qemu-devel, ct, mreitz, dillaman

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 block/rbd.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 650e27c351..bc8cf8af9b 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -90,6 +90,7 @@ typedef struct BDRVRBDState {
     char *snap;
     char *namespace;
     uint64_t image_size;
+    uint64_t object_size;
 } BDRVRBDState;
 
 static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
@@ -663,6 +664,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
     const QDictEntry *e;
     Error *local_err = NULL;
     char *keypairs, *secretid;
+    rbd_image_info_t info;
     int r;
 
     keypairs = g_strdup(qdict_get_try_str(options, "=keyvalue-pairs"));
@@ -727,13 +729,15 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
         goto failed_open;
     }
 
-    r = rbd_get_size(s->image, &s->image_size);
+    r = rbd_stat(s->image, &info, sizeof(info));
     if (r < 0) {
-        error_setg_errno(errp, -r, "error getting image size from %s",
+        error_setg_errno(errp, -r, "error getting image info from %s",
                          s->image_name);
         rbd_close(s->image);
         goto failed_open;
     }
+    s->image_size = info.size;
+    s->object_size = info.obj_size;
 
     /* If we are using an rbd snapshot, we must be r/o, otherwise
      * leave as-is */
@@ -945,15 +949,7 @@ static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
 static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BDRVRBDState *s = bs->opaque;
-    rbd_image_info_t info;
-    int r;
-
-    r = rbd_stat(s->image, &info, sizeof(info));
-    if (r < 0) {
-        return r;
-    }
-
-    bdi->cluster_size = info.obj_size;
+    bdi->cluster_size = s->object_size;
     return 0;
 }
 
-- 
2.17.1




^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/7] block/rbd: use stored image_size in qemu_rbd_getlength
  2020-12-27 16:42 [PATCH 0/7] block/rbd: migrate to coroutines and add write zeroes support Peter Lieven
  2020-12-27 16:42 ` [PATCH 1/7] block/rbd: bump librbd requirement to luminous release Peter Lieven
  2020-12-27 16:42 ` [PATCH 2/7] block/rbd: store object_size in BDRVRBDState Peter Lieven
@ 2020-12-27 16:42 ` Peter Lieven
  2021-01-14 19:18   ` Jason Dillaman
  2020-12-27 16:42 ` [PATCH 4/7] block/rbd: add bdrv_{attach,detach}_aio_context Peter Lieven
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2020-12-27 16:42 UTC (permalink / raw)
  To: qemu-block; +Cc: kwolf, Peter Lieven, qemu-devel, ct, mreitz, dillaman

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 block/rbd.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index bc8cf8af9b..a2da70e37f 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -956,15 +956,7 @@ static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
 static int64_t qemu_rbd_getlength(BlockDriverState *bs)
 {
     BDRVRBDState *s = bs->opaque;
-    rbd_image_info_t info;
-    int r;
-
-    r = rbd_stat(s->image, &info, sizeof(info));
-    if (r < 0) {
-        return r;
-    }
-
-    return info.size;
+    return s->image_size;
 }
 
 static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
-- 
2.17.1




^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 4/7] block/rbd: add bdrv_{attach,detach}_aio_context
  2020-12-27 16:42 [PATCH 0/7] block/rbd: migrate to coroutines and add write zeroes support Peter Lieven
                   ` (2 preceding siblings ...)
  2020-12-27 16:42 ` [PATCH 3/7] block/rbd: use stored image_size in qemu_rbd_getlength Peter Lieven
@ 2020-12-27 16:42 ` Peter Lieven
  2021-01-14 19:18   ` Jason Dillaman
  2020-12-27 16:42 ` [PATCH 5/7] block/rbd: migrate from aio to coroutines Peter Lieven
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2020-12-27 16:42 UTC (permalink / raw)
  To: qemu-block; +Cc: kwolf, Peter Lieven, qemu-devel, ct, mreitz, dillaman

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 block/rbd.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index a2da70e37f..27b232f4d8 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -91,6 +91,7 @@ typedef struct BDRVRBDState {
     char *namespace;
     uint64_t image_size;
     uint64_t object_size;
+    AioContext *aio_context;
 } BDRVRBDState;
 
 static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
@@ -749,6 +750,8 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
         }
     }
 
+    s->aio_context = bdrv_get_aio_context(bs);
+
     /* When extending regular files, we get zeros from the OS */
     bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
 
@@ -839,8 +842,7 @@ static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
     rcb->ret = rbd_aio_get_return_value(c);
     rbd_aio_release(c);
 
-    replay_bh_schedule_oneshot_event(bdrv_get_aio_context(acb->common.bs),
-                                     rbd_finish_bh, rcb);
+    replay_bh_schedule_oneshot_event(acb->s->aio_context, rbd_finish_bh, rcb);
 }
 
 static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
@@ -1151,6 +1153,18 @@ static const char *const qemu_rbd_strong_runtime_opts[] = {
     NULL
 };
 
+static void qemu_rbd_attach_aio_context(BlockDriverState *bs,
+                                       AioContext *new_context)
+{
+    BDRVRBDState *s = bs->opaque;
+    s->aio_context = new_context;
+}
+
+static void qemu_rbd_detach_aio_context(BlockDriverState *bs)
+{
+
+}
+
 static BlockDriver bdrv_rbd = {
     .format_name            = "rbd",
     .instance_size          = sizeof(BDRVRBDState),
@@ -1180,6 +1194,9 @@ static BlockDriver bdrv_rbd = {
     .bdrv_snapshot_goto     = qemu_rbd_snap_rollback,
     .bdrv_co_invalidate_cache = qemu_rbd_co_invalidate_cache,
 
+    .bdrv_attach_aio_context  = qemu_rbd_attach_aio_context,
+    .bdrv_detach_aio_context  = qemu_rbd_detach_aio_context,
+
     .strong_runtime_opts    = qemu_rbd_strong_runtime_opts,
 };
 
-- 
2.17.1




^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 5/7] block/rbd: migrate from aio to coroutines
  2020-12-27 16:42 [PATCH 0/7] block/rbd: migrate to coroutines and add write zeroes support Peter Lieven
                   ` (3 preceding siblings ...)
  2020-12-27 16:42 ` [PATCH 4/7] block/rbd: add bdrv_{attach,detach}_aio_context Peter Lieven
@ 2020-12-27 16:42 ` Peter Lieven
  2021-01-14 19:19   ` Jason Dillaman
  2020-12-27 16:42 ` [PATCH 6/7] block/rbd: add write zeroes support Peter Lieven
  2020-12-27 16:42 ` [PATCH 7/7] block/rbd: change request alignment to 1 byte Peter Lieven
  6 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2020-12-27 16:42 UTC (permalink / raw)
  To: qemu-block; +Cc: kwolf, Peter Lieven, qemu-devel, ct, mreitz, dillaman

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 block/rbd.c | 247 ++++++++++++++++++----------------------------------
 1 file changed, 84 insertions(+), 163 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 27b232f4d8..2d77d0007f 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -66,22 +66,6 @@ typedef enum {
     RBD_AIO_FLUSH
 } RBDAIOCmd;
 
-typedef struct RBDAIOCB {
-    BlockAIOCB common;
-    int64_t ret;
-    QEMUIOVector *qiov;
-    RBDAIOCmd cmd;
-    int error;
-    struct BDRVRBDState *s;
-} RBDAIOCB;
-
-typedef struct RADOSCB {
-    RBDAIOCB *acb;
-    struct BDRVRBDState *s;
-    int64_t size;
-    int64_t ret;
-} RADOSCB;
-
 typedef struct BDRVRBDState {
     rados_t cluster;
     rados_ioctx_t io_ctx;
@@ -94,6 +78,13 @@ typedef struct BDRVRBDState {
     AioContext *aio_context;
 } BDRVRBDState;
 
+typedef struct RBDTask {
+    BDRVRBDState *s;
+    Coroutine *co;
+    bool complete;
+    int64_t ret;
+} RBDTask;
+
 static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
                             BlockdevOptionsRbd *opts, bool cache,
                             const char *keypairs, const char *secretid,
@@ -316,13 +307,6 @@ static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json,
     return ret;
 }
 
-static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
-{
-    RBDAIOCB *acb = rcb->acb;
-    iov_memset(acb->qiov->iov, acb->qiov->niov, offs, 0,
-               acb->qiov->size - offs);
-}
-
 /* FIXME Deprecate and remove keypairs or make it available in QMP. */
 static int qemu_rbd_do_create(BlockdevCreateOptions *options,
                               const char *keypairs, const char *password_secret,
@@ -440,46 +424,6 @@ exit:
     return ret;
 }
 
-/*
- * This aio completion is being called from rbd_finish_bh() and runs in qemu
- * BH context.
- */
-static void qemu_rbd_complete_aio(RADOSCB *rcb)
-{
-    RBDAIOCB *acb = rcb->acb;
-    int64_t r;
-
-    r = rcb->ret;
-
-    if (acb->cmd != RBD_AIO_READ) {
-        if (r < 0) {
-            acb->ret = r;
-            acb->error = 1;
-        } else if (!acb->error) {
-            acb->ret = rcb->size;
-        }
-    } else {
-        if (r < 0) {
-            qemu_rbd_memset(rcb, 0);
-            acb->ret = r;
-            acb->error = 1;
-        } else if (r < rcb->size) {
-            qemu_rbd_memset(rcb, r);
-            if (!acb->error) {
-                acb->ret = rcb->size;
-            }
-        } else if (!acb->error) {
-            acb->ret = r;
-        }
-    }
-
-    g_free(rcb);
-
-    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
-
-    qemu_aio_unref(acb);
-}
-
 static char *qemu_rbd_mon_host(BlockdevOptionsRbd *opts, Error **errp)
 {
     const char **vals;
@@ -817,88 +761,49 @@ static int qemu_rbd_resize(BlockDriverState *bs, uint64_t size)
     return 0;
 }
 
-static const AIOCBInfo rbd_aiocb_info = {
-    .aiocb_size = sizeof(RBDAIOCB),
-};
-
-static void rbd_finish_bh(void *opaque)
+static void qemu_rbd_finish_bh(void *opaque)
 {
-    RADOSCB *rcb = opaque;
-    qemu_rbd_complete_aio(rcb);
+    RBDTask *task = opaque;
+    task->complete = 1;
+    aio_co_wake(task->co);
 }
 
-/*
- * This is the callback function for rbd_aio_read and _write
- *
- * Note: this function is being called from a non qemu thread so
- * we need to be careful about what we do here. Generally we only
- * schedule a BH, and do the rest of the io completion handling
- * from rbd_finish_bh() which runs in a qemu context.
- */
-static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
+static void qemu_rbd_completion_cb(rbd_completion_t c, RBDTask *task)
 {
-    RBDAIOCB *acb = rcb->acb;
-
-    rcb->ret = rbd_aio_get_return_value(c);
+    task->ret = rbd_aio_get_return_value(c);
     rbd_aio_release(c);
-
-    replay_bh_schedule_oneshot_event(acb->s->aio_context, rbd_finish_bh, rcb);
+    aio_bh_schedule_oneshot(task->s->aio_context, qemu_rbd_finish_bh, task);
 }
 
-static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
-                                 int64_t off,
-                                 QEMUIOVector *qiov,
-                                 int64_t size,
-                                 BlockCompletionFunc *cb,
-                                 void *opaque,
-                                 RBDAIOCmd cmd)
+static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs,
+                                          uint64_t offset,
+                                          uint64_t bytes,
+                                          QEMUIOVector *qiov,
+                                          int flags,
+                                          RBDAIOCmd cmd)
 {
-    RBDAIOCB *acb;
-    RADOSCB *rcb = NULL;
-    rbd_completion_t c;
-    int r;
-
     BDRVRBDState *s = bs->opaque;
+    RBDTask task = { .s = s, .co = qemu_coroutine_self() };
+    rbd_completion_t c;
+    int r, ret = -EIO;
 
-    acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
-    acb->cmd = cmd;
-    acb->qiov = qiov;
-    assert(!qiov || qiov->size == size);
-
-    rcb = g_new(RADOSCB, 1);
-
-    acb->ret = 0;
-    acb->error = 0;
-    acb->s = s;
+    assert(!qiov || qiov->size == bytes);
 
-    rcb->acb = acb;
-    rcb->s = acb->s;
-    rcb->size = size;
-    r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c);
+    r = rbd_aio_create_completion(&task,
+                                  (rbd_callback_t) qemu_rbd_completion_cb, &c);
     if (r < 0) {
         goto failed;
     }
 
     switch (cmd) {
-    case RBD_AIO_WRITE:
-        /*
-         * RBD APIs don't allow us to write more than actual size, so in order
-         * to support growing images, we resize the image before write
-         * operations that exceed the current size.
-         */
-        if (off + size > s->image_size) {
-            r = qemu_rbd_resize(bs, off + size);
-            if (r < 0) {
-                goto failed_completion;
-            }
-        }
-        r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c);
-        break;
     case RBD_AIO_READ:
-        r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c);
+        r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, offset, c);
+        break;
+    case RBD_AIO_WRITE:
+        r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, offset, c);
         break;
     case RBD_AIO_DISCARD:
-        r = rbd_aio_discard(s->image, off, size, c);
+        r = rbd_aio_discard(s->image, offset, bytes, c);
         break;
     case RBD_AIO_FLUSH:
         r = rbd_aio_flush(s->image, c);
@@ -908,44 +813,71 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
     }
 
     if (r < 0) {
+        task.complete = 1;
+        task.ret = r;
+    }
+
+    while (!task.complete) {
+        qemu_coroutine_yield();
+    }
+
+    if (task.ret < 0) {
+        error_report("rbd request failed: cmd %d offset %" PRIu64 " bytes %"
+                     PRIu64 " flags %d task.ret %" PRIi64 " (%s)", cmd, offset,
+                     bytes, flags, task.ret, strerror(-task.ret));
         goto failed_completion;
     }
-    return &acb->common;
+
+    /* zero pad short reads */
+    if (cmd == RBD_AIO_READ && task.ret < qiov->size) {
+        qemu_iovec_memset(qiov, task.ret, 0, qiov->size - task.ret);
+    }
+
+    return 0;
 
 failed_completion:
     rbd_aio_release(c);
 failed:
-    g_free(rcb);
+    return ret;
+}
 
-    qemu_aio_unref(acb);
-    return NULL;
+static int
+coroutine_fn qemu_rbd_co_preadv(BlockDriverState *bs, uint64_t offset,
+                               uint64_t bytes, QEMUIOVector *qiov,
+                               int flags)
+{
+    return qemu_rbd_start_co(bs, offset, bytes, qiov, flags, RBD_AIO_READ);
 }
 
-static BlockAIOCB *qemu_rbd_aio_preadv(BlockDriverState *bs,
-                                       uint64_t offset, uint64_t bytes,
-                                       QEMUIOVector *qiov, int flags,
-                                       BlockCompletionFunc *cb,
-                                       void *opaque)
+static int
+coroutine_fn qemu_rbd_co_pwritev(BlockDriverState *bs, uint64_t offset,
+                                 uint64_t bytes, QEMUIOVector *qiov,
+                                 int flags)
 {
-    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
-                         RBD_AIO_READ);
+    BDRVRBDState *s = bs->opaque;
+    /*
+     * RBD APIs don't allow us to write more than actual size, so in order
+     * to support growing images, we resize the image before write
+     * operations that exceed the current size.
+     */
+    if (offset + bytes > s->image_size) {
+        int r = qemu_rbd_resize(bs, offset + bytes);
+        if (r < 0) {
+            return r;
+        }
+    }
+    return qemu_rbd_start_co(bs, offset, bytes, qiov, flags, RBD_AIO_WRITE);
 }
 
-static BlockAIOCB *qemu_rbd_aio_pwritev(BlockDriverState *bs,
-                                        uint64_t offset, uint64_t bytes,
-                                        QEMUIOVector *qiov, int flags,
-                                        BlockCompletionFunc *cb,
-                                        void *opaque)
+static int coroutine_fn qemu_rbd_co_flush(BlockDriverState *bs)
 {
-    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
-                         RBD_AIO_WRITE);
+    return qemu_rbd_start_co(bs, 0, 0, NULL, 0, RBD_AIO_FLUSH);
 }
 
-static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
-                                      BlockCompletionFunc *cb,
-                                      void *opaque)
+static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs,
+                                             int64_t offset, int count)
 {
-    return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
+    return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD);
 }
 
 static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
@@ -1097,16 +1029,6 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
     return snap_count;
 }
 
-static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
-                                         int64_t offset,
-                                         int bytes,
-                                         BlockCompletionFunc *cb,
-                                         void *opaque)
-{
-    return rbd_start_aio(bs, offset, NULL, bytes, cb, opaque,
-                         RBD_AIO_DISCARD);
-}
-
 static void coroutine_fn qemu_rbd_co_invalidate_cache(BlockDriverState *bs,
                                                       Error **errp)
 {
@@ -1182,11 +1104,10 @@ static BlockDriver bdrv_rbd = {
     .bdrv_co_truncate       = qemu_rbd_co_truncate,
     .protocol_name          = "rbd",
 
-    .bdrv_aio_preadv        = qemu_rbd_aio_preadv,
-    .bdrv_aio_pwritev       = qemu_rbd_aio_pwritev,
-
-    .bdrv_aio_flush         = qemu_rbd_aio_flush,
-    .bdrv_aio_pdiscard      = qemu_rbd_aio_pdiscard,
+    .bdrv_co_preadv         = qemu_rbd_co_preadv,
+    .bdrv_co_pwritev        = qemu_rbd_co_pwritev,
+    .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
+    .bdrv_co_pdiscard       = qemu_rbd_co_pdiscard,
 
     .bdrv_snapshot_create   = qemu_rbd_snap_create,
     .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
-- 
2.17.1




^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 6/7] block/rbd: add write zeroes support
  2020-12-27 16:42 [PATCH 0/7] block/rbd: migrate to coroutines and add write zeroes support Peter Lieven
                   ` (4 preceding siblings ...)
  2020-12-27 16:42 ` [PATCH 5/7] block/rbd: migrate from aio to coroutines Peter Lieven
@ 2020-12-27 16:42 ` Peter Lieven
  2021-01-14 19:19   ` Jason Dillaman
  2020-12-27 16:42 ` [PATCH 7/7] block/rbd: change request alignment to 1 byte Peter Lieven
  6 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2020-12-27 16:42 UTC (permalink / raw)
  To: qemu-block; +Cc: kwolf, Peter Lieven, qemu-devel, ct, mreitz, dillaman

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 block/rbd.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/block/rbd.c b/block/rbd.c
index 2d77d0007f..27b4404adf 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -63,7 +63,8 @@ typedef enum {
     RBD_AIO_READ,
     RBD_AIO_WRITE,
     RBD_AIO_DISCARD,
-    RBD_AIO_FLUSH
+    RBD_AIO_FLUSH,
+    RBD_AIO_WRITE_ZEROES
 } RBDAIOCmd;
 
 typedef struct BDRVRBDState {
@@ -221,8 +222,12 @@ done:
 
 static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
+    BDRVRBDState *s = bs->opaque;
     /* XXX Does RBD support AIO on less than 512-byte alignment? */
     bs->bl.request_alignment = 512;
+#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+    bs->bl.pwrite_zeroes_alignment = s->object_size;
+#endif
 }
 
 
@@ -695,6 +700,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     s->aio_context = bdrv_get_aio_context(bs);
+#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
+#endif
 
     /* When extending regular files, we get zeros from the OS */
     bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
@@ -808,6 +816,11 @@ static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs,
     case RBD_AIO_FLUSH:
         r = rbd_aio_flush(s->image, c);
         break;
+#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+    case RBD_AIO_WRITE_ZEROES:
+        r = rbd_aio_write_zeroes(s->image, offset, bytes, c, 0, 0);
+        break;
+#endif
     default:
         r = -EINVAL;
     }
@@ -880,6 +893,19 @@ static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs,
     return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD);
 }
 
+#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+static int
+coroutine_fn qemu_rbd_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
+                                      int count, BdrvRequestFlags flags)
+{
+    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
+        return -ENOTSUP;
+    }
+    return qemu_rbd_start_co(bs, offset, count, NULL, flags,
+                             RBD_AIO_WRITE_ZEROES);
+}
+#endif
+
 static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BDRVRBDState *s = bs->opaque;
@@ -1108,6 +1134,9 @@ static BlockDriver bdrv_rbd = {
     .bdrv_co_pwritev        = qemu_rbd_co_pwritev,
     .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
     .bdrv_co_pdiscard       = qemu_rbd_co_pdiscard,
+#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+    .bdrv_co_pwrite_zeroes  = qemu_rbd_co_pwrite_zeroes,
+#endif
 
     .bdrv_snapshot_create   = qemu_rbd_snap_create,
     .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
-- 
2.17.1




^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2020-12-27 16:42 [PATCH 0/7] block/rbd: migrate to coroutines and add write zeroes support Peter Lieven
                   ` (5 preceding siblings ...)
  2020-12-27 16:42 ` [PATCH 6/7] block/rbd: add write zeroes support Peter Lieven
@ 2020-12-27 16:42 ` Peter Lieven
  2021-01-14 19:19   ` Jason Dillaman
  6 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2020-12-27 16:42 UTC (permalink / raw)
  To: qemu-block; +Cc: kwolf, Peter Lieven, qemu-devel, ct, mreitz, dillaman

since we implement byte interfaces and librbd supports aio on byte granularity we can lift
the 512 byte alignment.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 block/rbd.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 27b4404adf..8673e8f553 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -223,8 +223,6 @@ done:
 static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
     BDRVRBDState *s = bs->opaque;
-    /* XXX Does RBD support AIO on less than 512-byte alignment? */
-    bs->bl.request_alignment = 512;
 #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
     bs->bl.pwrite_zeroes_alignment = s->object_size;
 #endif
-- 
2.17.1




^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/7] block/rbd: add bdrv_{attach,detach}_aio_context
  2020-12-27 16:42 ` [PATCH 4/7] block/rbd: add bdrv_{attach,detach}_aio_context Peter Lieven
@ 2021-01-14 19:18   ` Jason Dillaman
  2021-01-14 19:49     ` Peter Lieven
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Dillaman @ 2021-01-14 19:18 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  block/rbd.c | 21 +++++++++++++++++++--
>  1 file changed, 19 insertions(+), 2 deletions(-)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index a2da70e37f..27b232f4d8 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -91,6 +91,7 @@ typedef struct BDRVRBDState {
>      char *namespace;
>      uint64_t image_size;
>      uint64_t object_size;
> +    AioContext *aio_context;
>  } BDRVRBDState;
>
>  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
> @@ -749,6 +750,8 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
>          }
>      }
>
> +    s->aio_context = bdrv_get_aio_context(bs);
> +
>      /* When extending regular files, we get zeros from the OS */
>      bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
>
> @@ -839,8 +842,7 @@ static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
>      rcb->ret = rbd_aio_get_return_value(c);
>      rbd_aio_release(c);
>
> -    replay_bh_schedule_oneshot_event(bdrv_get_aio_context(acb->common.bs),
> -                                     rbd_finish_bh, rcb);
> +    replay_bh_schedule_oneshot_event(acb->s->aio_context, rbd_finish_bh, rcb);
>  }
>
>  static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
> @@ -1151,6 +1153,18 @@ static const char *const qemu_rbd_strong_runtime_opts[] = {
>      NULL
>  };
>
> +static void qemu_rbd_attach_aio_context(BlockDriverState *bs,
> +                                       AioContext *new_context)
> +{
> +    BDRVRBDState *s = bs->opaque;
> +    s->aio_context = new_context;
> +}
> +
> +static void qemu_rbd_detach_aio_context(BlockDriverState *bs)
> +{

I don't know enough about the internals of QEMU, but this seems
suspicious to be a no-op.


> +}
> +
>  static BlockDriver bdrv_rbd = {
>      .format_name            = "rbd",
>      .instance_size          = sizeof(BDRVRBDState),
> @@ -1180,6 +1194,9 @@ static BlockDriver bdrv_rbd = {
>      .bdrv_snapshot_goto     = qemu_rbd_snap_rollback,
>      .bdrv_co_invalidate_cache = qemu_rbd_co_invalidate_cache,
>
> +    .bdrv_attach_aio_context  = qemu_rbd_attach_aio_context,
> +    .bdrv_detach_aio_context  = qemu_rbd_detach_aio_context,
> +
>      .strong_runtime_opts    = qemu_rbd_strong_runtime_opts,
>  };
>
> --
> 2.17.1
>
>


--
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/7] block/rbd: use stored image_size in qemu_rbd_getlength
  2020-12-27 16:42 ` [PATCH 3/7] block/rbd: use stored image_size in qemu_rbd_getlength Peter Lieven
@ 2021-01-14 19:18   ` Jason Dillaman
  2021-01-14 19:32     ` Peter Lieven
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Dillaman @ 2021-01-14 19:18 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  block/rbd.c | 10 +---------
>  1 file changed, 1 insertion(+), 9 deletions(-)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index bc8cf8af9b..a2da70e37f 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -956,15 +956,7 @@ static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
>  static int64_t qemu_rbd_getlength(BlockDriverState *bs)
>  {
>      BDRVRBDState *s = bs->opaque;
> -    rbd_image_info_t info;
> -    int r;
> -
> -    r = rbd_stat(s->image, &info, sizeof(info));
> -    if (r < 0) {
> -        return r;
> -    }
> -
> -    return info.size;
> +    return s->image_size;
>  }
>
>  static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
> --
> 2.17.1

An RBD image can technically change size dynamically while in-use. The
original code would provide the most up-to-date length but this
version will always return the size of the image when it was opened.


--
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/7] block/rbd: migrate from aio to coroutines
  2020-12-27 16:42 ` [PATCH 5/7] block/rbd: migrate from aio to coroutines Peter Lieven
@ 2021-01-14 19:19   ` Jason Dillaman
  2021-01-14 19:39     ` Peter Lieven
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Dillaman @ 2021-01-14 19:19 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  block/rbd.c | 247 ++++++++++++++++++----------------------------------
>  1 file changed, 84 insertions(+), 163 deletions(-)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index 27b232f4d8..2d77d0007f 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -66,22 +66,6 @@ typedef enum {
>      RBD_AIO_FLUSH
>  } RBDAIOCmd;
>
> -typedef struct RBDAIOCB {
> -    BlockAIOCB common;
> -    int64_t ret;
> -    QEMUIOVector *qiov;
> -    RBDAIOCmd cmd;
> -    int error;
> -    struct BDRVRBDState *s;
> -} RBDAIOCB;
> -
> -typedef struct RADOSCB {
> -    RBDAIOCB *acb;
> -    struct BDRVRBDState *s;
> -    int64_t size;
> -    int64_t ret;
> -} RADOSCB;
> -
>  typedef struct BDRVRBDState {
>      rados_t cluster;
>      rados_ioctx_t io_ctx;
> @@ -94,6 +78,13 @@ typedef struct BDRVRBDState {
>      AioContext *aio_context;
>  } BDRVRBDState;
>
> +typedef struct RBDTask {
> +    BDRVRBDState *s;
> +    Coroutine *co;
> +    bool complete;
> +    int64_t ret;
> +} RBDTask;
> +
>  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
>                              BlockdevOptionsRbd *opts, bool cache,
>                              const char *keypairs, const char *secretid,
> @@ -316,13 +307,6 @@ static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json,
>      return ret;
>  }
>
> -static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
> -{
> -    RBDAIOCB *acb = rcb->acb;
> -    iov_memset(acb->qiov->iov, acb->qiov->niov, offs, 0,
> -               acb->qiov->size - offs);
> -}
> -
>  /* FIXME Deprecate and remove keypairs or make it available in QMP. */
>  static int qemu_rbd_do_create(BlockdevCreateOptions *options,
>                                const char *keypairs, const char *password_secret,
> @@ -440,46 +424,6 @@ exit:
>      return ret;
>  }
>
> -/*
> - * This aio completion is being called from rbd_finish_bh() and runs in qemu
> - * BH context.
> - */
> -static void qemu_rbd_complete_aio(RADOSCB *rcb)
> -{
> -    RBDAIOCB *acb = rcb->acb;
> -    int64_t r;
> -
> -    r = rcb->ret;
> -
> -    if (acb->cmd != RBD_AIO_READ) {
> -        if (r < 0) {
> -            acb->ret = r;
> -            acb->error = 1;
> -        } else if (!acb->error) {
> -            acb->ret = rcb->size;
> -        }
> -    } else {
> -        if (r < 0) {
> -            qemu_rbd_memset(rcb, 0);
> -            acb->ret = r;
> -            acb->error = 1;
> -        } else if (r < rcb->size) {
> -            qemu_rbd_memset(rcb, r);
> -            if (!acb->error) {
> -                acb->ret = rcb->size;
> -            }
> -        } else if (!acb->error) {
> -            acb->ret = r;
> -        }
> -    }
> -
> -    g_free(rcb);
> -
> -    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
> -
> -    qemu_aio_unref(acb);
> -}
> -
>  static char *qemu_rbd_mon_host(BlockdevOptionsRbd *opts, Error **errp)
>  {
>      const char **vals;
> @@ -817,88 +761,49 @@ static int qemu_rbd_resize(BlockDriverState *bs, uint64_t size)
>      return 0;
>  }
>
> -static const AIOCBInfo rbd_aiocb_info = {
> -    .aiocb_size = sizeof(RBDAIOCB),
> -};
> -
> -static void rbd_finish_bh(void *opaque)
> +static void qemu_rbd_finish_bh(void *opaque)
>  {
> -    RADOSCB *rcb = opaque;
> -    qemu_rbd_complete_aio(rcb);
> +    RBDTask *task = opaque;
> +    task->complete = 1;
> +    aio_co_wake(task->co);
>  }
>
> -/*
> - * This is the callback function for rbd_aio_read and _write
> - *
> - * Note: this function is being called from a non qemu thread so
> - * we need to be careful about what we do here. Generally we only
> - * schedule a BH, and do the rest of the io completion handling
> - * from rbd_finish_bh() which runs in a qemu context.
> - */
> -static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
> +static void qemu_rbd_completion_cb(rbd_completion_t c, RBDTask *task)
>  {
> -    RBDAIOCB *acb = rcb->acb;
> -
> -    rcb->ret = rbd_aio_get_return_value(c);
> +    task->ret = rbd_aio_get_return_value(c);
>      rbd_aio_release(c);
> -
> -    replay_bh_schedule_oneshot_event(acb->s->aio_context, rbd_finish_bh, rcb);
> +    aio_bh_schedule_oneshot(task->s->aio_context, qemu_rbd_finish_bh, task);
>  }
>
> -static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
> -                                 int64_t off,
> -                                 QEMUIOVector *qiov,
> -                                 int64_t size,
> -                                 BlockCompletionFunc *cb,
> -                                 void *opaque,
> -                                 RBDAIOCmd cmd)
> +static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs,
> +                                          uint64_t offset,
> +                                          uint64_t bytes,
> +                                          QEMUIOVector *qiov,
> +                                          int flags,
> +                                          RBDAIOCmd cmd)
>  {
> -    RBDAIOCB *acb;
> -    RADOSCB *rcb = NULL;
> -    rbd_completion_t c;
> -    int r;
> -
>      BDRVRBDState *s = bs->opaque;
> +    RBDTask task = { .s = s, .co = qemu_coroutine_self() };
> +    rbd_completion_t c;
> +    int r, ret = -EIO;
>
> -    acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
> -    acb->cmd = cmd;
> -    acb->qiov = qiov;
> -    assert(!qiov || qiov->size == size);
> -
> -    rcb = g_new(RADOSCB, 1);
> -
> -    acb->ret = 0;
> -    acb->error = 0;
> -    acb->s = s;
> +    assert(!qiov || qiov->size == bytes);
>
> -    rcb->acb = acb;
> -    rcb->s = acb->s;
> -    rcb->size = size;
> -    r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c);
> +    r = rbd_aio_create_completion(&task,
> +                                  (rbd_callback_t) qemu_rbd_completion_cb, &c);
>      if (r < 0) {
>          goto failed;
>      }
>
>      switch (cmd) {
> -    case RBD_AIO_WRITE:
> -        /*
> -         * RBD APIs don't allow us to write more than actual size, so in order
> -         * to support growing images, we resize the image before write
> -         * operations that exceed the current size.
> -         */
> -        if (off + size > s->image_size) {
> -            r = qemu_rbd_resize(bs, off + size);

I realize this is inherited code that is being refactored, but is it
really possible for QEMU to issue IOs outside the block extents? This
also has the same issue as the previous commit comment where the
"image_size" is only initialized at image open time and therefore this
path will keep getting hit if IOs are issued beyond the original block
device extents.

> -            if (r < 0) {
> -                goto failed_completion;
> -            }
> -        }
> -        r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c);
> -        break;
>      case RBD_AIO_READ:
> -        r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c);
> +        r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, offset, c);
> +        break;
> +    case RBD_AIO_WRITE:
> +        r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, offset, c);
>          break;
>      case RBD_AIO_DISCARD:
> -        r = rbd_aio_discard(s->image, off, size, c);
> +        r = rbd_aio_discard(s->image, offset, bytes, c);
>          break;
>      case RBD_AIO_FLUSH:
>          r = rbd_aio_flush(s->image, c);
> @@ -908,44 +813,71 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
>      }
>
>      if (r < 0) {
> +        task.complete = 1;
> +        task.ret = r;
> +    }
> +
> +    while (!task.complete) {
> +        qemu_coroutine_yield();
> +    }
> +
> +    if (task.ret < 0) {
> +        error_report("rbd request failed: cmd %d offset %" PRIu64 " bytes %"
> +                     PRIu64 " flags %d task.ret %" PRIi64 " (%s)", cmd, offset,
> +                     bytes, flags, task.ret, strerror(-task.ret));
>          goto failed_completion;
>      }
> -    return &acb->common;
> +
> +    /* zero pad short reads */
> +    if (cmd == RBD_AIO_READ && task.ret < qiov->size) {
> +        qemu_iovec_memset(qiov, task.ret, 0, qiov->size - task.ret);
> +    }
> +
> +    return 0;
>
>  failed_completion:
>      rbd_aio_release(c);

Wasn't this already released in "qemu_rbd_completion_cb"?


>  failed:
> -    g_free(rcb);
> +    return ret;
> +}
>
> -    qemu_aio_unref(acb);
> -    return NULL;
> +static int
> +coroutine_fn qemu_rbd_co_preadv(BlockDriverState *bs, uint64_t offset,
> +                               uint64_t bytes, QEMUIOVector *qiov,
> +                               int flags)
> +{
> +    return qemu_rbd_start_co(bs, offset, bytes, qiov, flags, RBD_AIO_READ);
>  }
>
> -static BlockAIOCB *qemu_rbd_aio_preadv(BlockDriverState *bs,
> -                                       uint64_t offset, uint64_t bytes,
> -                                       QEMUIOVector *qiov, int flags,
> -                                       BlockCompletionFunc *cb,
> -                                       void *opaque)
> +static int
> +coroutine_fn qemu_rbd_co_pwritev(BlockDriverState *bs, uint64_t offset,
> +                                 uint64_t bytes, QEMUIOVector *qiov,
> +                                 int flags)
>  {
> -    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
> -                         RBD_AIO_READ);
> +    BDRVRBDState *s = bs->opaque;
> +    /*
> +     * RBD APIs don't allow us to write more than actual size, so in order
> +     * to support growing images, we resize the image before write
> +     * operations that exceed the current size.
> +     */
> +    if (offset + bytes > s->image_size) {
> +        int r = qemu_rbd_resize(bs, offset + bytes);
> +        if (r < 0) {
> +            return r;
> +        }
> +    }
> +    return qemu_rbd_start_co(bs, offset, bytes, qiov, flags, RBD_AIO_WRITE);
>  }
>
> -static BlockAIOCB *qemu_rbd_aio_pwritev(BlockDriverState *bs,
> -                                        uint64_t offset, uint64_t bytes,
> -                                        QEMUIOVector *qiov, int flags,
> -                                        BlockCompletionFunc *cb,
> -                                        void *opaque)
> +static int coroutine_fn qemu_rbd_co_flush(BlockDriverState *bs)
>  {
> -    return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque,
> -                         RBD_AIO_WRITE);
> +    return qemu_rbd_start_co(bs, 0, 0, NULL, 0, RBD_AIO_FLUSH);
>  }
>
> -static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs,
> -                                      BlockCompletionFunc *cb,
> -                                      void *opaque)
> +static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs,
> +                                             int64_t offset, int count)
>  {
> -    return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH);
> +    return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD);
>  }
>
>  static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
> @@ -1097,16 +1029,6 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
>      return snap_count;
>  }
>
> -static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs,
> -                                         int64_t offset,
> -                                         int bytes,
> -                                         BlockCompletionFunc *cb,
> -                                         void *opaque)
> -{
> -    return rbd_start_aio(bs, offset, NULL, bytes, cb, opaque,
> -                         RBD_AIO_DISCARD);
> -}
> -
>  static void coroutine_fn qemu_rbd_co_invalidate_cache(BlockDriverState *bs,
>                                                        Error **errp)
>  {
> @@ -1182,11 +1104,10 @@ static BlockDriver bdrv_rbd = {
>      .bdrv_co_truncate       = qemu_rbd_co_truncate,
>      .protocol_name          = "rbd",
>
> -    .bdrv_aio_preadv        = qemu_rbd_aio_preadv,
> -    .bdrv_aio_pwritev       = qemu_rbd_aio_pwritev,
> -
> -    .bdrv_aio_flush         = qemu_rbd_aio_flush,
> -    .bdrv_aio_pdiscard      = qemu_rbd_aio_pdiscard,
> +    .bdrv_co_preadv         = qemu_rbd_co_preadv,
> +    .bdrv_co_pwritev        = qemu_rbd_co_pwritev,
> +    .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
> +    .bdrv_co_pdiscard       = qemu_rbd_co_pdiscard,
>
>      .bdrv_snapshot_create   = qemu_rbd_snap_create,
>      .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
> --
> 2.17.1
>
>


--
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 6/7] block/rbd: add write zeroes support
  2020-12-27 16:42 ` [PATCH 6/7] block/rbd: add write zeroes support Peter Lieven
@ 2021-01-14 19:19   ` Jason Dillaman
  2021-01-14 19:41     ` Peter Lieven
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Dillaman @ 2021-01-14 19:19 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  block/rbd.c | 31 ++++++++++++++++++++++++++++++-
>  1 file changed, 30 insertions(+), 1 deletion(-)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index 2d77d0007f..27b4404adf 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -63,7 +63,8 @@ typedef enum {
>      RBD_AIO_READ,
>      RBD_AIO_WRITE,
>      RBD_AIO_DISCARD,
> -    RBD_AIO_FLUSH
> +    RBD_AIO_FLUSH,
> +    RBD_AIO_WRITE_ZEROES
>  } RBDAIOCmd;
>
>  typedef struct BDRVRBDState {
> @@ -221,8 +222,12 @@ done:
>
>  static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
> +    BDRVRBDState *s = bs->opaque;
>      /* XXX Does RBD support AIO on less than 512-byte alignment? */
>      bs->bl.request_alignment = 512;
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +    bs->bl.pwrite_zeroes_alignment = s->object_size;

The optimal alignment is 512 bytes -- but it can safely work just fine
down to 1 byte alignment since it will pad the request with additional
writes if needed.

> +#endif
>  }
>
>
> @@ -695,6 +700,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
>      }
>
>      s->aio_context = bdrv_get_aio_context(bs);
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
> +#endif
>
>      /* When extending regular files, we get zeros from the OS */
>      bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
> @@ -808,6 +816,11 @@ static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs,
>      case RBD_AIO_FLUSH:
>          r = rbd_aio_flush(s->image, c);
>          break;
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +    case RBD_AIO_WRITE_ZEROES:
> +        r = rbd_aio_write_zeroes(s->image, offset, bytes, c, 0, 0);
> +        break;
> +#endif
>      default:
>          r = -EINVAL;
>      }
> @@ -880,6 +893,19 @@ static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs,
>      return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD);
>  }
>
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +static int
> +coroutine_fn qemu_rbd_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
> +                                      int count, BdrvRequestFlags flags)
> +{
> +    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
> +        return -ENOTSUP;
> +    }

There is a "RBD_WRITE_ZEROES_FLAG_THICK_PROVISION" flag that you can
use to optionally disable unmap.


> +    return qemu_rbd_start_co(bs, offset, count, NULL, flags,
> +                             RBD_AIO_WRITE_ZEROES);
> +}
> +#endif
> +
>  static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
>  {
>      BDRVRBDState *s = bs->opaque;
> @@ -1108,6 +1134,9 @@ static BlockDriver bdrv_rbd = {
>      .bdrv_co_pwritev        = qemu_rbd_co_pwritev,
>      .bdrv_co_flush_to_disk  = qemu_rbd_co_flush,
>      .bdrv_co_pdiscard       = qemu_rbd_co_pdiscard,
> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> +    .bdrv_co_pwrite_zeroes  = qemu_rbd_co_pwrite_zeroes,
> +#endif
>
>      .bdrv_snapshot_create   = qemu_rbd_snap_create,
>      .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
> --
> 2.17.1
>
>


--
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2020-12-27 16:42 ` [PATCH 7/7] block/rbd: change request alignment to 1 byte Peter Lieven
@ 2021-01-14 19:19   ` Jason Dillaman
  2021-01-14 19:59     ` Peter Lieven
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Dillaman @ 2021-01-14 19:19 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>
> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
> the 512 byte alignment.
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  block/rbd.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index 27b4404adf..8673e8f553 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -223,8 +223,6 @@ done:
>  static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
>  {
>      BDRVRBDState *s = bs->opaque;
> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
> -    bs->bl.request_alignment = 512;

Just a suggestion, but perhaps improve discard alignment, max discard,
optimal alignment (if that's something QEMU handles internally) if not
overridden by the user.


>  #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
>      bs->bl.pwrite_zeroes_alignment = s->object_size;
>  #endif
> --
> 2.17.1
>
>


--
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/7] block/rbd: use stored image_size in qemu_rbd_getlength
  2021-01-14 19:18   ` Jason Dillaman
@ 2021-01-14 19:32     ` Peter Lieven
  0 siblings, 0 replies; 28+ messages in thread
From: Peter Lieven @ 2021-01-14 19:32 UTC (permalink / raw)
  To: dillaman; +Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

Am 14.01.21 um 20:18 schrieb Jason Dillaman:
> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>>  block/rbd.c | 10 +---------
>>  1 file changed, 1 insertion(+), 9 deletions(-)
>>
>> diff --git a/block/rbd.c b/block/rbd.c
>> index bc8cf8af9b..a2da70e37f 100644
>> --- a/block/rbd.c
>> +++ b/block/rbd.c
>> @@ -956,15 +956,7 @@ static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
>>  static int64_t qemu_rbd_getlength(BlockDriverState *bs)
>>  {
>>      BDRVRBDState *s = bs->opaque;
>> -    rbd_image_info_t info;
>> -    int r;
>> -
>> -    r = rbd_stat(s->image, &info, sizeof(info));
>> -    if (r < 0) {
>> -        return r;
>> -    }
>> -
>> -    return info.size;
>> +    return s->image_size;
>>  }
>>
>>  static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
>> --
>> 2.17.1
> An RBD image can technically change size dynamically while in-use. The
> original code would provide the most up-to-date length but this
> version will always return the size of the image when it was opened.


Agreed, but Qemu won't propagate this info to the guest unless Qemu truncate is called with length 0.

Anyway, if we want support this we should adjust s->image_size if the size has changed.


Peter





^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/7] block/rbd: migrate from aio to coroutines
  2021-01-14 19:19   ` Jason Dillaman
@ 2021-01-14 19:39     ` Peter Lieven
  0 siblings, 0 replies; 28+ messages in thread
From: Peter Lieven @ 2021-01-14 19:39 UTC (permalink / raw)
  To: dillaman; +Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

Am 14.01.21 um 20:19 schrieb Jason Dillaman:
> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>>  block/rbd.c | 247 ++++++++++++++++++----------------------------------
>>  1 file changed, 84 insertions(+), 163 deletions(-)
>>
>> diff --git a/block/rbd.c b/block/rbd.c
>> index 27b232f4d8..2d77d0007f 100644
>> --- a/block/rbd.c
>> +++ b/block/rbd.c
>> @@ -66,22 +66,6 @@ typedef enum {
>>      RBD_AIO_FLUSH
>>  } RBDAIOCmd;
>>
>> -typedef struct RBDAIOCB {
>> -    BlockAIOCB common;
>> -    int64_t ret;
>> -    QEMUIOVector *qiov;
>> -    RBDAIOCmd cmd;
>> -    int error;
>> -    struct BDRVRBDState *s;
>> -} RBDAIOCB;
>> -
>> -typedef struct RADOSCB {
>> -    RBDAIOCB *acb;
>> -    struct BDRVRBDState *s;
>> -    int64_t size;
>> -    int64_t ret;
>> -} RADOSCB;
>> -
>>  typedef struct BDRVRBDState {
>>      rados_t cluster;
>>      rados_ioctx_t io_ctx;
>> @@ -94,6 +78,13 @@ typedef struct BDRVRBDState {
>>      AioContext *aio_context;
>>  } BDRVRBDState;
>>
>> +typedef struct RBDTask {
>> +    BDRVRBDState *s;
>> +    Coroutine *co;
>> +    bool complete;
>> +    int64_t ret;
>> +} RBDTask;
>> +
>>  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
>>                              BlockdevOptionsRbd *opts, bool cache,
>>                              const char *keypairs, const char *secretid,
>> @@ -316,13 +307,6 @@ static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json,
>>      return ret;
>>  }
>>
>> -static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
>> -{
>> -    RBDAIOCB *acb = rcb->acb;
>> -    iov_memset(acb->qiov->iov, acb->qiov->niov, offs, 0,
>> -               acb->qiov->size - offs);
>> -}
>> -
>>  /* FIXME Deprecate and remove keypairs or make it available in QMP. */
>>  static int qemu_rbd_do_create(BlockdevCreateOptions *options,
>>                                const char *keypairs, const char *password_secret,
>> @@ -440,46 +424,6 @@ exit:
>>      return ret;
>>  }
>>
>> -/*
>> - * This aio completion is being called from rbd_finish_bh() and runs in qemu
>> - * BH context.
>> - */
>> -static void qemu_rbd_complete_aio(RADOSCB *rcb)
>> -{
>> -    RBDAIOCB *acb = rcb->acb;
>> -    int64_t r;
>> -
>> -    r = rcb->ret;
>> -
>> -    if (acb->cmd != RBD_AIO_READ) {
>> -        if (r < 0) {
>> -            acb->ret = r;
>> -            acb->error = 1;
>> -        } else if (!acb->error) {
>> -            acb->ret = rcb->size;
>> -        }
>> -    } else {
>> -        if (r < 0) {
>> -            qemu_rbd_memset(rcb, 0);
>> -            acb->ret = r;
>> -            acb->error = 1;
>> -        } else if (r < rcb->size) {
>> -            qemu_rbd_memset(rcb, r);
>> -            if (!acb->error) {
>> -                acb->ret = rcb->size;
>> -            }
>> -        } else if (!acb->error) {
>> -            acb->ret = r;
>> -        }
>> -    }
>> -
>> -    g_free(rcb);
>> -
>> -    acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
>> -
>> -    qemu_aio_unref(acb);
>> -}
>> -
>>  static char *qemu_rbd_mon_host(BlockdevOptionsRbd *opts, Error **errp)
>>  {
>>      const char **vals;
>> @@ -817,88 +761,49 @@ static int qemu_rbd_resize(BlockDriverState *bs, uint64_t size)
>>      return 0;
>>  }
>>
>> -static const AIOCBInfo rbd_aiocb_info = {
>> -    .aiocb_size = sizeof(RBDAIOCB),
>> -};
>> -
>> -static void rbd_finish_bh(void *opaque)
>> +static void qemu_rbd_finish_bh(void *opaque)
>>  {
>> -    RADOSCB *rcb = opaque;
>> -    qemu_rbd_complete_aio(rcb);
>> +    RBDTask *task = opaque;
>> +    task->complete = 1;
>> +    aio_co_wake(task->co);
>>  }
>>
>> -/*
>> - * This is the callback function for rbd_aio_read and _write
>> - *
>> - * Note: this function is being called from a non qemu thread so
>> - * we need to be careful about what we do here. Generally we only
>> - * schedule a BH, and do the rest of the io completion handling
>> - * from rbd_finish_bh() which runs in a qemu context.
>> - */
>> -static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
>> +static void qemu_rbd_completion_cb(rbd_completion_t c, RBDTask *task)
>>  {
>> -    RBDAIOCB *acb = rcb->acb;
>> -
>> -    rcb->ret = rbd_aio_get_return_value(c);
>> +    task->ret = rbd_aio_get_return_value(c);
>>      rbd_aio_release(c);
>> -
>> -    replay_bh_schedule_oneshot_event(acb->s->aio_context, rbd_finish_bh, rcb);
>> +    aio_bh_schedule_oneshot(task->s->aio_context, qemu_rbd_finish_bh, task);
>>  }
>>
>> -static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
>> -                                 int64_t off,
>> -                                 QEMUIOVector *qiov,
>> -                                 int64_t size,
>> -                                 BlockCompletionFunc *cb,
>> -                                 void *opaque,
>> -                                 RBDAIOCmd cmd)
>> +static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs,
>> +                                          uint64_t offset,
>> +                                          uint64_t bytes,
>> +                                          QEMUIOVector *qiov,
>> +                                          int flags,
>> +                                          RBDAIOCmd cmd)
>>  {
>> -    RBDAIOCB *acb;
>> -    RADOSCB *rcb = NULL;
>> -    rbd_completion_t c;
>> -    int r;
>> -
>>      BDRVRBDState *s = bs->opaque;
>> +    RBDTask task = { .s = s, .co = qemu_coroutine_self() };
>> +    rbd_completion_t c;
>> +    int r, ret = -EIO;
>>
>> -    acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
>> -    acb->cmd = cmd;
>> -    acb->qiov = qiov;
>> -    assert(!qiov || qiov->size == size);
>> -
>> -    rcb = g_new(RADOSCB, 1);
>> -
>> -    acb->ret = 0;
>> -    acb->error = 0;
>> -    acb->s = s;
>> +    assert(!qiov || qiov->size == bytes);
>>
>> -    rcb->acb = acb;
>> -    rcb->s = acb->s;
>> -    rcb->size = size;
>> -    r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c);
>> +    r = rbd_aio_create_completion(&task,
>> +                                  (rbd_callback_t) qemu_rbd_completion_cb, &c);
>>      if (r < 0) {
>>          goto failed;
>>      }
>>
>>      switch (cmd) {
>> -    case RBD_AIO_WRITE:
>> -        /*
>> -         * RBD APIs don't allow us to write more than actual size, so in order
>> -         * to support growing images, we resize the image before write
>> -         * operations that exceed the current size.
>> -         */
>> -        if (off + size > s->image_size) {
>> -            r = qemu_rbd_resize(bs, off + size);
> I realize this is inherited code that is being refactored, but is it
> really possible for QEMU to issue IOs outside the block extents? This
> also has the same issue as the previous commit comment where the
> "image_size" is only initialized at image open time and therefore this
> path will keep getting hit if IOs are issued beyond the original block
> device extents.


This is for the case that you came to the idea that putting e.g. a qcow2 file on rbd which actually can grow during runtime.

If you put a raw image on an rbd image the only way to grow is with block_resize (which invoces truncate) or with qemu-img resize...


I personally was unsure if it makes sense at all to supprt anything except raw on rbd, but it was supported before so I left this in.


>
>> -            if (r < 0) {
>> -                goto failed_completion;
>> -            }
>> -        }
>> -        r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c);
>> -        break;
>>      case RBD_AIO_READ:
>> -        r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c);
>> +        r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, offset, c);
>> +        break;
>> +    case RBD_AIO_WRITE:
>> +        r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, offset, c);
>>          break;
>>      case RBD_AIO_DISCARD:
>> -        r = rbd_aio_discard(s->image, off, size, c);
>> +        r = rbd_aio_discard(s->image, offset, bytes, c);
>>          break;
>>      case RBD_AIO_FLUSH:
>>          r = rbd_aio_flush(s->image, c);
>> @@ -908,44 +813,71 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
>>      }
>>
>>      if (r < 0) {
>> +        task.complete = 1;
>> +        task.ret = r;
>> +    }
>> +
>> +    while (!task.complete) {
>> +        qemu_coroutine_yield();
>> +    }
>> +
>> +    if (task.ret < 0) {
>> +        error_report("rbd request failed: cmd %d offset %" PRIu64 " bytes %"
>> +                     PRIu64 " flags %d task.ret %" PRIi64 " (%s)", cmd, offset,
>> +                     bytes, flags, task.ret, strerror(-task.ret));
>>          goto failed_completion;
>>      }
>> -    return &acb->common;
>> +
>> +    /* zero pad short reads */
>> +    if (cmd == RBD_AIO_READ && task.ret < qiov->size) {
>> +        qemu_iovec_memset(qiov, task.ret, 0, qiov->size - task.ret);
>> +    }
>> +
>> +    return 0;
>>
>>  failed_completion:
>>      rbd_aio_release(c);
> Wasn't this already released in "qemu_rbd_completion_cb"?


You are right, in case of task.ret < 0 we need to go to "failed" and not "failed_completion".

If the I/O is successful we leave the coroutine with "return 0" just above the label.


Peter



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 6/7] block/rbd: add write zeroes support
  2021-01-14 19:19   ` Jason Dillaman
@ 2021-01-14 19:41     ` Peter Lieven
  2021-01-15 15:09       ` Jason Dillaman
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2021-01-14 19:41 UTC (permalink / raw)
  To: dillaman; +Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

Am 14.01.21 um 20:19 schrieb Jason Dillaman:
> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>>  block/rbd.c | 31 ++++++++++++++++++++++++++++++-
>>  1 file changed, 30 insertions(+), 1 deletion(-)
>>
>> diff --git a/block/rbd.c b/block/rbd.c
>> index 2d77d0007f..27b4404adf 100644
>> --- a/block/rbd.c
>> +++ b/block/rbd.c
>> @@ -63,7 +63,8 @@ typedef enum {
>>      RBD_AIO_READ,
>>      RBD_AIO_WRITE,
>>      RBD_AIO_DISCARD,
>> -    RBD_AIO_FLUSH
>> +    RBD_AIO_FLUSH,
>> +    RBD_AIO_WRITE_ZEROES
>>  } RBDAIOCmd;
>>
>>  typedef struct BDRVRBDState {
>> @@ -221,8 +222,12 @@ done:
>>
>>  static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
>>  {
>> +    BDRVRBDState *s = bs->opaque;
>>      /* XXX Does RBD support AIO on less than 512-byte alignment? */
>>      bs->bl.request_alignment = 512;
>> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
>> +    bs->bl.pwrite_zeroes_alignment = s->object_size;
> The optimal alignment is 512 bytes -- but it can safely work just fine
> down to 1 byte alignment since it will pad the request with additional
> writes if needed.


Okay and this will likely be faster than having Qemu doing that request split, right?

If we drop the alignment hint Qemu will pass the original request.


>
>> +#endif
>>  }
>>
>>
>> @@ -695,6 +700,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
>>      }
>>
>>      s->aio_context = bdrv_get_aio_context(bs);
>> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
>> +    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
>> +#endif
>>
>>      /* When extending regular files, we get zeros from the OS */
>>      bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
>> @@ -808,6 +816,11 @@ static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs,
>>      case RBD_AIO_FLUSH:
>>          r = rbd_aio_flush(s->image, c);
>>          break;
>> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
>> +    case RBD_AIO_WRITE_ZEROES:
>> +        r = rbd_aio_write_zeroes(s->image, offset, bytes, c, 0, 0);
>> +        break;
>> +#endif
>>      default:
>>          r = -EINVAL;
>>      }
>> @@ -880,6 +893,19 @@ static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs,
>>      return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD);
>>  }
>>
>> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
>> +static int
>> +coroutine_fn qemu_rbd_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
>> +                                      int count, BdrvRequestFlags flags)
>> +{
>> +    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
>> +        return -ENOTSUP;
>> +    }
> There is a "RBD_WRITE_ZEROES_FLAG_THICK_PROVISION" flag that you can
> use to optionally disable unmap.


I have seen that. If you want I can support for this, too. But afaik this

is only available since Octopus release?


Peter




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/7] block/rbd: add bdrv_{attach,detach}_aio_context
  2021-01-14 19:18   ` Jason Dillaman
@ 2021-01-14 19:49     ` Peter Lieven
  0 siblings, 0 replies; 28+ messages in thread
From: Peter Lieven @ 2021-01-14 19:49 UTC (permalink / raw)
  To: dillaman; +Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

Am 14.01.21 um 20:18 schrieb Jason Dillaman:
> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>>  block/rbd.c | 21 +++++++++++++++++++--
>>  1 file changed, 19 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/rbd.c b/block/rbd.c
>> index a2da70e37f..27b232f4d8 100644
>> --- a/block/rbd.c
>> +++ b/block/rbd.c
>> @@ -91,6 +91,7 @@ typedef struct BDRVRBDState {
>>      char *namespace;
>>      uint64_t image_size;
>>      uint64_t object_size;
>> +    AioContext *aio_context;
>>  } BDRVRBDState;
>>
>>  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
>> @@ -749,6 +750,8 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
>>          }
>>      }
>>
>> +    s->aio_context = bdrv_get_aio_context(bs);
>> +
>>      /* When extending regular files, we get zeros from the OS */
>>      bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
>>
>> @@ -839,8 +842,7 @@ static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb)
>>      rcb->ret = rbd_aio_get_return_value(c);
>>      rbd_aio_release(c);
>>
>> -    replay_bh_schedule_oneshot_event(bdrv_get_aio_context(acb->common.bs),
>> -                                     rbd_finish_bh, rcb);
>> +    replay_bh_schedule_oneshot_event(acb->s->aio_context, rbd_finish_bh, rcb);
>>  }
>>
>>  static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
>> @@ -1151,6 +1153,18 @@ static const char *const qemu_rbd_strong_runtime_opts[] = {
>>      NULL
>>  };
>>
>> +static void qemu_rbd_attach_aio_context(BlockDriverState *bs,
>> +                                       AioContext *new_context)
>> +{
>> +    BDRVRBDState *s = bs->opaque;
>> +    s->aio_context = new_context;
>> +}
>> +
>> +static void qemu_rbd_detach_aio_context(BlockDriverState *bs)
>> +{
> I don't know enough about the internals of QEMU, but this seems
> suspicious to be a no-op.


You are right, I was believing attach and detach aio_context functions always needs to be implemented both at the same time.

Normally this is the point where internal timers will be deleted or polling an fd will be stopped.

We can leave it completely out or set s->aio_context = NULL if we don't want to omit it.


Peter





^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2021-01-14 19:19   ` Jason Dillaman
@ 2021-01-14 19:59     ` Peter Lieven
  2021-01-15 15:27       ` Jason Dillaman
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2021-01-14 19:59 UTC (permalink / raw)
  To: dillaman; +Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

Am 14.01.21 um 20:19 schrieb Jason Dillaman:
> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
>> the 512 byte alignment.
>>
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>>  block/rbd.c | 2 --
>>  1 file changed, 2 deletions(-)
>>
>> diff --git a/block/rbd.c b/block/rbd.c
>> index 27b4404adf..8673e8f553 100644
>> --- a/block/rbd.c
>> +++ b/block/rbd.c
>> @@ -223,8 +223,6 @@ done:
>>  static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
>>  {
>>      BDRVRBDState *s = bs->opaque;
>> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
>> -    bs->bl.request_alignment = 512;
> Just a suggestion, but perhaps improve discard alignment, max discard,
> optimal alignment (if that's something QEMU handles internally) if not
> overridden by the user.


Qemu supports max_discard and discard_alignment. Is there a call to get these limits

from librbd?


What do you mean by optimal_alignment? The object size?


Peter





^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 6/7] block/rbd: add write zeroes support
  2021-01-14 19:41     ` Peter Lieven
@ 2021-01-15 15:09       ` Jason Dillaman
  0 siblings, 0 replies; 28+ messages in thread
From: Jason Dillaman @ 2021-01-15 15:09 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, qemu-block, Christian Theune, qemu-devel, Max Reitz,
	dillaman

On Thu, Jan 14, 2021 at 2:41 PM Peter Lieven <pl@kamp.de> wrote:
>
> Am 14.01.21 um 20:19 schrieb Jason Dillaman:
> > On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
> >> Signed-off-by: Peter Lieven <pl@kamp.de>
> >> ---
> >>  block/rbd.c | 31 ++++++++++++++++++++++++++++++-
> >>  1 file changed, 30 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/block/rbd.c b/block/rbd.c
> >> index 2d77d0007f..27b4404adf 100644
> >> --- a/block/rbd.c
> >> +++ b/block/rbd.c
> >> @@ -63,7 +63,8 @@ typedef enum {
> >>      RBD_AIO_READ,
> >>      RBD_AIO_WRITE,
> >>      RBD_AIO_DISCARD,
> >> -    RBD_AIO_FLUSH
> >> +    RBD_AIO_FLUSH,
> >> +    RBD_AIO_WRITE_ZEROES
> >>  } RBDAIOCmd;
> >>
> >>  typedef struct BDRVRBDState {
> >> @@ -221,8 +222,12 @@ done:
> >>
> >>  static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
> >>  {
> >> +    BDRVRBDState *s = bs->opaque;
> >>      /* XXX Does RBD support AIO on less than 512-byte alignment? */
> >>      bs->bl.request_alignment = 512;
> >> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> >> +    bs->bl.pwrite_zeroes_alignment = s->object_size;
> > The optimal alignment is 512 bytes -- but it can safely work just fine
> > down to 1 byte alignment since it will pad the request with additional
> > writes if needed.
>
>
> Okay and this will likely be faster than having Qemu doing that request split, right?
>
> If we drop the alignment hint Qemu will pass the original request.
>
>
> >
> >> +#endif
> >>  }
> >>
> >>
> >> @@ -695,6 +700,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags,
> >>      }
> >>
> >>      s->aio_context = bdrv_get_aio_context(bs);
> >> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> >> +    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
> >> +#endif
> >>
> >>      /* When extending regular files, we get zeros from the OS */
> >>      bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
> >> @@ -808,6 +816,11 @@ static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs,
> >>      case RBD_AIO_FLUSH:
> >>          r = rbd_aio_flush(s->image, c);
> >>          break;
> >> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> >> +    case RBD_AIO_WRITE_ZEROES:
> >> +        r = rbd_aio_write_zeroes(s->image, offset, bytes, c, 0, 0);
> >> +        break;
> >> +#endif
> >>      default:
> >>          r = -EINVAL;
> >>      }
> >> @@ -880,6 +893,19 @@ static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs,
> >>      return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD);
> >>  }
> >>
> >> +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
> >> +static int
> >> +coroutine_fn qemu_rbd_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
> >> +                                      int count, BdrvRequestFlags flags)
> >> +{
> >> +    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
> >> +        return -ENOTSUP;
> >> +    }
> > There is a "RBD_WRITE_ZEROES_FLAG_THICK_PROVISION" flag that you can
> > use to optionally disable unmap.
>
>
> I have seen that. If you want I can support for this, too. But afaik this
>
> is only available since Octopus release?

True -- I didn't backport that support to Nautilus since it was a new
feature vs the bug-fix that the write-zeroes API was introduced to
fix.

>
> Peter
>
>


-- 
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2021-01-14 19:59     ` Peter Lieven
@ 2021-01-15 15:27       ` Jason Dillaman
  2021-01-15 15:39         ` Peter Lieven
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Dillaman @ 2021-01-15 15:27 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

On Thu, Jan 14, 2021 at 2:59 PM Peter Lieven <pl@kamp.de> wrote:
>
> Am 14.01.21 um 20:19 schrieb Jason Dillaman:
> > On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
> >> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
> >> the 512 byte alignment.
> >>
> >> Signed-off-by: Peter Lieven <pl@kamp.de>
> >> ---
> >>  block/rbd.c | 2 --
> >>  1 file changed, 2 deletions(-)
> >>
> >> diff --git a/block/rbd.c b/block/rbd.c
> >> index 27b4404adf..8673e8f553 100644
> >> --- a/block/rbd.c
> >> +++ b/block/rbd.c
> >> @@ -223,8 +223,6 @@ done:
> >>  static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
> >>  {
> >>      BDRVRBDState *s = bs->opaque;
> >> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
> >> -    bs->bl.request_alignment = 512;
> > Just a suggestion, but perhaps improve discard alignment, max discard,
> > optimal alignment (if that's something QEMU handles internally) if not
> > overridden by the user.
>
>
> Qemu supports max_discard and discard_alignment. Is there a call to get these limits
>
> from librbd?
>
>
> What do you mean by optimal_alignment? The object size?

krbd does a good job of initializing defaults [1] where optimal and
discard alignment is 64KiB (can actually be 4KiB now), max IO size for
writes, discards, and write-zeroes is the object size * the stripe
count.

> Peter
>
>
>

[1] https://github.com/torvalds/linux/blob/master/drivers/block/rbd.c#L4981

-- 
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2021-01-15 15:27       ` Jason Dillaman
@ 2021-01-15 15:39         ` Peter Lieven
  2021-01-18 22:33           ` Jason Dillaman
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2021-01-15 15:39 UTC (permalink / raw)
  To: dillaman; +Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

Am 15.01.21 um 16:27 schrieb Jason Dillaman:
> On Thu, Jan 14, 2021 at 2:59 PM Peter Lieven <pl@kamp.de> wrote:
>> Am 14.01.21 um 20:19 schrieb Jason Dillaman:
>>> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>>>> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
>>>> the 512 byte alignment.
>>>>
>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>> ---
>>>>  block/rbd.c | 2 --
>>>>  1 file changed, 2 deletions(-)
>>>>
>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>> index 27b4404adf..8673e8f553 100644
>>>> --- a/block/rbd.c
>>>> +++ b/block/rbd.c
>>>> @@ -223,8 +223,6 @@ done:
>>>>  static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
>>>>  {
>>>>      BDRVRBDState *s = bs->opaque;
>>>> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
>>>> -    bs->bl.request_alignment = 512;
>>> Just a suggestion, but perhaps improve discard alignment, max discard,
>>> optimal alignment (if that's something QEMU handles internally) if not
>>> overridden by the user.
>>
>> Qemu supports max_discard and discard_alignment. Is there a call to get these limits
>>
>> from librbd?
>>
>>
>> What do you mean by optimal_alignment? The object size?
> krbd does a good job of initializing defaults [1] where optimal and
> discard alignment is 64KiB (can actually be 4KiB now), max IO size for
> writes, discards, and write-zeroes is the object size * the stripe
> count.


Okay, I will have a look at it. If qemu issues a write, discard, write_zero greater than

obj_size  * stripe count will librbd split it internally or will the request fail?


Regarding the alignment it seems that rbd_dev->opts->alloc_size is something that comes from the device

configuration and not from rbd? I don't have that information inside the Qemu RBD driver.


Peter




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2021-01-15 15:39         ` Peter Lieven
@ 2021-01-18 22:33           ` Jason Dillaman
  2021-01-19  9:36             ` Peter Lieven
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Dillaman @ 2021-01-18 22:33 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

On Fri, Jan 15, 2021 at 10:39 AM Peter Lieven <pl@kamp.de> wrote:
>
> Am 15.01.21 um 16:27 schrieb Jason Dillaman:
> > On Thu, Jan 14, 2021 at 2:59 PM Peter Lieven <pl@kamp.de> wrote:
> >> Am 14.01.21 um 20:19 schrieb Jason Dillaman:
> >>> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
> >>>> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
> >>>> the 512 byte alignment.
> >>>>
> >>>> Signed-off-by: Peter Lieven <pl@kamp.de>
> >>>> ---
> >>>>  block/rbd.c | 2 --
> >>>>  1 file changed, 2 deletions(-)
> >>>>
> >>>> diff --git a/block/rbd.c b/block/rbd.c
> >>>> index 27b4404adf..8673e8f553 100644
> >>>> --- a/block/rbd.c
> >>>> +++ b/block/rbd.c
> >>>> @@ -223,8 +223,6 @@ done:
> >>>>  static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
> >>>>  {
> >>>>      BDRVRBDState *s = bs->opaque;
> >>>> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
> >>>> -    bs->bl.request_alignment = 512;
> >>> Just a suggestion, but perhaps improve discard alignment, max discard,
> >>> optimal alignment (if that's something QEMU handles internally) if not
> >>> overridden by the user.
> >>
> >> Qemu supports max_discard and discard_alignment. Is there a call to get these limits
> >>
> >> from librbd?
> >>
> >>
> >> What do you mean by optimal_alignment? The object size?
> > krbd does a good job of initializing defaults [1] where optimal and
> > discard alignment is 64KiB (can actually be 4KiB now), max IO size for
> > writes, discards, and write-zeroes is the object size * the stripe
> > count.
>
>
> Okay, I will have a look at it. If qemu issues a write, discard, write_zero greater than
>
> obj_size  * stripe count will librbd split it internally or will the request fail?

librbd will handle it as needed. My goal is really just to get the
hints down the guest OS.

> Regarding the alignment it seems that rbd_dev->opts->alloc_size is something that comes from the device
>
> configuration and not from rbd? I don't have that information inside the Qemu RBD driver.

librbd doesn't really have the information either. The 64KiB guess
that krbd uses was a compromise since that was the default OSD
allocation size for HDDs since Luminous. Starting with Pacific that
default is going down to 4KiB.

>
> Peter
>
>


-- 
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2021-01-18 22:33           ` Jason Dillaman
@ 2021-01-19  9:36             ` Peter Lieven
  2021-01-19 14:20               ` Jason Dillaman
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2021-01-19  9:36 UTC (permalink / raw)
  To: dillaman; +Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

Am 18.01.21 um 23:33 schrieb Jason Dillaman:
> On Fri, Jan 15, 2021 at 10:39 AM Peter Lieven <pl@kamp.de> wrote:
>> Am 15.01.21 um 16:27 schrieb Jason Dillaman:
>>> On Thu, Jan 14, 2021 at 2:59 PM Peter Lieven <pl@kamp.de> wrote:
>>>> Am 14.01.21 um 20:19 schrieb Jason Dillaman:
>>>>> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>>>>>> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
>>>>>> the 512 byte alignment.
>>>>>>
>>>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>>>> ---
>>>>>>  block/rbd.c | 2 --
>>>>>>  1 file changed, 2 deletions(-)
>>>>>>
>>>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>>>> index 27b4404adf..8673e8f553 100644
>>>>>> --- a/block/rbd.c
>>>>>> +++ b/block/rbd.c
>>>>>> @@ -223,8 +223,6 @@ done:
>>>>>>  static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
>>>>>>  {
>>>>>>      BDRVRBDState *s = bs->opaque;
>>>>>> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
>>>>>> -    bs->bl.request_alignment = 512;
>>>>> Just a suggestion, but perhaps improve discard alignment, max discard,
>>>>> optimal alignment (if that's something QEMU handles internally) if not
>>>>> overridden by the user.
>>>> Qemu supports max_discard and discard_alignment. Is there a call to get these limits
>>>>
>>>> from librbd?
>>>>
>>>>
>>>> What do you mean by optimal_alignment? The object size?
>>> krbd does a good job of initializing defaults [1] where optimal and
>>> discard alignment is 64KiB (can actually be 4KiB now), max IO size for
>>> writes, discards, and write-zeroes is the object size * the stripe
>>> count.
>>
>> Okay, I will have a look at it. If qemu issues a write, discard, write_zero greater than
>>
>> obj_size  * stripe count will librbd split it internally or will the request fail?
> librbd will handle it as needed. My goal is really just to get the
> hints down the guest OS.
>
>> Regarding the alignment it seems that rbd_dev->opts->alloc_size is something that comes from the device
>>
>> configuration and not from rbd? I don't have that information inside the Qemu RBD driver.
> librbd doesn't really have the information either. The 64KiB guess
> that krbd uses was a compromise since that was the default OSD
> allocation size for HDDs since Luminous. Starting with Pacific that
> default is going down to 4KiB.


I will try to adjust these values as far as it is possible and makes sense.


Is there a way to check the minimum supported OSD release in the backend from librbd / librados?


Anyway, I want to sent a V2 by the end of this week.


Peter




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2021-01-19  9:36             ` Peter Lieven
@ 2021-01-19 14:20               ` Jason Dillaman
  0 siblings, 0 replies; 28+ messages in thread
From: Jason Dillaman @ 2021-01-19 14:20 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

On Tue, Jan 19, 2021 at 4:36 AM Peter Lieven <pl@kamp.de> wrote:
>
> Am 18.01.21 um 23:33 schrieb Jason Dillaman:
> > On Fri, Jan 15, 2021 at 10:39 AM Peter Lieven <pl@kamp.de> wrote:
> >> Am 15.01.21 um 16:27 schrieb Jason Dillaman:
> >>> On Thu, Jan 14, 2021 at 2:59 PM Peter Lieven <pl@kamp.de> wrote:
> >>>> Am 14.01.21 um 20:19 schrieb Jason Dillaman:
> >>>>> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
> >>>>>> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
> >>>>>> the 512 byte alignment.
> >>>>>>
> >>>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
> >>>>>> ---
> >>>>>>  block/rbd.c | 2 --
> >>>>>>  1 file changed, 2 deletions(-)
> >>>>>>
> >>>>>> diff --git a/block/rbd.c b/block/rbd.c
> >>>>>> index 27b4404adf..8673e8f553 100644
> >>>>>> --- a/block/rbd.c
> >>>>>> +++ b/block/rbd.c
> >>>>>> @@ -223,8 +223,6 @@ done:
> >>>>>>  static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
> >>>>>>  {
> >>>>>>      BDRVRBDState *s = bs->opaque;
> >>>>>> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
> >>>>>> -    bs->bl.request_alignment = 512;
> >>>>> Just a suggestion, but perhaps improve discard alignment, max discard,
> >>>>> optimal alignment (if that's something QEMU handles internally) if not
> >>>>> overridden by the user.
> >>>> Qemu supports max_discard and discard_alignment. Is there a call to get these limits
> >>>>
> >>>> from librbd?
> >>>>
> >>>>
> >>>> What do you mean by optimal_alignment? The object size?
> >>> krbd does a good job of initializing defaults [1] where optimal and
> >>> discard alignment is 64KiB (can actually be 4KiB now), max IO size for
> >>> writes, discards, and write-zeroes is the object size * the stripe
> >>> count.
> >>
> >> Okay, I will have a look at it. If qemu issues a write, discard, write_zero greater than
> >>
> >> obj_size  * stripe count will librbd split it internally or will the request fail?
> > librbd will handle it as needed. My goal is really just to get the
> > hints down the guest OS.
> >
> >> Regarding the alignment it seems that rbd_dev->opts->alloc_size is something that comes from the device
> >>
> >> configuration and not from rbd? I don't have that information inside the Qemu RBD driver.
> > librbd doesn't really have the information either. The 64KiB guess
> > that krbd uses was a compromise since that was the default OSD
> > allocation size for HDDs since Luminous. Starting with Pacific that
> > default is going down to 4KiB.
>
>
> I will try to adjust these values as far as it is possible and makes sense.
>
>
> Is there a way to check the minimum supported OSD release in the backend from librbd / librados?

It's not a minimum -- RADOS will gladly access 1 byte writes as well.
It's really just the optimal (performance and space-wise). Sadly,
there is no realistic way to query this data from the backend.

>
> Anyway, I want to sent a V2 by the end of this week.
>
>
> Peter
>
>


-- 
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2021-01-21 20:29   ` Peter Lieven
@ 2021-01-21 20:55     ` Jason Dillaman
  0 siblings, 0 replies; 28+ messages in thread
From: Jason Dillaman @ 2021-01-21 20:55 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

On Thu, Jan 21, 2021 at 3:29 PM Peter Lieven <pl@kamp.de> wrote:
>
> Am 21.01.21 um 20:42 schrieb Jason Dillaman:
> > On Wed, Jan 20, 2021 at 6:01 PM Peter Lieven <pl@kamp.de> wrote:
> >>
> >>> Am 19.01.2021 um 15:20 schrieb Jason Dillaman <jdillama@redhat.com>:
> >>>
> >>> On Tue, Jan 19, 2021 at 4:36 AM Peter Lieven <pl@kamp.de> wrote:
> >>>>> Am 18.01.21 um 23:33 schrieb Jason Dillaman:
> >>>>> On Fri, Jan 15, 2021 at 10:39 AM Peter Lieven <pl@kamp.de> wrote:
> >>>>>> Am 15.01.21 um 16:27 schrieb Jason Dillaman:
> >>>>>>> On Thu, Jan 14, 2021 at 2:59 PM Peter Lieven <pl@kamp.de> wrote:
> >>>>>>>> Am 14.01.21 um 20:19 schrieb Jason Dillaman:
> >>>>>>>>> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
> >>>>>>>>>> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
> >>>>>>>>>> the 512 byte alignment.
> >>>>>>>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
> >>>>>>>>>> ---
> >>>>>>>>>> block/rbd.c | 2 --
> >>>>>>>>>> 1 file changed, 2 deletions(-)
> >>>>>>>>>> diff --git a/block/rbd.c b/block/rbd.c
> >>>>>>>>>> index 27b4404adf..8673e8f553 100644
> >>>>>>>>>> --- a/block/rbd.c
> >>>>>>>>>> +++ b/block/rbd.c
> >>>>>>>>>> @@ -223,8 +223,6 @@ done:
> >>>>>>>>>> static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
> >>>>>>>>>> {
> >>>>>>>>>>    BDRVRBDState *s = bs->opaque;
> >>>>>>>>>> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
> >>>>>>>>>> -    bs->bl.request_alignment = 512;
> >>>>>>>>> Just a suggestion, but perhaps improve discard alignment, max discard,
> >>>>>>>>> optimal alignment (if that's something QEMU handles internally) if not
> >>>>>>>>> overridden by the user.
> >>>>>>>> Qemu supports max_discard and discard_alignment. Is there a call to get these limits
> >>>>>>>> from librbd?
> >>>>>>>> What do you mean by optimal_alignment? The object size?
> >>>>>>> krbd does a good job of initializing defaults [1] where optimal and
> >>>>>>> discard alignment is 64KiB (can actually be 4KiB now), max IO size for
> >>>>>>> writes, discards, and write-zeroes is the object size * the stripe
> >>>>>>> count.
> >>>>>> Okay, I will have a look at it. If qemu issues a write, discard, write_zero greater than
> >>>>>> obj_size  * stripe count will librbd split it internally or will the request fail?
> >>>>> librbd will handle it as needed. My goal is really just to get the
> >>>>> hints down the guest OS.
> >>>>>> Regarding the alignment it seems that rbd_dev->opts->alloc_size is something that comes from the device
> >>>>>> configuration and not from rbd? I don't have that information inside the Qemu RBD driver.
> >>>>> librbd doesn't really have the information either. The 64KiB guess
> >>>>> that krbd uses was a compromise since that was the default OSD
> >>>>> allocation size for HDDs since Luminous. Starting with Pacific that
> >>>>> default is going down to 4KiB.
> >>>> I will try to adjust these values as far as it is possible and makes sense.
> >>>> Is there a way to check the minimum supported OSD release in the backend from librbd / librados?
> >>> It's not a minimum -- RADOS will gladly access 1 byte writes as well.
> >>> It's really just the optimal (performance and space-wise). Sadly,
> >>> there is no realistic way to query this data from the backend.
> >> So you would suggest to advertise an optimal transfer length of 64k and max transfer length of obj size * stripe count to the guest unless we have an API in the future to query these limits from the backend?
> > I'll open a Ceph tracker ticket to expose these via the API in a future release.
>
>
> That would be good to have!
>
>
> >
> >> I would leave request alignment at 1 byte as otherwise Qemu will issue RMWs for all write requests that do not align. Everything that comes from a guest OS is very likely 4k aligned anyway.
> > My goal is definitely not to have QEMU do any extra work for splitting
> > or aligning IOs. I am really only trying to get hints passed down the
> > guest via the virtio drivers. If there isn't the plumbing in QEMU for
> > that yet, disregard.
>
>
> From what I read from the code Qemu emulates the Block Limits VPD Page for virtio-scsi, but the limits there are not taken from the backend driver. They can be passed as config to the virtio-scsi device.
>
> However, Qemu uses all the Block Limit that can be found in include/block/block_int.h in the BlockLimits struct to generate optimal requests if it comes to block operations like mirroring, or zeroing of a whole
>
> device etc. Some of the alignments (e.g. the request alignment) have direct influence and make Qemu split requests from the guest or even make RMW cycles.
>
> If my understanding is incorrect please anyone correct me.
>
>
> It would indeed be nice to have an option to propagate the settings from the backend driver into the Guest. But from my understanding this is not there yet.
>
>
> So I would leave it as it. Drop the request_alignment = 512 (like in the patch) and just advertise the cluster_size at obj_size. This is already in the rbd driver today.
>
> The cluster_size e.g. is used in any qemu-img convert operation to align read / write requests and size them apropiately.

+1 to leave as-is until we can pass those hints down.

> Peter
>
>

Thanks!

-- 
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2021-01-21 19:42 ` Jason Dillaman
@ 2021-01-21 20:29   ` Peter Lieven
  2021-01-21 20:55     ` Jason Dillaman
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2021-01-21 20:29 UTC (permalink / raw)
  To: dillaman; +Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

Am 21.01.21 um 20:42 schrieb Jason Dillaman:
> On Wed, Jan 20, 2021 at 6:01 PM Peter Lieven <pl@kamp.de> wrote:
>>
>>> Am 19.01.2021 um 15:20 schrieb Jason Dillaman <jdillama@redhat.com>:
>>>
>>> On Tue, Jan 19, 2021 at 4:36 AM Peter Lieven <pl@kamp.de> wrote:
>>>>> Am 18.01.21 um 23:33 schrieb Jason Dillaman:
>>>>> On Fri, Jan 15, 2021 at 10:39 AM Peter Lieven <pl@kamp.de> wrote:
>>>>>> Am 15.01.21 um 16:27 schrieb Jason Dillaman:
>>>>>>> On Thu, Jan 14, 2021 at 2:59 PM Peter Lieven <pl@kamp.de> wrote:
>>>>>>>> Am 14.01.21 um 20:19 schrieb Jason Dillaman:
>>>>>>>>> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>>>>>>>>>> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
>>>>>>>>>> the 512 byte alignment.
>>>>>>>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>>>>>>>> ---
>>>>>>>>>> block/rbd.c | 2 --
>>>>>>>>>> 1 file changed, 2 deletions(-)
>>>>>>>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>>>>>>>> index 27b4404adf..8673e8f553 100644
>>>>>>>>>> --- a/block/rbd.c
>>>>>>>>>> +++ b/block/rbd.c
>>>>>>>>>> @@ -223,8 +223,6 @@ done:
>>>>>>>>>> static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
>>>>>>>>>> {
>>>>>>>>>>    BDRVRBDState *s = bs->opaque;
>>>>>>>>>> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
>>>>>>>>>> -    bs->bl.request_alignment = 512;
>>>>>>>>> Just a suggestion, but perhaps improve discard alignment, max discard,
>>>>>>>>> optimal alignment (if that's something QEMU handles internally) if not
>>>>>>>>> overridden by the user.
>>>>>>>> Qemu supports max_discard and discard_alignment. Is there a call to get these limits
>>>>>>>> from librbd?
>>>>>>>> What do you mean by optimal_alignment? The object size?
>>>>>>> krbd does a good job of initializing defaults [1] where optimal and
>>>>>>> discard alignment is 64KiB (can actually be 4KiB now), max IO size for
>>>>>>> writes, discards, and write-zeroes is the object size * the stripe
>>>>>>> count.
>>>>>> Okay, I will have a look at it. If qemu issues a write, discard, write_zero greater than
>>>>>> obj_size  * stripe count will librbd split it internally or will the request fail?
>>>>> librbd will handle it as needed. My goal is really just to get the
>>>>> hints down the guest OS.
>>>>>> Regarding the alignment it seems that rbd_dev->opts->alloc_size is something that comes from the device
>>>>>> configuration and not from rbd? I don't have that information inside the Qemu RBD driver.
>>>>> librbd doesn't really have the information either. The 64KiB guess
>>>>> that krbd uses was a compromise since that was the default OSD
>>>>> allocation size for HDDs since Luminous. Starting with Pacific that
>>>>> default is going down to 4KiB.
>>>> I will try to adjust these values as far as it is possible and makes sense.
>>>> Is there a way to check the minimum supported OSD release in the backend from librbd / librados?
>>> It's not a minimum -- RADOS will gladly access 1 byte writes as well.
>>> It's really just the optimal (performance and space-wise). Sadly,
>>> there is no realistic way to query this data from the backend.
>> So you would suggest to advertise an optimal transfer length of 64k and max transfer length of obj size * stripe count to the guest unless we have an API in the future to query these limits from the backend?
> I'll open a Ceph tracker ticket to expose these via the API in a future release.


That would be good to have!


>
>> I would leave request alignment at 1 byte as otherwise Qemu will issue RMWs for all write requests that do not align. Everything that comes from a guest OS is very likely 4k aligned anyway.
> My goal is definitely not to have QEMU do any extra work for splitting
> or aligning IOs. I am really only trying to get hints passed down the
> guest via the virtio drivers. If there isn't the plumbing in QEMU for
> that yet, disregard.


From what I read from the code Qemu emulates the Block Limits VPD Page for virtio-scsi, but the limits there are not taken from the backend driver. They can be passed as config to the virtio-scsi device.

However, Qemu uses all the Block Limit that can be found in include/block/block_int.h in the BlockLimits struct to generate optimal requests if it comes to block operations like mirroring, or zeroing of a whole

device etc. Some of the alignments (e.g. the request alignment) have direct influence and make Qemu split requests from the guest or even make RMW cycles.

If my understanding is incorrect please anyone correct me.


It would indeed be nice to have an option to propagate the settings from the backend driver into the Guest. But from my understanding this is not there yet.


So I would leave it as it. Drop the request_alignment = 512 (like in the patch) and just advertise the cluster_size at obj_size. This is already in the rbd driver today.

The cluster_size e.g. is used in any qemu-img convert operation to align read / write requests and size them apropiately.


Peter




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
  2021-01-20 23:01 Peter Lieven
@ 2021-01-21 19:42 ` Jason Dillaman
  2021-01-21 20:29   ` Peter Lieven
  0 siblings, 1 reply; 28+ messages in thread
From: Jason Dillaman @ 2021-01-21 19:42 UTC (permalink / raw)
  To: Peter Lieven
  Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz

On Wed, Jan 20, 2021 at 6:01 PM Peter Lieven <pl@kamp.de> wrote:
>
>
> > Am 19.01.2021 um 15:20 schrieb Jason Dillaman <jdillama@redhat.com>:
> >
> > On Tue, Jan 19, 2021 at 4:36 AM Peter Lieven <pl@kamp.de> wrote:
> >>> Am 18.01.21 um 23:33 schrieb Jason Dillaman:
> >>> On Fri, Jan 15, 2021 at 10:39 AM Peter Lieven <pl@kamp.de> wrote:
> >>>> Am 15.01.21 um 16:27 schrieb Jason Dillaman:
> >>>>> On Thu, Jan 14, 2021 at 2:59 PM Peter Lieven <pl@kamp.de> wrote:
> >>>>>> Am 14.01.21 um 20:19 schrieb Jason Dillaman:
> >>>>>>> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
> >>>>>>>> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
> >>>>>>>> the 512 byte alignment.
> >>>>>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
> >>>>>>>> ---
> >>>>>>>> block/rbd.c | 2 --
> >>>>>>>> 1 file changed, 2 deletions(-)
> >>>>>>>> diff --git a/block/rbd.c b/block/rbd.c
> >>>>>>>> index 27b4404adf..8673e8f553 100644
> >>>>>>>> --- a/block/rbd.c
> >>>>>>>> +++ b/block/rbd.c
> >>>>>>>> @@ -223,8 +223,6 @@ done:
> >>>>>>>> static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
> >>>>>>>> {
> >>>>>>>>    BDRVRBDState *s = bs->opaque;
> >>>>>>>> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
> >>>>>>>> -    bs->bl.request_alignment = 512;
> >>>>>>> Just a suggestion, but perhaps improve discard alignment, max discard,
> >>>>>>> optimal alignment (if that's something QEMU handles internally) if not
> >>>>>>> overridden by the user.
> >>>>>> Qemu supports max_discard and discard_alignment. Is there a call to get these limits
> >>>>>> from librbd?
> >>>>>> What do you mean by optimal_alignment? The object size?
> >>>>> krbd does a good job of initializing defaults [1] where optimal and
> >>>>> discard alignment is 64KiB (can actually be 4KiB now), max IO size for
> >>>>> writes, discards, and write-zeroes is the object size * the stripe
> >>>>> count.
> >>>> Okay, I will have a look at it. If qemu issues a write, discard, write_zero greater than
> >>>> obj_size  * stripe count will librbd split it internally or will the request fail?
> >>> librbd will handle it as needed. My goal is really just to get the
> >>> hints down the guest OS.
> >>>> Regarding the alignment it seems that rbd_dev->opts->alloc_size is something that comes from the device
> >>>> configuration and not from rbd? I don't have that information inside the Qemu RBD driver.
> >>> librbd doesn't really have the information either. The 64KiB guess
> >>> that krbd uses was a compromise since that was the default OSD
> >>> allocation size for HDDs since Luminous. Starting with Pacific that
> >>> default is going down to 4KiB.
> >> I will try to adjust these values as far as it is possible and makes sense.
> >> Is there a way to check the minimum supported OSD release in the backend from librbd / librados?
> >
> > It's not a minimum -- RADOS will gladly access 1 byte writes as well.
> > It's really just the optimal (performance and space-wise). Sadly,
> > there is no realistic way to query this data from the backend.
>
> So you would suggest to advertise an optimal transfer length of 64k and max transfer length of obj size * stripe count to the guest unless we have an API in the future to query these limits from the backend?

I'll open a Ceph tracker ticket to expose these via the API in a future release.

> I would leave request alignment at 1 byte as otherwise Qemu will issue RMWs for all write requests that do not align. Everything that comes from a guest OS is very likely 4k aligned anyway.

My goal is definitely not to have QEMU do any extra work for splitting
or aligning IOs. I am really only trying to get hints passed down the
guest via the virtio drivers. If there isn't the plumbing in QEMU for
that yet, disregard.

> Peter
>
>


-- 
Jason



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/7] block/rbd: change request alignment to 1 byte
@ 2021-01-20 23:01 Peter Lieven
  2021-01-21 19:42 ` Jason Dillaman
  0 siblings, 1 reply; 28+ messages in thread
From: Peter Lieven @ 2021-01-20 23:01 UTC (permalink / raw)
  To: dillaman; +Cc: Kevin Wolf, Christian Theune, qemu-devel, qemu-block, Max Reitz


> Am 19.01.2021 um 15:20 schrieb Jason Dillaman <jdillama@redhat.com>:
> 
> On Tue, Jan 19, 2021 at 4:36 AM Peter Lieven <pl@kamp.de> wrote:
>>> Am 18.01.21 um 23:33 schrieb Jason Dillaman:
>>> On Fri, Jan 15, 2021 at 10:39 AM Peter Lieven <pl@kamp.de> wrote:
>>>> Am 15.01.21 um 16:27 schrieb Jason Dillaman:
>>>>> On Thu, Jan 14, 2021 at 2:59 PM Peter Lieven <pl@kamp.de> wrote:
>>>>>> Am 14.01.21 um 20:19 schrieb Jason Dillaman:
>>>>>>> On Sun, Dec 27, 2020 at 11:42 AM Peter Lieven <pl@kamp.de> wrote:
>>>>>>>> since we implement byte interfaces and librbd supports aio on byte granularity we can lift
>>>>>>>> the 512 byte alignment.
>>>>>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>>>>>> ---
>>>>>>>> block/rbd.c | 2 --
>>>>>>>> 1 file changed, 2 deletions(-)
>>>>>>>> diff --git a/block/rbd.c b/block/rbd.c
>>>>>>>> index 27b4404adf..8673e8f553 100644
>>>>>>>> --- a/block/rbd.c
>>>>>>>> +++ b/block/rbd.c
>>>>>>>> @@ -223,8 +223,6 @@ done:
>>>>>>>> static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp)
>>>>>>>> {
>>>>>>>>    BDRVRBDState *s = bs->opaque;
>>>>>>>> -    /* XXX Does RBD support AIO on less than 512-byte alignment? */
>>>>>>>> -    bs->bl.request_alignment = 512;
>>>>>>> Just a suggestion, but perhaps improve discard alignment, max discard,
>>>>>>> optimal alignment (if that's something QEMU handles internally) if not
>>>>>>> overridden by the user.
>>>>>> Qemu supports max_discard and discard_alignment. Is there a call to get these limits
>>>>>> from librbd?
>>>>>> What do you mean by optimal_alignment? The object size?
>>>>> krbd does a good job of initializing defaults [1] where optimal and
>>>>> discard alignment is 64KiB (can actually be 4KiB now), max IO size for
>>>>> writes, discards, and write-zeroes is the object size * the stripe
>>>>> count.
>>>> Okay, I will have a look at it. If qemu issues a write, discard, write_zero greater than
>>>> obj_size  * stripe count will librbd split it internally or will the request fail?
>>> librbd will handle it as needed. My goal is really just to get the
>>> hints down the guest OS.
>>>> Regarding the alignment it seems that rbd_dev->opts->alloc_size is something that comes from the device
>>>> configuration and not from rbd? I don't have that information inside the Qemu RBD driver.
>>> librbd doesn't really have the information either. The 64KiB guess
>>> that krbd uses was a compromise since that was the default OSD
>>> allocation size for HDDs since Luminous. Starting with Pacific that
>>> default is going down to 4KiB.
>> I will try to adjust these values as far as it is possible and makes sense.
>> Is there a way to check the minimum supported OSD release in the backend from librbd / librados?
> 
> It's not a minimum -- RADOS will gladly access 1 byte writes as well.
> It's really just the optimal (performance and space-wise). Sadly,
> there is no realistic way to query this data from the backend.

So you would suggest to advertise an optimal transfer length of 64k and max transfer length of obj size * stripe count to the guest unless we have an API in the future to query these limits from the backend?

I would leave request alignment at 1 byte as otherwise Qemu will issue RMWs for all write requests that do not align. Everything that comes from a guest OS is very likely 4k aligned anyway.

Peter




^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2021-01-21 20:56 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-27 16:42 [PATCH 0/7] block/rbd: migrate to coroutines and add write zeroes support Peter Lieven
2020-12-27 16:42 ` [PATCH 1/7] block/rbd: bump librbd requirement to luminous release Peter Lieven
2020-12-27 16:42 ` [PATCH 2/7] block/rbd: store object_size in BDRVRBDState Peter Lieven
2020-12-27 16:42 ` [PATCH 3/7] block/rbd: use stored image_size in qemu_rbd_getlength Peter Lieven
2021-01-14 19:18   ` Jason Dillaman
2021-01-14 19:32     ` Peter Lieven
2020-12-27 16:42 ` [PATCH 4/7] block/rbd: add bdrv_{attach,detach}_aio_context Peter Lieven
2021-01-14 19:18   ` Jason Dillaman
2021-01-14 19:49     ` Peter Lieven
2020-12-27 16:42 ` [PATCH 5/7] block/rbd: migrate from aio to coroutines Peter Lieven
2021-01-14 19:19   ` Jason Dillaman
2021-01-14 19:39     ` Peter Lieven
2020-12-27 16:42 ` [PATCH 6/7] block/rbd: add write zeroes support Peter Lieven
2021-01-14 19:19   ` Jason Dillaman
2021-01-14 19:41     ` Peter Lieven
2021-01-15 15:09       ` Jason Dillaman
2020-12-27 16:42 ` [PATCH 7/7] block/rbd: change request alignment to 1 byte Peter Lieven
2021-01-14 19:19   ` Jason Dillaman
2021-01-14 19:59     ` Peter Lieven
2021-01-15 15:27       ` Jason Dillaman
2021-01-15 15:39         ` Peter Lieven
2021-01-18 22:33           ` Jason Dillaman
2021-01-19  9:36             ` Peter Lieven
2021-01-19 14:20               ` Jason Dillaman
2021-01-20 23:01 Peter Lieven
2021-01-21 19:42 ` Jason Dillaman
2021-01-21 20:29   ` Peter Lieven
2021-01-21 20:55     ` Jason Dillaman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.