qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs
@ 2021-03-02 11:10 Klaus Jensen
  2021-03-02 11:10 ` [PATCH RFC 1/4] hw/block/nvme: convert dsm to aiocb Klaus Jensen
                   ` (4 more replies)
  0 siblings, 5 replies; 10+ messages in thread
From: Klaus Jensen @ 2021-03-02 11:10 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz, Klaus Jensen,
	Stefan Hajnoczi, Keith Busch

From: Klaus Jensen <k.jensen@samsung.com>

The nvme device currently uses an ad-hoc approach to tracking AIO
completion when a request results in multiple issued AIOs.

This series convert those operations (DSM, Copy, Flush and Zone Reset)
to use "proper" QEMU AIOCB processing instead. This requires more code,
but the end result of this is that we gain proper cancellation support
(something that the device would not do correctly in the existing ad-hoc
approach, and something that would have required more code anyway).

This series makes SQ deletions "just work" and allows Abort to be
implemented such that it actually does something.

Marking RFC, since I've not really done anything with QEMU AIOs and BHs
on this level before, so I'd really like some block-layer eyes on it.

Klaus Jensen (4):
  hw/block/nvme: convert dsm to aiocb
  hw/block/nvme: convert copy to aiocb
  hw/block/nvme: convert flush to aiocb
  hw/block/nvme: convert zone reset to aiocb

 hw/block/nvme.c       | 945 ++++++++++++++++++++++++------------------
 hw/block/trace-events |   2 +-
 2 files changed, 537 insertions(+), 410 deletions(-)

-- 
2.30.1



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH RFC 1/4] hw/block/nvme: convert dsm to aiocb
  2021-03-02 11:10 [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs Klaus Jensen
@ 2021-03-02 11:10 ` Klaus Jensen
  2021-03-08 16:37   ` Stefan Hajnoczi
  2021-03-02 11:10 ` [PATCH RFC 2/4] hw/block/nvme: convert copy " Klaus Jensen
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 10+ messages in thread
From: Klaus Jensen @ 2021-03-02 11:10 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz, Klaus Jensen,
	Stefan Hajnoczi, Keith Busch

From: Klaus Jensen <k.jensen@samsung.com>

Convert dataset management from ad-hoc multi aio tracking to use
standard QEMU AIOCB processing.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c       | 187 ++++++++++++++++++++++++++++--------------
 hw/block/trace-events |   2 +-
 2 files changed, 125 insertions(+), 64 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 54c87c8f5fe3..8830d72b959f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1494,23 +1494,16 @@ static void nvme_aio_flush_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
-static void nvme_aio_discard_cb(void *opaque, int ret)
+static void nvme_misc_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
-    uintptr_t *discards = (uintptr_t *)&req->opaque;
 
-    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
+    trace_pci_nvme_misc_cb(nvme_cid(req));
 
     if (ret) {
         nvme_aio_err(req, ret);
     }
 
-    (*discards)--;
-
-    if (*discards) {
-        return;
-    }
-
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
@@ -1736,78 +1729,146 @@ out:
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+typedef struct NvmeDSMAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    NvmeRequest *req;
+    QEMUBH *bh;
+    int ret;
+
+    NvmeDsmRange *range;
+    int nr;
+    struct {
+        int64_t offset;
+        size_t  len;
+        int     idx;
+    } curr;
+} NvmeDSMAIOCB;
+
+static void nvme_dsm_cancel(BlockAIOCB *aiocb)
+{
+    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
+
+    /* break loop */
+    iocb->curr.len = 0;
+    iocb->curr.idx = iocb->nr;
+
+    iocb->ret = -ECANCELED;
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+        iocb->aiocb = NULL;
+    }
+}
+
+static const AIOCBInfo nvme_dsm_aiocb_info = {
+    .aiocb_size   = sizeof(NvmeDSMAIOCB),
+    .cancel_async = nvme_dsm_cancel,
+};
+
+static void nvme_dsm_bh(void *opaque)
+{
+    NvmeDSMAIOCB *iocb = opaque;
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+    qemu_aio_unref(iocb);
+}
+
+static void nvme_dsm_aio_cb(void *opaque, int ret)
+{
+    NvmeDSMAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeCtrl *n = nvme_ctrl(req);
+    NvmeNamespace *ns = req->ns;
+    NvmeDsmRange *range;
+    uint64_t slba;
+    uint32_t nlb;
+    size_t bytes;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+    if (iocb->curr.len == 0) {
+next:
+        if (iocb->curr.idx == iocb->nr) {
+            goto done;
+        }
+
+        range = &iocb->range[iocb->curr.idx++];
+        slba = le64_to_cpu(range->slba);
+        nlb = le32_to_cpu(range->nlb);
+
+        trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
+                                      nlb);
+
+        if (nlb > n->dmrsl) {
+            trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
+        }
+
+        if (nvme_check_bounds(ns, slba, nlb)) {
+            trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+                                                 ns->id_ns.nsze);
+            goto next;
+        }
+
+        iocb->curr.offset = nvme_l2b(ns, slba);
+        iocb->curr.len = nvme_l2b(ns, nlb);
+    }
+
+    bytes = MIN(BDRV_REQUEST_MAX_BYTES, iocb->curr.len);
+
+    iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, iocb->curr.offset, bytes,
+                                   nvme_dsm_aio_cb, iocb);
+
+    iocb->curr.offset += bytes;
+    iocb->curr.len -= bytes;
+
+    return;
+
+done:
+    iocb->aiocb = NULL;
+    if (iocb->bh) {
+        qemu_bh_schedule(iocb->bh);
+    }
+}
+
 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns = req->ns;
     NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
-
     uint32_t attr = le32_to_cpu(dsm->attributes);
     uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
-
     uint16_t status = NVME_SUCCESS;
 
     trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
 
     if (attr & NVME_DSMGMT_AD) {
-        int64_t offset;
-        size_t len;
-        NvmeDsmRange range[nr];
-        uintptr_t *discards = (uintptr_t *)&req->opaque;
+        NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
+                                         nvme_misc_cb, req);
 
-        status = nvme_dma(n, (uint8_t *)range, sizeof(range),
+        iocb->req = req;
+        iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
+        iocb->ret = 0;
+        iocb->range = g_new(NvmeDsmRange, nr);
+        iocb->nr = nr;
+        iocb->curr.len = 0;
+        iocb->curr.idx = 0;
+
+        status = nvme_dma(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
                           DMA_DIRECTION_TO_DEVICE, req);
         if (status) {
             return status;
         }
 
-        /*
-         * AIO callbacks may be called immediately, so initialize discards to 1
-         * to make sure the the callback does not complete the request before
-         * all discards have been issued.
-         */
-        *discards = 1;
+        nvme_dsm_aio_cb(iocb, 0);
+        req->aiocb = &iocb->common;
 
-        for (int i = 0; i < nr; i++) {
-            uint64_t slba = le64_to_cpu(range[i].slba);
-            uint32_t nlb = le32_to_cpu(range[i].nlb);
-
-            if (nvme_check_bounds(ns, slba, nlb)) {
-                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
-                                                     ns->id_ns.nsze);
-                continue;
-            }
-
-            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
-                                          nlb);
-
-            if (nlb > n->dmrsl) {
-                trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
-            }
-
-            offset = nvme_l2b(ns, slba);
-            len = nvme_l2b(ns, nlb);
-
-            while (len) {
-                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
-
-                (*discards)++;
-
-                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
-                                 nvme_aio_discard_cb, req);
-
-                offset += bytes;
-                len -= bytes;
-            }
-        }
-
-        /* account for the 1-initialization */
-        (*discards)--;
-
-        if (*discards) {
-            status = NVME_NO_COMPLETE;
-        } else {
-            status = req->status;
-        }
+        return NVME_NO_COMPLETE;
     }
 
     return status;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 8deeacc8c35c..0e5bddbdd48b 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -54,7 +54,7 @@ pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb
 pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32""
 pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16""
-pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""
+pci_nvme_misc_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_aio_copy_in_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64""
 pci_nvme_aio_flush_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
-- 
2.30.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH RFC 2/4] hw/block/nvme: convert copy to aiocb
  2021-03-02 11:10 [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs Klaus Jensen
  2021-03-02 11:10 ` [PATCH RFC 1/4] hw/block/nvme: convert dsm to aiocb Klaus Jensen
@ 2021-03-02 11:10 ` Klaus Jensen
  2021-03-02 11:10 ` [PATCH RFC 3/4] hw/block/nvme: convert flush " Klaus Jensen
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Klaus Jensen @ 2021-03-02 11:10 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz, Klaus Jensen,
	Stefan Hajnoczi, Keith Busch

From: Klaus Jensen <k.jensen@samsung.com>

Convert copy from ad-hoc multi aio tracking to use standard QEMU AIOCB
processing.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 382 +++++++++++++++++++++++++-----------------------
 1 file changed, 199 insertions(+), 183 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 8830d72b959f..48a1abe52787 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1554,136 +1554,6 @@ static void nvme_aio_zone_reset_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
-struct nvme_copy_ctx {
-    int copies;
-    uint8_t *bounce;
-    uint32_t nlb;
-};
-
-struct nvme_copy_in_ctx {
-    NvmeRequest *req;
-    QEMUIOVector iov;
-};
-
-static void nvme_copy_cb(void *opaque, int ret)
-{
-    NvmeRequest *req = opaque;
-    NvmeNamespace *ns = req->ns;
-    struct nvme_copy_ctx *ctx = req->opaque;
-
-    trace_pci_nvme_copy_cb(nvme_cid(req));
-
-    if (ns->params.zoned) {
-        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
-        uint64_t sdlba = le64_to_cpu(copy->sdlba);
-        NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
-
-        __nvme_advance_zone_wp(ns, zone, ctx->nlb);
-    }
-
-    if (!ret) {
-        block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
-    } else {
-        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
-        nvme_aio_err(req, ret);
-    }
-
-    g_free(ctx->bounce);
-    g_free(ctx);
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
-static void nvme_copy_in_complete(NvmeRequest *req)
-{
-    NvmeNamespace *ns = req->ns;
-    NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
-    struct nvme_copy_ctx *ctx = req->opaque;
-    uint64_t sdlba = le64_to_cpu(copy->sdlba);
-    uint16_t status;
-
-    trace_pci_nvme_copy_in_complete(nvme_cid(req));
-
-    block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
-
-    status = nvme_check_bounds(ns, sdlba, ctx->nlb);
-    if (status) {
-        trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze);
-        goto invalid;
-    }
-
-    if (ns->params.zoned) {
-        NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
-
-        status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb);
-        if (status) {
-            goto invalid;
-        }
-
-        status = nvme_zrm_auto(ns, zone);
-        if (status) {
-            goto invalid;
-        }
-
-        zone->w_ptr += ctx->nlb;
-    }
-
-    qemu_iovec_init(&req->iov, 1);
-    qemu_iovec_add(&req->iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
-
-    block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
-                     BLOCK_ACCT_WRITE);
-
-    req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
-                                 &req->iov, 0, nvme_copy_cb, req);
-
-    return;
-
-invalid:
-    req->status = status;
-
-    g_free(ctx->bounce);
-    g_free(ctx);
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
-static void nvme_aio_copy_in_cb(void *opaque, int ret)
-{
-    struct nvme_copy_in_ctx *in_ctx = opaque;
-    NvmeRequest *req = in_ctx->req;
-    NvmeNamespace *ns = req->ns;
-    struct nvme_copy_ctx *ctx = req->opaque;
-
-    qemu_iovec_destroy(&in_ctx->iov);
-    g_free(in_ctx);
-
-    trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));
-
-    if (ret) {
-        nvme_aio_err(req, ret);
-    }
-
-    ctx->copies--;
-
-    if (ctx->copies) {
-        return;
-    }
-
-    if (req->status) {
-        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
-
-        g_free(ctx->bounce);
-        g_free(ctx);
-
-        nvme_enqueue_req_completion(nvme_cq(req), req);
-
-        return;
-    }
-
-    nvme_copy_in_complete(req);
-}
-
 struct nvme_compare_ctx {
     QEMUIOVector iov;
     uint8_t *bounce;
@@ -1874,18 +1744,184 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
     return status;
 }
 
+typedef struct NvmeCopyAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    NvmeRequest *req;
+    QEMUBH *bh;
+    int ret;
+
+    NvmeCopySourceRange *range;
+    int nr;
+    uint32_t nlb;
+    uint8_t *bounce;
+    bool done;
+    struct {
+        int idx;
+        uint8_t *p;
+    } copy_in;
+} NvmeCopyAIOCB;
+
+static void nvme_copy_cancel(BlockAIOCB *aiocb)
+{
+    NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
+
+    iocb->ret = -ECANCELED;
+    iocb->copy_in.idx = iocb->nr;
+    iocb->done = true;
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+        iocb->aiocb = NULL;
+    }
+}
+
+static const AIOCBInfo nvme_copy_aiocb_info = {
+    .aiocb_size   = sizeof(NvmeCopyAIOCB),
+    .cancel_async = nvme_copy_cancel,
+};
+
+static void nvme_copy_bh(void *opaque)
+{
+    NvmeCopyAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+
+    if (ns->params.zoned) {
+        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
+        uint64_t sdlba = le64_to_cpu(copy->sdlba);
+        NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
+
+        __nvme_advance_zone_wp(ns, zone, iocb->nlb);
+    }
+
+    if (iocb->ret < 0) {
+        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
+    } else {
+        block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
+    }
+
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+
+    g_free(iocb->bounce);
+
+    qemu_aio_unref(iocb);
+}
+
+static void nvme_copy_aio_cb(void *opaque, int ret);
+
+static uint16_t nvme_copy_in_complete(NvmeCopyAIOCB *iocb)
+{
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
+    uint64_t sdlba = le64_to_cpu(copy->sdlba);
+    uint16_t status;
+
+    block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
+
+    status = nvme_check_bounds(ns, sdlba, iocb->nlb);
+    if (status) {
+        trace_pci_nvme_err_invalid_lba_range(sdlba, iocb->nlb, ns->id_ns.nsze);
+        return status;
+    }
+
+    if (ns->params.zoned) {
+        NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
+
+        status = nvme_check_zone_write(ns, zone, sdlba, iocb->nlb);
+        if (status) {
+            return status;
+        }
+
+        status = nvme_zrm_auto(ns, zone);
+        if (status) {
+            return status;
+        }
+
+        zone->w_ptr += iocb->nlb;
+    }
+
+    iocb->done = true;
+
+    qemu_iovec_reset(&req->iov);
+    qemu_iovec_add(&req->iov, iocb->bounce, nvme_l2b(ns, iocb->nlb));
+
+    block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
+                     BLOCK_ACCT_WRITE);
+
+    iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
+                                  &req->iov, 0, nvme_copy_aio_cb, iocb);
+
+    return NVME_SUCCESS;
+}
+
+static void nvme_copy_aio_cb(void *opaque, int ret)
+{
+    NvmeCopyAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeCopySourceRange *range;
+    uint64_t slba;
+    uint32_t nlb;
+    size_t bytes;
+    uint16_t status;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+    if (iocb->copy_in.idx == iocb->nr) {
+        if (iocb->done) {
+            goto done;
+        }
+
+        status = nvme_copy_in_complete(iocb);
+        if (status) {
+            req->status = status;
+            goto done;
+        }
+
+        return;
+    }
+
+    range = &iocb->range[iocb->copy_in.idx++];
+    slba = le64_to_cpu(range->slba);
+    nlb = le32_to_cpu(range->nlb);
+    bytes = nvme_l2b(ns, nlb);
+
+    trace_pci_nvme_copy_source_range(slba, nlb);
+
+    qemu_iovec_reset(&req->iov);
+    qemu_iovec_add(&req->iov, iocb->copy_in.p, bytes);
+    iocb->copy_in.p += bytes;
+
+    iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
+                                 &req->iov, 0, nvme_copy_aio_cb, iocb);
+    return;
+
+done:
+    iocb->aiocb = NULL;
+    if (iocb->bh) {
+        qemu_bh_schedule(iocb->bh);
+    }
+}
+
+
 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns = req->ns;
     NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
-    g_autofree NvmeCopySourceRange *range = NULL;
-
+    NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
+                                      nvme_misc_cb, req);
     uint16_t nr = copy->nr + 1;
     uint8_t format = copy->control[0] & 0xf;
     uint32_t nlb = 0;
-
-    uint8_t *bounce = NULL, *bouncep = NULL;
-    struct nvme_copy_ctx *ctx;
     uint16_t status;
     int i;
 
@@ -1900,39 +1936,46 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
         return NVME_CMD_SIZE_LIMIT | NVME_DNR;
     }
 
-    range = g_new(NvmeCopySourceRange, nr);
+    iocb->req = req;
+    iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
+    iocb->ret = 0;
+    iocb->done = false;
+    iocb->range = g_new(NvmeCopySourceRange, nr);
+    iocb->nr = nr;
+    iocb->copy_in.idx = 0;
 
-    status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeCopySourceRange),
+    status = nvme_dma(n, (uint8_t *)iocb->range,
+                      sizeof(NvmeCopySourceRange) * nr,
                       DMA_DIRECTION_TO_DEVICE, req);
     if (status) {
-        return status;
+        goto invalid;
     }
 
     for (i = 0; i < nr; i++) {
-        uint64_t slba = le64_to_cpu(range[i].slba);
-        uint32_t _nlb = le16_to_cpu(range[i].nlb) + 1;
+        uint64_t slba = le64_to_cpu(iocb->range[i].slba);
+        uint32_t _nlb = le16_to_cpu(iocb->range[i].nlb) + 1;
 
         if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) {
-            return NVME_CMD_SIZE_LIMIT | NVME_DNR;
+            status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
         }
 
         status = nvme_check_bounds(ns, slba, _nlb);
         if (status) {
             trace_pci_nvme_err_invalid_lba_range(slba, _nlb, ns->id_ns.nsze);
-            return status;
+            goto invalid;
         }
 
         if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
             status = nvme_check_dulbe(ns, slba, _nlb);
             if (status) {
-                return status;
+                goto invalid;
             }
         }
 
         if (ns->params.zoned) {
             status = nvme_check_zone_read(ns, slba, _nlb);
             if (status) {
-                return status;
+                goto invalid;
             }
         }
 
@@ -1940,53 +1983,26 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
     }
 
     if (nlb > le32_to_cpu(ns->id_ns.mcl)) {
-        return NVME_CMD_SIZE_LIMIT | NVME_DNR;
+        status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
+        goto invalid;
     }
 
-    bounce = bouncep = g_malloc(nvme_l2b(ns, nlb));
+    iocb->nlb = nlb;
+    iocb->bounce = g_malloc(nvme_l2b(ns, nlb));
+    iocb->copy_in.p = iocb->bounce;
+
+    qemu_iovec_init(&req->iov, 1);
 
     block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
                      BLOCK_ACCT_READ);
 
-    ctx = g_new(struct nvme_copy_ctx, 1);
-
-    ctx->bounce = bounce;
-    ctx->nlb = nlb;
-    ctx->copies = 1;
-
-    req->opaque = ctx;
-
-    for (i = 0; i < nr; i++) {
-        uint64_t slba = le64_to_cpu(range[i].slba);
-        uint32_t nlb = le16_to_cpu(range[i].nlb) + 1;
-
-        size_t len = nvme_l2b(ns, nlb);
-        int64_t offset = nvme_l2b(ns, slba);
-
-        trace_pci_nvme_copy_source_range(slba, nlb);
-
-        struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1);
-        in_ctx->req = req;
-
-        qemu_iovec_init(&in_ctx->iov, 1);
-        qemu_iovec_add(&in_ctx->iov, bouncep, len);
-
-        ctx->copies++;
-
-        blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
-                       nvme_aio_copy_in_cb, in_ctx);
-
-        bouncep += len;
-    }
-
-    /* account for the 1-initialization */
-    ctx->copies--;
-
-    if (!ctx->copies) {
-        nvme_copy_in_complete(req);
-    }
+    nvme_copy_aio_cb(iocb, 0);
+    req->aiocb = &iocb->common;
 
     return NVME_NO_COMPLETE;
+
+invalid:
+    return status;
 }
 
 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
-- 
2.30.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH RFC 3/4] hw/block/nvme: convert flush to aiocb
  2021-03-02 11:10 [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs Klaus Jensen
  2021-03-02 11:10 ` [PATCH RFC 1/4] hw/block/nvme: convert dsm to aiocb Klaus Jensen
  2021-03-02 11:10 ` [PATCH RFC 2/4] hw/block/nvme: convert copy " Klaus Jensen
@ 2021-03-02 11:10 ` Klaus Jensen
  2021-03-02 11:10 ` [PATCH RFC 4/4] hw/block/nvme: convert zone reset " Klaus Jensen
  2021-03-08 16:38 ` [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs Stefan Hajnoczi
  4 siblings, 0 replies; 10+ messages in thread
From: Klaus Jensen @ 2021-03-02 11:10 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz, Klaus Jensen,
	Stefan Hajnoczi, Keith Busch

From: Klaus Jensen <k.jensen@samsung.com>

Convert flush from ad-hoc multi aio tracking to use standard QEMU AIOCB
processing.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 143 +++++++++++++++++++++++++-----------------------
 1 file changed, 74 insertions(+), 69 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 48a1abe52787..773b41527c79 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1459,41 +1459,6 @@ static void nvme_rw_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
-struct nvme_aio_flush_ctx {
-    NvmeRequest     *req;
-    NvmeNamespace   *ns;
-    BlockAcctCookie acct;
-};
-
-static void nvme_aio_flush_cb(void *opaque, int ret)
-{
-    struct nvme_aio_flush_ctx *ctx = opaque;
-    NvmeRequest *req = ctx->req;
-    uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
-
-    BlockBackend *blk = ctx->ns->blkconf.blk;
-    BlockAcctCookie *acct = &ctx->acct;
-    BlockAcctStats *stats = blk_get_stats(blk);
-
-    trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));
-
-    if (!ret) {
-        block_acct_done(stats, acct);
-    } else {
-        block_acct_failed(stats, acct);
-        nvme_aio_err(req, ret);
-    }
-
-    (*num_flushes)--;
-    g_free(ctx);
-
-    if (*num_flushes) {
-        return;
-    }
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
 static void nvme_misc_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
@@ -2055,13 +2020,74 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
     return NVME_NO_COMPLETE;
 }
 
+typedef struct NvmeFlushAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    NvmeRequest *req;
+    QEMUBH *bh;
+    int ret;
+
+    int nsid;
+} NvmeFlushAIOCB;
+
+static void nvme_flush_cancel(BlockAIOCB *acb)
+{
+    NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
+    NvmeCtrl *n = nvme_ctrl(iocb->req);
+
+    iocb->nsid = n->num_namespaces + 1;
+    iocb->ret = -ECANCELED;
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+        iocb->aiocb = NULL;
+    }
+}
+
+static const AIOCBInfo nvme_flush_aiocb_info = {
+    .aiocb_size = sizeof(NvmeFlushAIOCB),
+    .cancel_async = nvme_flush_cancel,
+};
+
+static void nvme_flush_bh(void *opaque)
+{
+    NvmeFlushAIOCB *iocb = opaque;
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+    qemu_aio_unref(iocb);
+}
+
+static void nvme_flush_aio_cb(void *opaque, int ret)
+{
+    NvmeFlushAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeCtrl *n = nvme_ctrl(req);
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+    while (iocb->nsid <= n->num_namespaces) {
+        NvmeNamespace *ns = nvme_ns(n, iocb->nsid++);
+        if (ns) {
+            iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_aio_cb,
+                                        iocb);
+            return;
+        }
+    }
+
+done:
+    iocb->aiocb = NULL;
+    qemu_bh_schedule(iocb->bh);
+}
+
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
+    NvmeFlushAIOCB *iocb;
     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
-    uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
-    uint16_t status;
-    struct nvme_aio_flush_ctx *ctx;
-    NvmeNamespace *ns;
 
     trace_pci_nvme_flush(nvme_cid(req), nsid);
 
@@ -2071,42 +2097,21 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
             return NVME_INVALID_FIELD | NVME_DNR;
         }
 
-        block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
-                         BLOCK_ACCT_FLUSH);
-        req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
+        req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req);
         return NVME_NO_COMPLETE;
     }
 
-    /* 1-initialize; see comment in nvme_dsm */
-    *num_flushes = 1;
+    iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
 
-    for (int i = 1; i <= n->num_namespaces; i++) {
-        ns = nvme_ns(n, i);
-        if (!ns) {
-            continue;
-        }
+    iocb->req = req;
+    iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
+    iocb->ret = 0;
+    iocb->nsid = 1;
 
-        ctx = g_new(struct nvme_aio_flush_ctx, 1);
-        ctx->req = req;
-        ctx->ns = ns;
+    nvme_flush_aio_cb(iocb, 0);
+    req->aiocb = &iocb->common;
 
-        (*num_flushes)++;
-
-        block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
-                         BLOCK_ACCT_FLUSH);
-        blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
-    }
-
-    /* account for the 1-initialization */
-    (*num_flushes)--;
-
-    if (*num_flushes) {
-        status = NVME_NO_COMPLETE;
-    } else {
-        status = req->status;
-    }
-
-    return status;
+    return NVME_NO_COMPLETE;
 }
 
 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
-- 
2.30.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH RFC 4/4] hw/block/nvme: convert zone reset to aiocb
  2021-03-02 11:10 [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs Klaus Jensen
                   ` (2 preceding siblings ...)
  2021-03-02 11:10 ` [PATCH RFC 3/4] hw/block/nvme: convert flush " Klaus Jensen
@ 2021-03-02 11:10 ` Klaus Jensen
  2021-03-08 16:38 ` [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs Stefan Hajnoczi
  4 siblings, 0 replies; 10+ messages in thread
From: Klaus Jensen @ 2021-03-02 11:10 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz, Klaus Jensen,
	Stefan Hajnoczi, Keith Busch

From: Klaus Jensen <k.jensen@samsung.com>

Convert zone reset from ad-hoc multi aio tracking to use standard QEMU
AIOCB processing.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c | 233 +++++++++++++++++++++++++++++-------------------
 1 file changed, 139 insertions(+), 94 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 773b41527c79..0d067f186ed7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1325,6 +1325,29 @@ static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
     }
 }
 
+static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
+{
+    switch (nvme_get_zone_state(zone)) {
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        nvme_aor_dec_open(ns);
+        /* fallthrough */
+    case NVME_ZONE_STATE_CLOSED:
+        nvme_aor_dec_active(ns);
+        /* fallthrough */
+    case NVME_ZONE_STATE_FULL:
+        zone->w_ptr = zone->d.zslba;
+        zone->d.wp = zone->w_ptr;
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
+        /* fallthrough */
+    case NVME_ZONE_STATE_EMPTY:
+        return NVME_SUCCESS;
+
+    default:
+        return NVME_ZONE_INVAL_TRANSITION;
+    }
+}
+
 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
 {
     NvmeZone *zone;
@@ -1472,53 +1495,6 @@ static void nvme_misc_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
-struct nvme_zone_reset_ctx {
-    NvmeRequest *req;
-    NvmeZone    *zone;
-};
-
-static void nvme_aio_zone_reset_cb(void *opaque, int ret)
-{
-    struct nvme_zone_reset_ctx *ctx = opaque;
-    NvmeRequest *req = ctx->req;
-    NvmeNamespace *ns = req->ns;
-    NvmeZone *zone = ctx->zone;
-    uintptr_t *resets = (uintptr_t *)&req->opaque;
-
-    g_free(ctx);
-
-    trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
-
-    if (!ret) {
-        switch (nvme_get_zone_state(zone)) {
-        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
-        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
-            nvme_aor_dec_open(ns);
-            /* fall through */
-        case NVME_ZONE_STATE_CLOSED:
-            nvme_aor_dec_active(ns);
-            /* fall through */
-        case NVME_ZONE_STATE_FULL:
-            zone->w_ptr = zone->d.zslba;
-            zone->d.wp = zone->w_ptr;
-            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
-            /* fall through */
-        default:
-            break;
-        }
-    } else {
-        nvme_aio_err(req, ret);
-    }
-
-    (*resets)--;
-
-    if (*resets) {
-        return;
-    }
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
 struct nvme_compare_ctx {
     QEMUIOVector iov;
     uint8_t *bounce;
@@ -2336,41 +2312,6 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
     return nvme_zrm_finish(ns, zone);
 }
 
-static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
-                                NvmeZoneState state, NvmeRequest *req)
-{
-    uintptr_t *resets = (uintptr_t *)&req->opaque;
-    struct nvme_zone_reset_ctx *ctx;
-
-    switch (state) {
-    case NVME_ZONE_STATE_EMPTY:
-        return NVME_SUCCESS;
-    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
-    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
-    case NVME_ZONE_STATE_CLOSED:
-    case NVME_ZONE_STATE_FULL:
-        break;
-    default:
-        return NVME_ZONE_INVAL_TRANSITION;
-    }
-
-    /*
-     * The zone reset aio callback needs to know the zone that is being reset
-     * in order to transition the zone on completion.
-     */
-    ctx = g_new(struct nvme_zone_reset_ctx, 1);
-    ctx->req = req;
-    ctx->zone = zone;
-
-    (*resets)++;
-
-    blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
-                          nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
-                          nvme_aio_zone_reset_cb, ctx);
-
-    return NVME_NO_COMPLETE;
-}
-
 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
                                   NvmeZoneState state, NvmeRequest *req)
 {
@@ -2499,12 +2440,115 @@ out:
     return status;
 }
 
+typedef struct NvmeZoneResetAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    NvmeRequest *req;
+    QEMUBH *bh;
+    int ret;
+
+    bool all;
+
+    struct {
+        int idx;
+        NvmeZone *zone;
+    } curr;
+} NvmeZoneResetAIOCB;
+
+static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
+{
+    NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+
+    iocb->curr.idx = ns->num_zones;
+
+    iocb->ret = -ECANCELED;
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+        iocb->aiocb = NULL;
+    }
+}
+
+static const AIOCBInfo nvme_zone_reset_aiocb_info = {
+    .aiocb_size = sizeof(NvmeZoneResetAIOCB),
+    .cancel_async = nvme_zone_reset_cancel,
+};
+
+static void nvme_zone_reset_bh(void *opaque)
+{
+    NvmeZoneResetAIOCB *iocb = opaque;
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+    qemu_aio_unref(iocb);
+}
+
+static void nvme_zone_reset_aio_cb(void *opaque, int ret)
+{
+    NvmeZoneResetAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+    if (iocb->curr.zone) {
+        nvme_zrm_reset(ns, iocb->curr.zone);
+
+        if (!iocb->all) {
+            goto done;
+        }
+    }
+
+    while (iocb->curr.idx < ns->num_zones) {
+        NvmeZone *zone = &ns->zone_array[iocb->curr.idx++];
+
+        switch (nvme_get_zone_state(zone)) {
+        case NVME_ZONE_STATE_EMPTY:
+            if (!iocb->all) {
+                goto done;
+            }
+
+            continue;
+
+        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        case NVME_ZONE_STATE_CLOSED:
+        case NVME_ZONE_STATE_FULL:
+            iocb->curr.zone = zone;
+            break;
+
+        default:
+            continue;
+        }
+
+        iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
+                                            nvme_l2b(ns, zone->d.zslba),
+                                            nvme_l2b(ns, ns->zone_size),
+                                            BDRV_REQ_MAY_UNMAP,
+                                            nvme_zone_reset_aio_cb, iocb);
+        return;
+    }
+
+done:
+    iocb->aiocb = NULL;
+    if (iocb->bh) {
+        qemu_bh_schedule(iocb->bh);
+    }
+}
+
 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
     NvmeZone *zone;
-    uintptr_t *resets;
+    NvmeZoneResetAIOCB *iocb;
     uint8_t *zd_ext;
     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
     uint64_t slba = 0;
@@ -2515,7 +2559,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
     enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
 
     action = dw13 & 0xff;
-    all = dw13 & 0x100;
+    all = !!(dw13 & 0x100);
 
     req->status = NVME_SUCCESS;
 
@@ -2559,21 +2603,22 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
         break;
 
     case NVME_ZONE_ACTION_RESET:
-        resets = (uintptr_t *)&req->opaque;
-
-        if (all) {
-            proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
-                NVME_PROC_FULL_ZONES;
-        }
         trace_pci_nvme_reset_zone(slba, zone_idx, all);
 
-        *resets = 1;
+        iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
+                           nvme_misc_cb, req);
 
-        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req);
+        iocb->req = req;
+        iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
+        iocb->ret = 0;
+        iocb->all = all;
+        iocb->curr.idx = zone_idx;
+        iocb->curr.zone = NULL;
 
-        (*resets)--;
+        nvme_zone_reset_aio_cb(iocb, 0);
+        req->aiocb = &iocb->common;
 
-        return *resets ? NVME_NO_COMPLETE : req->status;
+        return NVME_NO_COMPLETE;
 
     case NVME_ZONE_ACTION_OFFLINE:
         if (all) {
-- 
2.30.1



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC 1/4] hw/block/nvme: convert dsm to aiocb
  2021-03-02 11:10 ` [PATCH RFC 1/4] hw/block/nvme: convert dsm to aiocb Klaus Jensen
@ 2021-03-08 16:37   ` Stefan Hajnoczi
  2021-03-08 18:05     ` Klaus Jensen
  0 siblings, 1 reply; 10+ messages in thread
From: Stefan Hajnoczi @ 2021-03-08 16:37 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, qemu-devel, Max Reitz, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 2414 bytes --]

On Tue, Mar 02, 2021 at 12:10:37PM +0100, Klaus Jensen wrote:
> +static void nvme_dsm_cancel(BlockAIOCB *aiocb)
> +{
> +    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
> +
> +    /* break loop */
> +    iocb->curr.len = 0;
> +    iocb->curr.idx = iocb->nr;
> +
> +    iocb->ret = -ECANCELED;
> +
> +    if (iocb->aiocb) {
> +        blk_aio_cancel_async(iocb->aiocb);
> +        iocb->aiocb = NULL;
> +    }
> +}

Is the case where iocb->aiocb == NULL just in case nvme_dsm_cancel() is
called after the last discard has completed but before the BH runs? I
want to make sure there are no other cases because nothing would call
iocb->common.cb().

>  static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
>  {
>      NvmeNamespace *ns = req->ns;
>      NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
> -
>      uint32_t attr = le32_to_cpu(dsm->attributes);
>      uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
> -
>      uint16_t status = NVME_SUCCESS;
>  
>      trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
>  
>      if (attr & NVME_DSMGMT_AD) {
> -        int64_t offset;
> -        size_t len;
> -        NvmeDsmRange range[nr];
> -        uintptr_t *discards = (uintptr_t *)&req->opaque;
> +        NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
> +                                         nvme_misc_cb, req);
>  
> -        status = nvme_dma(n, (uint8_t *)range, sizeof(range),
> +        iocb->req = req;
> +        iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
> +        iocb->ret = 0;
> +        iocb->range = g_new(NvmeDsmRange, nr);
> +        iocb->nr = nr;
> +        iocb->curr.len = 0;
> +        iocb->curr.idx = 0;
> +
> +        status = nvme_dma(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
>                            DMA_DIRECTION_TO_DEVICE, req);
>          if (status) {
>              return status;
>          }
>  
> -        /*
> -         * AIO callbacks may be called immediately, so initialize discards to 1
> -         * to make sure the the callback does not complete the request before
> -         * all discards have been issued.
> -         */
> -        *discards = 1;
> +        nvme_dsm_aio_cb(iocb, 0);
> +        req->aiocb = &iocb->common;

Want to move this line up one just in case something in
nvme_dsm_aio_cb() accesses req->aiocb?

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs
  2021-03-02 11:10 [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs Klaus Jensen
                   ` (3 preceding siblings ...)
  2021-03-02 11:10 ` [PATCH RFC 4/4] hw/block/nvme: convert zone reset " Klaus Jensen
@ 2021-03-08 16:38 ` Stefan Hajnoczi
  4 siblings, 0 replies; 10+ messages in thread
From: Stefan Hajnoczi @ 2021-03-08 16:38 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, qemu-devel, Max Reitz, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 286 bytes --]

On Tue, Mar 02, 2021 at 12:10:36PM +0100, Klaus Jensen wrote:
> Marking RFC, since I've not really done anything with QEMU AIOs and BHs
> on this level before, so I'd really like some block-layer eyes on it.

I took a brief look and it seems like a nice conversion of the code.

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC 1/4] hw/block/nvme: convert dsm to aiocb
  2021-03-08 16:37   ` Stefan Hajnoczi
@ 2021-03-08 18:05     ` Klaus Jensen
  2021-03-09 16:03       ` Stefan Hajnoczi
  0 siblings, 1 reply; 10+ messages in thread
From: Klaus Jensen @ 2021-03-08 18:05 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, qemu-devel, Max Reitz, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 2737 bytes --]

On Mar  8 16:37, Stefan Hajnoczi wrote:
> On Tue, Mar 02, 2021 at 12:10:37PM +0100, Klaus Jensen wrote:
> > +static void nvme_dsm_cancel(BlockAIOCB *aiocb)
> > +{
> > +    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
> > +
> > +    /* break loop */
> > +    iocb->curr.len = 0;
> > +    iocb->curr.idx = iocb->nr;
> > +
> > +    iocb->ret = -ECANCELED;
> > +
> > +    if (iocb->aiocb) {
> > +        blk_aio_cancel_async(iocb->aiocb);
> > +        iocb->aiocb = NULL;
> > +    }
> > +}
> 
> Is the case where iocb->aiocb == NULL just in case nvme_dsm_cancel() is
> called after the last discard has completed but before the BH runs? I
> want to make sure there are no other cases because nothing would call
> iocb->common.cb().
> 

Yes - that case *can* happen, right?

I modeled this after the appoach in the ide trim code (hw/ide/core.c).

> >  static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >      NvmeNamespace *ns = req->ns;
> >      NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
> > -
> >      uint32_t attr = le32_to_cpu(dsm->attributes);
> >      uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
> > -
> >      uint16_t status = NVME_SUCCESS;
> >  
> >      trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
> >  
> >      if (attr & NVME_DSMGMT_AD) {
> > -        int64_t offset;
> > -        size_t len;
> > -        NvmeDsmRange range[nr];
> > -        uintptr_t *discards = (uintptr_t *)&req->opaque;
> > +        NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
> > +                                         nvme_misc_cb, req);
> >  
> > -        status = nvme_dma(n, (uint8_t *)range, sizeof(range),
> > +        iocb->req = req;
> > +        iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
> > +        iocb->ret = 0;
> > +        iocb->range = g_new(NvmeDsmRange, nr);
> > +        iocb->nr = nr;
> > +        iocb->curr.len = 0;
> > +        iocb->curr.idx = 0;
> > +
> > +        status = nvme_dma(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
> >                            DMA_DIRECTION_TO_DEVICE, req);
> >          if (status) {
> >              return status;
> >          }
> >  
> > -        /*
> > -         * AIO callbacks may be called immediately, so initialize discards to 1
> > -         * to make sure the the callback does not complete the request before
> > -         * all discards have been issued.
> > -         */
> > -        *discards = 1;
> > +        nvme_dsm_aio_cb(iocb, 0);
> > +        req->aiocb = &iocb->common;
> 
> Want to move this line up one just in case something in
> nvme_dsm_aio_cb() accesses req->aiocb?

Sounds reasonable! Thanks!

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC 1/4] hw/block/nvme: convert dsm to aiocb
  2021-03-08 18:05     ` Klaus Jensen
@ 2021-03-09 16:03       ` Stefan Hajnoczi
  2021-03-09 18:27         ` Klaus Jensen
  0 siblings, 1 reply; 10+ messages in thread
From: Stefan Hajnoczi @ 2021-03-09 16:03 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, qemu-devel, Max Reitz, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 2301 bytes --]

On Mon, Mar 08, 2021 at 07:05:40PM +0100, Klaus Jensen wrote:
> On Mar  8 16:37, Stefan Hajnoczi wrote:
> > On Tue, Mar 02, 2021 at 12:10:37PM +0100, Klaus Jensen wrote:
> > > +static void nvme_dsm_cancel(BlockAIOCB *aiocb)
> > > +{
> > > +    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
> > > +
> > > +    /* break loop */
> > > +    iocb->curr.len = 0;
> > > +    iocb->curr.idx = iocb->nr;
> > > +
> > > +    iocb->ret = -ECANCELED;
> > > +
> > > +    if (iocb->aiocb) {
> > > +        blk_aio_cancel_async(iocb->aiocb);
> > > +        iocb->aiocb = NULL;
> > > +    }
> > > +}
> > 
> > Is the case where iocb->aiocb == NULL just in case nvme_dsm_cancel() is
> > called after the last discard has completed but before the BH runs? I
> > want to make sure there are no other cases because nothing would call
> > iocb->common.cb().
> > 
> 
> Yes - that case *can* happen, right?
> 
> I modeled this after the appoach in the ide trim code (hw/ide/core.c).

Yes, nvme_dsm_bh() may run after other event loop activity. Therefore we
have to take the iocb->aiocb == NULL case into account because some
event loop activity could call nvme_dsm_cancel() before the BH runs.

Another (wild?) possibility is that nvme_dsm_cancel() is called twice.
That's okay, nvme_dsm_cancel() supports that nicely.

But I wasn't sure if there are any other cases where iocb->aiocb can be
NULL? It could be nice to include an assertion or comment to clarify
this. For example:

  if (iocb->aiocb) {
      blk_aio_cancel_async(iocb->aiocb);
      iocb->aiocb = NULL;
  } else {
      /*
       * We only get here if nvme_dsm_cancel() was already called or
       * nvme_dsm_bh() is about to run.
       */
      assert(iocb->curr.idx == iocb->nr);
  }

  /* break loop */
  iocb->curr.len = 0;
  iocb->curr.idx = iocb->nr;

  iocb->ret = -ECANCELED;

(I'm not sure if my assert is correct, but hopefully this explains what
I mean.)

The reason why this assertion is important is because nvme_dsm_cancel()
does not support other iocb->aiocb = NULL cases. The cancelled request
could hang if nothing completes it. The assertion will complain loudly
if this every happens (may not now, but if someone changes the code in
the future).

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC 1/4] hw/block/nvme: convert dsm to aiocb
  2021-03-09 16:03       ` Stefan Hajnoczi
@ 2021-03-09 18:27         ` Klaus Jensen
  0 siblings, 0 replies; 10+ messages in thread
From: Klaus Jensen @ 2021-03-09 18:27 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, qemu-devel, Max Reitz, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 2792 bytes --]

On Mar  9 16:03, Stefan Hajnoczi wrote:
> On Mon, Mar 08, 2021 at 07:05:40PM +0100, Klaus Jensen wrote:
> > On Mar  8 16:37, Stefan Hajnoczi wrote:
> > > On Tue, Mar 02, 2021 at 12:10:37PM +0100, Klaus Jensen wrote:
> > > > +static void nvme_dsm_cancel(BlockAIOCB *aiocb)
> > > > +{
> > > > +    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
> > > > +
> > > > +    /* break loop */
> > > > +    iocb->curr.len = 0;
> > > > +    iocb->curr.idx = iocb->nr;
> > > > +
> > > > +    iocb->ret = -ECANCELED;
> > > > +
> > > > +    if (iocb->aiocb) {
> > > > +        blk_aio_cancel_async(iocb->aiocb);
> > > > +        iocb->aiocb = NULL;
> > > > +    }
> > > > +}
> > > 
> > > Is the case where iocb->aiocb == NULL just in case nvme_dsm_cancel() is
> > > called after the last discard has completed but before the BH runs? I
> > > want to make sure there are no other cases because nothing would call
> > > iocb->common.cb().
> > > 
> > 
> > Yes - that case *can* happen, right?
> > 
> > I modeled this after the appoach in the ide trim code (hw/ide/core.c).
> 
> Yes, nvme_dsm_bh() may run after other event loop activity. Therefore we
> have to take the iocb->aiocb == NULL case into account because some
> event loop activity could call nvme_dsm_cancel() before the BH runs.
> 
> Another (wild?) possibility is that nvme_dsm_cancel() is called twice.
> That's okay, nvme_dsm_cancel() supports that nicely.
> 
> But I wasn't sure if there are any other cases where iocb->aiocb can be
> NULL? It could be nice to include an assertion or comment to clarify
> this. For example:
> 
>   if (iocb->aiocb) {
>       blk_aio_cancel_async(iocb->aiocb);
>       iocb->aiocb = NULL;
>   } else {
>       /*
>        * We only get here if nvme_dsm_cancel() was already called or
>        * nvme_dsm_bh() is about to run.
>        */
>       assert(iocb->curr.idx == iocb->nr);
>   }
> 
>   /* break loop */
>   iocb->curr.len = 0;
>   iocb->curr.idx = iocb->nr;
> 
>   iocb->ret = -ECANCELED;
> 
> (I'm not sure if my assert is correct, but hopefully this explains what
> I mean.)
> 

Understood! I'll fix that up.

> The reason why this assertion is important is because nvme_dsm_cancel()
> does not support other iocb->aiocb = NULL cases. The cancelled request
> could hang if nothing completes it. The assertion will complain loudly
> if this every happens (may not now, but if someone changes the code in
> the future).
> 

Yeah, I understand that there is a risk of dead-lock due to "weird"
scheduling if one is not careful.

Thanks Stefan, these kinds of comments are super helpful when trying to
wrap ones head around this!

I'll give it another spin and post a v2 taking your comments into
account :)

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-03-09 19:59 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-02 11:10 [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs Klaus Jensen
2021-03-02 11:10 ` [PATCH RFC 1/4] hw/block/nvme: convert dsm to aiocb Klaus Jensen
2021-03-08 16:37   ` Stefan Hajnoczi
2021-03-08 18:05     ` Klaus Jensen
2021-03-09 16:03       ` Stefan Hajnoczi
2021-03-09 18:27         ` Klaus Jensen
2021-03-02 11:10 ` [PATCH RFC 2/4] hw/block/nvme: convert copy " Klaus Jensen
2021-03-02 11:10 ` [PATCH RFC 3/4] hw/block/nvme: convert flush " Klaus Jensen
2021-03-02 11:10 ` [PATCH RFC 4/4] hw/block/nvme: convert zone reset " Klaus Jensen
2021-03-08 16:38 ` [PATCH RFC 0/4] hw/block/nvme: convert ad-hoc aio tracking to aiocbs Stefan Hajnoczi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).