All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs
@ 2021-06-17 19:06 Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 01/11] hw/nvme: reimplement flush to allow cancellation Klaus Jensen
                   ` (12 more replies)
  0 siblings, 13 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

This series reimplements flush, dsm, copy, zone reset and format nvm to
allow cancellation. I posted an RFC back in March ("hw/block/nvme:
convert ad-hoc aio tracking to aiocb") and I've applied some feedback
from Stefan and reimplemented the remaining commands.

The basic idea is to define custom AIOCBs for these commands. The custom
AIOCB takes care of issuing all the "nested" AIOs one by one instead of
blindly sending them off simultaneously without tracking the returned
aiocbs.

v2:
  - dropped RFC
  - fixed flush cancel from being unintentially a noop (Vladimir)

Klaus Jensen (11):
  hw/nvme: reimplement flush to allow cancellation
  hw/nvme: add nvme_block_status_all helper
  hw/nvme: reimplement dsm to allow cancellation
  hw/nvme: save reftag when generating pi
  hw/nvme: remove assert from nvme_get_zone_by_slba
  hw/nvme: use prinfo directly in nvme_check_prinfo and nvme_dif_check
  hw/nvme: add dw0/1 to the req completion trace event
  hw/nvme: reimplement the copy command to allow aio cancellation
  hw/nvme: reimplement zone reset to allow cancellation
  hw/nvme: reimplement format nvm to allow cancellation
  Partially revert "hw/block/nvme: drain namespaces on sq deletion"

 hw/nvme/nvme.h       |   10 +-
 include/block/nvme.h |    8 +
 hw/nvme/ctrl.c       | 1883 ++++++++++++++++++++++++------------------
 hw/nvme/dif.c        |   64 +-
 hw/nvme/trace-events |   21 +-
 5 files changed, 1124 insertions(+), 862 deletions(-)

-- 
2.32.0



^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2 01/11] hw/nvme: reimplement flush to allow cancellation
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 02/11] hw/nvme: add nvme_block_status_all helper Klaus Jensen
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

Prior to this patch, a broadcast flush would result in submitting
multiple "fire and forget" aios (no reference saved to the aiocbs
returned from the blk_aio_flush calls).

Fix this by issuing the flushes one after another.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/nvme.h       |   2 +
 hw/nvme/ctrl.c       | 206 ++++++++++++++++++++++++++-----------------
 hw/nvme/trace-events |   6 +-
 3 files changed, 130 insertions(+), 84 deletions(-)

diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 93a7e0e5380e..19376570c91e 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -27,6 +27,8 @@
 #define NVME_MAX_CONTROLLERS 32
 #define NVME_MAX_NAMESPACES  256
 
+QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_NSID_BROADCAST - 1);
+
 typedef struct NvmeCtrl NvmeCtrl;
 typedef struct NvmeNamespace NvmeNamespace;
 
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 7dea64b72e6a..9f81ab99d484 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1788,22 +1788,19 @@ static inline bool nvme_is_write(NvmeRequest *req)
            rw->opcode == NVME_CMD_WRITE_ZEROES;
 }
 
+static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
+{
+    return qemu_get_aio_context();
+}
+
 static void nvme_misc_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
-    NvmeNamespace *ns = req->ns;
 
-    BlockBackend *blk = ns->blkconf.blk;
-    BlockAcctCookie *acct = &req->acct;
-    BlockAcctStats *stats = blk_get_stats(blk);
-
-    trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk));
+    trace_pci_nvme_misc_cb(nvme_cid(req));
 
     if (ret) {
-        block_acct_failed(stats, acct);
         nvme_aio_err(req, ret);
-    } else {
-        block_acct_done(stats, acct);
     }
 
     nvme_enqueue_req_completion(nvme_cq(req), req);
@@ -1919,41 +1916,6 @@ static void nvme_aio_format_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
-struct nvme_aio_flush_ctx {
-    NvmeRequest     *req;
-    NvmeNamespace   *ns;
-    BlockAcctCookie acct;
-};
-
-static void nvme_aio_flush_cb(void *opaque, int ret)
-{
-    struct nvme_aio_flush_ctx *ctx = opaque;
-    NvmeRequest *req = ctx->req;
-    uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
-
-    BlockBackend *blk = ctx->ns->blkconf.blk;
-    BlockAcctCookie *acct = &ctx->acct;
-    BlockAcctStats *stats = blk_get_stats(blk);
-
-    trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));
-
-    if (!ret) {
-        block_acct_done(stats, acct);
-    } else {
-        block_acct_failed(stats, acct);
-        nvme_aio_err(req, ret);
-    }
-
-    (*num_flushes)--;
-    g_free(ctx);
-
-    if (*num_flushes) {
-        return;
-    }
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
 static void nvme_verify_cb(void *opaque, int ret)
 {
     NvmeBounceContext *ctx = opaque;
@@ -2868,56 +2830,138 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
     return NVME_NO_COMPLETE;
 }
 
+typedef struct NvmeFlushAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    NvmeRequest *req;
+    QEMUBH *bh;
+    int ret;
+
+    NvmeNamespace *ns;
+    uint32_t nsid;
+    bool broadcast;
+} NvmeFlushAIOCB;
+
+static void nvme_flush_cancel(BlockAIOCB *acb)
+{
+    NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
+
+    iocb->ret = -ECANCELED;
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+    }
+}
+
+static const AIOCBInfo nvme_flush_aiocb_info = {
+    .aiocb_size = sizeof(NvmeFlushAIOCB),
+    .cancel_async = nvme_flush_cancel,
+    .get_aio_context = nvme_get_aio_context,
+};
+
+static void nvme_flush_ns_cb(void *opaque, int ret)
+{
+    NvmeFlushAIOCB *iocb = opaque;
+    NvmeNamespace *ns = iocb->ns;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto out;
+    } else if (iocb->ret < 0) {
+        goto out;
+    }
+
+    if (ns) {
+        trace_pci_nvme_flush_ns(iocb->nsid);
+
+        iocb->ns = NULL;
+        iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
+        return;
+    }
+
+out:
+    iocb->aiocb = NULL;
+    qemu_bh_schedule(iocb->bh);
+}
+
+static void nvme_flush_bh(void *opaque)
+{
+    NvmeFlushAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeCtrl *n = nvme_ctrl(req);
+    int i;
+
+    if (iocb->ret < 0) {
+        goto done;
+    }
+
+    if (iocb->broadcast) {
+        for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
+            iocb->ns = nvme_ns(n, i);
+            if (iocb->ns) {
+                iocb->nsid = i;
+                break;
+            }
+        }
+    }
+
+    if (!iocb->ns) {
+        goto done;
+    }
+
+    nvme_flush_ns_cb(iocb, 0);
+    return;
+
+done:
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+    qemu_aio_unref(iocb);
+
+    return;
+}
+
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
+    NvmeFlushAIOCB *iocb;
     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
-    uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
     uint16_t status;
-    struct nvme_aio_flush_ctx *ctx;
-    NvmeNamespace *ns;
 
-    trace_pci_nvme_flush(nvme_cid(req), nsid);
+    iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
 
-    if (nsid != NVME_NSID_BROADCAST) {
-        req->ns = nvme_ns(n, nsid);
-        if (unlikely(!req->ns)) {
-            return NVME_INVALID_FIELD | NVME_DNR;
+    iocb->req = req;
+    iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
+    iocb->ret = 0;
+    iocb->ns = NULL;
+    iocb->nsid = 0;
+    iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
+
+    if (!iocb->broadcast) {
+        if (!nvme_nsid_valid(n, nsid)) {
+            status = NVME_INVALID_NSID | NVME_DNR;
+            goto out;
         }
 
-        block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
-                         BLOCK_ACCT_FLUSH);
-        req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req);
-        return NVME_NO_COMPLETE;
-    }
-
-    /* 1-initialize; see comment in nvme_dsm */
-    *num_flushes = 1;
-
-    for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
-        ns = nvme_ns(n, i);
-        if (!ns) {
-            continue;
+        iocb->ns = nvme_ns(n, nsid);
+        if (!iocb->ns) {
+            status = NVME_INVALID_FIELD | NVME_DNR;
+            goto out;
         }
 
-        ctx = g_new(struct nvme_aio_flush_ctx, 1);
-        ctx->req = req;
-        ctx->ns = ns;
-
-        (*num_flushes)++;
-
-        block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
-                         BLOCK_ACCT_FLUSH);
-        blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
+        iocb->nsid = nsid;
     }
 
-    /* account for the 1-initialization */
-    (*num_flushes)--;
+    req->aiocb = &iocb->common;
+    qemu_bh_schedule(iocb->bh);
 
-    if (*num_flushes) {
-        status = NVME_NO_COMPLETE;
-    } else {
-        status = req->status;
-    }
+    return NVME_NO_COMPLETE;
+
+out:
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+    qemu_aio_unref(iocb);
 
     return status;
 }
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index ea33d0ccc383..ce6b6ffe9604 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -7,16 +7,16 @@ pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
 pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
 pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d"
 pci_nvme_map_sgl(uint8_t typ, uint64_t len) "type 0x%"PRIx8" len %"PRIu64""
-pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
+pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid 0x%"PRIx32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
 pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
-pci_nvme_flush(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
+pci_nvme_flush_ns(uint32_t nsid) "nsid 0x%"PRIx32""
 pci_nvme_format(uint16_t cid, uint32_t nsid, uint8_t lbaf, uint8_t mset, uint8_t pi, uint8_t pil) "cid %"PRIu16" nsid %"PRIu32" lbaf %"PRIu8" mset %"PRIu8" pi %"PRIu8" pil %"PRIu8""
 pci_nvme_format_ns(uint16_t cid, uint32_t nsid, uint8_t lbaf, uint8_t mset, uint8_t pi, uint8_t pil) "cid %"PRIu16" nsid %"PRIu32" lbaf %"PRIu8" mset %"PRIu8" pi %"PRIu8" pil %"PRIu8""
 pci_nvme_format_cb(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
 pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
-pci_nvme_misc_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
+pci_nvme_misc_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_dif_rw(uint8_t pract, uint8_t prinfo) "pract 0x%"PRIx8" prinfo 0x%"PRIx8""
 pci_nvme_dif_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
 pci_nvme_dif_rw_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 02/11] hw/nvme: add nvme_block_status_all helper
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 01/11] hw/nvme: reimplement flush to allow cancellation Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 03/11] hw/nvme: reimplement dsm to allow cancellation Klaus Jensen
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

Pull the gist of nvme_check_dulbe() into a helper function. This is in
preparation for dsm refactoring.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 9f81ab99d484..2e36349ced28 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1438,18 +1438,15 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
     return NVME_SUCCESS;
 }
 
-static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
-                                 uint32_t nlb)
+static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
+                                 uint32_t nlb, int flags)
 {
     BlockDriverState *bs = blk_bs(ns->blkconf.blk);
 
     int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
     int64_t offset = nvme_l2b(ns, slba);
-    bool zeroed;
     int ret;
 
-    Error *local_err = NULL;
-
     /*
      * `pnum` holds the number of bytes after offset that shares the same
      * allocation status as the byte at offset. If `pnum` is different from
@@ -1461,23 +1458,41 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
 
         ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
         if (ret < 0) {
-            error_setg_errno(&local_err, -ret, "unable to get block status");
-            error_report_err(local_err);
-
-            return NVME_INTERNAL_DEV_ERROR;
+            return ret;
         }
 
-        zeroed = !!(ret & BDRV_BLOCK_ZERO);
 
-        trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
+        trace_pci_nvme_block_status(offset, bytes, pnum, ret,
+                                    !!(ret & BDRV_BLOCK_ZERO));
 
-        if (zeroed) {
-            return NVME_DULB;
+        if (!(ret & flags)) {
+            return 1;
         }
 
         offset += pnum;
     } while (pnum != bytes);
 
+    return 0;
+}
+
+static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
+                                 uint32_t nlb)
+{
+    int ret;
+    Error *err = NULL;
+
+    ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
+    if (ret) {
+        if (ret < 0) {
+            error_setg_errno(&err, -ret, "unable to get block status");
+            error_report_err(err);
+
+            return NVME_INTERNAL_DEV_ERROR;
+        }
+
+        return NVME_DULB;
+    }
+
     return NVME_SUCCESS;
 }
 
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 03/11] hw/nvme: reimplement dsm to allow cancellation
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 01/11] hw/nvme: reimplement flush to allow cancellation Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 02/11] hw/nvme: add nvme_block_status_all helper Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 04/11] hw/nvme: save reftag when generating pi Klaus Jensen
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

Prior to this patch, a loop was used to issue multiple "fire and forget"
aios for each range in the command. Without a reference to the aiocb
returned from the blk_aio_pdiscard calls, the aios cannot be canceled.

Fix this by processing the ranges one after another.

As a bonus, this fixes how metadata is cleared (i.e. we only zero it out
if the data was succesfully discarded).

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c       | 233 +++++++++++++++++++++++++++++--------------
 hw/nvme/trace-events |   4 +-
 2 files changed, 162 insertions(+), 75 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 2e36349ced28..1626ed29c76b 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -2015,26 +2015,6 @@ out:
     nvme_verify_cb(ctx, ret);
 }
 
-static void nvme_aio_discard_cb(void *opaque, int ret)
-{
-    NvmeRequest *req = opaque;
-    uintptr_t *discards = (uintptr_t *)&req->opaque;
-
-    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
-
-    if (ret) {
-        nvme_aio_err(req, ret);
-    }
-
-    (*discards)--;
-
-    if (*discards) {
-        return;
-    }
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
 struct nvme_zone_reset_ctx {
     NvmeRequest *req;
     NvmeZone    *zone;
@@ -2495,75 +2475,182 @@ out:
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+typedef struct NvmeDSMAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    NvmeRequest *req;
+    QEMUBH *bh;
+    int ret;
+
+    NvmeDsmRange *range;
+    unsigned int nr;
+    unsigned int idx;
+} NvmeDSMAIOCB;
+
+static void nvme_dsm_cancel(BlockAIOCB *aiocb)
+{
+    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
+
+    /* break nvme_dsm_cb loop */
+    iocb->idx = iocb->nr;
+    iocb->ret = -ECANCELED;
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+        iocb->aiocb = NULL;
+    } else {
+        /*
+         * We only reach this if nvme_dsm_cancel() has already been called or
+         * the command ran to completion and nvme_dsm_bh is scheduled to run.
+         */
+        assert(iocb->idx == iocb->nr);
+    }
+}
+
+static const AIOCBInfo nvme_dsm_aiocb_info = {
+    .aiocb_size   = sizeof(NvmeDSMAIOCB),
+    .cancel_async = nvme_dsm_cancel,
+};
+
+static void nvme_dsm_bh(void *opaque)
+{
+    NvmeDSMAIOCB *iocb = opaque;
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+    qemu_aio_unref(iocb);
+}
+
+static void nvme_dsm_cb(void *opaque, int ret);
+
+static void nvme_dsm_md_cb(void *opaque, int ret)
+{
+    NvmeDSMAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeDsmRange *range;
+    uint64_t slba;
+    uint32_t nlb;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+    if (!ns->lbaf.ms) {
+        nvme_dsm_cb(iocb, 0);
+        return;
+    }
+
+    range = &iocb->range[iocb->idx - 1];
+    slba = le64_to_cpu(range->slba);
+    nlb = le32_to_cpu(range->nlb);
+
+    /*
+     * Check that all block were discarded (zeroed); otherwise we do not zero
+     * the metadata.
+     */
+
+    ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
+    if (ret) {
+        if (ret < 0) {
+            iocb->ret = ret;
+            goto done;
+        }
+
+        nvme_dsm_cb(iocb, 0);
+    }
+
+    iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
+                                        nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
+                                        nvme_dsm_cb, iocb);
+    return;
+
+done:
+    iocb->aiocb = NULL;
+    qemu_bh_schedule(iocb->bh);
+}
+
+static void nvme_dsm_cb(void *opaque, int ret)
+{
+    NvmeDSMAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeCtrl *n = nvme_ctrl(req);
+    NvmeNamespace *ns = req->ns;
+    NvmeDsmRange *range;
+    uint64_t slba;
+    uint32_t nlb;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+next:
+    if (iocb->idx == iocb->nr) {
+        goto done;
+    }
+
+    range = &iocb->range[iocb->idx++];
+    slba = le64_to_cpu(range->slba);
+    nlb = le32_to_cpu(range->nlb);
+
+    trace_pci_nvme_dsm_deallocate(slba, nlb);
+
+    if (nlb > n->dmrsl) {
+        trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
+        goto next;
+    }
+
+    if (nvme_check_bounds(ns, slba, nlb)) {
+        trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+                                             ns->id_ns.nsze);
+        goto next;
+    }
+
+    iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
+                                   nvme_l2b(ns, nlb),
+                                   nvme_dsm_md_cb, iocb);
+    return;
+
+done:
+    iocb->aiocb = NULL;
+    qemu_bh_schedule(iocb->bh);
+}
+
 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns = req->ns;
     NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
-
     uint32_t attr = le32_to_cpu(dsm->attributes);
     uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
-
     uint16_t status = NVME_SUCCESS;
 
-    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
+    trace_pci_nvme_dsm(nr, attr);
 
     if (attr & NVME_DSMGMT_AD) {
-        int64_t offset;
-        size_t len;
-        NvmeDsmRange range[nr];
-        uintptr_t *discards = (uintptr_t *)&req->opaque;
+        NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
+                                         nvme_misc_cb, req);
 
-        status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req);
+        iocb->req = req;
+        iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
+        iocb->ret = 0;
+        iocb->range = g_new(NvmeDsmRange, nr);
+        iocb->nr = nr;
+        iocb->idx = 0;
+
+        status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
+                          req);
         if (status) {
             return status;
         }
 
-        /*
-         * AIO callbacks may be called immediately, so initialize discards to 1
-         * to make sure the the callback does not complete the request before
-         * all discards have been issued.
-         */
-        *discards = 1;
+        req->aiocb = &iocb->common;
+        nvme_dsm_cb(iocb, 0);
 
-        for (int i = 0; i < nr; i++) {
-            uint64_t slba = le64_to_cpu(range[i].slba);
-            uint32_t nlb = le32_to_cpu(range[i].nlb);
-
-            if (nvme_check_bounds(ns, slba, nlb)) {
-                continue;
-            }
-
-            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
-                                          nlb);
-
-            if (nlb > n->dmrsl) {
-                trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
-            }
-
-            offset = nvme_l2b(ns, slba);
-            len = nvme_l2b(ns, nlb);
-
-            while (len) {
-                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
-
-                (*discards)++;
-
-                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
-                                 nvme_aio_discard_cb, req);
-
-                offset += bytes;
-                len -= bytes;
-            }
-        }
-
-        /* account for the 1-initialization */
-        (*discards)--;
-
-        if (*discards) {
-            status = NVME_NO_COMPLETE;
-        } else {
-            status = req->status;
-        }
+        return NVME_NO_COMPLETE;
     }
 
     return status;
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index ce6b6ffe9604..eea4e31e46c4 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -37,8 +37,8 @@ pci_nvme_verify_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" bl
 pci_nvme_verify_cb(uint16_t cid, uint8_t prinfo, uint16_t apptag, uint16_t appmask, uint32_t reftag) "cid %"PRIu16" prinfo 0x%"PRIx8" apptag 0x%"PRIx16" appmask 0x%"PRIx16" reftag 0x%"PRIx32""
 pci_nvme_rw_complete_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
 pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
-pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
-pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_dsm(uint32_t nr, uint32_t attr) "nr %"PRIu32" attr 0x%"PRIx32""
+pci_nvme_dsm_deallocate(uint64_t slba, uint32_t nlb) "slba %"PRIu64" nlb %"PRIu32""
 pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32""
 pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_compare_data_cb(uint16_t cid) "cid %"PRIu16""
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 04/11] hw/nvme: save reftag when generating pi
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
                   ` (2 preceding siblings ...)
  2021-06-17 19:06 ` [PATCH v2 03/11] hw/nvme: reimplement dsm to allow cancellation Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 05/11] hw/nvme: remove assert from nvme_get_zone_by_slba Klaus Jensen
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

Prepare nvme_dif_pract_generate_dif() and nvme_dif_check() to be
callable in smaller increments by making the reftag a pointer parameter
updated by the function.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/nvme.h |  4 ++--
 hw/nvme/ctrl.c | 10 +++++-----
 hw/nvme/dif.c  | 22 +++++++++++-----------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 19376570c91e..6e43f3f3d128 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -539,11 +539,11 @@ uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t *mbuf, size_t mlen,
                                uint64_t slba);
 void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
                                  uint8_t *mbuf, size_t mlen, uint16_t apptag,
-                                 uint32_t reftag);
+                                 uint32_t *reftag);
 uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
                         uint8_t *mbuf, size_t mlen, uint16_t ctrl,
                         uint64_t slba, uint16_t apptag,
-                        uint16_t appmask, uint32_t reftag);
+                        uint16_t appmask, uint32_t *reftag);
 uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req);
 
 
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 1626ed29c76b..07e40107e86b 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1968,7 +1968,7 @@ static void nvme_verify_cb(void *opaque, int ret)
 
         req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
                                      ctx->mdata.bounce, ctx->mdata.iov.size,
-                                     ctrl, slba, apptag, appmask, reftag);
+                                     ctrl, slba, apptag, appmask, &reftag);
     }
 
 out:
@@ -2204,7 +2204,7 @@ static void nvme_copy_in_complete(NvmeRequest *req)
             reftag = le32_to_cpu(range->reftag);
 
             status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba,
-                                    apptag, appmask, reftag);
+                                    apptag, appmask, &reftag);
             if (status) {
                 goto invalid;
             }
@@ -2227,10 +2227,10 @@ static void nvme_copy_in_complete(NvmeRequest *req)
             }
 
             nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce,
-                                        mlen, apptag, reftag);
+                                        mlen, apptag, &reftag);
         } else {
             status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen,
-                                    prinfow, sdlba, apptag, appmask, reftag);
+                                    prinfow, sdlba, apptag, appmask, &reftag);
             if (status) {
                 goto invalid;
             }
@@ -2370,7 +2370,7 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
 
         status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
                                 ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
-                                slba, apptag, appmask, reftag);
+                                slba, apptag, appmask, &reftag);
         if (status) {
             req->status = status;
             goto out;
diff --git a/hw/nvme/dif.c b/hw/nvme/dif.c
index 88efcbe9bd60..f5f86f1c26ad 100644
--- a/hw/nvme/dif.c
+++ b/hw/nvme/dif.c
@@ -41,7 +41,7 @@ static uint16_t crc_t10dif(uint16_t crc, const unsigned char *buffer,
 
 void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
                                  uint8_t *mbuf, size_t mlen, uint16_t apptag,
-                                 uint32_t reftag)
+                                 uint32_t *reftag)
 {
     uint8_t *end = buf + len;
     int16_t pil = 0;
@@ -51,7 +51,7 @@ void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
     }
 
     trace_pci_nvme_dif_pract_generate_dif(len, ns->lbasz, ns->lbasz + pil,
-                                          apptag, reftag);
+                                          apptag, *reftag);
 
     for (; buf < end; buf += ns->lbasz, mbuf += ns->lbaf.ms) {
         NvmeDifTuple *dif = (NvmeDifTuple *)(mbuf + pil);
@@ -63,10 +63,10 @@ void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
 
         dif->guard = cpu_to_be16(crc);
         dif->apptag = cpu_to_be16(apptag);
-        dif->reftag = cpu_to_be32(reftag);
+        dif->reftag = cpu_to_be32(*reftag);
 
         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) != NVME_ID_NS_DPS_TYPE_3) {
-            reftag++;
+            (*reftag)++;
         }
     }
 }
@@ -132,13 +132,13 @@ static uint16_t nvme_dif_prchk(NvmeNamespace *ns, NvmeDifTuple *dif,
 uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
                         uint8_t *mbuf, size_t mlen, uint16_t ctrl,
                         uint64_t slba, uint16_t apptag,
-                        uint16_t appmask, uint32_t reftag)
+                        uint16_t appmask, uint32_t *reftag)
 {
     uint8_t *end = buf + len;
     int16_t pil = 0;
     uint16_t status;
 
-    status = nvme_check_prinfo(ns, ctrl, slba, reftag);
+    status = nvme_check_prinfo(ns, ctrl, slba, *reftag);
     if (status) {
         return status;
     }
@@ -153,13 +153,13 @@ uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
         NvmeDifTuple *dif = (NvmeDifTuple *)(mbuf + pil);
 
         status = nvme_dif_prchk(ns, dif, buf, mbuf, pil, ctrl, apptag,
-                                appmask, reftag);
+                                appmask, *reftag);
         if (status) {
             return status;
         }
 
         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) != NVME_ID_NS_DPS_TYPE_3) {
-            reftag++;
+            (*reftag)++;
         }
     }
 
@@ -270,7 +270,7 @@ static void nvme_dif_rw_check_cb(void *opaque, int ret)
 
     status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
                             ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
-                            slba, apptag, appmask, reftag);
+                            slba, apptag, appmask, &reftag);
     if (status) {
         req->status = status;
         goto out;
@@ -478,11 +478,11 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)
         /* splice generated protection information into the buffer */
         nvme_dif_pract_generate_dif(ns, ctx->data.bounce, ctx->data.iov.size,
                                     ctx->mdata.bounce, ctx->mdata.iov.size,
-                                    apptag, reftag);
+                                    apptag, &reftag);
     } else {
         status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
                                 ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
-                                slba, apptag, appmask, reftag);
+                                slba, apptag, appmask, &reftag);
         if (status) {
             goto err;
         }
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 05/11] hw/nvme: remove assert from nvme_get_zone_by_slba
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
                   ` (3 preceding siblings ...)
  2021-06-17 19:06 ` [PATCH v2 04/11] hw/nvme: save reftag when generating pi Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 06/11] hw/nvme: use prinfo directly in nvme_check_prinfo and nvme_dif_check Klaus Jensen
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

Make nvme_get_zone_by_slba() return NULL if the slba is out of range.
This allows the function to be used without guarding the call with a
call to nvme_check_bounds(), in preparation for the next patch.

Add asserts after calling nvme_get_zone_by_slba() instead.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 07e40107e86b..f1a36c2052d0 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1542,7 +1542,10 @@ static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
 {
     uint32_t zone_idx = nvme_zone_idx(ns, slba);
 
-    assert(zone_idx < ns->num_zones);
+    if (zone_idx >= ns->num_zones) {
+        return NULL;
+    }
+
     return &ns->zone_array[zone_idx];
 }
 
@@ -1619,11 +1622,16 @@ static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
                                      uint32_t nlb)
 {
-    NvmeZone *zone = nvme_get_zone_by_slba(ns, slba);
-    uint64_t bndry = nvme_zone_rd_boundary(ns, zone);
-    uint64_t end = slba + nlb;
+    NvmeZone *zone;
+    uint64_t bndry, end;
     uint16_t status;
 
+    zone = nvme_get_zone_by_slba(ns, slba);
+    assert(zone);
+
+    bndry = nvme_zone_rd_boundary(ns, zone);
+    end = slba + nlb;
+
     status = nvme_check_zone_state_for_read(zone);
     if (status) {
         ;
@@ -1790,6 +1798,7 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
     slba = le64_to_cpu(rw->slba);
     nlb = le16_to_cpu(rw->nlb) + 1;
     zone = nvme_get_zone_by_slba(ns, slba);
+    assert(zone);
 
     nvme_advance_zone_wp(ns, zone, nlb);
 }
@@ -3186,6 +3195,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
 
     if (ns->params.zoned) {
         zone = nvme_get_zone_by_slba(ns, slba);
+        assert(zone);
 
         if (append) {
             bool piremap = !!(ctrl & NVME_RW_PIREMAP);
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 06/11] hw/nvme: use prinfo directly in nvme_check_prinfo and nvme_dif_check
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
                   ` (4 preceding siblings ...)
  2021-06-17 19:06 ` [PATCH v2 05/11] hw/nvme: remove assert from nvme_get_zone_by_slba Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 07/11] hw/nvme: add dw0/1 to the req completion trace event Klaus Jensen
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

The nvme_check_prinfo() and nvme_dif_check() functions operate on the
16 bit "control" member of the NvmeCmd. These functions do not otherwise
operate on an NvmeCmd or an NvmeRequest, so change them to expect the
actual 4 bit PRINFO field and add constants that work on this field as
well.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/nvme.h       |  4 +--
 include/block/nvme.h |  8 ++++++
 hw/nvme/ctrl.c       | 67 +++++++++++++++++---------------------------
 hw/nvme/dif.c        | 44 ++++++++++++++---------------
 4 files changed, 57 insertions(+), 66 deletions(-)

diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 6e43f3f3d128..7491157f56f6 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -533,7 +533,7 @@ static const uint16_t t10_dif_crc_table[256] = {
     0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
 };
 
-uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint16_t ctrl, uint64_t slba,
+uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint8_t prinfo, uint64_t slba,
                            uint32_t reftag);
 uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t *mbuf, size_t mlen,
                                uint64_t slba);
@@ -541,7 +541,7 @@ void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
                                  uint8_t *mbuf, size_t mlen, uint16_t apptag,
                                  uint32_t *reftag);
 uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
-                        uint8_t *mbuf, size_t mlen, uint16_t ctrl,
+                        uint8_t *mbuf, size_t mlen, uint8_t prinfo,
                         uint64_t slba, uint16_t apptag,
                         uint16_t appmask, uint32_t *reftag);
 uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 333affdb8534..0fabe28b7bdd 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -708,6 +708,14 @@ enum {
 
 #define NVME_RW_PRINFO(control) ((control >> 10) & 0xf)
 
+enum {
+    NVME_PRINFO_PRACT       = 1 << 3,
+    NVME_PRINFO_PRCHK_GUARD = 1 << 2,
+    NVME_PRINFO_PRCHK_APP   = 1 << 1,
+    NVME_PRINFO_PRCHK_REF   = 1 << 0,
+    NVME_PRINFO_PRCHK_MASK  = 7 << 0,
+};
+
 typedef struct QEMU_PACKED NvmeDsmCmd {
     uint8_t     opcode;
     uint8_t     flags;
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f1a36c2052d0..aca6cf068e2b 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1010,16 +1010,12 @@ static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
 {
     NvmeNamespace *ns = req->ns;
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
-    uint16_t ctrl = le16_to_cpu(rw->control);
+    bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
+    bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
     size_t len = nvme_l2b(ns, nlb);
     uint16_t status;
 
-    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
-        (ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8)) {
-        goto out;
-    }
-
-    if (nvme_ns_ext(ns)) {
+    if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
         NvmeSg sg;
 
         len += nvme_m2b(ns, nlb);
@@ -1036,7 +1032,6 @@ static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
         return NVME_SUCCESS;
     }
 
-out:
     return nvme_map_dptr(n, &req->sg, len, &req->cmd);
 }
 
@@ -1195,10 +1190,10 @@ uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
 {
     NvmeNamespace *ns = req->ns;
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
-    uint16_t ctrl = le16_to_cpu(rw->control);
+    bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
+    bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
 
-    if (nvme_ns_ext(ns) &&
-        !(ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8)) {
+    if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
         return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
                                    ns->lbaf.ms, 0, dir);
     }
@@ -1950,14 +1945,13 @@ static void nvme_verify_cb(void *opaque, int ret)
     BlockAcctStats *stats = blk_get_stats(blk);
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     uint64_t slba = le64_to_cpu(rw->slba);
-    uint16_t ctrl = le16_to_cpu(rw->control);
+    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
     uint16_t apptag = le16_to_cpu(rw->apptag);
     uint16_t appmask = le16_to_cpu(rw->appmask);
     uint32_t reftag = le32_to_cpu(rw->reftag);
     uint16_t status;
 
-    trace_pci_nvme_verify_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag,
-                             appmask, reftag);
+    trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
 
     if (ret) {
         block_acct_failed(stats, acct);
@@ -1977,7 +1971,7 @@ static void nvme_verify_cb(void *opaque, int ret)
 
         req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
                                      ctx->mdata.bounce, ctx->mdata.iov.size,
-                                     ctrl, slba, apptag, appmask, &reftag);
+                                     prinfo, slba, apptag, appmask, &reftag);
     }
 
 out:
@@ -2183,8 +2177,8 @@ static void nvme_copy_in_complete(NvmeRequest *req)
     block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
 
     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
-        uint16_t prinfor = (copy->control[0] >> 4) & 0xf;
-        uint16_t prinfow = (copy->control[2] >> 2) & 0xf;
+        uint8_t prinfor = (copy->control[0] >> 4) & 0xf;
+        uint8_t prinfow = (copy->control[2] >> 2) & 0xf;
         uint16_t nr = copy->nr + 1;
         NvmeCopySourceRange *range;
         uint64_t slba;
@@ -2195,13 +2189,6 @@ static void nvme_copy_in_complete(NvmeRequest *req)
         size_t len, mlen;
         int i;
 
-        /*
-         * The dif helpers expects prinfo to be similar to the control field of
-         * the NvmeRwCmd, so shift by 10 to fake it.
-         */
-        prinfor = prinfor << 10;
-        prinfow = prinfow << 10;
-
         for (i = 0; i < nr; i++) {
             range = &ctx->ranges[i];
             slba = le64_to_cpu(range->slba);
@@ -2342,7 +2329,7 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
     NvmeNamespace *ns = req->ns;
     NvmeCtrl *n = nvme_ctrl(req);
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
-    uint16_t ctrl = le16_to_cpu(rw->control);
+    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
     uint16_t apptag = le16_to_cpu(rw->apptag);
     uint16_t appmask = le16_to_cpu(rw->appmask);
     uint32_t reftag = le32_to_cpu(rw->reftag);
@@ -2378,7 +2365,7 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
         int16_t pil = 0;
 
         status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
-                                ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
+                                ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
                                 slba, apptag, appmask, &reftag);
         if (status) {
             req->status = status;
@@ -2674,7 +2661,7 @@ static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
     size_t len = nvme_l2b(ns, nlb);
     int64_t offset = nvme_l2b(ns, slba);
-    uint16_t ctrl = le16_to_cpu(rw->control);
+    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
     uint32_t reftag = le32_to_cpu(rw->reftag);
     NvmeBounceContext *ctx = NULL;
     uint16_t status;
@@ -2682,12 +2669,12 @@ static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
     trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
 
     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
-        status = nvme_check_prinfo(ns, ctrl, slba, reftag);
+        status = nvme_check_prinfo(ns, prinfo, slba, reftag);
         if (status) {
             return status;
         }
 
-        if (ctrl & NVME_RW_PRINFO_PRACT) {
+        if (prinfo & NVME_PRINFO_PRACT) {
             return NVME_INVALID_PROT_INFO | NVME_DNR;
         }
     }
@@ -2731,13 +2718,8 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
 
     uint16_t nr = copy->nr + 1;
     uint8_t format = copy->control[0] & 0xf;
-
-    /*
-     * Shift the PRINFOR/PRINFOW values by 10 to allow reusing the
-     * NVME_RW_PRINFO constants.
-     */
-    uint16_t prinfor = ((copy->control[0] >> 4) & 0xf) << 10;
-    uint16_t prinfow = ((copy->control[2] >> 2) & 0xf) << 10;
+    uint8_t prinfor = (copy->control[0] >> 4) & 0xf;
+    uint8_t prinfow = (copy->control[2] >> 2) & 0xf;
 
     uint32_t nlb = 0;
     uint8_t *bounce = NULL, *bouncep = NULL;
@@ -2749,7 +2731,7 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
     trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
 
     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
-        ((prinfor & NVME_RW_PRINFO_PRACT) != (prinfow & NVME_RW_PRINFO_PRACT))) {
+        ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -2886,7 +2868,7 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
     BlockBackend *blk = ns->blkconf.blk;
     uint64_t slba = le64_to_cpu(rw->slba);
     uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
-    uint16_t ctrl = le16_to_cpu(rw->control);
+    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
     size_t data_len = nvme_l2b(ns, nlb);
     size_t len = data_len;
     int64_t offset = nvme_l2b(ns, slba);
@@ -2895,7 +2877,7 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
 
     trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
 
-    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (ctrl & NVME_RW_PRINFO_PRACT)) {
+    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
         return NVME_INVALID_PROT_INFO | NVME_DNR;
     }
 
@@ -3083,7 +3065,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
     NvmeNamespace *ns = req->ns;
     uint64_t slba = le64_to_cpu(rw->slba);
     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
-    uint16_t ctrl = le16_to_cpu(rw->control);
+    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
     uint64_t data_size = nvme_l2b(ns, nlb);
     uint64_t mapped_size = data_size;
     uint64_t data_offset;
@@ -3094,7 +3076,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
         mapped_size += nvme_m2b(ns, nlb);
 
         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
-            bool pract = ctrl & NVME_RW_PRINFO_PRACT;
+            bool pract = prinfo & NVME_PRINFO_PRACT;
 
             if (pract && ns->lbaf.ms == 8) {
                 mapped_size = data_size;
@@ -3158,6 +3140,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
     uint64_t slba = le64_to_cpu(rw->slba);
     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
     uint16_t ctrl = le16_to_cpu(rw->control);
+    uint8_t prinfo = NVME_RW_PRINFO(ctrl);
     uint64_t data_size = nvme_l2b(ns, nlb);
     uint64_t mapped_size = data_size;
     uint64_t data_offset;
@@ -3170,7 +3153,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
         mapped_size += nvme_m2b(ns, nlb);
 
         if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
-            bool pract = ctrl & NVME_RW_PRINFO_PRACT;
+            bool pract = prinfo & NVME_PRINFO_PRACT;
 
             if (pract && ns->lbaf.ms == 8) {
                 mapped_size -= nvme_m2b(ns, nlb);
diff --git a/hw/nvme/dif.c b/hw/nvme/dif.c
index f5f86f1c26ad..5dbd18b2a4a5 100644
--- a/hw/nvme/dif.c
+++ b/hw/nvme/dif.c
@@ -15,11 +15,11 @@
 #include "nvme.h"
 #include "trace.h"
 
-uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint16_t ctrl, uint64_t slba,
+uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint8_t prinfo, uint64_t slba,
                            uint32_t reftag)
 {
     if ((NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) == NVME_ID_NS_DPS_TYPE_1) &&
-        (ctrl & NVME_RW_PRINFO_PRCHK_REF) && (slba & 0xffffffff) != reftag) {
+        (prinfo & NVME_PRINFO_PRCHK_REF) && (slba & 0xffffffff) != reftag) {
         return NVME_INVALID_PROT_INFO | NVME_DNR;
     }
 
@@ -73,7 +73,7 @@ void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
 
 static uint16_t nvme_dif_prchk(NvmeNamespace *ns, NvmeDifTuple *dif,
                                uint8_t *buf, uint8_t *mbuf, size_t pil,
-                               uint16_t ctrl, uint16_t apptag,
+                               uint8_t prinfo, uint16_t apptag,
                                uint16_t appmask, uint32_t reftag)
 {
     switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
@@ -95,7 +95,7 @@ static uint16_t nvme_dif_prchk(NvmeNamespace *ns, NvmeDifTuple *dif,
         return NVME_SUCCESS;
     }
 
-    if (ctrl & NVME_RW_PRINFO_PRCHK_GUARD) {
+    if (prinfo & NVME_PRINFO_PRCHK_GUARD) {
         uint16_t crc = crc_t10dif(0x0, buf, ns->lbasz);
 
         if (pil) {
@@ -109,7 +109,7 @@ static uint16_t nvme_dif_prchk(NvmeNamespace *ns, NvmeDifTuple *dif,
         }
     }
 
-    if (ctrl & NVME_RW_PRINFO_PRCHK_APP) {
+    if (prinfo & NVME_PRINFO_PRCHK_APP) {
         trace_pci_nvme_dif_prchk_apptag(be16_to_cpu(dif->apptag), apptag,
                                         appmask);
 
@@ -118,7 +118,7 @@ static uint16_t nvme_dif_prchk(NvmeNamespace *ns, NvmeDifTuple *dif,
         }
     }
 
-    if (ctrl & NVME_RW_PRINFO_PRCHK_REF) {
+    if (prinfo & NVME_PRINFO_PRCHK_REF) {
         trace_pci_nvme_dif_prchk_reftag(be32_to_cpu(dif->reftag), reftag);
 
         if (be32_to_cpu(dif->reftag) != reftag) {
@@ -130,7 +130,7 @@ static uint16_t nvme_dif_prchk(NvmeNamespace *ns, NvmeDifTuple *dif,
 }
 
 uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
-                        uint8_t *mbuf, size_t mlen, uint16_t ctrl,
+                        uint8_t *mbuf, size_t mlen, uint8_t prinfo,
                         uint64_t slba, uint16_t apptag,
                         uint16_t appmask, uint32_t *reftag)
 {
@@ -138,7 +138,7 @@ uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
     int16_t pil = 0;
     uint16_t status;
 
-    status = nvme_check_prinfo(ns, ctrl, slba, *reftag);
+    status = nvme_check_prinfo(ns, prinfo, slba, *reftag);
     if (status) {
         return status;
     }
@@ -147,12 +147,12 @@ uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
         pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
     }
 
-    trace_pci_nvme_dif_check(NVME_RW_PRINFO(ctrl), ns->lbasz + pil);
+    trace_pci_nvme_dif_check(prinfo, ns->lbasz + pil);
 
     for (; buf < end; buf += ns->lbasz, mbuf += ns->lbaf.ms) {
         NvmeDifTuple *dif = (NvmeDifTuple *)(mbuf + pil);
 
-        status = nvme_dif_prchk(ns, dif, buf, mbuf, pil, ctrl, apptag,
+        status = nvme_dif_prchk(ns, dif, buf, mbuf, pil, prinfo, apptag,
                                 appmask, *reftag);
         if (status) {
             return status;
@@ -248,14 +248,14 @@ static void nvme_dif_rw_check_cb(void *opaque, int ret)
     NvmeCtrl *n = nvme_ctrl(req);
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     uint64_t slba = le64_to_cpu(rw->slba);
-    uint16_t ctrl = le16_to_cpu(rw->control);
+    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
     uint16_t apptag = le16_to_cpu(rw->apptag);
     uint16_t appmask = le16_to_cpu(rw->appmask);
     uint32_t reftag = le32_to_cpu(rw->reftag);
     uint16_t status;
 
-    trace_pci_nvme_dif_rw_check_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag,
-                                   appmask, reftag);
+    trace_pci_nvme_dif_rw_check_cb(nvme_cid(req), prinfo, apptag, appmask,
+                                   reftag);
 
     if (ret) {
         goto out;
@@ -269,7 +269,7 @@ static void nvme_dif_rw_check_cb(void *opaque, int ret)
     }
 
     status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
-                            ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
+                            ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
                             slba, apptag, appmask, &reftag);
     if (status) {
         req->status = status;
@@ -283,7 +283,7 @@ static void nvme_dif_rw_check_cb(void *opaque, int ret)
         goto out;
     }
 
-    if (ctrl & NVME_RW_PRINFO_PRACT && ns->lbaf.ms == 8) {
+    if (prinfo & NVME_PRINFO_PRACT && ns->lbaf.ms == 8) {
         goto out;
     }
 
@@ -364,15 +364,15 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)
     size_t mlen = nvme_m2b(ns, nlb);
     size_t mapped_len = len;
     int64_t offset = nvme_l2b(ns, slba);
-    uint16_t ctrl = le16_to_cpu(rw->control);
+    uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
     uint16_t apptag = le16_to_cpu(rw->apptag);
     uint16_t appmask = le16_to_cpu(rw->appmask);
     uint32_t reftag = le32_to_cpu(rw->reftag);
-    bool pract = !!(ctrl & NVME_RW_PRINFO_PRACT);
+    bool pract = !!(prinfo & NVME_PRINFO_PRACT);
     NvmeBounceContext *ctx;
     uint16_t status;
 
-    trace_pci_nvme_dif_rw(pract, NVME_RW_PRINFO(ctrl));
+    trace_pci_nvme_dif_rw(pract, prinfo);
 
     ctx = g_new0(NvmeBounceContext, 1);
     ctx->req = req;
@@ -380,7 +380,7 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)
     if (wrz) {
         BdrvRequestFlags flags = BDRV_REQ_MAY_UNMAP;
 
-        if (ctrl & NVME_RW_PRINFO_PRCHK_MASK) {
+        if (prinfo & NVME_PRINFO_PRCHK_MASK) {
             status = NVME_INVALID_PROT_INFO | NVME_DNR;
             goto err;
         }
@@ -389,7 +389,7 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)
             uint8_t *mbuf, *end;
             int16_t pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
 
-            status = nvme_check_prinfo(ns, ctrl, slba, reftag);
+            status = nvme_check_prinfo(ns, prinfo, slba, reftag);
             if (status) {
                 goto err;
             }
@@ -469,7 +469,7 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)
         }
     }
 
-    status = nvme_check_prinfo(ns, ctrl, slba, reftag);
+    status = nvme_check_prinfo(ns, prinfo, slba, reftag);
     if (status) {
         goto err;
     }
@@ -481,7 +481,7 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)
                                     apptag, &reftag);
     } else {
         status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
-                                ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
+                                ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
                                 slba, apptag, appmask, &reftag);
         if (status) {
             goto err;
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 07/11] hw/nvme: add dw0/1 to the req completion trace event
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
                   ` (5 preceding siblings ...)
  2021-06-17 19:06 ` [PATCH v2 06/11] hw/nvme: use prinfo directly in nvme_check_prinfo and nvme_dif_check Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 08/11] hw/nvme: reimplement the copy command to allow aio cancellation Klaus Jensen
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

Some commands report additional useful information in dw0 and dw1 of the
completion queue entry.

Add them to the trace.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c       | 2 ++
 hw/nvme/trace-events | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index aca6cf068e2b..cfacf70c1979 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1290,6 +1290,8 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
 {
     assert(cq->cqid == req->sq->cqid);
     trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
+                                          le32_to_cpu(req->cqe.result),
+                                          le32_to_cpu(req->cqe.dw1),
                                           req->status);
 
     if (req->status) {
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index eea4e31e46c4..a3e11346865e 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -80,7 +80,7 @@ pci_nvme_enqueue_event(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PR
 pci_nvme_enqueue_event_noqueue(int queued) "queued %d"
 pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
 pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
-pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16""
+pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint32_t dw0, uint32_t dw1, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" dw0 0x%"PRIx32" dw1 0x%"PRIx32" status 0x%"PRIx16""
 pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d"
 pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" data 0x%"PRIx64" size %d"
 pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16""
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 08/11] hw/nvme: reimplement the copy command to allow aio cancellation
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
                   ` (6 preceding siblings ...)
  2021-06-17 19:06 ` [PATCH v2 07/11] hw/nvme: add dw0/1 to the req completion trace event Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 09/11] hw/nvme: reimplement zone reset to allow cancellation Klaus Jensen
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

Before this patch the code would issue several aios simultaneously
without saving a reference to the aiocb. Without the aiocb reference the
individual copies cannot be canceled.

Fix this by issuing copies of the ranges one after another.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c       | 726 +++++++++++++++++++++++--------------------
 hw/nvme/trace-events |   3 +-
 2 files changed, 394 insertions(+), 335 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index cfacf70c1979..012b0126057b 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -2093,226 +2093,6 @@ out:
     nvme_aio_zone_reset_complete_cb(opaque, ret);
 }
 
-struct nvme_copy_ctx {
-    int copies;
-    uint8_t *bounce;
-    uint8_t *mbounce;
-    uint32_t nlb;
-    NvmeCopySourceRange *ranges;
-};
-
-struct nvme_copy_in_ctx {
-    NvmeRequest *req;
-    QEMUIOVector iov;
-    NvmeCopySourceRange *range;
-};
-
-static void nvme_copy_complete_cb(void *opaque, int ret)
-{
-    NvmeRequest *req = opaque;
-    NvmeNamespace *ns = req->ns;
-    struct nvme_copy_ctx *ctx = req->opaque;
-
-    if (ret) {
-        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
-        nvme_aio_err(req, ret);
-        goto out;
-    }
-
-    block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
-
-out:
-    if (ns->params.zoned) {
-        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
-        uint64_t sdlba = le64_to_cpu(copy->sdlba);
-        NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
-
-        nvme_advance_zone_wp(ns, zone, ctx->nlb);
-    }
-
-    g_free(ctx->bounce);
-    g_free(ctx->mbounce);
-    g_free(ctx);
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
-static void nvme_copy_cb(void *opaque, int ret)
-{
-    NvmeRequest *req = opaque;
-    NvmeNamespace *ns = req->ns;
-    struct nvme_copy_ctx *ctx = req->opaque;
-
-    trace_pci_nvme_copy_cb(nvme_cid(req));
-
-    if (ret) {
-        goto out;
-    }
-
-    if (ns->lbaf.ms) {
-        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
-        uint64_t sdlba = le64_to_cpu(copy->sdlba);
-        int64_t offset = nvme_moff(ns, sdlba);
-
-        qemu_iovec_reset(&req->sg.iov);
-        qemu_iovec_add(&req->sg.iov, ctx->mbounce, nvme_m2b(ns, ctx->nlb));
-
-        req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &req->sg.iov, 0,
-                                     nvme_copy_complete_cb, req);
-        return;
-    }
-
-out:
-    nvme_copy_complete_cb(opaque, ret);
-}
-
-static void nvme_copy_in_complete(NvmeRequest *req)
-{
-    NvmeNamespace *ns = req->ns;
-    NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
-    struct nvme_copy_ctx *ctx = req->opaque;
-    uint64_t sdlba = le64_to_cpu(copy->sdlba);
-    uint16_t status;
-
-    trace_pci_nvme_copy_in_complete(nvme_cid(req));
-
-    block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
-
-    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
-        uint8_t prinfor = (copy->control[0] >> 4) & 0xf;
-        uint8_t prinfow = (copy->control[2] >> 2) & 0xf;
-        uint16_t nr = copy->nr + 1;
-        NvmeCopySourceRange *range;
-        uint64_t slba;
-        uint32_t nlb;
-        uint16_t apptag, appmask;
-        uint32_t reftag;
-        uint8_t *buf = ctx->bounce, *mbuf = ctx->mbounce;
-        size_t len, mlen;
-        int i;
-
-        for (i = 0; i < nr; i++) {
-            range = &ctx->ranges[i];
-            slba = le64_to_cpu(range->slba);
-            nlb = le16_to_cpu(range->nlb) + 1;
-            len = nvme_l2b(ns, nlb);
-            mlen = nvme_m2b(ns, nlb);
-            apptag = le16_to_cpu(range->apptag);
-            appmask = le16_to_cpu(range->appmask);
-            reftag = le32_to_cpu(range->reftag);
-
-            status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba,
-                                    apptag, appmask, &reftag);
-            if (status) {
-                goto invalid;
-            }
-
-            buf += len;
-            mbuf += mlen;
-        }
-
-        apptag = le16_to_cpu(copy->apptag);
-        appmask = le16_to_cpu(copy->appmask);
-        reftag = le32_to_cpu(copy->reftag);
-
-        if (prinfow & NVME_RW_PRINFO_PRACT) {
-            size_t len = nvme_l2b(ns, ctx->nlb);
-            size_t mlen = nvme_m2b(ns, ctx->nlb);
-
-            status = nvme_check_prinfo(ns, prinfow, sdlba, reftag);
-            if (status) {
-                goto invalid;
-            }
-
-            nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce,
-                                        mlen, apptag, &reftag);
-        } else {
-            status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen,
-                                    prinfow, sdlba, apptag, appmask, &reftag);
-            if (status) {
-                goto invalid;
-            }
-        }
-    }
-
-    status = nvme_check_bounds(ns, sdlba, ctx->nlb);
-    if (status) {
-        goto invalid;
-    }
-
-    if (ns->params.zoned) {
-        NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
-
-        status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb);
-        if (status) {
-            goto invalid;
-        }
-
-        status = nvme_zrm_auto(nvme_ctrl(req), ns, zone);
-        if (status) {
-            goto invalid;
-        }
-
-        zone->w_ptr += ctx->nlb;
-    }
-
-    qemu_iovec_init(&req->sg.iov, 1);
-    qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
-
-    block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
-                     BLOCK_ACCT_WRITE);
-
-    req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
-                                 &req->sg.iov, 0, nvme_copy_cb, req);
-
-    return;
-
-invalid:
-    req->status = status;
-
-    g_free(ctx->bounce);
-    g_free(ctx);
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
-static void nvme_aio_copy_in_cb(void *opaque, int ret)
-{
-    struct nvme_copy_in_ctx *in_ctx = opaque;
-    NvmeRequest *req = in_ctx->req;
-    NvmeNamespace *ns = req->ns;
-    struct nvme_copy_ctx *ctx = req->opaque;
-
-    qemu_iovec_destroy(&in_ctx->iov);
-    g_free(in_ctx);
-
-    trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));
-
-    if (ret) {
-        nvme_aio_err(req, ret);
-    }
-
-    ctx->copies--;
-
-    if (ctx->copies) {
-        return;
-    }
-
-    if (req->status) {
-        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
-
-        g_free(ctx->bounce);
-        g_free(ctx->mbounce);
-        g_free(ctx);
-
-        nvme_enqueue_req_completion(nvme_cq(req), req);
-
-        return;
-    }
-
-    nvme_copy_in_complete(req);
-}
-
 struct nvme_compare_ctx {
     struct {
         QEMUIOVector iov;
@@ -2713,153 +2493,433 @@ static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
     return NVME_NO_COMPLETE;
 }
 
+typedef struct NvmeCopyAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    NvmeRequest *req;
+    QEMUBH *bh;
+    int ret;
+
+    NvmeCopySourceRange *ranges;
+    int nr;
+    int idx;
+
+    uint8_t *bounce;
+    QEMUIOVector iov;
+    struct {
+        BlockAcctCookie read;
+        BlockAcctCookie write;
+    } acct;
+
+    uint32_t reftag;
+    uint64_t slba;
+
+    NvmeZone *zone;
+} NvmeCopyAIOCB;
+
+static void nvme_copy_cancel(BlockAIOCB *aiocb)
+{
+    NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
+
+    iocb->ret = -ECANCELED;
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+        iocb->aiocb = NULL;
+    }
+}
+
+static const AIOCBInfo nvme_copy_aiocb_info = {
+    .aiocb_size   = sizeof(NvmeCopyAIOCB),
+    .cancel_async = nvme_copy_cancel,
+};
+
+static void nvme_copy_bh(void *opaque)
+{
+    NvmeCopyAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
+
+    if (iocb->idx != iocb->nr) {
+        req->cqe.result = cpu_to_le32(iocb->idx);
+    }
+
+    qemu_iovec_destroy(&iocb->iov);
+    g_free(iocb->bounce);
+
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+
+    if (iocb->ret < 0) {
+        block_acct_failed(stats, &iocb->acct.read);
+        block_acct_failed(stats, &iocb->acct.write);
+    } else {
+        block_acct_done(stats, &iocb->acct.read);
+        block_acct_done(stats, &iocb->acct.write);
+    }
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+    qemu_aio_unref(iocb);
+}
+
+static void nvme_copy_cb(void *opaque, int ret);
+
+static void nvme_copy_out_completed_cb(void *opaque, int ret)
+{
+    NvmeCopyAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeCopySourceRange *range = &iocb->ranges[iocb->idx];
+    uint32_t nlb = le32_to_cpu(range->nlb) + 1;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto out;
+    } else if (iocb->ret < 0) {
+        goto out;
+    }
+
+    if (ns->params.zoned) {
+        nvme_advance_zone_wp(ns, iocb->zone, nlb);
+    }
+
+    iocb->idx++;
+    iocb->slba += nlb;
+out:
+    nvme_copy_cb(iocb, iocb->ret);
+}
+
+static void nvme_copy_out_cb(void *opaque, int ret)
+{
+    NvmeCopyAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeCopySourceRange *range;
+    uint32_t nlb;
+    size_t mlen;
+    uint8_t *mbounce;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto out;
+    } else if (iocb->ret < 0) {
+        goto out;
+    }
+
+    if (!ns->lbaf.ms) {
+        nvme_copy_out_completed_cb(iocb, 0);
+        return;
+    }
+
+    range = &iocb->ranges[iocb->idx];
+    nlb = le32_to_cpu(range->nlb) + 1;
+
+    mlen = nvme_m2b(ns, nlb);
+    mbounce = iocb->bounce + nvme_l2b(ns, nlb);
+
+    qemu_iovec_reset(&iocb->iov);
+    qemu_iovec_add(&iocb->iov, mbounce, mlen);
+
+    iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
+                                  &iocb->iov, 0, nvme_copy_out_completed_cb,
+                                  iocb);
+
+    return;
+
+out:
+    nvme_copy_cb(iocb, ret);
+}
+
+static void nvme_copy_in_completed_cb(void *opaque, int ret)
+{
+    NvmeCopyAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeCopySourceRange *range;
+    uint32_t nlb;
+    size_t len;
+    uint16_t status;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto out;
+    } else if (iocb->ret < 0) {
+        goto out;
+    }
+
+    range = &iocb->ranges[iocb->idx];
+    nlb = le32_to_cpu(range->nlb) + 1;
+    len = nvme_l2b(ns, nlb);
+
+    trace_pci_nvme_copy_out(iocb->slba, nlb);
+
+    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
+        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
+
+        uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
+        uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
+
+        uint16_t apptag = le16_to_cpu(range->apptag);
+        uint16_t appmask = le16_to_cpu(range->appmask);
+        uint32_t reftag = le32_to_cpu(range->reftag);
+
+        uint64_t slba = le64_to_cpu(range->slba);
+        size_t mlen = nvme_m2b(ns, nlb);
+        uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
+
+        status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
+                                slba, apptag, appmask, &reftag);
+        if (status) {
+            goto invalid;
+        }
+
+        apptag = le16_to_cpu(copy->apptag);
+        appmask = le16_to_cpu(copy->appmask);
+
+        if (prinfow & NVME_PRINFO_PRACT) {
+            status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
+            if (status) {
+                goto invalid;
+            }
+
+            nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
+                                        apptag, &iocb->reftag);
+        } else {
+            status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
+                                    prinfow, iocb->slba, apptag, appmask,
+                                    &iocb->reftag);
+            if (status) {
+                goto invalid;
+            }
+        }
+    }
+
+    status = nvme_check_bounds(ns, iocb->slba, nlb);
+    if (status) {
+        goto invalid;
+    }
+
+    if (ns->params.zoned) {
+        status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
+        if (status) {
+            goto invalid;
+        }
+
+        iocb->zone->w_ptr += nlb;
+    }
+
+    qemu_iovec_reset(&iocb->iov);
+    qemu_iovec_add(&iocb->iov, iocb->bounce, len);
+
+    iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
+                                  &iocb->iov, 0, nvme_copy_out_cb, iocb);
+
+    return;
+
+invalid:
+    req->status = status;
+    iocb->aiocb = NULL;
+    if (iocb->bh) {
+        qemu_bh_schedule(iocb->bh);
+    }
+
+    return;
+
+out:
+    nvme_copy_cb(iocb, ret);
+}
+
+static void nvme_copy_in_cb(void *opaque, int ret)
+{
+    NvmeCopyAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeCopySourceRange *range;
+    uint64_t slba;
+    uint32_t nlb;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto out;
+    } else if (iocb->ret < 0) {
+        goto out;
+    }
+
+    if (!ns->lbaf.ms) {
+        nvme_copy_in_completed_cb(iocb, 0);
+        return;
+    }
+
+    range = &iocb->ranges[iocb->idx];
+    slba = le64_to_cpu(range->slba);
+    nlb = le32_to_cpu(range->nlb) + 1;
+
+    qemu_iovec_reset(&iocb->iov);
+    qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
+                   nvme_m2b(ns, nlb));
+
+    iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
+                                 &iocb->iov, 0, nvme_copy_in_completed_cb,
+                                 iocb);
+    return;
+
+out:
+    nvme_copy_cb(iocb, iocb->ret);
+}
+
+static void nvme_copy_cb(void *opaque, int ret)
+{
+    NvmeCopyAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeCopySourceRange *range;
+    uint64_t slba;
+    uint32_t nlb;
+    size_t len;
+    uint16_t status;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    } else if (iocb->ret < 0) {
+        goto done;
+    }
+
+    if (iocb->idx == iocb->nr) {
+        goto done;
+    }
+
+    range = &iocb->ranges[iocb->idx];
+    slba = le64_to_cpu(range->slba);
+    nlb = le32_to_cpu(range->nlb) + 1;
+    len = nvme_l2b(ns, nlb);
+
+    trace_pci_nvme_copy_source_range(slba, nlb);
+
+    if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
+        status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
+        goto invalid;
+    }
+
+    status = nvme_check_bounds(ns, slba, nlb);
+    if (status) {
+        goto invalid;
+    }
+
+    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+        status = nvme_check_dulbe(ns, slba, nlb);
+        if (status) {
+            goto invalid;
+        }
+    }
+
+    if (ns->params.zoned) {
+        status = nvme_check_zone_read(ns, slba, nlb);
+        if (status) {
+            goto invalid;
+        }
+    }
+
+    qemu_iovec_reset(&iocb->iov);
+    qemu_iovec_add(&iocb->iov, iocb->bounce, len);
+
+    iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
+                                 &iocb->iov, 0, nvme_copy_in_cb, iocb);
+    return;
+
+invalid:
+    req->status = status;
+done:
+    iocb->aiocb = NULL;
+    if (iocb->bh) {
+        qemu_bh_schedule(iocb->bh);
+    }
+}
+
+
 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns = req->ns;
     NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
-
+    NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
+                                      nvme_misc_cb, req);
     uint16_t nr = copy->nr + 1;
     uint8_t format = copy->control[0] & 0xf;
-    uint8_t prinfor = (copy->control[0] >> 4) & 0xf;
-    uint8_t prinfow = (copy->control[2] >> 2) & 0xf;
+    uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
+    uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
 
-    uint32_t nlb = 0;
-    uint8_t *bounce = NULL, *bouncep = NULL;
-    uint8_t *mbounce = NULL, *mbouncep = NULL;
-    struct nvme_copy_ctx *ctx;
     uint16_t status;
-    int i;
 
     trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
 
+    iocb->ranges = NULL;
+    iocb->zone = NULL;
+
     if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
         ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
-        return NVME_INVALID_FIELD | NVME_DNR;
+        status = NVME_INVALID_FIELD | NVME_DNR;
+        goto invalid;
     }
 
     if (!(n->id_ctrl.ocfs & (1 << format))) {
         trace_pci_nvme_err_copy_invalid_format(format);
-        return NVME_INVALID_FIELD | NVME_DNR;
+        status = NVME_INVALID_FIELD | NVME_DNR;
+        goto invalid;
     }
 
     if (nr > ns->id_ns.msrc + 1) {
-        return NVME_CMD_SIZE_LIMIT | NVME_DNR;
-    }
-
-    ctx = g_new(struct nvme_copy_ctx, 1);
-    ctx->ranges = g_new(NvmeCopySourceRange, nr);
-
-    status = nvme_h2c(n, (uint8_t *)ctx->ranges,
-                      nr * sizeof(NvmeCopySourceRange), req);
-    if (status) {
-        goto out;
-    }
-
-    for (i = 0; i < nr; i++) {
-        uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
-        uint32_t _nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
-
-        if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) {
-            status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
-            goto out;
-        }
-
-        status = nvme_check_bounds(ns, slba, _nlb);
-        if (status) {
-            goto out;
-        }
-
-        if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
-            status = nvme_check_dulbe(ns, slba, _nlb);
-            if (status) {
-                goto out;
-            }
-        }
-
-        if (ns->params.zoned) {
-            status = nvme_check_zone_read(ns, slba, _nlb);
-            if (status) {
-                goto out;
-            }
-        }
-
-        nlb += _nlb;
-    }
-
-    if (nlb > le32_to_cpu(ns->id_ns.mcl)) {
         status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
-        goto out;
+        goto invalid;
     }
 
-    bounce = bouncep = g_malloc(nvme_l2b(ns, nlb));
-    if (ns->lbaf.ms) {
-        mbounce = mbouncep = g_malloc(nvme_m2b(ns, nlb));
+    iocb->ranges = g_new(NvmeCopySourceRange, nr);
+
+    status = nvme_h2c(n, (uint8_t *)iocb->ranges,
+                      sizeof(NvmeCopySourceRange) * nr, req);
+    if (status) {
+        goto invalid;
     }
 
-    block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
-                     BLOCK_ACCT_READ);
+    iocb->slba = le64_to_cpu(copy->sdlba);
 
-    ctx->bounce = bounce;
-    ctx->mbounce = mbounce;
-    ctx->nlb = nlb;
-    ctx->copies = 1;
+    if (ns->params.zoned) {
+        iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
+        if (!iocb->zone) {
+            status = NVME_LBA_RANGE | NVME_DNR;
+            goto invalid;
+        }
 
-    req->opaque = ctx;
-
-    for (i = 0; i < nr; i++) {
-        uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
-        uint32_t nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
-
-        size_t len = nvme_l2b(ns, nlb);
-        int64_t offset = nvme_l2b(ns, slba);
-
-        trace_pci_nvme_copy_source_range(slba, nlb);
-
-        struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1);
-        in_ctx->req = req;
-
-        qemu_iovec_init(&in_ctx->iov, 1);
-        qemu_iovec_add(&in_ctx->iov, bouncep, len);
-
-        ctx->copies++;
-
-        blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
-                       nvme_aio_copy_in_cb, in_ctx);
-
-        bouncep += len;
-
-        if (ns->lbaf.ms) {
-            len = nvme_m2b(ns, nlb);
-            offset = nvme_moff(ns, slba);
-
-            in_ctx = g_new(struct nvme_copy_in_ctx, 1);
-            in_ctx->req = req;
-
-            qemu_iovec_init(&in_ctx->iov, 1);
-            qemu_iovec_add(&in_ctx->iov, mbouncep, len);
-
-            ctx->copies++;
-
-            blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
-                           nvme_aio_copy_in_cb, in_ctx);
-
-            mbouncep += len;
+        status = nvme_zrm_auto(n, ns, iocb->zone);
+        if (status) {
+            goto invalid;
         }
     }
 
-    /* account for the 1-initialization */
-    ctx->copies--;
+    iocb->req = req;
+    iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
+    iocb->ret = 0;
+    iocb->nr = nr;
+    iocb->idx = 0;
+    iocb->reftag = le32_to_cpu(copy->reftag);
+    iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
+                              ns->lbasz + ns->lbaf.ms);
 
-    if (!ctx->copies) {
-        nvme_copy_in_complete(req);
-    }
+    qemu_iovec_init(&iocb->iov, 1);
+
+    block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
+                     BLOCK_ACCT_READ);
+    block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
+                     BLOCK_ACCT_WRITE);
+
+    req->aiocb = &iocb->common;
+    nvme_copy_cb(iocb, 0);
 
     return NVME_NO_COMPLETE;
 
-out:
-    g_free(ctx->ranges);
-    g_free(ctx);
-
+invalid:
+    g_free(iocb->ranges);
+    qemu_aio_unref(iocb);
     return status;
 }
 
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index a3e11346865e..cd65f8b28895 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -30,8 +30,7 @@ pci_nvme_dif_prchk_apptag(uint16_t apptag, uint16_t elbat, uint16_t elbatm) "app
 pci_nvme_dif_prchk_reftag(uint32_t reftag, uint32_t elbrt) "reftag 0x%"PRIx32" elbrt 0x%"PRIx32""
 pci_nvme_copy(uint16_t cid, uint32_t nsid, uint16_t nr, uint8_t format) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu16" format 0x%"PRIx8""
 pci_nvme_copy_source_range(uint64_t slba, uint32_t nlb) "slba 0x%"PRIx64" nlb %"PRIu32""
-pci_nvme_copy_in_complete(uint16_t cid) "cid %"PRIu16""
-pci_nvme_copy_cb(uint16_t cid) "cid %"PRIu16""
+pci_nvme_copy_out(uint64_t slba, uint32_t nlb) "slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_verify(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_verify_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
 pci_nvme_verify_cb(uint16_t cid, uint8_t prinfo, uint16_t apptag, uint16_t appmask, uint32_t reftag) "cid %"PRIu16" prinfo 0x%"PRIx8" apptag 0x%"PRIx16" appmask 0x%"PRIx16" reftag 0x%"PRIx32""
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 09/11] hw/nvme: reimplement zone reset to allow cancellation
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
                   ` (7 preceding siblings ...)
  2021-06-17 19:06 ` [PATCH v2 08/11] hw/nvme: reimplement the copy command to allow aio cancellation Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 10/11] hw/nvme: reimplement format nvm " Klaus Jensen
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

Prior to this patch, the aios associated with zone reset are submitted
anonymously (no reference saved to the aiocb from the blk_aio call).

Fix this by resetting the zones one after another, saving a reference to
the aiocb for each reset.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c       | 288 +++++++++++++++++++++++++------------------
 hw/nvme/trace-events |   2 +-
 2 files changed, 169 insertions(+), 121 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 012b0126057b..6a8c1ff2271d 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1691,6 +1691,29 @@ static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
     }
 }
 
+static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
+{
+    switch (nvme_get_zone_state(zone)) {
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        nvme_aor_dec_open(ns);
+        /* fallthrough */
+    case NVME_ZONE_STATE_CLOSED:
+        nvme_aor_dec_active(ns);
+        /* fallthrough */
+    case NVME_ZONE_STATE_FULL:
+        zone->w_ptr = zone->d.zslba;
+        zone->d.wp = zone->w_ptr;
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
+        /* fallthrough */
+    case NVME_ZONE_STATE_EMPTY:
+        return NVME_SUCCESS;
+
+    default:
+        return NVME_ZONE_INVAL_TRANSITION;
+    }
+}
+
 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
 {
     NvmeZone *zone;
@@ -2020,79 +2043,6 @@ out:
     nvme_verify_cb(ctx, ret);
 }
 
-struct nvme_zone_reset_ctx {
-    NvmeRequest *req;
-    NvmeZone    *zone;
-};
-
-static void nvme_aio_zone_reset_complete_cb(void *opaque, int ret)
-{
-    struct nvme_zone_reset_ctx *ctx = opaque;
-    NvmeRequest *req = ctx->req;
-    NvmeNamespace *ns = req->ns;
-    NvmeZone *zone = ctx->zone;
-    uintptr_t *resets = (uintptr_t *)&req->opaque;
-
-    if (ret) {
-        nvme_aio_err(req, ret);
-        goto out;
-    }
-
-    switch (nvme_get_zone_state(zone)) {
-    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
-    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
-        nvme_aor_dec_open(ns);
-        /* fall through */
-    case NVME_ZONE_STATE_CLOSED:
-        nvme_aor_dec_active(ns);
-        /* fall through */
-    case NVME_ZONE_STATE_FULL:
-        zone->w_ptr = zone->d.zslba;
-        zone->d.wp = zone->w_ptr;
-        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
-        /* fall through */
-    default:
-        break;
-    }
-
-out:
-    g_free(ctx);
-
-    (*resets)--;
-
-    if (*resets) {
-        return;
-    }
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
-static void nvme_aio_zone_reset_cb(void *opaque, int ret)
-{
-    struct nvme_zone_reset_ctx *ctx = opaque;
-    NvmeRequest *req = ctx->req;
-    NvmeNamespace *ns = req->ns;
-    NvmeZone *zone = ctx->zone;
-
-    trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
-
-    if (ret) {
-        goto out;
-    }
-
-    if (ns->lbaf.ms) {
-        int64_t offset = nvme_moff(ns, zone->d.zslba);
-
-        blk_aio_pwrite_zeroes(ns->blkconf.blk, offset,
-                              nvme_m2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
-                              nvme_aio_zone_reset_complete_cb, ctx);
-        return;
-    }
-
-out:
-    nvme_aio_zone_reset_complete_cb(opaque, ret);
-}
-
 struct nvme_compare_ctx {
     struct {
         QEMUIOVector iov;
@@ -3395,41 +3345,6 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
     return nvme_zrm_finish(ns, zone);
 }
 
-static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
-                                NvmeZoneState state, NvmeRequest *req)
-{
-    uintptr_t *resets = (uintptr_t *)&req->opaque;
-    struct nvme_zone_reset_ctx *ctx;
-
-    switch (state) {
-    case NVME_ZONE_STATE_EMPTY:
-        return NVME_SUCCESS;
-    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
-    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
-    case NVME_ZONE_STATE_CLOSED:
-    case NVME_ZONE_STATE_FULL:
-        break;
-    default:
-        return NVME_ZONE_INVAL_TRANSITION;
-    }
-
-    /*
-     * The zone reset aio callback needs to know the zone that is being reset
-     * in order to transition the zone on completion.
-     */
-    ctx = g_new(struct nvme_zone_reset_ctx, 1);
-    ctx->req = req;
-    ctx->zone = zone;
-
-    (*resets)++;
-
-    blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
-                          nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
-                          nvme_aio_zone_reset_cb, ctx);
-
-    return NVME_NO_COMPLETE;
-}
-
 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
                                   NvmeZoneState state, NvmeRequest *req)
 {
@@ -3558,12 +3473,144 @@ out:
     return status;
 }
 
+typedef struct NvmeZoneResetAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    NvmeRequest *req;
+    QEMUBH *bh;
+    int ret;
+
+    bool all;
+    int idx;
+    NvmeZone *zone;
+} NvmeZoneResetAIOCB;
+
+static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
+{
+    NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+
+    iocb->idx = ns->num_zones;
+
+    iocb->ret = -ECANCELED;
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+        iocb->aiocb = NULL;
+    }
+}
+
+static const AIOCBInfo nvme_zone_reset_aiocb_info = {
+    .aiocb_size = sizeof(NvmeZoneResetAIOCB),
+    .cancel_async = nvme_zone_reset_cancel,
+};
+
+static void nvme_zone_reset_bh(void *opaque)
+{
+    NvmeZoneResetAIOCB *iocb = opaque;
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+    qemu_aio_unref(iocb);
+}
+
+static void nvme_zone_reset_cb(void *opaque, int ret);
+
+static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
+{
+    NvmeZoneResetAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    int64_t moff;
+    int count;
+
+    if (ret < 0) {
+        nvme_zone_reset_cb(iocb, ret);
+        return;
+    }
+
+    if (!ns->lbaf.ms) {
+        nvme_zone_reset_cb(iocb, 0);
+        return;
+    }
+
+    moff = nvme_moff(ns, iocb->zone->d.zslba);
+    count = nvme_m2b(ns, ns->zone_size);
+
+    iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
+                                        BDRV_REQ_MAY_UNMAP,
+                                        nvme_zone_reset_cb, iocb);
+    return;
+}
+
+static void nvme_zone_reset_cb(void *opaque, int ret)
+{
+    NvmeZoneResetAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+    if (iocb->zone) {
+        nvme_zrm_reset(ns, iocb->zone);
+
+        if (!iocb->all) {
+            goto done;
+        }
+    }
+
+    while (iocb->idx < ns->num_zones) {
+        NvmeZone *zone = &ns->zone_array[iocb->idx++];
+
+        switch (nvme_get_zone_state(zone)) {
+        case NVME_ZONE_STATE_EMPTY:
+            if (!iocb->all) {
+                goto done;
+            }
+
+            continue;
+
+        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        case NVME_ZONE_STATE_CLOSED:
+        case NVME_ZONE_STATE_FULL:
+            iocb->zone = zone;
+            break;
+
+        default:
+            continue;
+        }
+
+        trace_pci_nvme_zns_zone_reset(zone->d.zslba);
+
+        iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
+                                            nvme_l2b(ns, zone->d.zslba),
+                                            nvme_l2b(ns, ns->zone_size),
+                                            BDRV_REQ_MAY_UNMAP,
+                                            nvme_zone_reset_epilogue_cb,
+                                            iocb);
+        return;
+    }
+
+done:
+    iocb->aiocb = NULL;
+    if (iocb->bh) {
+        qemu_bh_schedule(iocb->bh);
+    }
+}
+
 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
     NvmeZone *zone;
-    uintptr_t *resets;
+    NvmeZoneResetAIOCB *iocb;
     uint8_t *zd_ext;
     uint32_t dw13 = le32_to_cpu(cmd->cdw13);
     uint64_t slba = 0;
@@ -3574,7 +3621,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
     enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
 
     action = dw13 & 0xff;
-    all = dw13 & 0x100;
+    all = !!(dw13 & 0x100);
 
     req->status = NVME_SUCCESS;
 
@@ -3618,21 +3665,22 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
         break;
 
     case NVME_ZONE_ACTION_RESET:
-        resets = (uintptr_t *)&req->opaque;
-
-        if (all) {
-            proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
-                NVME_PROC_FULL_ZONES;
-        }
         trace_pci_nvme_reset_zone(slba, zone_idx, all);
 
-        *resets = 1;
+        iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
+                           nvme_misc_cb, req);
 
-        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req);
+        iocb->req = req;
+        iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
+        iocb->ret = 0;
+        iocb->all = all;
+        iocb->idx = zone_idx;
+        iocb->zone = NULL;
 
-        (*resets)--;
+        req->aiocb = &iocb->common;
+        nvme_zone_reset_cb(iocb, 0);
 
-        return *resets ? NVME_NO_COMPLETE : req->status;
+        return NVME_NO_COMPLETE;
 
     case NVME_ZONE_ACTION_OFFLINE:
         if (all) {
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index cd65f8b28895..dc00c2860db7 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -44,7 +44,6 @@ pci_nvme_compare_data_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_compare_mdata_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_aio_copy_in_cb(uint16_t cid) "cid %"PRIu16""
-pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64""
 pci_nvme_aio_flush_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
 pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
 pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
@@ -100,6 +99,7 @@ pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, slba=%
 pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
 pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
 pci_nvme_reset_zone(uint64_t slba, uint32_t zone_idx, int all) "reset zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_zns_zone_reset(uint64_t zslba) "zslba 0x%"PRIx64""
 pci_nvme_offline_zone(uint64_t slba, uint32_t zone_idx, int all) "offline zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
 pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone descriptor extension, slba=%"PRIu64", idx=%"PRIu32""
 pci_nvme_zd_extension_set(uint32_t zone_idx) "set descriptor extension for zone_idx=%"PRIu32""
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 10/11] hw/nvme: reimplement format nvm to allow cancellation
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
                   ` (8 preceding siblings ...)
  2021-06-17 19:06 ` [PATCH v2 09/11] hw/nvme: reimplement zone reset to allow cancellation Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-17 19:06 ` [PATCH v2 11/11] Partially revert "hw/block/nvme: drain namespaces on sq deletion" Klaus Jensen
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

Prior to this patch, the aios associated with broadcast format are
submitted anonymously (no aiocb reference saved from the blk_aio call).

Fix this by formatting the namespaces one after another, saving a
reference to the aiocb for each.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c       | 289 +++++++++++++++++++++++--------------------
 hw/nvme/trace-events |   4 +-
 2 files changed, 156 insertions(+), 137 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 6a8c1ff2271d..ec8ddb76cd39 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1924,42 +1924,6 @@ out:
     nvme_rw_complete_cb(req, ret);
 }
 
-struct nvme_aio_format_ctx {
-    NvmeRequest   *req;
-    NvmeNamespace *ns;
-
-    /* number of outstanding write zeroes for this namespace */
-    int *count;
-};
-
-static void nvme_aio_format_cb(void *opaque, int ret)
-{
-    struct nvme_aio_format_ctx *ctx = opaque;
-    NvmeRequest *req = ctx->req;
-    NvmeNamespace *ns = ctx->ns;
-    uintptr_t *num_formats = (uintptr_t *)&req->opaque;
-    int *count = ctx->count;
-
-    g_free(ctx);
-
-    if (ret) {
-        nvme_aio_err(req, ret);
-    }
-
-    if (--(*count)) {
-        return;
-    }
-
-    g_free(count);
-    ns->status = 0x0;
-
-    if (--(*num_formats)) {
-        return;
-    }
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
 static void nvme_verify_cb(void *opaque, int ret)
 {
     NvmeBounceContext *ctx = opaque;
@@ -5260,30 +5224,98 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
-static uint16_t nvme_format_ns(NvmeCtrl *n, NvmeNamespace *ns, uint8_t lbaf,
-                               uint8_t mset, uint8_t pi, uint8_t pil,
-                               NvmeRequest *req)
-{
-    int64_t len, offset;
-    struct nvme_aio_format_ctx *ctx;
-    BlockBackend *blk = ns->blkconf.blk;
-    uint16_t ms;
-    uintptr_t *num_formats = (uintptr_t *)&req->opaque;
-    int *count;
+typedef struct NvmeFormatAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    QEMUBH *bh;
+    NvmeRequest *req;
+    int ret;
 
+    NvmeNamespace *ns;
+    uint32_t nsid;
+    bool broadcast;
+    int64_t offset;
+} NvmeFormatAIOCB;
+
+static void nvme_format_bh(void *opaque);
+
+static void nvme_format_cancel(BlockAIOCB *aiocb)
+{
+    NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+    }
+}
+
+static const AIOCBInfo nvme_format_aiocb_info = {
+    .aiocb_size = sizeof(NvmeFormatAIOCB),
+    .cancel_async = nvme_format_cancel,
+    .get_aio_context = nvme_get_aio_context,
+};
+
+static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
+{
+    uint32_t dw10 = le32_to_cpu(cmd->cdw10);
+    uint8_t lbaf = dw10 & 0xf;
+    uint8_t pi = (dw10 >> 5) & 0x7;
+    uint8_t mset = (dw10 >> 4) & 0x1;
+    uint8_t pil = (dw10 >> 8) & 0x1;
+
+    trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
+
+    ns->id_ns.dps = (pil << 3) | pi;
+    ns->id_ns.flbas = lbaf | (mset << 4);
+
+    nvme_ns_init_format(ns);
+}
+
+static void nvme_format_ns_cb(void *opaque, int ret)
+{
+    NvmeFormatAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = iocb->ns;
+    int bytes;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+    assert(ns);
+
+    if (iocb->offset < ns->size) {
+        bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
+
+        iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
+                                            bytes, BDRV_REQ_MAY_UNMAP,
+                                            nvme_format_ns_cb, iocb);
+
+        iocb->offset += bytes;
+        return;
+    }
+
+    nvme_format_set(ns, &req->cmd);
+    ns->status = 0x0;
+    iocb->ns = NULL;
+    iocb->offset = 0;
+
+done:
+    iocb->aiocb = NULL;
+    qemu_bh_schedule(iocb->bh);
+}
+
+static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
+{
     if (ns->params.zoned) {
         return NVME_INVALID_FORMAT | NVME_DNR;
     }
 
-    trace_pci_nvme_format_ns(nvme_cid(req), nvme_nsid(ns), lbaf, mset, pi, pil);
-
     if (lbaf > ns->id_ns.nlbaf) {
         return NVME_INVALID_FORMAT | NVME_DNR;
     }
 
-    ms = ns->id_ns.lbaf[lbaf].ms;
-
-    if (pi && (ms < sizeof(NvmeDifTuple))) {
+    if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
         return NVME_INVALID_FORMAT | NVME_DNR;
     }
 
@@ -5291,107 +5323,96 @@ static uint16_t nvme_format_ns(NvmeCtrl *n, NvmeNamespace *ns, uint8_t lbaf,
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
-    nvme_ns_drain(ns);
-    nvme_ns_shutdown(ns);
-    nvme_ns_cleanup(ns);
-
-    ns->id_ns.dps = (pil << 3) | pi;
-    ns->id_ns.flbas = lbaf | (mset << 4);
-
-    nvme_ns_init_format(ns);
-
-    ns->status = NVME_FORMAT_IN_PROGRESS;
-
-    len = ns->size;
-    offset = 0;
-
-    count = g_new(int, 1);
-    *count = 1;
-
-    (*num_formats)++;
-
-    while (len) {
-        ctx = g_new(struct nvme_aio_format_ctx, 1);
-        ctx->req = req;
-        ctx->ns = ns;
-        ctx->count = count;
-
-        size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
-
-        (*count)++;
-
-        blk_aio_pwrite_zeroes(blk, offset, bytes, BDRV_REQ_MAY_UNMAP,
-                              nvme_aio_format_cb, ctx);
-
-        offset += bytes;
-        len -= bytes;
-
-    }
-
-    if (--(*count)) {
-        return NVME_NO_COMPLETE;
-    }
-
-    g_free(count);
-    ns->status = 0x0;
-    (*num_formats)--;
-
     return NVME_SUCCESS;
 }
 
-static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
+static void nvme_format_bh(void *opaque)
 {
-    NvmeNamespace *ns;
+    NvmeFormatAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeCtrl *n = nvme_ctrl(req);
     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
-    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
     uint8_t lbaf = dw10 & 0xf;
-    uint8_t mset = (dw10 >> 4) & 0x1;
     uint8_t pi = (dw10 >> 5) & 0x7;
-    uint8_t pil = (dw10 >> 8) & 0x1;
-    uintptr_t *num_formats = (uintptr_t *)&req->opaque;
     uint16_t status;
     int i;
 
-    trace_pci_nvme_format(nvme_cid(req), nsid, lbaf, mset, pi, pil);
+    if (iocb->ret < 0) {
+        goto done;
+    }
 
-    /* 1-initialize; see the comment in nvme_dsm */
-    *num_formats = 1;
-
-    if (nsid != NVME_NSID_BROADCAST) {
-        if (!nvme_nsid_valid(n, nsid)) {
-            return NVME_INVALID_NSID | NVME_DNR;
-        }
-
-        ns = nvme_ns(n, nsid);
-        if (!ns) {
-            return NVME_INVALID_FIELD | NVME_DNR;
-        }
-
-        status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
-        if (status && status != NVME_NO_COMPLETE) {
-            req->status = status;
-        }
-    } else {
-        for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
-            ns = nvme_ns(n, i);
-            if (!ns) {
-                continue;
-            }
-
-            status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
-            if (status && status != NVME_NO_COMPLETE) {
-                req->status = status;
+    if (iocb->broadcast) {
+        for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
+            iocb->ns = nvme_ns(n, i);
+            if (iocb->ns) {
+                iocb->nsid = i;
                 break;
             }
         }
     }
 
-    /* account for the 1-initialization */
-    if (--(*num_formats)) {
-        return NVME_NO_COMPLETE;
+    if (!iocb->ns) {
+        goto done;
     }
 
-    return req->status;
+    status = nvme_format_check(iocb->ns, lbaf, pi);
+    if (status) {
+        req->status = status;
+        goto done;
+    }
+
+    iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
+    nvme_format_ns_cb(iocb, 0);
+    return;
+
+done:
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+    qemu_aio_unref(iocb);
+}
+
+static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeFormatAIOCB *iocb;
+    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+    uint16_t status;
+
+    iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
+
+    iocb->req = req;
+    iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
+    iocb->ret = 0;
+    iocb->ns = NULL;
+    iocb->nsid = 0;
+    iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
+    iocb->offset = 0;
+
+    if (!iocb->broadcast) {
+        if (!nvme_nsid_valid(n, nsid)) {
+            status = NVME_INVALID_NSID | NVME_DNR;
+            goto out;
+        }
+
+        iocb->ns = nvme_ns(n, nsid);
+        if (!iocb->ns) {
+            status = NVME_INVALID_FIELD | NVME_DNR;
+            goto out;
+        }
+    }
+
+    req->aiocb = &iocb->common;
+    qemu_bh_schedule(iocb->bh);
+
+    return NVME_NO_COMPLETE;
+
+out:
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+    qemu_aio_unref(iocb);
+    return status;
 }
 
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index dc00c2860db7..48d10c36e85b 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -10,9 +10,7 @@ pci_nvme_map_sgl(uint8_t typ, uint64_t len) "type 0x%"PRIx8" len %"PRIu64""
 pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid 0x%"PRIx32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
 pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
 pci_nvme_flush_ns(uint32_t nsid) "nsid 0x%"PRIx32""
-pci_nvme_format(uint16_t cid, uint32_t nsid, uint8_t lbaf, uint8_t mset, uint8_t pi, uint8_t pil) "cid %"PRIu16" nsid %"PRIu32" lbaf %"PRIu8" mset %"PRIu8" pi %"PRIu8" pil %"PRIu8""
-pci_nvme_format_ns(uint16_t cid, uint32_t nsid, uint8_t lbaf, uint8_t mset, uint8_t pi, uint8_t pil) "cid %"PRIu16" nsid %"PRIu32" lbaf %"PRIu8" mset %"PRIu8" pi %"PRIu8" pil %"PRIu8""
-pci_nvme_format_cb(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
+pci_nvme_format_set(uint32_t nsid, uint8_t lbaf, uint8_t mset, uint8_t pi, uint8_t pil) "nsid %"PRIu32" lbaf %"PRIu8" mset %"PRIu8" pi %"PRIu8" pil %"PRIu8""
 pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2 11/11] Partially revert "hw/block/nvme: drain namespaces on sq deletion"
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
                   ` (9 preceding siblings ...)
  2021-06-17 19:06 ` [PATCH v2 10/11] hw/nvme: reimplement format nvm " Klaus Jensen
@ 2021-06-17 19:06 ` Klaus Jensen
  2021-06-28 16:00   ` Keith Busch
  2021-06-25 18:47 ` [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Keith Busch
  2021-06-28 17:08 ` Klaus Jensen
  12 siblings, 1 reply; 16+ messages in thread
From: Klaus Jensen @ 2021-06-17 19:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Keith Busch, Stefan Hajnoczi,
	Klaus Jensen

From: Klaus Jensen <k.jensen@samsung.com>

This partially reverts commit 98f84f5a4eca5c03e32fff20f246d9b4b96d6422.

Since all "multi aio" commands are now reimplemented to properly track
the nested aiocbs, we can revert the "hack" that was introduced to make
sure all requests we're properly drained upon sq deletion.

The revert is partial since we keep the assert that no outstanding
requests remain on the submission queue after the explicit cancellation.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index ec8ddb76cd39..5a1d25265a9d 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -3918,7 +3918,6 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
     NvmeSQueue *sq;
     NvmeCQueue *cq;
     uint16_t qid = le16_to_cpu(c->qid);
-    uint32_t nsid;
 
     if (unlikely(!qid || nvme_check_sqid(n, qid))) {
         trace_pci_nvme_err_invalid_del_sq(qid);
@@ -3930,22 +3929,8 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
     sq = n->sq[qid];
     while (!QTAILQ_EMPTY(&sq->out_req_list)) {
         r = QTAILQ_FIRST(&sq->out_req_list);
-        if (r->aiocb) {
-            blk_aio_cancel(r->aiocb);
-        }
-    }
-
-    /*
-     * Drain all namespaces if there are still outstanding requests that we
-     * could not cancel explicitly.
-     */
-    if (!QTAILQ_EMPTY(&sq->out_req_list)) {
-        for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
-            NvmeNamespace *ns = nvme_ns(n, nsid);
-            if (ns) {
-                nvme_ns_drain(ns);
-            }
-        }
+        assert(r->aiocb);
+        blk_aio_cancel(r->aiocb);
     }
 
     assert(QTAILQ_EMPTY(&sq->out_req_list));
-- 
2.32.0



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
                   ` (10 preceding siblings ...)
  2021-06-17 19:06 ` [PATCH v2 11/11] Partially revert "hw/block/nvme: drain namespaces on sq deletion" Klaus Jensen
@ 2021-06-25 18:47 ` Keith Busch
  2021-06-28 17:08 ` Klaus Jensen
  12 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2021-06-25 18:47 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, qemu-devel, Max Reitz, Stefan Hajnoczi

On Thu, Jun 17, 2021 at 09:06:46PM +0200, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
> 
> This series reimplements flush, dsm, copy, zone reset and format nvm to
> allow cancellation. I posted an RFC back in March ("hw/block/nvme:
> convert ad-hoc aio tracking to aiocb") and I've applied some feedback
> from Stefan and reimplemented the remaining commands.
> 
> The basic idea is to define custom AIOCBs for these commands. The custom
> AIOCB takes care of issuing all the "nested" AIOs one by one instead of
> blindly sending them off simultaneously without tracking the returned
> aiocbs.


Klaus,

THis series looks good to me.

Reviewed-by: Keith Busch <kbusch@kernel.org>


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 11/11] Partially revert "hw/block/nvme: drain namespaces on sq deletion"
  2021-06-17 19:06 ` [PATCH v2 11/11] Partially revert "hw/block/nvme: drain namespaces on sq deletion" Klaus Jensen
@ 2021-06-28 16:00   ` Keith Busch
  2021-06-28 16:33     ` Klaus Jensen
  0 siblings, 1 reply; 16+ messages in thread
From: Keith Busch @ 2021-06-28 16:00 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, qemu-devel, Max Reitz, Stefan Hajnoczi

On Thu, Jun 17, 2021 at 09:06:57PM +0200, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
> 
> This partially reverts commit 98f84f5a4eca5c03e32fff20f246d9b4b96d6422.
> 
> Since all "multi aio" commands are now reimplemented to properly track
> the nested aiocbs, we can revert the "hack" that was introduced to make
> sure all requests we're properly drained upon sq deletion.
> 
> The revert is partial since we keep the assert that no outstanding
> requests remain on the submission queue after the explicit cancellation.
> 
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> ---
>  hw/nvme/ctrl.c | 19 ++-----------------
>  1 file changed, 2 insertions(+), 17 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index ec8ddb76cd39..5a1d25265a9d 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -3918,7 +3918,6 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
>      NvmeSQueue *sq;
>      NvmeCQueue *cq;
>      uint16_t qid = le16_to_cpu(c->qid);
> -    uint32_t nsid;
>  
>      if (unlikely(!qid || nvme_check_sqid(n, qid))) {
>          trace_pci_nvme_err_invalid_del_sq(qid);
> @@ -3930,22 +3929,8 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
>      sq = n->sq[qid];
>      while (!QTAILQ_EMPTY(&sq->out_req_list)) {
>          r = QTAILQ_FIRST(&sq->out_req_list);
> -        if (r->aiocb) {
> -            blk_aio_cancel(r->aiocb);
> -        }
> -    }
> -
> -    /*
> -     * Drain all namespaces if there are still outstanding requests that we
> -     * could not cancel explicitly.
> -     */
> -    if (!QTAILQ_EMPTY(&sq->out_req_list)) {

Quick question: was this 'if' ever even possible to hit? The preceding
'while' loop doesn't break out until the queue is empty, so this should
have been unreachable.

> -        for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
> -            NvmeNamespace *ns = nvme_ns(n, nsid);
> -            if (ns) {
> -                nvme_ns_drain(ns);
> -            }
> -        }
> +        assert(r->aiocb);
> +        blk_aio_cancel(r->aiocb);
>      }
>  
>      assert(QTAILQ_EMPTY(&sq->out_req_list));
> -- 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 11/11] Partially revert "hw/block/nvme: drain namespaces on sq deletion"
  2021-06-28 16:00   ` Keith Busch
@ 2021-06-28 16:33     ` Klaus Jensen
  0 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-28 16:33 UTC (permalink / raw)
  To: Keith Busch
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, qemu-devel, Max Reitz, Stefan Hajnoczi

[-- Attachment #1: Type: text/plain, Size: 2393 bytes --]

On Jun 28 09:00, Keith Busch wrote:
>On Thu, Jun 17, 2021 at 09:06:57PM +0200, Klaus Jensen wrote:
>> From: Klaus Jensen <k.jensen@samsung.com>
>>
>> This partially reverts commit 98f84f5a4eca5c03e32fff20f246d9b4b96d6422.
>>
>> Since all "multi aio" commands are now reimplemented to properly track
>> the nested aiocbs, we can revert the "hack" that was introduced to make
>> sure all requests we're properly drained upon sq deletion.
>>
>> The revert is partial since we keep the assert that no outstanding
>> requests remain on the submission queue after the explicit cancellation.
>>
>> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
>> ---
>>  hw/nvme/ctrl.c | 19 ++-----------------
>>  1 file changed, 2 insertions(+), 17 deletions(-)
>>
>> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
>> index ec8ddb76cd39..5a1d25265a9d 100644
>> --- a/hw/nvme/ctrl.c
>> +++ b/hw/nvme/ctrl.c
>> @@ -3918,7 +3918,6 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
>>      NvmeSQueue *sq;
>>      NvmeCQueue *cq;
>>      uint16_t qid = le16_to_cpu(c->qid);
>> -    uint32_t nsid;
>>
>>      if (unlikely(!qid || nvme_check_sqid(n, qid))) {
>>          trace_pci_nvme_err_invalid_del_sq(qid);
>> @@ -3930,22 +3929,8 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
>>      sq = n->sq[qid];
>>      while (!QTAILQ_EMPTY(&sq->out_req_list)) {
>>          r = QTAILQ_FIRST(&sq->out_req_list);
>> -        if (r->aiocb) {
>> -            blk_aio_cancel(r->aiocb);
>> -        }
>> -    }
>> -
>> -    /*
>> -     * Drain all namespaces if there are still outstanding requests that we
>> -     * could not cancel explicitly.
>> -     */
>> -    if (!QTAILQ_EMPTY(&sq->out_req_list)) {
>
>Quick question: was this 'if' ever even possible to hit? The preceding
>'while' loop doesn't break out until the queue is empty, so this should
>have been unreachable.
>

Hi Keith,

Good question ;) But yes. The requirement of that 'if' is the primary 
motivation for this series. The problem is that some commands (zone 
reset, copy, aka the commands reimplemented in this series) would not 
track the AIOCB in r->aiocb, so there would be no way to cancel/wait for 
them.

See 98f84f5a4eca ("hw/block/nvme: drain namespaces on sq deletion") for 
a more elaborate description of the issue and the bandaid that the above 
fix was.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs
  2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
                   ` (11 preceding siblings ...)
  2021-06-25 18:47 ` [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Keith Busch
@ 2021-06-28 17:08 ` Klaus Jensen
  12 siblings, 0 replies; 16+ messages in thread
From: Klaus Jensen @ 2021-06-28 17:08 UTC (permalink / raw)
  To: qemu-devel
  Cc: Fam Zheng, Kevin Wolf, Vladimir Sementsov-Ogievskiy, qemu-block,
	Klaus Jensen, Max Reitz, Stefan Hajnoczi, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 1656 bytes --]

On Jun 17 21:06, Klaus Jensen wrote:
>From: Klaus Jensen <k.jensen@samsung.com>
>
>This series reimplements flush, dsm, copy, zone reset and format nvm to
>allow cancellation. I posted an RFC back in March ("hw/block/nvme:
>convert ad-hoc aio tracking to aiocb") and I've applied some feedback
>from Stefan and reimplemented the remaining commands.
>
>The basic idea is to define custom AIOCBs for these commands. The custom
>AIOCB takes care of issuing all the "nested" AIOs one by one instead of
>blindly sending them off simultaneously without tracking the returned
>aiocbs.
>
>v2:
>  - dropped RFC
>  - fixed flush cancel from being unintentially a noop (Vladimir)
>
>Klaus Jensen (11):
>  hw/nvme: reimplement flush to allow cancellation
>  hw/nvme: add nvme_block_status_all helper
>  hw/nvme: reimplement dsm to allow cancellation
>  hw/nvme: save reftag when generating pi
>  hw/nvme: remove assert from nvme_get_zone_by_slba
>  hw/nvme: use prinfo directly in nvme_check_prinfo and nvme_dif_check
>  hw/nvme: add dw0/1 to the req completion trace event
>  hw/nvme: reimplement the copy command to allow aio cancellation
>  hw/nvme: reimplement zone reset to allow cancellation
>  hw/nvme: reimplement format nvm to allow cancellation
>  Partially revert "hw/block/nvme: drain namespaces on sq deletion"
>
> hw/nvme/nvme.h       |   10 +-
> include/block/nvme.h |    8 +
> hw/nvme/ctrl.c       | 1883 ++++++++++++++++++++++++------------------
> hw/nvme/dif.c        |   64 +-
> hw/nvme/trace-events |   21 +-
> 5 files changed, 1124 insertions(+), 862 deletions(-)
>
>-- 
>2.32.0
>

Applied to nvme-next.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2021-06-28 17:17 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 01/11] hw/nvme: reimplement flush to allow cancellation Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 02/11] hw/nvme: add nvme_block_status_all helper Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 03/11] hw/nvme: reimplement dsm to allow cancellation Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 04/11] hw/nvme: save reftag when generating pi Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 05/11] hw/nvme: remove assert from nvme_get_zone_by_slba Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 06/11] hw/nvme: use prinfo directly in nvme_check_prinfo and nvme_dif_check Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 07/11] hw/nvme: add dw0/1 to the req completion trace event Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 08/11] hw/nvme: reimplement the copy command to allow aio cancellation Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 09/11] hw/nvme: reimplement zone reset to allow cancellation Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 10/11] hw/nvme: reimplement format nvm " Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 11/11] Partially revert "hw/block/nvme: drain namespaces on sq deletion" Klaus Jensen
2021-06-28 16:00   ` Keith Busch
2021-06-28 16:33     ` Klaus Jensen
2021-06-25 18:47 ` [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Keith Busch
2021-06-28 17:08 ` Klaus Jensen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.