All of lore.kernel.org
 help / color / mirror / Atom feed
From: Klaus Jensen <its@irrelevant.dk>
To: qemu-devel@nongnu.org
Cc: Fam Zheng <fam@euphon.net>, Kevin Wolf <kwolf@redhat.com>,
	Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>,
	qemu-block@nongnu.org, Klaus Jensen <k.jensen@samsung.com>,
	Max Reitz <mreitz@redhat.com>, Keith Busch <kbusch@kernel.org>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	Klaus Jensen <its@irrelevant.dk>
Subject: [PATCH v2 03/11] hw/nvme: reimplement dsm to allow cancellation
Date: Thu, 17 Jun 2021 21:06:49 +0200	[thread overview]
Message-ID: <20210617190657.110823-4-its@irrelevant.dk> (raw)
In-Reply-To: <20210617190657.110823-1-its@irrelevant.dk>

From: Klaus Jensen <k.jensen@samsung.com>

Prior to this patch, a loop was used to issue multiple "fire and forget"
aios for each range in the command. Without a reference to the aiocb
returned from the blk_aio_pdiscard calls, the aios cannot be canceled.

Fix this by processing the ranges one after another.

As a bonus, this fixes how metadata is cleared (i.e. we only zero it out
if the data was succesfully discarded).

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c       | 233 +++++++++++++++++++++++++++++--------------
 hw/nvme/trace-events |   4 +-
 2 files changed, 162 insertions(+), 75 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 2e36349ced28..1626ed29c76b 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -2015,26 +2015,6 @@ out:
     nvme_verify_cb(ctx, ret);
 }
 
-static void nvme_aio_discard_cb(void *opaque, int ret)
-{
-    NvmeRequest *req = opaque;
-    uintptr_t *discards = (uintptr_t *)&req->opaque;
-
-    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
-
-    if (ret) {
-        nvme_aio_err(req, ret);
-    }
-
-    (*discards)--;
-
-    if (*discards) {
-        return;
-    }
-
-    nvme_enqueue_req_completion(nvme_cq(req), req);
-}
-
 struct nvme_zone_reset_ctx {
     NvmeRequest *req;
     NvmeZone    *zone;
@@ -2495,75 +2475,182 @@ out:
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+typedef struct NvmeDSMAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    NvmeRequest *req;
+    QEMUBH *bh;
+    int ret;
+
+    NvmeDsmRange *range;
+    unsigned int nr;
+    unsigned int idx;
+} NvmeDSMAIOCB;
+
+static void nvme_dsm_cancel(BlockAIOCB *aiocb)
+{
+    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
+
+    /* break nvme_dsm_cb loop */
+    iocb->idx = iocb->nr;
+    iocb->ret = -ECANCELED;
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+        iocb->aiocb = NULL;
+    } else {
+        /*
+         * We only reach this if nvme_dsm_cancel() has already been called or
+         * the command ran to completion and nvme_dsm_bh is scheduled to run.
+         */
+        assert(iocb->idx == iocb->nr);
+    }
+}
+
+static const AIOCBInfo nvme_dsm_aiocb_info = {
+    .aiocb_size   = sizeof(NvmeDSMAIOCB),
+    .cancel_async = nvme_dsm_cancel,
+};
+
+static void nvme_dsm_bh(void *opaque)
+{
+    NvmeDSMAIOCB *iocb = opaque;
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+    qemu_aio_unref(iocb);
+}
+
+static void nvme_dsm_cb(void *opaque, int ret);
+
+static void nvme_dsm_md_cb(void *opaque, int ret)
+{
+    NvmeDSMAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeNamespace *ns = req->ns;
+    NvmeDsmRange *range;
+    uint64_t slba;
+    uint32_t nlb;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+    if (!ns->lbaf.ms) {
+        nvme_dsm_cb(iocb, 0);
+        return;
+    }
+
+    range = &iocb->range[iocb->idx - 1];
+    slba = le64_to_cpu(range->slba);
+    nlb = le32_to_cpu(range->nlb);
+
+    /*
+     * Check that all block were discarded (zeroed); otherwise we do not zero
+     * the metadata.
+     */
+
+    ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
+    if (ret) {
+        if (ret < 0) {
+            iocb->ret = ret;
+            goto done;
+        }
+
+        nvme_dsm_cb(iocb, 0);
+    }
+
+    iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
+                                        nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
+                                        nvme_dsm_cb, iocb);
+    return;
+
+done:
+    iocb->aiocb = NULL;
+    qemu_bh_schedule(iocb->bh);
+}
+
+static void nvme_dsm_cb(void *opaque, int ret)
+{
+    NvmeDSMAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeCtrl *n = nvme_ctrl(req);
+    NvmeNamespace *ns = req->ns;
+    NvmeDsmRange *range;
+    uint64_t slba;
+    uint32_t nlb;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+next:
+    if (iocb->idx == iocb->nr) {
+        goto done;
+    }
+
+    range = &iocb->range[iocb->idx++];
+    slba = le64_to_cpu(range->slba);
+    nlb = le32_to_cpu(range->nlb);
+
+    trace_pci_nvme_dsm_deallocate(slba, nlb);
+
+    if (nlb > n->dmrsl) {
+        trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
+        goto next;
+    }
+
+    if (nvme_check_bounds(ns, slba, nlb)) {
+        trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+                                             ns->id_ns.nsze);
+        goto next;
+    }
+
+    iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
+                                   nvme_l2b(ns, nlb),
+                                   nvme_dsm_md_cb, iocb);
+    return;
+
+done:
+    iocb->aiocb = NULL;
+    qemu_bh_schedule(iocb->bh);
+}
+
 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns = req->ns;
     NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
-
     uint32_t attr = le32_to_cpu(dsm->attributes);
     uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
-
     uint16_t status = NVME_SUCCESS;
 
-    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
+    trace_pci_nvme_dsm(nr, attr);
 
     if (attr & NVME_DSMGMT_AD) {
-        int64_t offset;
-        size_t len;
-        NvmeDsmRange range[nr];
-        uintptr_t *discards = (uintptr_t *)&req->opaque;
+        NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
+                                         nvme_misc_cb, req);
 
-        status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req);
+        iocb->req = req;
+        iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
+        iocb->ret = 0;
+        iocb->range = g_new(NvmeDsmRange, nr);
+        iocb->nr = nr;
+        iocb->idx = 0;
+
+        status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
+                          req);
         if (status) {
             return status;
         }
 
-        /*
-         * AIO callbacks may be called immediately, so initialize discards to 1
-         * to make sure the the callback does not complete the request before
-         * all discards have been issued.
-         */
-        *discards = 1;
+        req->aiocb = &iocb->common;
+        nvme_dsm_cb(iocb, 0);
 
-        for (int i = 0; i < nr; i++) {
-            uint64_t slba = le64_to_cpu(range[i].slba);
-            uint32_t nlb = le32_to_cpu(range[i].nlb);
-
-            if (nvme_check_bounds(ns, slba, nlb)) {
-                continue;
-            }
-
-            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
-                                          nlb);
-
-            if (nlb > n->dmrsl) {
-                trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
-            }
-
-            offset = nvme_l2b(ns, slba);
-            len = nvme_l2b(ns, nlb);
-
-            while (len) {
-                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
-
-                (*discards)++;
-
-                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
-                                 nvme_aio_discard_cb, req);
-
-                offset += bytes;
-                len -= bytes;
-            }
-        }
-
-        /* account for the 1-initialization */
-        (*discards)--;
-
-        if (*discards) {
-            status = NVME_NO_COMPLETE;
-        } else {
-            status = req->status;
-        }
+        return NVME_NO_COMPLETE;
     }
 
     return status;
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index ce6b6ffe9604..eea4e31e46c4 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -37,8 +37,8 @@ pci_nvme_verify_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" bl
 pci_nvme_verify_cb(uint16_t cid, uint8_t prinfo, uint16_t apptag, uint16_t appmask, uint32_t reftag) "cid %"PRIu16" prinfo 0x%"PRIx8" apptag 0x%"PRIx16" appmask 0x%"PRIx16" reftag 0x%"PRIx32""
 pci_nvme_rw_complete_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
 pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
-pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
-pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_dsm(uint32_t nr, uint32_t attr) "nr %"PRIu32" attr 0x%"PRIx32""
+pci_nvme_dsm_deallocate(uint64_t slba, uint32_t nlb) "slba %"PRIu64" nlb %"PRIu32""
 pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32""
 pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_compare_data_cb(uint16_t cid) "cid %"PRIu16""
-- 
2.32.0



  parent reply	other threads:[~2021-06-17 19:11 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-17 19:06 [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 01/11] hw/nvme: reimplement flush to allow cancellation Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 02/11] hw/nvme: add nvme_block_status_all helper Klaus Jensen
2021-06-17 19:06 ` Klaus Jensen [this message]
2021-06-17 19:06 ` [PATCH v2 04/11] hw/nvme: save reftag when generating pi Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 05/11] hw/nvme: remove assert from nvme_get_zone_by_slba Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 06/11] hw/nvme: use prinfo directly in nvme_check_prinfo and nvme_dif_check Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 07/11] hw/nvme: add dw0/1 to the req completion trace event Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 08/11] hw/nvme: reimplement the copy command to allow aio cancellation Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 09/11] hw/nvme: reimplement zone reset to allow cancellation Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 10/11] hw/nvme: reimplement format nvm " Klaus Jensen
2021-06-17 19:06 ` [PATCH v2 11/11] Partially revert "hw/block/nvme: drain namespaces on sq deletion" Klaus Jensen
2021-06-28 16:00   ` Keith Busch
2021-06-28 16:33     ` Klaus Jensen
2021-06-25 18:47 ` [PATCH v2 00/11] hw/nvme: reimplement all multi-aio commands with custom aiocbs Keith Busch
2021-06-28 17:08 ` Klaus Jensen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210617190657.110823-4-its@irrelevant.dk \
    --to=its@irrelevant.dk \
    --cc=fam@euphon.net \
    --cc=k.jensen@samsung.com \
    --cc=kbusch@kernel.org \
    --cc=kwolf@redhat.com \
    --cc=mreitz@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    --cc=vsementsov@virtuozzo.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.