All of lore.kernel.org
 help / color / mirror / Atom feed
From: Klaus Jensen <its@irrelevant.dk>
To: qemu-devel@nongnu.org, Peter Maydell <peter.maydell@linaro.org>
Cc: Fam Zheng <fam@euphon.net>, Kevin Wolf <kwolf@redhat.com>,
	qemu-block@nongnu.org, Klaus Jensen <k.jensen@samsung.com>,
	Max Reitz <mreitz@redhat.com>, Klaus Jensen <its@irrelevant.dk>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	Keith Busch <kbusch@kernel.org>
Subject: [PULL 05/56] hw/block/nvme: add the dataset management command
Date: Tue,  9 Feb 2021 08:30:10 +0100	[thread overview]
Message-ID: <20210209073101.548811-6-its@irrelevant.dk> (raw)
In-Reply-To: <20210209073101.548811-1-its@irrelevant.dk>

From: Klaus Jensen <k.jensen@samsung.com>

Add support for the Dataset Management command and the Deallocate
attribute. Deallocation results in discards being sent to the underlying
block device. Whether of not the blocks are actually deallocated is
affected by the same factors as Write Zeroes (see previous commit).

     format | discard | dsm (512B)  dsm (4KiB)  dsm (64KiB)
    --------------------------------------------------------
      qcow2    ignore   n           n           n
      qcow2    unmap    n           n           y
      raw      ignore   n           n           n
      raw      unmap    n           y           y

Again, a raw format and 4KiB LBAs are preferable.

In order to set the Namespace Preferred Deallocate Granularity and
Alignment fields (NPDG and NPDA), choose a sane minimum discard
granularity of 4KiB. If we are using a passthru device supporting
discard at a 512B granularity, user should set the discard_granularity
property explicitly. NPDG and NPDA will also account for the
cluster_size of the block driver if required (i.e. for QCOW2).

See NVM Express 1.3d, Section 6.7 ("Dataset Management command").

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
---
 hw/block/nvme.h    |  2 +
 hw/block/nvme-ns.c | 30 ++++++++++++--
 hw/block/nvme.c    | 98 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 125 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a50..574333caa3f9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -28,6 +28,7 @@ typedef struct NvmeRequest {
     struct NvmeNamespace    *ns;
     BlockAIOCB              *aiocb;
     uint16_t                status;
+    void                    *opaque;
     NvmeCqe                 cqe;
     NvmeCmd                 cmd;
     BlockAcctCookie         acct;
@@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
     case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
     case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
     case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
+    case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
     default:                        return "NVME_NVM_CMD_UNKNOWN";
     }
 }
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 53ded460348e..37f95951a6b8 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -28,10 +28,14 @@
 #include "nvme.h"
 #include "nvme-ns.h"
 
-static void nvme_ns_init(NvmeNamespace *ns)
+#define MIN_DISCARD_GRANULARITY (4 * KiB)
+
+static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 {
+    BlockDriverInfo bdi;
     NvmeIdNs *id_ns = &ns->id_ns;
     int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+    int npdg;
 
     ns->id_ns.dlfeat = 0x9;
 
@@ -43,8 +47,19 @@ static void nvme_ns_init(NvmeNamespace *ns)
     id_ns->ncap = id_ns->nsze;
     id_ns->nuse = id_ns->ncap;
 
-    /* support DULBE */
-    id_ns->nsfeat |= 0x4;
+    /* support DULBE and I/O optimization fields */
+    id_ns->nsfeat |= (0x4 | 0x10);
+
+    npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
+
+    if (bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi) >= 0 &&
+        bdi.cluster_size > ns->blkconf.discard_granularity) {
+        npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
+    }
+
+    id_ns->npda = id_ns->npdg = npdg - 1;
+
+    return 0;
 }
 
 static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -60,6 +75,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
+    if (ns->blkconf.discard_granularity == -1) {
+        ns->blkconf.discard_granularity =
+            MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
+    }
+
     ns->size = blk_getlength(ns->blkconf.blk);
     if (ns->size < 0) {
         error_setg_errno(errp, -ns->size, "could not get blockdev size");
@@ -93,7 +113,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
-    nvme_ns_init(ns);
+    if (nvme_ns_init(ns, errp)) {
+        return -1;
+    }
 
     if (nvme_register_namespace(n, ns, errp)) {
         return -1;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 6e6bdb338ad7..f019d43788ac 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -242,6 +242,7 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
 static void nvme_req_clear(NvmeRequest *req)
 {
     req->ns = NULL;
+    req->opaque = NULL;
     memset(&req->cqe, 0x0, sizeof(req->cqe));
     req->status = NVME_SUCCESS;
 }
@@ -978,6 +979,99 @@ static void nvme_rw_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+static void nvme_aio_discard_cb(void *opaque, int ret)
+{
+    NvmeRequest *req = opaque;
+    uintptr_t *discards = (uintptr_t *)&req->opaque;
+
+    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
+
+    if (ret) {
+        nvme_aio_err(req, ret);
+    }
+
+    (*discards)--;
+
+    if (*discards) {
+        return;
+    }
+
+    nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeNamespace *ns = req->ns;
+    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
+
+    uint32_t attr = le32_to_cpu(dsm->attributes);
+    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
+
+    uint16_t status = NVME_SUCCESS;
+
+    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
+
+    if (attr & NVME_DSMGMT_AD) {
+        int64_t offset;
+        size_t len;
+        NvmeDsmRange range[nr];
+        uintptr_t *discards = (uintptr_t *)&req->opaque;
+
+        status = nvme_dma(n, (uint8_t *)range, sizeof(range),
+                          DMA_DIRECTION_TO_DEVICE, req);
+        if (status) {
+            return status;
+        }
+
+        /*
+         * AIO callbacks may be called immediately, so initialize discards to 1
+         * to make sure the the callback does not complete the request before
+         * all discards have been issued.
+         */
+        *discards = 1;
+
+        for (int i = 0; i < nr; i++) {
+            uint64_t slba = le64_to_cpu(range[i].slba);
+            uint32_t nlb = le32_to_cpu(range[i].nlb);
+
+            if (nvme_check_bounds(ns, slba, nlb)) {
+                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+                                                     ns->id_ns.nsze);
+                continue;
+            }
+
+            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
+                                          nlb);
+
+            offset = nvme_l2b(ns, slba);
+            len = nvme_l2b(ns, nlb);
+
+            while (len) {
+                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
+
+                (*discards)++;
+
+                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
+                                 nvme_aio_discard_cb, req);
+
+                offset += bytes;
+                len -= bytes;
+            }
+        }
+
+        /* account for the 1-initialization */
+        (*discards)--;
+
+        if (*discards) {
+            status = NVME_NO_COMPLETE;
+        } else {
+            status = req->status;
+        }
+    }
+
+    return status;
+}
+
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
     block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
@@ -1107,6 +1201,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
     case NVME_CMD_WRITE:
     case NVME_CMD_READ:
         return nvme_rw(n, req);
+    case NVME_CMD_DSM:
+        return nvme_dsm(n, req);
     default:
         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
@@ -2829,7 +2925,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->cqes = (0x4 << 4) | 0x4;
     id->nn = cpu_to_le32(n->num_namespaces);
     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
-                           NVME_ONCS_FEATURES);
+                           NVME_ONCS_FEATURES | NVME_ONCS_DSM);
 
     id->vwc = 0x1;
     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
-- 
2.30.0



  parent reply	other threads:[~2021-02-09  7:52 UTC|newest]

Thread overview: 62+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-02-09  7:30 [PULL 00/56] emulated nvme patches Klaus Jensen
2021-02-09  7:30 ` [PULL 01/56] hw/block/nvme: remove superfluous NvmeCtrl parameter Klaus Jensen
2021-02-09  7:30 ` [PULL 02/56] hw/block/nvme: pull aio error handling Klaus Jensen
2021-02-09  7:30 ` [PULL 03/56] hw/block/nvme: add dulbe support Klaus Jensen
2021-02-09  7:30 ` [PULL 04/56] nvme: add namespace I/O optimization fields to shared header Klaus Jensen
2021-02-09  7:30 ` Klaus Jensen [this message]
2021-02-09  7:30 ` [PULL 06/56] hw/block/nvme: add compare command Klaus Jensen
2021-02-09  7:30 ` [PULL 07/56] hw/block/nvme: fix bad clearing of CAP Klaus Jensen
2021-02-09  7:30 ` [PULL 08/56] hw/block/nvme: Process controller reset and shutdown differently Klaus Jensen
2021-02-09  7:30 ` [PULL 09/56] hw/block/nvme: Generate namespace UUIDs Klaus Jensen
2021-02-09  7:30 ` [PULL 10/56] hw/block/nvme: Separate read and write handlers Klaus Jensen
2021-02-09  7:30 ` [PULL 11/56] hw/block/nvme: Combine nvme_write_zeroes() and nvme_write() Klaus Jensen
2021-02-09  7:30 ` [PULL 12/56] hw/block/nvme: Add Commands Supported and Effects log Klaus Jensen
2021-02-09  7:30 ` [PULL 13/56] hw/block/nvme: Add support for Namespace Types Klaus Jensen
2021-02-09  7:30 ` [PULL 14/56] hw/block/nvme: Support allocated CNS command variants Klaus Jensen
2021-02-09  7:30 ` [PULL 15/56] nvme: Make ZNS-related definitions Klaus Jensen
2021-02-09  7:30 ` [PULL 16/56] hw/block/nvme: Support Zoned Namespace Command Set Klaus Jensen
2021-02-09  7:30 ` [PULL 17/56] hw/block/nvme: Introduce max active and open zone limits Klaus Jensen
2021-02-09  7:30 ` [PULL 18/56] hw/block/nvme: Support Zone Descriptor Extensions Klaus Jensen
2021-02-09  7:30 ` [PULL 19/56] hw/block/nvme: Document zoned parameters in usage text Klaus Jensen
2021-02-09  7:30 ` [PULL 20/56] hw/block/nvme: fix for non-msix machines Klaus Jensen
2021-02-09  7:30 ` [PULL 21/56] hw/block/nvme: conditionally enable DULBE for zoned namespaces Klaus Jensen
2021-02-09  7:30 ` [PULL 22/56] hw/block/nvme: fix shutdown/reset logic Klaus Jensen
2021-02-09  7:30 ` [PULL 23/56] hw/block/nvme: merge implicitly/explicitly opened processing masks Klaus Jensen
2021-02-09  7:30 ` [PULL 24/56] hw/block/nvme: enum style fix Klaus Jensen
2021-02-09  7:30 ` [PULL 25/56] hw/block/nvme: zero out zones on reset Klaus Jensen
2021-02-09  7:30 ` [PULL 26/56] hw/block/nvme: add missing string representations for commands Klaus Jensen
2021-02-09  7:30 ` [PULL 27/56] hw/block/nvme: remove unnecessary check for append Klaus Jensen
2021-02-09  7:30 ` [PULL 28/56] hw/block/nvme: Correct error status for unaligned ZA Klaus Jensen
2021-02-09  7:30 ` [PULL 29/56] hw/block/nvme: remove unused argument in nvme_ns_init_zoned Klaus Jensen
2021-02-09  7:30 ` [PULL 30/56] hw/block/nvme: open code for volatile write cache Klaus Jensen
2021-02-09  7:30 ` [PULL 31/56] hw/block/nvme: remove unused argument in nvme_ns_init_blk Klaus Jensen
2021-02-09  7:30 ` [PULL 32/56] hw/block/nvme: split setup and register for namespace Klaus Jensen
2021-02-11  9:53   ` Alexander Graf
2021-02-11 10:41     ` Klaus Jensen
2021-02-11 11:40     ` Philippe Mathieu-Daudé
2021-02-11 11:46       ` Klaus Jensen
2021-02-09  7:30 ` [PULL 33/56] hw/block/nvme: remove unused argument in nvme_ns_setup Klaus Jensen
2021-02-09  7:30 ` [PULL 34/56] hw/block/nvme: fix zone write finalize Klaus Jensen
2021-02-09  7:30 ` [PULL 35/56] nvme: introduce bit 5 for critical warning Klaus Jensen
2021-02-09  7:30 ` [PULL 36/56] hw/block/nvme: add smart_critical_warning property Klaus Jensen
2021-02-09  7:30 ` [PULL 37/56] hw/block/nvme: trigger async event during injecting smart warning Klaus Jensen
2021-02-09  7:30 ` [PULL 38/56] hw/block/nvme: add size to mmio read/write trace events Klaus Jensen
2021-02-09  7:30 ` [PULL 39/56] hw/block/nvme: fix 64 bit register hi/lo split writes Klaus Jensen
2021-02-09  7:30 ` [PULL 40/56] hw/block/nvme: indicate CMB support through controller capabilities register Klaus Jensen
2021-02-09  7:30 ` [PULL 41/56] hw/block/nvme: move msix table and pba to BAR 0 Klaus Jensen
2021-02-09  7:30 ` [PULL 42/56] hw/block/nvme: allow cmb and pmr to coexist Klaus Jensen
2021-02-09  7:30 ` [PULL 43/56] hw/block/nvme: rename PMR/CMB shift/mask fields Klaus Jensen
2021-02-09  7:30 ` [PULL 44/56] hw/block/nvme: remove redundant zeroing of PMR registers Klaus Jensen
2021-02-09  7:30 ` [PULL 45/56] hw/block/nvme: disable PMR at boot up Klaus Jensen
2021-02-09  7:30 ` [PULL 46/56] hw/block/nvme: add PMR RDS/WDS support Klaus Jensen
2021-02-09  7:30 ` [PULL 47/56] hw/block/nvme: move cmb logic to v1.4 Klaus Jensen
2021-02-09  7:30 ` [PULL 48/56] hw/block/nvme: bump " Klaus Jensen
2021-02-09  7:30 ` [PULL 49/56] hw/block/nvme: lift cmb restrictions Klaus Jensen
2021-02-09  7:30 ` [PULL 50/56] hw/block/nvme: error if drive less than a zone size Klaus Jensen
2021-02-09  7:30 ` [PULL 51/56] hw/block/nvme: fix set feature for error recovery Klaus Jensen
2021-02-09  7:30 ` [PULL 52/56] hw/block/nvme: fix set feature save field check Klaus Jensen
2021-02-09  7:30 ` [PULL 53/56] hw/block/nvme: align with existing style Klaus Jensen
2021-02-09  7:30 ` [PULL 54/56] hw/block/nvme: fix wrong parameter name 'cross_read' Klaus Jensen
2021-02-09  7:31 ` [PULL 55/56] hw/block/nvme: fix zone boundary check for append Klaus Jensen
2021-02-09  7:31 ` [PULL 56/56] hw/block/nvme: refactor the logic for zone write checks Klaus Jensen
2021-02-09 14:52 ` [PULL 00/56] emulated nvme patches Peter Maydell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210209073101.548811-6-its@irrelevant.dk \
    --to=its@irrelevant.dk \
    --cc=fam@euphon.net \
    --cc=k.jensen@samsung.com \
    --cc=kbusch@kernel.org \
    --cc=kwolf@redhat.com \
    --cc=mreitz@redhat.com \
    --cc=peter.maydell@linaro.org \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.