All of lore.kernel.org
 help / color / mirror / Atom feed
From: Klaus Jensen <its@irrelevant.dk>
To: qemu-devel@nongnu.org
Cc: Fam Zheng <fam@euphon.net>, Kevin Wolf <kwolf@redhat.com>,
	qemu-block@nongnu.org, Klaus Jensen <k.jensen@samsung.com>,
	Max Reitz <mreitz@redhat.com>, Keith Busch <kbusch@kernel.org>,
	Klaus Jensen <its@irrelevant.dk>
Subject: [PATCH 06/16] hw/block/nvme: add support for dulbe and block utilization tracking
Date: Thu, 24 Sep 2020 22:45:06 +0200	[thread overview]
Message-ID: <20200924204516.1881843-7-its@irrelevant.dk> (raw)
In-Reply-To: <20200924204516.1881843-1-its@irrelevant.dk>

From: Klaus Jensen <k.jensen@samsung.com>

This adds support for reporting the Deallocated or Unwritten Logical
Block error (DULBE). This requires tracking the allocated/deallocated
status of all logical blocks.

Introduce a bitmap that does this. The bitmap is persisted on the new
'pstate' blockdev that is associated with a namespace. If no such drive
is attached, the controller will not indicate support for DULBE.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 docs/specs/nvme.txt   |   7 +++
 hw/block/nvme-ns.h    |  13 +++++
 include/block/nvme.h  |   5 ++
 hw/block/nvme-ns.c    | 110 ++++++++++++++++++++++++++++++++++++++++
 hw/block/nvme.c       | 113 ++++++++++++++++++++++++++++++++++++++++--
 hw/block/trace-events |   2 +
 6 files changed, 247 insertions(+), 3 deletions(-)

diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
index 438ca50d698c..9a5c67f10b5d 100644
--- a/docs/specs/nvme.txt
+++ b/docs/specs/nvme.txt
@@ -12,6 +12,13 @@ nvme-ns Options
      namespace. It is specified in terms of a power of two. Only values between
      9 and 12 (both inclusive) are supported.
 
+  `pstate`; This parameter specifies another blockdev to be used for storing
+     persistent state such as logical block allocation tracking. Adding this
+     parameter enables various optional features of the device.
+
+       -drive id=pstate,file=pstate.img,format=raw
+       -device nvme-ns,pstate=pstate,...
+
 
 Reference Specifications
 ------------------------
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 78b0d1a00672..51141796909f 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,7 +31,20 @@ typedef struct NvmeNamespace {
     int64_t      size;
     NvmeIdNs     id_ns;
 
+    struct {
+        BlockBackend *blk;
+
+        struct {
+            unsigned long *map;
+            int64_t       offset;
+        } utilization;
+    } pstate;
+
     NvmeNamespaceParams params;
+
+    struct {
+        uint32_t err_rec;
+    } features;
 } NvmeNamespace;
 
 static inline uint32_t nvme_nsid(NvmeNamespace *ns)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 999b4f8ae0d4..abd49d371e63 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -683,6 +683,7 @@ enum NvmeStatusCodes {
     NVME_E2E_REF_ERROR          = 0x0284,
     NVME_CMP_FAILURE            = 0x0285,
     NVME_ACCESS_DENIED          = 0x0286,
+    NVME_DULB                   = 0x0287,
     NVME_MORE                   = 0x2000,
     NVME_DNR                    = 0x4000,
     NVME_NO_COMPLETE            = 0xffff,
@@ -898,6 +899,9 @@ enum NvmeIdCtrlLpa {
 #define NVME_AEC_NS_ATTR(aec)       ((aec >> 8) & 0x1)
 #define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
 
+#define NVME_ERR_REC_TLER(err_rec)  (err_rec & 0xffff)
+#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000)
+
 enum NvmeFeatureIds {
     NVME_ARBITRATION                = 0x1,
     NVME_POWER_MANAGEMENT           = 0x2,
@@ -1018,6 +1022,7 @@ enum NvmeNsIdentifierType {
 
 
 #define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
+#define NVME_ID_NS_NSFEAT_DULBE(nsfeat)     ((nsfeat >> 2) & 0x1)
 #define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
 #define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
 #define NVME_ID_NS_MC_SEPARATE(mc)          ((mc >> 1) & 0x1)
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 576c7486f45b..9a63004c000a 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -25,9 +25,36 @@
 #include "hw/qdev-properties.h"
 #include "hw/qdev-core.h"
 
+#include "trace.h"
+
 #include "nvme.h"
 #include "nvme-ns.h"
 
+static int nvme_blk_truncate(BlockBackend *blk, size_t len, Error **errp)
+{
+    int ret;
+    uint64_t perm, shared_perm;
+
+    blk_get_perm(blk, &perm, &shared_perm);
+
+    ret = blk_set_perm(blk, perm | BLK_PERM_RESIZE, shared_perm, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = blk_truncate(blk, len, false, PREALLOC_MODE_OFF, 0, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = blk_set_perm(blk, perm, shared_perm, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
 static void nvme_ns_init(NvmeNamespace *ns)
 {
     NvmeIdNs *id_ns = &ns->id_ns;
@@ -45,6 +72,67 @@ static void nvme_ns_init(NvmeNamespace *ns)
     id_ns->nuse = id_ns->ncap;
 }
 
+static int nvme_ns_setup_blk_pstate(NvmeNamespace *ns, Error **errp)
+{
+    BlockBackend *blk = ns->pstate.blk;
+    uint64_t perm, shared_perm;
+    ssize_t len;
+    size_t pstate_len;
+    int ret;
+
+    perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+    shared_perm = BLK_PERM_ALL;
+
+    ret = blk_set_perm(blk, perm, shared_perm, errp);
+    if (ret) {
+        return ret;
+    }
+
+    pstate_len = ROUND_UP(DIV_ROUND_UP(nvme_ns_nlbas(ns), 8),
+                          BDRV_SECTOR_SIZE);
+
+    len = blk_getlength(blk);
+    if (len < 0) {
+        error_setg_errno(errp, -len, "could not determine pstate size");
+        return len;
+    }
+
+    unsigned long *map = bitmap_new(nvme_ns_nlbas(ns));
+    ns->pstate.utilization.offset = 0;
+
+    if (!len) {
+        ret = nvme_blk_truncate(blk, pstate_len, errp);
+        if (ret < 0) {
+            return ret;
+        }
+
+        ns->pstate.utilization.map = map;
+    } else {
+        if (len != pstate_len) {
+            error_setg(errp, "pstate size mismatch "
+                "(expected %zd bytes; was %zu bytes)",
+                pstate_len, len);
+            return -1;
+        }
+
+        ret = blk_pread(blk, 0, map, pstate_len);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "could not read pstate");
+            return ret;
+        }
+#ifdef HOST_WORDS_BIGENDIAN
+        ns->pstate.utilization.map = bitmap_new(nvme_ns_nlbas(ns));
+        bitmap_from_le(ns->pstate.utilization.map, map, nvme_ns_nlbas(ns));
+#else
+        ns->pstate.utilization.map = map;
+#endif
+
+        return 0;
+    }
+
+    return 0;
+}
+
 static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
 {
     if (!blkconf_blocksizes(&ns->blkconf, errp)) {
@@ -96,6 +184,19 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
     }
 
     nvme_ns_init(ns);
+
+    if (ns->pstate.blk) {
+        if (nvme_ns_setup_blk_pstate(ns, errp)) {
+            return -1;
+        }
+
+        /*
+         * With a pstate file in place we can enable the Deallocated or
+         * Unwritten Logical Block Error feature.
+         */
+        ns->id_ns.nsfeat |= 0x4;
+    }
+
     if (nvme_register_namespace(n, ns, errp)) {
         return -1;
     }
@@ -106,11 +207,19 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
 void nvme_ns_drain(NvmeNamespace *ns)
 {
     blk_drain(ns->blkconf.blk);
+
+    if (ns->pstate.blk) {
+        blk_drain(ns->pstate.blk);
+    }
 }
 
 void nvme_ns_flush(NvmeNamespace *ns)
 {
     blk_flush(ns->blkconf.blk);
+
+    if (ns->pstate.blk) {
+        blk_flush(ns->pstate.blk);
+    }
 }
 
 static void nvme_ns_realize(DeviceState *dev, Error **errp)
@@ -131,6 +240,7 @@ static Property nvme_ns_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
     DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
     DEFINE_PROP_UINT8("lbads", NvmeNamespace, params.lbads, BDRV_SECTOR_BITS),
+    DEFINE_PROP_DRIVE("pstate", NvmeNamespace, pstate.blk),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 795c7e7c529f..b16e089bda80 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -105,6 +105,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
 
 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
     [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
+    [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
     [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
     [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
     [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
@@ -888,6 +889,61 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
     return NVME_SUCCESS;
 }
 
+static inline uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
+                                        uint32_t nlb)
+{
+    uint64_t elba = slba + nlb;
+
+    if (find_next_zero_bit(ns->pstate.utilization.map, elba, slba) < elba) {
+        return NVME_DULB;
+    }
+
+    return NVME_SUCCESS;
+}
+
+static int nvme_allocate(NvmeNamespace *ns, uint64_t slba, uint32_t nlb)
+{
+    int nlongs, first;
+    int64_t offset;
+    unsigned long *map, *src;
+    int ret;
+
+    if (!(ns->pstate.blk && nvme_check_dulbe(ns, slba, nlb))) {
+        return 0;
+    }
+
+    trace_pci_nvme_allocate(nvme_nsid(ns), slba, nlb);
+
+    bitmap_set(ns->pstate.utilization.map, slba, nlb);
+
+    nlongs = BITS_TO_LONGS(nlb) + 1;
+    first = slba / BITS_PER_LONG;
+    offset = ns->pstate.utilization.offset + first * sizeof(unsigned long);
+    src = ns->pstate.utilization.map;
+
+#ifdef HOST_WORDS_BIGENDIAN
+    map = g_new(nlongs, sizeof(unsigned long));
+    for (int i = first; i < first + nlongs; i++) {
+# if HOST_LONG_BITS == 64
+        map[i] = bswap64(src[i]);
+# else
+        map[i] = bswap32(src[i]);
+# endif
+    }
+#else
+    map = src;
+#endif
+
+    ret = blk_pwrite(ns->pstate.blk, offset, &map[first],
+                     nlongs * sizeof(unsigned long), 0);
+
+#ifdef HOST_WORDS_BIGENDIAN
+    g_free(map);
+#endif
+    return ret;
+}
+
+
 static void nvme_rw_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
@@ -1006,6 +1062,7 @@ static uint16_t nvme_rwz(NvmeCtrl *n, NvmeRequest *req)
     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
     size_t len = nvme_l2b(ns, nlb);
 
+    bool is_write = nvme_req_is_write(req);
     uint16_t status;
 
     trace_pci_nvme_rwz(nvme_cid(req), nvme_io_opc_str(rw->opcode),
@@ -1017,6 +1074,16 @@ static uint16_t nvme_rwz(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
+    if (!is_write) {
+        if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+            status = nvme_check_dulbe(ns, slba, nlb);
+            if (status) {
+                trace_pci_nvme_err_dulbe(nvme_cid(req), slba, nlb);
+                goto invalid;
+            }
+        }
+    }
+
     if (req->cmd.opcode & NVME_CMD_OPCODE_DATA_TRANSFER_MASK) {
         status = nvme_check_mdts(n, len);
         if (status) {
@@ -1030,12 +1097,18 @@ static uint16_t nvme_rwz(NvmeCtrl *n, NvmeRequest *req)
         }
     }
 
+    if (is_write) {
+        if (nvme_allocate(ns, slba, nlb) < 0) {
+            status = NVME_INTERNAL_DEV_ERROR;
+            goto invalid;
+        }
+    }
+
     return nvme_do_aio(ns->blkconf.blk, nvme_l2b(ns, slba), len, req);
 
 invalid:
     block_acct_invalid(blk_get_stats(ns->blkconf.blk),
-                       nvme_req_is_write(req) ? BLOCK_ACCT_WRITE :
-                       BLOCK_ACCT_READ);
+                       is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
 
     return status;
 }
@@ -1638,6 +1711,8 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
 
 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
 {
+    NvmeNamespace *ns;
+
     NvmeCmd *cmd = &req->cmd;
     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
     uint32_t dw11 = le32_to_cpu(cmd->cdw11);
@@ -1708,6 +1783,18 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
         }
 
         return NVME_INVALID_FIELD | NVME_DNR;
+    case NVME_ERROR_RECOVERY:
+        if (!nvme_nsid_valid(n, nsid)) {
+            return NVME_INVALID_NSID | NVME_DNR;
+        }
+
+        ns = nvme_ns(n, nsid);
+        if (unlikely(!ns)) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        result = ns->features.err_rec;
+        goto out;
     case NVME_VOLATILE_WRITE_CACHE:
         result = n->features.vwc;
         trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
@@ -1780,7 +1867,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
 
 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
 {
-    NvmeNamespace *ns;
+    NvmeNamespace *ns = NULL;
 
     NvmeCmd *cmd = &req->cmd;
     uint32_t dw10 = le32_to_cpu(cmd->cdw10);
@@ -1847,6 +1934,26 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
                                NVME_LOG_SMART_INFO);
         }
 
+        break;
+    case NVME_ERROR_RECOVERY:
+        if (nsid == NVME_NSID_BROADCAST) {
+            for (int i = 1; i <= n->num_namespaces; i++) {
+                ns = nvme_ns(n, i);
+
+                if (!ns) {
+                    continue;
+                }
+
+                if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
+                    ns->features.err_rec = dw11;
+                }
+            }
+
+            break;
+        }
+
+        assert(ns);
+        ns->features.err_rec = dw11;
         break;
     case NVME_VOLATILE_WRITE_CACHE:
         n->features.vwc = dw11 & 0x1;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index b18056c49836..774513469274 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -42,6 +42,7 @@ pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, cons
 pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
 pci_nvme_rwz(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t len, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" len %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
+pci_nvme_allocate(uint32_t ns, uint64_t slba, uint32_t nlb) "nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_do_aio(uint16_t cid, uint8_t opc, const char *opname, const char *blkname, int64_t offset, size_t len) "cid %"PRIu16" opc 0x%"PRIx8" opname '%s' blk '%s' offset %"PRId64" len %zu"
 pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
 pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
@@ -89,6 +90,7 @@ pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 # nvme traces for error conditions
 pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu"
 pci_nvme_err_req_status(uint16_t cid, uint32_t nsid, uint16_t status, uint8_t opc) "cid %"PRIu16" nsid %"PRIu32" status 0x%"PRIx16" opc 0x%"PRIx8""
+pci_nvme_err_dulbe(uint16_t cid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64""
 pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64""
 pci_nvme_err_cfs(void) "controller fatal status"
-- 
2.28.0



  parent reply	other threads:[~2020-09-24 21:13 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-24 20:45 [PATCH 00/16] hw/block/nvme: zoned namespace command set Klaus Jensen
2020-09-24 20:45 ` [PATCH 01/16] hw/block/nvme: add nsid to get/setfeat trace events Klaus Jensen
2020-09-24 20:45 ` [PATCH 02/16] hw/block/nvme: add trace event for requests with non-zero status code Klaus Jensen
2020-09-24 20:45 ` [PATCH 03/16] hw/block/nvme: make lba data size configurable Klaus Jensen
2020-09-24 21:13   ` Klaus Jensen
2020-09-24 20:45 ` [PATCH 04/16] hw/block/nvme: reject io commands if only admin command set selected Klaus Jensen
2020-09-24 20:45 ` [PATCH 05/16] hw/block/nvme: consolidate read, write and write zeroes Klaus Jensen
2020-09-24 20:45 ` Klaus Jensen [this message]
2020-09-24 20:45 ` [PATCH 07/16] hw/block/nvme: add commands supported and effects log page Klaus Jensen
2020-09-24 20:45 ` [PATCH 08/16] hw/block/nvme: support namespace types Klaus Jensen
2020-09-24 20:45 ` [PATCH 09/16] hw/block/nvme: add basic read/write for zoned namespaces Klaus Jensen
2020-09-24 20:45 ` [PATCH 10/16] hw/block/nvme: add the zone management receive command Klaus Jensen
2020-09-24 20:45 ` [PATCH 11/16] hw/block/nvme: add the zone management send command Klaus Jensen
2020-09-24 20:45 ` [PATCH 12/16] hw/block/nvme: add the zone append command Klaus Jensen
2020-09-24 20:45 ` [PATCH 13/16] hw/block/nvme: track and enforce zone resources Klaus Jensen
2020-09-24 20:45 ` [PATCH 14/16] hw/block/nvme: allow open to close transitions by controller Klaus Jensen
2020-09-24 20:45 ` [PATCH 15/16] hw/block/nvme: support zone active excursions Klaus Jensen
2020-09-24 20:45 ` [PATCH 16/16] hw/block/nvme: support reset/finish recommended limits Klaus Jensen
2020-09-24 22:43 ` [PATCH 00/16] hw/block/nvme: zoned namespace command set no-reply
2020-09-25  7:55   ` Klaus Jensen
2020-09-25  1:24 ` Keith Busch
2020-09-25  5:27   ` Klaus Jensen
2020-09-25 17:06 ` Dmitry Fomichev
2020-09-25 17:27   ` Klaus Jensen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200924204516.1881843-7-its@irrelevant.dk \
    --to=its@irrelevant.dk \
    --cc=fam@euphon.net \
    --cc=k.jensen@samsung.com \
    --cc=kbusch@kernel.org \
    --cc=kwolf@redhat.com \
    --cc=mreitz@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.