All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] hw/block/nvme: oncs and write uncorrectable support
@ 2021-02-10  7:06 Klaus Jensen
  2021-02-10  7:06 ` [PATCH 1/2] hw/block/nvme: add oncs device parameter Klaus Jensen
  2021-02-10  7:06 ` [PATCH 2/2] hw/block/nvme: add write uncorrectable command Klaus Jensen
  0 siblings, 2 replies; 11+ messages in thread
From: Klaus Jensen @ 2021-02-10  7:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Max Reitz, Klaus Jensen,
	Keith Busch

From: Klaus Jensen <k.jensen@samsung.com>

First, add support for toggling optional features through the new `oncs`
nvme device parameter.

Secondly, add support for the Write Uncorrectable command.

Gollu Appalanaidu (2):
  hw/block/nvme: add oncs device parameter
  hw/block/nvme: add write uncorrectable command

 docs/specs/nvme.txt   |   3 +
 hw/block/nvme-ns.h    |   2 +
 hw/block/nvme.h       |   8 ++
 hw/block/nvme-ns.c    |   2 +
 hw/block/nvme.c       | 166 +++++++++++++++++++++++++++++++-----------
 hw/block/trace-events |   1 +
 6 files changed, 140 insertions(+), 42 deletions(-)

-- 
2.30.0



^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/2] hw/block/nvme: add oncs device parameter
  2021-02-10  7:06 [PATCH 0/2] hw/block/nvme: oncs and write uncorrectable support Klaus Jensen
@ 2021-02-10  7:06 ` Klaus Jensen
  2021-02-10 11:06   ` Minwoo Im
  2021-02-10  7:06 ` [PATCH 2/2] hw/block/nvme: add write uncorrectable command Klaus Jensen
  1 sibling, 1 reply; 11+ messages in thread
From: Klaus Jensen @ 2021-02-10  7:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Gollu Appalanaidu,
	Max Reitz, Klaus Jensen, Keith Busch

From: Gollu Appalanaidu <anaidu.gollu@samsung.com>

Add the 'oncs' nvme device parameter to allow optional features to be
enabled/disabled explicitly. Since most of these are optional commands,
make the CSE log pages dynamic to account for the value of ONCS.

Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.h |   7 ++++
 hw/block/nvme.c | 101 ++++++++++++++++++++++++++++++++----------------
 2 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index cb2b5175f1a1..98082b2dfba3 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -9,6 +9,7 @@
 
 #define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+#define NVME_MAX_COMMANDS 0x100
 
 typedef struct NvmeParams {
     char     *serial;
@@ -22,6 +23,7 @@ typedef struct NvmeParams {
     bool     use_intel_id;
     uint32_t zasl_bs;
     bool     legacy_cmb;
+    uint16_t oncs;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -183,6 +185,11 @@ typedef struct NvmeCtrl {
     NvmeCQueue      admin_cq;
     NvmeIdCtrl      id_ctrl;
     NvmeFeatureVal  features;
+
+    struct {
+        uint32_t nvm[NVME_MAX_COMMANDS];
+        uint32_t zoned[NVME_MAX_COMMANDS];
+    } iocs;
 } NvmeCtrl;
 
 static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 93345bf3c1fc..e5f6666725d7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -71,6 +71,11 @@
  *   data size being in effect. By setting this property to 0, users can make
  *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
  *
+ * - `oncs`
+ *   This field indicates the optional NVM commands and features supported
+ *   by the controller. To add support for the optional feature, needs to
+ *   set the corresponding support indicated bit.
+ *
  * nvme namespace device parameters
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  * - `subsys`
@@ -165,7 +170,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
     [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
 };
 
-static const uint32_t nvme_cse_acs[256] = {
+static const uint32_t nvme_cse_acs[NVME_MAX_COMMANDS] = {
     [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
     [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
     [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
@@ -178,30 +183,7 @@ static const uint32_t nvme_cse_acs[256] = {
     [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
 };
 
-static const uint32_t nvme_cse_iocs_none[256];
-
-static const uint32_t nvme_cse_iocs_nvm[256] = {
-    [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
-    [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
-};
-
-static const uint32_t nvme_cse_iocs_zoned[256] = {
-    [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
-    [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
-    [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-    [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
-};
+static const uint32_t nvme_cse_iocs_none[NVME_MAX_COMMANDS];
 
 static void nvme_process_sq(void *opaque);
 
@@ -2884,17 +2866,17 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
 
     switch (NVME_CC_CSS(n->bar.cc)) {
     case NVME_CC_CSS_NVM:
-        src_iocs = nvme_cse_iocs_nvm;
+        src_iocs = n->iocs.nvm;
         /* fall through */
     case NVME_CC_CSS_ADMIN_ONLY:
         break;
     case NVME_CC_CSS_CSI:
         switch (csi) {
         case NVME_CSI_NVM:
-            src_iocs = nvme_cse_iocs_nvm;
+            src_iocs = n->iocs.nvm;
             break;
         case NVME_CSI_ZONED:
-            src_iocs = nvme_cse_iocs_zoned;
+            src_iocs = n->iocs.zoned;
             break;
         }
     }
@@ -3422,6 +3404,10 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
+    if (!(le16_to_cpu(n->id_ctrl.oncs) & NVME_ONCS_FEATURES) && sel) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
     if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
         if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
             /*
@@ -3503,6 +3489,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
         result = n->features.async_config;
         goto out;
     case NVME_TIMESTAMP:
+        if (!(le16_to_cpu(n->id_ctrl.oncs) & NVME_ONCS_TIMESTAMP)) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
         return nvme_get_feature_timestamp(n, req);
     default:
         break;
@@ -3585,6 +3574,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
         return NVME_FID_NOT_SAVEABLE | NVME_DNR;
     }
 
+    if (!(le16_to_cpu(n->id_ctrl.oncs) & NVME_ONCS_FEATURES) && save) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
     if (!nvme_feature_support[fid]) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
@@ -3697,6 +3690,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
         n->features.async_config = dw11;
         break;
     case NVME_TIMESTAMP:
+        if (!(le16_to_cpu(n->id_ctrl.oncs) & NVME_ONCS_TIMESTAMP)) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
         return nvme_set_feature_timestamp(n, req);
     case NVME_COMMAND_SET_PROFILE:
         if (dw11 & 0x1ff) {
@@ -3875,14 +3871,14 @@ static void nvme_select_ns_iocs(NvmeCtrl *n)
         switch (ns->csi) {
         case NVME_CSI_NVM:
             if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
-                ns->iocs = nvme_cse_iocs_nvm;
+                ns->iocs = n->iocs.nvm;
             }
             break;
         case NVME_CSI_ZONED:
             if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
-                ns->iocs = nvme_cse_iocs_zoned;
+                ns->iocs = n->iocs.zoned;
             } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
-                ns->iocs = nvme_cse_iocs_nvm;
+                ns->iocs = n->iocs.nvm;
             }
             break;
         }
@@ -4510,6 +4506,40 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
     }
 }
 
+static void nvme_init_cse_iocs(NvmeCtrl *n)
+{
+    uint16_t oncs = n->params.oncs;
+
+    n->iocs.nvm[NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC;
+    n->iocs.nvm[NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC;
+    n->iocs.nvm[NVME_CMD_READ]  = NVME_CMD_EFF_CSUPP;
+
+    if (oncs & NVME_ONCS_WRITE_ZEROES) {
+        n->iocs.nvm[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP |
+            NVME_CMD_EFF_LBCC;
+    }
+
+    if (oncs & NVME_ONCS_COMPARE) {
+        n->iocs.nvm[NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP;
+    }
+
+    if (oncs & NVME_ONCS_DSM) {
+        n->iocs.nvm[NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC;
+    }
+
+    if (oncs & NVME_ONCS_COPY) {
+        n->iocs.nvm[NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC;
+    }
+
+    memcpy(n->iocs.zoned,  n->iocs.nvm, sizeof(n->iocs.nvm));
+
+    n->iocs.zoned[NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP |
+        NVME_CMD_EFF_LBCC;
+    n->iocs.zoned[NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP |
+        NVME_CMD_EFF_LBCC;
+    n->iocs.zoned[NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP;
+}
+
 static void nvme_init_state(NvmeCtrl *n)
 {
     n->num_namespaces = NVME_MAX_NAMESPACES;
@@ -4522,6 +4552,8 @@ static void nvme_init_state(NvmeCtrl *n)
     n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
     n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
+
+    nvme_init_cse_iocs(n);
 }
 
 int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -4720,9 +4752,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->sqes = (0x6 << 4) | 0x6;
     id->cqes = (0x4 << 4) | 0x4;
     id->nn = cpu_to_le32(n->num_namespaces);
-    id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
-                           NVME_ONCS_FEATURES | NVME_ONCS_DSM |
-                           NVME_ONCS_COMPARE | NVME_ONCS_COPY);
+    id->oncs = cpu_to_le16(n->params.oncs);
 
     id->vwc = (0x2 << 1) | 0x1;
     id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
@@ -4857,6 +4887,9 @@ static Property nvme_props[] = {
     DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
     DEFINE_PROP_SIZE32("zoned.append_size_limit", NvmeCtrl, params.zasl_bs,
                        NVME_DEFAULT_MAX_ZA_SIZE),
+    DEFINE_PROP_UINT16("oncs", NvmeCtrl, params.oncs, NVME_ONCS_WRITE_ZEROES |
+                       NVME_ONCS_TIMESTAMP | NVME_ONCS_DSM |
+                       NVME_ONCS_COMPARE | NVME_ONCS_FEATURES | NVME_ONCS_COPY),
     DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.30.0



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/2] hw/block/nvme: add write uncorrectable command
  2021-02-10  7:06 [PATCH 0/2] hw/block/nvme: oncs and write uncorrectable support Klaus Jensen
  2021-02-10  7:06 ` [PATCH 1/2] hw/block/nvme: add oncs device parameter Klaus Jensen
@ 2021-02-10  7:06 ` Klaus Jensen
  2021-02-10 11:14   ` Minwoo Im
  2021-02-11  3:37   ` Keith Busch
  1 sibling, 2 replies; 11+ messages in thread
From: Klaus Jensen @ 2021-02-10  7:06 UTC (permalink / raw)
  To: qemu-devel
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Gollu Appalanaidu,
	Max Reitz, Klaus Jensen, Keith Busch

From: Gollu Appalanaidu <anaidu.gollu@samsung.com>

Add support for marking blocks invalid with the Write Uncorrectable
command. Block status is tracked in a (non-persistent) bitmap that is
checked on all reads and written to on all writes. This is potentially
expensive, so keep Write Uncorrectable disabled by default.

Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 docs/specs/nvme.txt   |  3 ++
 hw/block/nvme-ns.h    |  2 ++
 hw/block/nvme.h       |  1 +
 hw/block/nvme-ns.c    |  2 ++
 hw/block/nvme.c       | 65 +++++++++++++++++++++++++++++++++++++------
 hw/block/trace-events |  1 +
 6 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
index 56d393884e7a..88f9cc278d4c 100644
--- a/docs/specs/nvme.txt
+++ b/docs/specs/nvme.txt
@@ -19,5 +19,8 @@ Known issues
 
 * The accounting numbers in the SMART/Health are reset across power cycles
 
+* Marking blocks invalid with the Write Uncorrectable is not persisted across
+  power cycles.
+
 * Interrupt Coalescing is not supported and is disabled by default in volation
   of the specification.
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7af6884862b5..15fa422ded03 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -72,6 +72,8 @@ typedef struct NvmeNamespace {
     struct {
         uint32_t err_rec;
     } features;
+
+    unsigned long *uncorrectable;
 } NvmeNamespace;
 
 static inline uint32_t nvme_nsid(NvmeNamespace *ns)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 98082b2dfba3..9b8f85b9cf16 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -68,6 +68,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
     case NVME_CMD_FLUSH:            return "NVME_NVM_CMD_FLUSH";
     case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
     case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
+    case NVME_CMD_WRITE_UNCOR:      return "NVME_CMD_WRITE_UNCOR";
     case NVME_CMD_COMPARE:          return "NVME_NVM_CMD_COMPARE";
     case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
     case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index ade46e2f3739..742bbc4b4b62 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -72,6 +72,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
     id_ns->mcl = cpu_to_le32(ns->params.mcl);
     id_ns->msrc = ns->params.msrc;
 
+    ns->uncorrectable = bitmap_new(id_ns->nsze);
+
     return 0;
 }
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e5f6666725d7..56048046c193 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1112,6 +1112,20 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
     return NVME_SUCCESS;
 }
 
+static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba,
+                                        uint32_t nlb)
+{
+    uint64_t elba = nlb + slba;
+
+    if (ns->uncorrectable) {
+        if (find_next_bit(ns->uncorrectable, elba, slba) < elba) {
+            return NVME_UNRECOVERED_READ | NVME_DNR;
+        }
+    }
+
+    return NVME_SUCCESS;
+}
+
 static void nvme_aio_err(NvmeRequest *req, int ret)
 {
     uint16_t status = NVME_SUCCESS;
@@ -1423,14 +1437,24 @@ static void nvme_rw_cb(void *opaque, int ret)
     BlockAcctCookie *acct = &req->acct;
     BlockAcctStats *stats = blk_get_stats(blk);
 
+    bool is_write = nvme_is_write(req);
+
     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
 
-    if (ns->params.zoned && nvme_is_write(req)) {
+    if (ns->params.zoned && is_write) {
         nvme_finalize_zoned_write(ns, req);
     }
 
     if (!ret) {
         block_acct_done(stats, acct);
+
+        if (is_write) {
+            NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+            uint64_t slba = le64_to_cpu(rw->slba);
+            uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
+
+            bitmap_clear(ns->uncorrectable, slba, nlb);
+        }
     } else {
         block_acct_failed(stats, acct);
         nvme_aio_err(req, ret);
@@ -1521,13 +1545,13 @@ static void nvme_copy_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
     NvmeNamespace *ns = req->ns;
+    NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
+    uint64_t sdlba = le64_to_cpu(copy->sdlba);
     struct nvme_copy_ctx *ctx = req->opaque;
 
     trace_pci_nvme_copy_cb(nvme_cid(req));
 
     if (ns->params.zoned) {
-        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
-        uint64_t sdlba = le64_to_cpu(copy->sdlba);
         NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
 
         __nvme_advance_zone_wp(ns, zone, ctx->nlb);
@@ -1535,6 +1559,7 @@ static void nvme_copy_cb(void *opaque, int ret)
 
     if (!ret) {
         block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
+        bitmap_clear(ns->uncorrectable, sdlba, ctx->nlb);
     } else {
         block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
         nvme_aio_err(req, ret);
@@ -1953,6 +1978,12 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
+    status = nvme_check_uncor(ns, slba, nlb);
+    if (status) {
+        trace_pci_nvme_err_unrecoverable_read(slba, nlb);
+        return status;
+    }
+
     if (ns->params.zoned) {
         status = nvme_check_zone_read(ns, slba, nlb);
         if (status) {
@@ -1992,7 +2023,7 @@ invalid:
 }
 
 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
-                              bool wrz)
+                              bool wrz, bool uncor)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
@@ -2008,7 +2039,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
     trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
                          nvme_nsid(ns), nlb, data_size, slba);
 
-    if (!wrz) {
+    if (!wrz && !uncor) {
         status = nvme_check_mdts(n, data_size);
         if (status) {
             trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
@@ -2055,6 +2086,11 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
         zone->w_ptr += nlb;
     }
 
+    if (uncor) {
+        bitmap_set(ns->uncorrectable, slba, nlb);
+        return NVME_SUCCESS;
+    }
+
     data_offset = nvme_l2b(ns, slba);
 
     if (!wrz) {
@@ -2087,17 +2123,22 @@ invalid:
 
 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
 {
-    return nvme_do_write(n, req, false, false);
+    return nvme_do_write(n, req, false, false, false);
 }
 
 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
 {
-    return nvme_do_write(n, req, false, true);
+    return nvme_do_write(n, req, false, true, false);
 }
 
 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
 {
-    return nvme_do_write(n, req, true, false);
+    return nvme_do_write(n, req, true, false, false);
+}
+
+static inline uint16_t nvme_write_uncor(NvmeCtrl *n, NvmeRequest *req)
+{
+    return nvme_do_write(n, req, false, false, true);
 }
 
 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
@@ -2596,6 +2637,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
         return nvme_flush(n, req);
     case NVME_CMD_WRITE_ZEROES:
         return nvme_write_zeroes(n, req);
+    case NVME_CMD_WRITE_UNCOR:
+        return nvme_write_uncor(n, req);
     case NVME_CMD_ZONE_APPEND:
         return nvme_zone_append(n, req);
     case NVME_CMD_WRITE:
@@ -4514,6 +4557,11 @@ static void nvme_init_cse_iocs(NvmeCtrl *n)
     n->iocs.nvm[NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC;
     n->iocs.nvm[NVME_CMD_READ]  = NVME_CMD_EFF_CSUPP;
 
+    if (oncs & NVME_ONCS_WRITE_UNCORR) {
+        n->iocs.nvm[NVME_CMD_WRITE_UNCOR] = NVME_CMD_EFF_CSUPP |
+            NVME_CMD_EFF_LBCC;
+    }
+
     if (oncs & NVME_ONCS_WRITE_ZEROES) {
         n->iocs.nvm[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP |
             NVME_CMD_EFF_LBCC;
@@ -4853,6 +4901,7 @@ static void nvme_exit(PCIDevice *pci_dev)
         }
 
         nvme_ns_cleanup(ns);
+        g_free(ns->uncorrectable);
     }
 
     g_free(n->cq);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 4b5ee04024f4..f30ef220c26a 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -128,6 +128,7 @@ pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PR
 pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+pci_nvme_err_unrecoverable_read(uint64_t start, uint32_t nlb) "islba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64""
 pci_nvme_err_cmb_invalid_cba(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64""
 pci_nvme_err_cmb_not_enabled(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64""
-- 
2.30.0



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] hw/block/nvme: add oncs device parameter
  2021-02-10  7:06 ` [PATCH 1/2] hw/block/nvme: add oncs device parameter Klaus Jensen
@ 2021-02-10 11:06   ` Minwoo Im
  0 siblings, 0 replies; 11+ messages in thread
From: Minwoo Im @ 2021-02-10 11:06 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Gollu Appalanaidu,
	qemu-devel, Max Reitz, Keith Busch

On 21-02-10 08:06:45, Klaus Jensen wrote:
> From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> 
> Add the 'oncs' nvme device parameter to allow optional features to be
> enabled/disabled explicitly. Since most of these are optional commands,
> make the CSE log pages dynamic to account for the value of ONCS.
> 
> Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>

Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] hw/block/nvme: add write uncorrectable command
  2021-02-10  7:06 ` [PATCH 2/2] hw/block/nvme: add write uncorrectable command Klaus Jensen
@ 2021-02-10 11:14   ` Minwoo Im
  2021-02-10 11:42     ` Klaus Jensen
  2021-02-11  3:37   ` Keith Busch
  1 sibling, 1 reply; 11+ messages in thread
From: Minwoo Im @ 2021-02-10 11:14 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Gollu Appalanaidu,
	qemu-devel, Max Reitz, Keith Busch

On 21-02-10 08:06:46, Klaus Jensen wrote:
> From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> 
> Add support for marking blocks invalid with the Write Uncorrectable
> command. Block status is tracked in a (non-persistent) bitmap that is
> checked on all reads and written to on all writes. This is potentially
> expensive, so keep Write Uncorrectable disabled by default.
> 
> Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> ---
>  docs/specs/nvme.txt   |  3 ++
>  hw/block/nvme-ns.h    |  2 ++
>  hw/block/nvme.h       |  1 +
>  hw/block/nvme-ns.c    |  2 ++
>  hw/block/nvme.c       | 65 +++++++++++++++++++++++++++++++++++++------
>  hw/block/trace-events |  1 +
>  6 files changed, 66 insertions(+), 8 deletions(-)
> 
> diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
> index 56d393884e7a..88f9cc278d4c 100644
> --- a/docs/specs/nvme.txt
> +++ b/docs/specs/nvme.txt
> @@ -19,5 +19,8 @@ Known issues
>  
>  * The accounting numbers in the SMART/Health are reset across power cycles
>  
> +* Marking blocks invalid with the Write Uncorrectable is not persisted across
> +  power cycles.
> +
>  * Interrupt Coalescing is not supported and is disabled by default in volation
>    of the specification.
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 7af6884862b5..15fa422ded03 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -72,6 +72,8 @@ typedef struct NvmeNamespace {
>      struct {
>          uint32_t err_rec;
>      } features;
> +
> +    unsigned long *uncorrectable;
>  } NvmeNamespace;
>  
>  static inline uint32_t nvme_nsid(NvmeNamespace *ns)
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 98082b2dfba3..9b8f85b9cf16 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -68,6 +68,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
>      case NVME_CMD_FLUSH:            return "NVME_NVM_CMD_FLUSH";
>      case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
>      case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
> +    case NVME_CMD_WRITE_UNCOR:      return "NVME_CMD_WRITE_UNCOR";
>      case NVME_CMD_COMPARE:          return "NVME_NVM_CMD_COMPARE";
>      case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
>      case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index ade46e2f3739..742bbc4b4b62 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -72,6 +72,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
>      id_ns->mcl = cpu_to_le32(ns->params.mcl);
>      id_ns->msrc = ns->params.msrc;
>  
> +    ns->uncorrectable = bitmap_new(id_ns->nsze);
> +
>      return 0;
>  }
>  
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index e5f6666725d7..56048046c193 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1112,6 +1112,20 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
>      return NVME_SUCCESS;
>  }
>  
> +static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba,
> +                                        uint32_t nlb)
> +{
> +    uint64_t elba = nlb + slba;
> +
> +    if (ns->uncorrectable) {
> +        if (find_next_bit(ns->uncorrectable, elba, slba) < elba) {
> +            return NVME_UNRECOVERED_READ | NVME_DNR;
> +        }
> +    }
> +
> +    return NVME_SUCCESS;
> +}
> +
>  static void nvme_aio_err(NvmeRequest *req, int ret)
>  {
>      uint16_t status = NVME_SUCCESS;
> @@ -1423,14 +1437,24 @@ static void nvme_rw_cb(void *opaque, int ret)
>      BlockAcctCookie *acct = &req->acct;
>      BlockAcctStats *stats = blk_get_stats(blk);
>  
> +    bool is_write = nvme_is_write(req);
> +
>      trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
>  
> -    if (ns->params.zoned && nvme_is_write(req)) {
> +    if (ns->params.zoned && is_write) {
>          nvme_finalize_zoned_write(ns, req);
>      }
>  
>      if (!ret) {
>          block_acct_done(stats, acct);
> +
> +        if (is_write) {
> +            NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
> +            uint64_t slba = le64_to_cpu(rw->slba);
> +            uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
> +
> +            bitmap_clear(ns->uncorrectable, slba, nlb);

It might be nitpick, 'nlb' would easily represent the value which is
defined itself in the spec which is zero-based.  Can we have this like:

	uint32_t nlb = le16_to_cpu(rw->nlb);

	bitmap_clear(ns->uncorrectable, slba, nlb + 1);

Otherwise, it looks good to me.

Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] hw/block/nvme: add write uncorrectable command
  2021-02-10 11:14   ` Minwoo Im
@ 2021-02-10 11:42     ` Klaus Jensen
  2021-02-10 15:28       ` Minwoo Im
  0 siblings, 1 reply; 11+ messages in thread
From: Klaus Jensen @ 2021-02-10 11:42 UTC (permalink / raw)
  To: Minwoo Im
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Gollu Appalanaidu,
	qemu-devel, Max Reitz, Keith Busch

[-- Attachment #1: Type: text/plain, Size: 5213 bytes --]

On Feb 10 20:14, Minwoo Im wrote:
> On 21-02-10 08:06:46, Klaus Jensen wrote:
> > From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> > 
> > Add support for marking blocks invalid with the Write Uncorrectable
> > command. Block status is tracked in a (non-persistent) bitmap that is
> > checked on all reads and written to on all writes. This is potentially
> > expensive, so keep Write Uncorrectable disabled by default.
> > 
> > Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > ---
> >  docs/specs/nvme.txt   |  3 ++
> >  hw/block/nvme-ns.h    |  2 ++
> >  hw/block/nvme.h       |  1 +
> >  hw/block/nvme-ns.c    |  2 ++
> >  hw/block/nvme.c       | 65 +++++++++++++++++++++++++++++++++++++------
> >  hw/block/trace-events |  1 +
> >  6 files changed, 66 insertions(+), 8 deletions(-)
> > 
> > diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
> > index 56d393884e7a..88f9cc278d4c 100644
> > --- a/docs/specs/nvme.txt
> > +++ b/docs/specs/nvme.txt
> > @@ -19,5 +19,8 @@ Known issues
> >  
> >  * The accounting numbers in the SMART/Health are reset across power cycles
> >  
> > +* Marking blocks invalid with the Write Uncorrectable is not persisted across
> > +  power cycles.
> > +
> >  * Interrupt Coalescing is not supported and is disabled by default in volation
> >    of the specification.
> > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > index 7af6884862b5..15fa422ded03 100644
> > --- a/hw/block/nvme-ns.h
> > +++ b/hw/block/nvme-ns.h
> > @@ -72,6 +72,8 @@ typedef struct NvmeNamespace {
> >      struct {
> >          uint32_t err_rec;
> >      } features;
> > +
> > +    unsigned long *uncorrectable;
> >  } NvmeNamespace;
> >  
> >  static inline uint32_t nvme_nsid(NvmeNamespace *ns)
> > diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> > index 98082b2dfba3..9b8f85b9cf16 100644
> > --- a/hw/block/nvme.h
> > +++ b/hw/block/nvme.h
> > @@ -68,6 +68,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
> >      case NVME_CMD_FLUSH:            return "NVME_NVM_CMD_FLUSH";
> >      case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
> >      case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
> > +    case NVME_CMD_WRITE_UNCOR:      return "NVME_CMD_WRITE_UNCOR";
> >      case NVME_CMD_COMPARE:          return "NVME_NVM_CMD_COMPARE";
> >      case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
> >      case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
> > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > index ade46e2f3739..742bbc4b4b62 100644
> > --- a/hw/block/nvme-ns.c
> > +++ b/hw/block/nvme-ns.c
> > @@ -72,6 +72,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
> >      id_ns->mcl = cpu_to_le32(ns->params.mcl);
> >      id_ns->msrc = ns->params.msrc;
> >  
> > +    ns->uncorrectable = bitmap_new(id_ns->nsze);
> > +
> >      return 0;
> >  }
> >  
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index e5f6666725d7..56048046c193 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -1112,6 +1112,20 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
> >      return NVME_SUCCESS;
> >  }
> >  
> > +static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba,
> > +                                        uint32_t nlb)
> > +{
> > +    uint64_t elba = nlb + slba;
> > +
> > +    if (ns->uncorrectable) {
> > +        if (find_next_bit(ns->uncorrectable, elba, slba) < elba) {
> > +            return NVME_UNRECOVERED_READ | NVME_DNR;
> > +        }
> > +    }
> > +
> > +    return NVME_SUCCESS;
> > +}
> > +
> >  static void nvme_aio_err(NvmeRequest *req, int ret)
> >  {
> >      uint16_t status = NVME_SUCCESS;
> > @@ -1423,14 +1437,24 @@ static void nvme_rw_cb(void *opaque, int ret)
> >      BlockAcctCookie *acct = &req->acct;
> >      BlockAcctStats *stats = blk_get_stats(blk);
> >  
> > +    bool is_write = nvme_is_write(req);
> > +
> >      trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
> >  
> > -    if (ns->params.zoned && nvme_is_write(req)) {
> > +    if (ns->params.zoned && is_write) {
> >          nvme_finalize_zoned_write(ns, req);
> >      }
> >  
> >      if (!ret) {
> >          block_acct_done(stats, acct);
> > +
> > +        if (is_write) {
> > +            NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
> > +            uint64_t slba = le64_to_cpu(rw->slba);
> > +            uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
> > +
> > +            bitmap_clear(ns->uncorrectable, slba, nlb);
> 
> It might be nitpick, 'nlb' would easily represent the value which is
> defined itself in the spec which is zero-based.  Can we have this like:
> 
> 	uint32_t nlb = le16_to_cpu(rw->nlb);
> 
> 	bitmap_clear(ns->uncorrectable, slba, nlb + 1);
> 


I do not disagree, but the `uint32_t nlb = le16_to_cpu(rw->nlb) + 1;`
pattern is already used in several places.

> Otherwise, it looks good to me.
> 
> Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>

-- 
One of us - No more doubt, silence or taboo about mental illness.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] hw/block/nvme: add write uncorrectable command
  2021-02-10 11:42     ` Klaus Jensen
@ 2021-02-10 15:28       ` Minwoo Im
  0 siblings, 0 replies; 11+ messages in thread
From: Minwoo Im @ 2021-02-10 15:28 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Gollu Appalanaidu,
	qemu-devel, Max Reitz, Keith Busch

> > It might be nitpick, 'nlb' would easily represent the value which is
> > defined itself in the spec which is zero-based.  Can we have this like:
> > 
> > 	uint32_t nlb = le16_to_cpu(rw->nlb);
> > 
> > 	bitmap_clear(ns->uncorrectable, slba, nlb + 1);
> > 
> 
> 
> I do not disagree, but the `uint32_t nlb = le16_to_cpu(rw->nlb) + 1;`
> pattern is already used in several places.

Oh yes, Now I just saw some places.  Then, please take my review tag for
this patch.

Thanks!


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] hw/block/nvme: add write uncorrectable command
  2021-02-10  7:06 ` [PATCH 2/2] hw/block/nvme: add write uncorrectable command Klaus Jensen
  2021-02-10 11:14   ` Minwoo Im
@ 2021-02-11  3:37   ` Keith Busch
  2021-02-11  8:43     ` Klaus Jensen
  1 sibling, 1 reply; 11+ messages in thread
From: Keith Busch @ 2021-02-11  3:37 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Gollu Appalanaidu,
	qemu-devel, Max Reitz

On Wed, Feb 10, 2021 at 08:06:46AM +0100, Klaus Jensen wrote:
> From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> 
> Add support for marking blocks invalid with the Write Uncorrectable
> command. Block status is tracked in a (non-persistent) bitmap that is
> checked on all reads and written to on all writes. This is potentially
> expensive, so keep Write Uncorrectable disabled by default.

I really think attempting to emulate all these things is putting a
potentially unnecessary maintenance burden on this device.

The DULBE implementation started off similiar, but I suggested it
leverage support out of the backing file, and I feel it ended up better
for it.

But unlike punching and checking for holes, there's no filesystem
support for Write Uncorrectable in our qemu API, and that's probably
because this is kind of a niche feature. Is there a use case with a
real qemu guest wanting this?


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] hw/block/nvme: add write uncorrectable command
  2021-02-11  3:37   ` Keith Busch
@ 2021-02-11  8:43     ` Klaus Jensen
  2021-02-11 15:37       ` Keith Busch
  0 siblings, 1 reply; 11+ messages in thread
From: Klaus Jensen @ 2021-02-11  8:43 UTC (permalink / raw)
  To: Keith Busch
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Gollu Appalanaidu,
	qemu-devel, Max Reitz

[-- Attachment #1: Type: text/plain, Size: 2438 bytes --]

On Feb 11 12:37, Keith Busch wrote:
> On Wed, Feb 10, 2021 at 08:06:46AM +0100, Klaus Jensen wrote:
> > From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> > 
> > Add support for marking blocks invalid with the Write Uncorrectable
> > command. Block status is tracked in a (non-persistent) bitmap that is
> > checked on all reads and written to on all writes. This is potentially
> > expensive, so keep Write Uncorrectable disabled by default.
> 
> I really think attempting to emulate all these things is putting a
> potentially unnecessary maintenance burden on this device.
> 

Even though I myself posted the Telemetry support on behalf of Gollu, I
now agree that it would bloat the device with a "useless" feature. We
totally accept that and in that initial form there was not a lot of bang
for bucks.

This emulated feature comes at a pretty low cost in terms of code and
complexity, but I won't argue beyond that. However, it does comes at a
performance cost, which is why the intention was to disable it by
default.

> The DULBE implementation started off similiar, but I suggested it
> leverage support out of the backing file, and I feel it ended up better
> for it.
> 

And thanks for pushing back against that - the solution we ended up with
was totally superior, no doubt about that!

> But unlike punching and checking for holes, there's no filesystem
> support for Write Uncorrectable in our qemu API, and that's probably
> because this is kind of a niche feature.

True. I don't see any trivial way to support this on a lower level. Even
if we persued implementing this in the QEMU block layer I only think it
could be supported by "integrated formats" like QCOW2 that could
optionally include a persistent bitmap, like the dirty bitmap feature.

> Is there a use case with a real qemu guest wanting this?

Like for the extended metadata case (which also does not have a lot of
"public" exposure, but definitely have enterprise use), our main
motivation here was to ease testing for compliance suites and frameworks
like SPDK. I'm honestly have no clue what so ever what a real world use
of Write Uncorrectable would be. It's been in the spec since 1.0, so
there must have been some reason, Is it just to align with SCSI WRITE
LONG? I'm not SCSI expert at all, but from what I can read it looks like
that was also intended as a feature for testing read error conditions.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] hw/block/nvme: add write uncorrectable command
  2021-02-11  8:43     ` Klaus Jensen
@ 2021-02-11 15:37       ` Keith Busch
  2021-02-11 17:54         ` Klaus Jensen
  0 siblings, 1 reply; 11+ messages in thread
From: Keith Busch @ 2021-02-11 15:37 UTC (permalink / raw)
  To: Klaus Jensen
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Gollu Appalanaidu,
	qemu-devel, Max Reitz

On Thu, Feb 11, 2021 at 09:43:05AM +0100, Klaus Jensen wrote:
> On Feb 11 12:37, Keith Busch wrote:
> 
> > Is there a use case with a real qemu guest wanting this?
> 
> Like for the extended metadata case (which also does not have a lot of
> "public" exposure, but definitely have enterprise use), our main
> motivation here was to ease testing for compliance suites and frameworks
> like SPDK. 

I'm okay with the metadata patches.

> I'm honestly have no clue what so ever what a real world use
> of Write Uncorrectable would be. It's been in the spec since 1.0, so
> there must have been some reason, Is it just to align with SCSI WRITE
> LONG? I'm not SCSI expert at all, but from what I can read it looks like
> that was also intended as a feature for testing read error conditions.

I don't think it's for testing purposes.

If you need to send a burst of non-atomic writes (ex: writing a RAID
stripe), a power failure can result in an inconsistent state where you
don't know at a block level which ones have old data or new data. If you
Write Uncorrectable first, you can never read old data, and thus have no
"write hole".

Journalling solves this better, and I'm not aware of any real
implementation relying on uncorrectable.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/2] hw/block/nvme: add write uncorrectable command
  2021-02-11 15:37       ` Keith Busch
@ 2021-02-11 17:54         ` Klaus Jensen
  0 siblings, 0 replies; 11+ messages in thread
From: Klaus Jensen @ 2021-02-11 17:54 UTC (permalink / raw)
  To: Keith Busch
  Cc: Kevin Wolf, qemu-block, Klaus Jensen, Gollu Appalanaidu,
	qemu-devel, Max Reitz

[-- Attachment #1: Type: text/plain, Size: 1491 bytes --]

On Feb 12 00:37, Keith Busch wrote:
> On Thu, Feb 11, 2021 at 09:43:05AM +0100, Klaus Jensen wrote:
> > On Feb 11 12:37, Keith Busch wrote:
> > 
> > > Is there a use case with a real qemu guest wanting this?
> > 
> > Like for the extended metadata case (which also does not have a lot of
> > "public" exposure, but definitely have enterprise use), our main
> > motivation here was to ease testing for compliance suites and frameworks
> > like SPDK. 
> 
> I'm okay with the metadata patches.
> 
> > I'm honestly have no clue what so ever what a real world use
> > of Write Uncorrectable would be. It's been in the spec since 1.0, so
> > there must have been some reason, Is it just to align with SCSI WRITE
> > LONG? I'm not SCSI expert at all, but from what I can read it looks like
> > that was also intended as a feature for testing read error conditions.
> 
> I don't think it's for testing purposes.
> 
> If you need to send a burst of non-atomic writes (ex: writing a RAID
> stripe), a power failure can result in an inconsistent state where you
> don't know at a block level which ones have old data or new data. If you
> Write Uncorrectable first, you can never read old data, and thus have no
> "write hole".
> 
> Journalling solves this better, and I'm not aware of any real
> implementation relying on uncorrectable.

Right, thanks! I'm aware of the RAID write hole issue, but I did not
consider Write Uncorrectable as a possible means to solve it.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-02-11 18:02 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-10  7:06 [PATCH 0/2] hw/block/nvme: oncs and write uncorrectable support Klaus Jensen
2021-02-10  7:06 ` [PATCH 1/2] hw/block/nvme: add oncs device parameter Klaus Jensen
2021-02-10 11:06   ` Minwoo Im
2021-02-10  7:06 ` [PATCH 2/2] hw/block/nvme: add write uncorrectable command Klaus Jensen
2021-02-10 11:14   ` Minwoo Im
2021-02-10 11:42     ` Klaus Jensen
2021-02-10 15:28       ` Minwoo Im
2021-02-11  3:37   ` Keith Busch
2021-02-11  8:43     ` Klaus Jensen
2021-02-11 15:37       ` Keith Busch
2021-02-11 17:54         ` Klaus Jensen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.