From: Klaus Jensen <its@irrelevant.dk>
To: qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>, Fam Zheng <fam@euphon.net>,
qemu-block@nongnu.org, Klaus Jensen <k.jensen@samsung.com>,
Max Reitz <mreitz@redhat.com>, Keith Busch <kbusch@kernel.org>,
Stefan Hajnoczi <stefanha@redhat.com>,
Klaus Jensen <its@irrelevant.dk>
Subject: [PATCH RFC v5 12/12] hw/block/nvme: add persistence for zone info
Date: Fri, 27 Nov 2020 00:46:01 +0100 [thread overview]
Message-ID: <20201126234601.689714-13-its@irrelevant.dk> (raw)
In-Reply-To: <20201126234601.689714-1-its@irrelevant.dk>
From: Klaus Jensen <k.jensen@samsung.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
docs/specs/nvme.txt | 15 +++
hw/block/nvme-ns.h | 16 ++++
hw/block/nvme-ns.c | 212 +++++++++++++++++++++++++++++++++++++++++-
hw/block/nvme.c | 87 +++++++++++++++++
hw/block/trace-events | 2 +
5 files changed, 331 insertions(+), 1 deletion(-)
diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
index 03bb4d9516b4..05d81c88ad4e 100644
--- a/docs/specs/nvme.txt
+++ b/docs/specs/nvme.txt
@@ -20,6 +20,21 @@ The nvme device (-device nvme) emulates an NVM Express Controller.
`zns.mor`; Specifies the number of open resources available. This is a 0s
based value.
+ `zns.pstate`; This parameter specifies another blockdev to be used for
+ storing zone state persistently.
+
+ -drive id=zns-pstate,file=zns-pstate.img,format=raw
+ -device nvme-ns,zns.pstate=zns-pstate,...
+
+ To reset (or initialize) state, the blockdev image should be of zero size:
+
+ qemu-img create -f raw zns-pstate.img 0
+
+ The image will be intialized with a file format header and truncated to
+ the required size. If the pstate given is of non-zero size, it will be
+ assumed to already contain zone state information and the header will be
+ checked.
+
Reference Specifications
------------------------
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 05a79a214605..5cb4c1da59ce 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -19,6 +19,15 @@
#define NVME_NS(obj) \
OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
+#define NVME_ZONE_PSTATE_MAGIC ((0x00 << 24) | ('S' << 16) | ('N' << 8) | 'Z')
+#define NVME_ZONE_PSTATE_V1 1
+
+typedef struct NvmeZonePStateHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint8_t rsvd8[4088];
+} QEMU_PACKED NvmeZonePStateHeader;
+
typedef struct NvmeNamespaceParams {
uint32_t nsid;
uint8_t iocs;
@@ -74,6 +83,8 @@ typedef struct NvmeNamespace {
QTAILQ_HEAD(, NvmeZone) lru_open;
QTAILQ_HEAD(, NvmeZone) lru_active;
} resources;
+
+ BlockBackend *pstate;
} zns;
} NvmeNamespace;
@@ -186,4 +197,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
void nvme_ns_drain(NvmeNamespace *ns);
void nvme_ns_flush(NvmeNamespace *ns);
+static inline void _nvme_ns_check_size(void)
+{
+ QEMU_BUILD_BUG_ON(sizeof(NvmeZonePStateHeader) != 4096);
+}
+
#endif /* NVME_NS_H */
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index cd0f075dd281..4f311dd704c0 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -50,6 +50,31 @@ const char *nvme_zs_to_str(NvmeZoneState zs)
return "UNKNOWN";
}
+static int nvme_blk_truncate(BlockBackend *blk, size_t len, Error **errp)
+{
+ int ret;
+ uint64_t perm, shared_perm;
+
+ blk_get_perm(blk, &perm, &shared_perm);
+
+ ret = blk_set_perm(blk, perm | BLK_PERM_RESIZE, shared_perm, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = blk_truncate(blk, len, false, PREALLOC_MODE_OFF, 0, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = blk_set_perm(blk, perm, shared_perm, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
static void nvme_ns_zns_init_zones(NvmeNamespace *ns)
{
NvmeZone *zone;
@@ -153,6 +178,176 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
return 0;
}
+static int nvme_ns_zns_restore_zone_state(NvmeNamespace *ns, Error **errp)
+{
+ for (int i = 0; i < ns->zns.num_zones; i++) {
+ NvmeZone *zone = &ns->zns.zones[i];
+ zone->zd = &ns->zns.zd[i];
+ if (ns->params.zns.zdes) {
+ zone->zde = &ns->zns.zde[i];
+ }
+
+ switch (nvme_zs(zone)) {
+ case NVME_ZS_ZSE:
+ case NVME_ZS_ZSF:
+ case NVME_ZS_ZSRO:
+ case NVME_ZS_ZSO:
+ break;
+
+ case NVME_ZS_ZSC:
+ if (nvme_wp(zone) == nvme_zslba(zone) &&
+ !(zone->zd->za & NVME_ZA_ZDEV)) {
+ nvme_zs_set(zone, NVME_ZS_ZSE);
+ break;
+ }
+
+ if (ns->zns.resources.active) {
+ ns->zns.resources.active--;
+ QTAILQ_INSERT_TAIL(&ns->zns.resources.lru_active, zone,
+ lru_entry);
+ break;
+ }
+
+ /* fallthrough */
+
+ case NVME_ZS_ZSIO:
+ case NVME_ZS_ZSEO:
+ zone->zd->wp = zone->zd->zslba;
+ nvme_zs_set(zone, NVME_ZS_ZSF);
+ break;
+
+ default:
+ error_setg(errp, "invalid zone state");
+ return -1;
+ }
+
+ zone->wp_staging = nvme_wp(zone);
+ }
+
+ return 0;
+}
+
+static int nvme_ns_zns_init_pstate(NvmeNamespace *ns, Error **errp)
+{
+ BlockBackend *blk = ns->zns.pstate;
+ NvmeZonePStateHeader header;
+ size_t zd_len, zde_len;
+ int ret;
+
+ zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor);
+ zde_len = ns->zns.num_zones * nvme_ns_zdes_bytes(ns);
+
+ ret = nvme_blk_truncate(blk, zd_len + zde_len + sizeof(header), errp);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not truncate zone pstate");
+ return ret;
+ }
+
+ nvme_ns_zns_init_zones(ns);
+
+ ret = blk_pwrite(blk, 0, ns->zns.zd, zd_len, 0);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not write zone descriptors to "
+ "zone pstate");
+ return ret;
+ }
+
+ header = (NvmeZonePStateHeader) {
+ .magic = cpu_to_le32(NVME_ZONE_PSTATE_MAGIC),
+ .version = cpu_to_le32(NVME_ZONE_PSTATE_V1),
+ };
+
+ ret = blk_pwrite(blk, zd_len + zde_len, &header, sizeof(header), 0);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not write zone pstate header");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int nvme_ns_zns_load_pstate(NvmeNamespace *ns, size_t len, Error **errp)
+{
+ BlockBackend *blk = ns->zns.pstate;
+ NvmeZonePStateHeader header;
+ size_t zd_len, zde_len;
+ int ret;
+
+ ret = blk_pread(blk, len - sizeof(header), &header, sizeof(header));
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not read zone pstate header");
+ return ret;
+ }
+
+ if (le32_to_cpu(header.magic) != NVME_ZONE_PSTATE_MAGIC) {
+ error_setg(errp, "invalid zone pstate header");
+ return -1;
+ } else if (le32_to_cpu(header.version) > NVME_ZONE_PSTATE_V1) {
+ error_setg(errp, "unsupported zone pstate version");
+ return -1;
+ }
+
+ zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor);
+ zde_len = ns->zns.num_zones * nvme_ns_zdes_bytes(ns);
+
+ ret = blk_pread(blk, 0, ns->zns.zd, zd_len);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not read zone descriptors from "
+ "zone pstate");
+ return ret;
+ }
+
+ if (zde_len) {
+ ret = blk_pread(blk, zd_len, ns->zns.zde, zde_len);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not read zone descriptor "
+ "extensions from zone pstate");
+ return ret;
+ }
+ }
+
+ if (nvme_ns_zns_restore_zone_state(ns, errp)) {
+ return -1;
+ }
+
+ ret = blk_pwrite(blk, 0, ns->zns.zd, zd_len, 0);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not write zone descriptors to "
+ "zone pstate");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int nvme_ns_zns_setup_pstate(NvmeNamespace *ns, Error **errp)
+{
+ BlockBackend *blk = ns->zns.pstate;
+ uint64_t perm, shared_perm;
+ ssize_t len;
+ int ret;
+
+ perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+ shared_perm = BLK_PERM_ALL;
+
+ ret = blk_set_perm(blk, perm, shared_perm, errp);
+ if (ret) {
+ return ret;
+ }
+
+ len = blk_getlength(blk);
+ if (len < 0) {
+ error_setg_errno(errp, -len, "could not determine zone pstate size");
+ return len;
+ }
+
+ if (!len) {
+ return nvme_ns_zns_init_pstate(ns, errp);
+ }
+
+ return nvme_ns_zns_load_pstate(ns, len, errp);
+}
+
static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
{
if (!blkconf_blocksizes(&ns->blkconf, errp)) {
@@ -236,7 +431,13 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
}
if (nvme_ns_zoned(ns)) {
- nvme_ns_zns_init_zones(ns);
+ if (ns->zns.pstate) {
+ if (nvme_ns_zns_setup_pstate(ns, errp)) {
+ return -1;
+ }
+ } else {
+ nvme_ns_zns_init_zones(ns);
+ }
}
if (nvme_register_namespace(n, ns, errp)) {
@@ -249,11 +450,19 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
void nvme_ns_drain(NvmeNamespace *ns)
{
blk_drain(ns->blkconf.blk);
+
+ if (ns->zns.pstate) {
+ blk_drain(ns->zns.pstate);
+ }
}
void nvme_ns_flush(NvmeNamespace *ns)
{
blk_flush(ns->blkconf.blk);
+
+ if (ns->zns.pstate) {
+ blk_flush(ns->zns.pstate);
+ }
}
static void nvme_ns_realize(DeviceState *dev, Error **errp)
@@ -283,6 +492,7 @@ static Property nvme_ns_props[] = {
DEFINE_PROP_UINT8("zns.zdes", NvmeNamespace, params.zns.zdes, 0),
DEFINE_PROP_UINT32("zns.mar", NvmeNamespace, params.zns.mar, 0xffffffff),
DEFINE_PROP_UINT32("zns.mor", NvmeNamespace, params.zns.mor, 0xffffffff),
+ DEFINE_PROP_DRIVE("zns.pstate", NvmeNamespace, zns.pstate),
DEFINE_PROP_END_OF_LIST(),
};
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e62efd7cf0c4..04ad9f20223d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1023,6 +1023,46 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
return NVME_SUCCESS;
}
+static int nvme_zns_commit_zone(NvmeNamespace *ns, NvmeZone *zone)
+{
+ uint64_t zslba;
+ int64_t offset;
+
+ if (!ns->zns.pstate) {
+ return 0;
+ }
+
+ trace_pci_nvme_zns_commit_zone(nvme_nsid(ns), nvme_zslba(zone));
+
+ zslba = nvme_zslba(zone);
+ offset = nvme_ns_zone_idx(ns, zslba) * sizeof(NvmeZoneDescriptor);
+
+ return blk_pwrite(ns->zns.pstate, offset, zone->zd,
+ sizeof(NvmeZoneDescriptor), 0);
+}
+
+static int nvme_zns_commit_zde(NvmeNamespace *ns, NvmeZone *zone)
+{
+ uint64_t zslba;
+ int zidx;
+ size_t zd_len, zdes_bytes;
+ int64_t offset;
+
+ if (!ns->zns.pstate) {
+ return 0;
+ }
+
+ trace_pci_nvme_zns_commit_zde(nvme_nsid(ns), nvme_zslba(zone));
+
+ zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor);
+ zslba = nvme_zslba(zone);
+ zidx = nvme_ns_zone_idx(ns, zslba);
+ zdes_bytes = nvme_ns_zdes_bytes(ns);
+ offset = zd_len + zidx * zdes_bytes;
+
+ return blk_pwrite(ns->zns.pstate, offset, zone->zde, zdes_bytes, 0);
+}
+
static inline void nvme_zone_reset_wp(NvmeZone *zone)
{
zone->zd->wp = zone->zd->zslba;
@@ -1058,6 +1098,10 @@ static uint16_t nvme_zrm_release_open(NvmeNamespace *ns)
return status;
}
+ if (nvme_zns_commit_zone(ns, candidate) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
return NVME_SUCCESS;
}
@@ -1252,6 +1296,10 @@ static uint16_t __nvme_zns_advance_wp(NvmeNamespace *ns, NvmeZone *zone,
if (status) {
return status;
}
+
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
}
return NVME_SUCCESS;
@@ -1307,6 +1355,10 @@ static void nvme_aio_err(NvmeRequest *req, int ret, NvmeZone *zone)
NVME_ZS_ZSRO : NVME_ZS_ZSO;
nvme_zrm_transition(ns, zone, zs);
+
+ if (nvme_zns_commit_zone(req->ns, zone) < 0) {
+ req->status = NVME_INTERNAL_DEV_ERROR;
+ }
}
/*
@@ -1618,6 +1670,10 @@ static void nvme_aio_zone_reset_cb(void *opaque, int ret)
nvme_aio_err(req, ret, zone);
}
+ if (nvme_zns_commit_zone(req->ns, zone) < 0) {
+ req->status = NVME_INTERNAL_DEV_ERROR;
+ }
+
(*resets)--;
if (*resets) {
@@ -1657,6 +1713,10 @@ static uint16_t nvme_zone_mgmt_send_close(NvmeCtrl *n, NvmeRequest *req,
return status;
}
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
return NVME_SUCCESS;
}
@@ -1678,6 +1738,10 @@ static uint16_t nvme_zone_mgmt_send_finish(NvmeCtrl *n, NvmeRequest *req,
return status;
}
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
return NVME_SUCCESS;
}
@@ -1699,6 +1763,10 @@ static uint16_t nvme_zone_mgmt_send_open(NvmeCtrl *n, NvmeRequest *req,
return status;
}
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
return NVME_SUCCESS;
}
@@ -1754,6 +1822,10 @@ static uint16_t nvme_zone_mgmt_send_offline(NvmeCtrl *n, NvmeRequest *req,
case NVME_ZS_ZSRO:
nvme_zrm_transition(ns, zone, NVME_ZS_ZSO);
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
/* fallthrough */
case NVME_ZS_ZSO:
@@ -1793,6 +1865,10 @@ static uint16_t nvme_zone_mgmt_send_set_zde(NvmeCtrl *n, NvmeRequest *req,
return status;
}
+ if (nvme_zns_commit_zde(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
status = nvme_zrm_transition(ns, zone, NVME_ZS_ZSC);
if (status) {
return status;
@@ -1800,6 +1876,10 @@ static uint16_t nvme_zone_mgmt_send_set_zde(NvmeCtrl *n, NvmeRequest *req,
NVME_ZA_SET(zone->zd->za, NVME_ZA_ZDEV);
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
return NVME_SUCCESS;
}
@@ -2502,6 +2582,11 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
goto invalid;
}
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ status = NVME_INTERNAL_DEV_ERROR;
+ goto invalid;
+ }
+
break;
}
@@ -3778,6 +3863,8 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n)
nvme_zrm_transition(ns, zone, NVME_ZS_ZSE);
}
+ nvme_zns_commit_zone(ns, zone);
+
/* fallthrough */
default:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 31482bfba1fe..aa5491c398b9 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -96,6 +96,8 @@ pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "c
pci_nvme_zrm_transition(uint32_t nsid, uint64_t zslba, const char *s_from, uint8_t from, const char *s_to, uint8_t to) "nsid %"PRIu32" zslba 0x%"PRIx64" from '%s' (%"PRIu8") to '%s' (%"PRIu8")"
pci_nvme_zrm_release_open(uint32_t nsid) "nsid %"PRIu32""
pci_nvme_zns_advance_wp(uint32_t nsid, uint64_t zslba, uint64_t wp_orig, uint32_t nlb) "nsid 0x%"PRIx32" zslba 0x%"PRIx64" wp_orig 0x%"PRIx64" nlb %"PRIu32""
+pci_nvme_zns_commit_zone(uint32_t nsid, uint64_t zslba) "nsid 0x%"PRIx32" zslba 0x%"PRIx64""
+pci_nvme_zns_commit_zde(uint32_t nsid, uint64_t zslba) "nsid 0x%"PRIx32" zslba 0x%"PRIx64""
pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64""
pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64""
pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16""
--
2.29.2
next prev parent reply other threads:[~2020-11-26 23:55 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-11-26 23:45 [PATCH v5 00/12] hw/block/nvme: zoned namespace command set Klaus Jensen
2020-11-26 23:45 ` [PATCH v5 01/12] hw/block/nvme: Separate read and write handlers Klaus Jensen
2020-11-26 23:45 ` [PATCH v5 02/12] hw/block/nvme: Merge nvme_write_zeroes() with nvme_write() Klaus Jensen
2020-11-26 23:45 ` [PATCH v5 03/12] hw/block/nvme: add commands supported and effects log page Klaus Jensen
2020-11-26 23:45 ` [PATCH v5 04/12] hw/block/nvme: Generate namespace UUIDs Klaus Jensen
2020-11-26 23:45 ` [PATCH v5 05/12] hw/block/nvme: support namespace types Klaus Jensen
2020-11-26 23:45 ` [PATCH v5 06/12] hw/block/nvme: add basic read/write for zoned namespaces Klaus Jensen
2020-11-26 23:45 ` [PATCH v5 07/12] hw/block/nvme: add the zone management receive command Klaus Jensen
2020-11-26 23:45 ` [PATCH v5 08/12] hw/block/nvme: add the zone management send command Klaus Jensen
2020-11-26 23:45 ` [PATCH v5 09/12] hw/block/nvme: add the zone append command Klaus Jensen
2020-11-26 23:45 ` [PATCH v5 10/12] hw/block/nvme: track and enforce zone resources Klaus Jensen
2020-11-26 23:46 ` [PATCH v5 11/12] hw/block/nvme: allow open to close zone transitions by controller Klaus Jensen
2020-11-26 23:46 ` Klaus Jensen [this message]
2020-11-30 12:33 ` [PATCH RFC v5 12/12] hw/block/nvme: add persistence for zone info Stefan Hajnoczi
2020-11-30 12:59 ` Klaus Jensen
2020-11-30 13:18 ` Klaus Jensen
2020-11-30 14:58 ` Stefan Hajnoczi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201126234601.689714-13-its@irrelevant.dk \
--to=its@irrelevant.dk \
--cc=fam@euphon.net \
--cc=k.jensen@samsung.com \
--cc=kbusch@kernel.org \
--cc=kwolf@redhat.com \
--cc=mreitz@redhat.com \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.