All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 0/5] nvme: ANA support
@ 2018-05-04 11:28 Hannes Reinecke
  2018-05-04 11:28 ` [PATCH 1/5] nvmet: EUI64 support Hannes Reinecke
                   ` (4 more replies)
  0 siblings, 5 replies; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-04 11:28 UTC (permalink / raw)


Hi all,

here's a prototype implementation for ANA support in nvme.
The implementation itself is pretty straightforward; only the ANA group
linking for NVMe target required some fiddling.

As usual, comments and reviews are welcome.

Hannes Reinecke (5):
  nvmet: EUI64 support
  nvme: Add ANA base definitions
  nvmet: Add ANA base support
  block: BLK_STS_NEXUS is a path failure
  nvme: ANA base support

 drivers/nvme/host/core.c        | 122 ++++++++++++++-
 drivers/nvme/host/multipath.c   |  12 +-
 drivers/nvme/host/nvme.h        |   3 +
 drivers/nvme/target/admin-cmd.c |  75 ++++++++-
 drivers/nvme/target/configfs.c  | 328 ++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/target/core.c      |  78 +++++++++-
 drivers/nvme/target/discovery.c |  16 ++
 drivers/nvme/target/io-cmd.c    |  10 ++
 drivers/nvme/target/nvmet.h     |  46 ++++++
 include/linux/blk_types.h       |   1 -
 include/linux/nvme.h            |  47 +++++-
 11 files changed, 723 insertions(+), 15 deletions(-)

-- 
2.12.3

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/5] nvmet: EUI64 support
  2018-05-04 11:28 [RFC PATCH 0/5] nvme: ANA support Hannes Reinecke
@ 2018-05-04 11:28 ` Hannes Reinecke
  2018-05-07  7:17   ` Johannes Thumshirn
                     ` (2 more replies)
  2018-05-04 11:28 ` [PATCH 2/5] nvme: Add ANA base definitions Hannes Reinecke
                   ` (3 subsequent siblings)
  4 siblings, 3 replies; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-04 11:28 UTC (permalink / raw)


Allow the setting of an IEEE Extended Unique Identifier (EUI64) for
each namespace. As the OUI is per subsystem ensure that the OUI
part of the EUI64 value (ie the top 3 bytes) are matching for each
namespace.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 drivers/nvme/target/admin-cmd.c |  6 ++---
 drivers/nvme/target/configfs.c  | 58 +++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/target/nvmet.h     |  2 ++
 3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 5e0e9fcc0d4d..a53259e1082e 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -187,10 +187,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 	id->rab = 6;
 
-	/*
-	 * XXX: figure out how we can assign a IEEE OUI, but until then
-	 * the safest is to leave it as zeroes.
-	 */
+	memcpy(id->ieee, &ctrl->subsys->oui, 3);
 
 	/* we support multiple ports and multiples hosts: */
 	id->cmic = (1 << 0) | (1 << 1);
@@ -314,6 +311,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	id->nmic = (1 << 0);
 
 	memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le));
+	memcpy(&id->eui64, &ns->eui64, sizeof(id->eui64));
 
 	id->lbaf[0].ds = ns->blksize_shift;
 
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index ad9ff27234b5..3dc2b2ae56e5 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -378,6 +378,49 @@ static ssize_t nvmet_ns_device_nguid_store(struct config_item *item,
 
 CONFIGFS_ATTR(nvmet_ns_, device_nguid);
 
+static ssize_t nvmet_ns_device_eui64_show(struct config_item *item, char *page)
+{
+	return sprintf(page, "%llx\n",
+		       (unsigned long long)&to_nvmet_ns(item)->eui64);
+}
+
+static ssize_t nvmet_ns_device_eui64_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_ns *ns = to_nvmet_ns(item), *n;
+	struct nvmet_subsys *subsys = ns->subsys;
+	u64 eui64;
+	int ret = 0;
+
+	mutex_lock(&subsys->lock);
+	if (ns->enabled) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	if (sscanf(page, "%llx\n", &eui64) != 1) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	list_for_each_entry_rcu(n, &subsys->namespaces, dev_link) {
+		if ((n->eui64 >> 40) != (eui64 >> 40)) {
+			pr_warn("non-matching EUI64 company id %06x\n",
+				(unsigned int)(eui64 >> 40));
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+	}
+	ns->eui64 = eui64;
+	if (!subsys->oui)
+		subsys->oui = (eui64 >> 40);
+out_unlock:
+	mutex_unlock(&subsys->lock);
+	return ret ? ret : count;
+}
+
+CONFIGFS_ATTR(nvmet_ns_, device_eui64);
+
 static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page)
 {
 	return sprintf(page, "%d\n", to_nvmet_ns(item)->enabled);
@@ -407,6 +450,7 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
 	&nvmet_ns_attr_device_path,
 	&nvmet_ns_attr_device_nguid,
 	&nvmet_ns_attr_device_uuid,
+	&nvmet_ns_attr_device_eui64,
 	&nvmet_ns_attr_enable,
 	NULL,
 };
@@ -704,10 +748,24 @@ static ssize_t nvmet_subsys_attr_serial_store(struct config_item *item,
 }
 CONFIGFS_ATTR(nvmet_subsys_, attr_serial);
 
+/*
+ * OUI is set from the namespace EUI64, so we cannot set it here.
+ */
+static ssize_t nvmet_subsys_attr_oui_show(struct config_item *item,
+					     char *page)
+{
+	struct nvmet_subsys *subsys = to_subsys(item);
+
+	return snprintf(page, PAGE_SIZE, "%06x\n", subsys->oui);
+}
+
+CONFIGFS_ATTR_RO(nvmet_subsys_, attr_oui);
+
 static struct configfs_attribute *nvmet_subsys_attrs[] = {
 	&nvmet_subsys_attr_attr_allow_any_host,
 	&nvmet_subsys_attr_attr_version,
 	&nvmet_subsys_attr_attr_serial,
+	&nvmet_subsys_attr_attr_oui,
 	NULL,
 };
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 15fd84ab21f8..8cd42ed37314 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -47,6 +47,7 @@ struct nvmet_ns {
 	u32			blksize_shift;
 	loff_t			size;
 	u8			nguid[16];
+	u64			eui64;
 	uuid_t			uuid;
 
 	bool			enabled;
@@ -154,6 +155,7 @@ struct nvmet_subsys {
 
 	u64			ver;
 	u64			serial;
+	u32			oui;
 	char			*subsysnqn;
 
 	struct config_group	group;
-- 
2.12.3

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/5] nvme: Add ANA base definitions
  2018-05-04 11:28 [RFC PATCH 0/5] nvme: ANA support Hannes Reinecke
  2018-05-04 11:28 ` [PATCH 1/5] nvmet: EUI64 support Hannes Reinecke
@ 2018-05-04 11:28 ` Hannes Reinecke
  2018-05-04 16:17   ` Keith Busch
                     ` (2 more replies)
  2018-05-04 11:28 ` [PATCH 3/5] nvmet: Add ANA base support Hannes Reinecke
                   ` (2 subsequent siblings)
  4 siblings, 3 replies; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-04 11:28 UTC (permalink / raw)


Add definitions from the ANA TP errata.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 include/linux/nvme.h | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4112e2bd747f..40c2d08ee761 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -242,7 +242,11 @@ struct nvme_id_ctrl {
 	__le32			sanicap;
 	__le32			hmminds;
 	__le16			hmmaxd;
-	__u8			rsvd338[174];
+	__u8			anatt;
+	__u8			anacap;
+	__le32			anagrpmax;
+	__le32			nanagrpid;
+	__u8			rsvd338[164];
 	__u8			sqes;
 	__u8			cqes;
 	__le16			maxcmd;
@@ -258,7 +262,8 @@ struct nvme_id_ctrl {
 	__le16			acwu;
 	__u8			rsvd534[2];
 	__le32			sgls;
-	__u8			rsvd540[228];
+	__le32			mnn;
+	__u8			rsvd540[224];
 	char			subnqn[256];
 	__u8			rsvd1024[768];
 	__le32			ioccsz;
@@ -312,7 +317,8 @@ struct nvme_id_ns {
 	__le16			nabspf;
 	__le16			noiob;
 	__u8			nvmcap[16];
-	__u8			rsvd64[40];
+	__u8			rsvd64[36];
+	__le32			anagrpid;
 	__u8			nguid[16];
 	__u8			eui64[8];
 	struct nvme_lbaf	lbaf[16];
@@ -440,6 +446,7 @@ enum {
 	NVME_AER_VS			= 7,
 	NVME_AER_NOTICE_NS_CHANGED	= 0x0002,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102,
+	NVME_AER_NOTICE_ANA_CHANGE	= 0x0302,
 };
 
 struct nvme_lba_range_type {
@@ -748,11 +755,17 @@ enum {
 	NVME_LOG_SMART		= 0x02,
 	NVME_LOG_FW_SLOT	= 0x03,
 	NVME_LOG_CMD_EFFECTS	= 0x05,
+	NVME_LOG_ANA		= 0x0c,
 	NVME_LOG_DISC		= 0x70,
 	NVME_LOG_RESERVATION	= 0x80,
 	NVME_FWACT_REPL		= (0 << 3),
 	NVME_FWACT_REPL_ACTV	= (1 << 3),
 	NVME_FWACT_ACTV		= (2 << 3),
+	NVME_ANA_STATE_OPTIMIZED = 0x01,
+	NVME_ANA_STATE_NONOPTIMIZED = 0x02,
+	NVME_ANA_STATE_INACCESSIBLE = 0x03,
+	NVME_ANA_STATE_PERSISTENT_LOSS = 0x04,
+	NVME_ANA_STATE_CHANGE_STATE = 0x05,
 };
 
 struct nvme_identify {
@@ -917,6 +930,23 @@ struct nvmf_common_command {
 	__u8	ts[24];
 };
 
+/* Asymmetric Namespace Access log page entry */
+struct nvmf_ana_group_descriptor {
+	__le32 groupid;
+	__le32 nsid_num;
+	__le64 chgcnt;
+	__u8 ana_state;
+	__u8 resv1[7];
+	__le32 nsid[0];
+};
+
+struct nvmf_ana_rsp_page_header {
+	__le64 chgcnt;
+	__le16 grpid_num;
+	__le16 resv[3];
+	struct nvmf_ana_group_descriptor desc[0];
+};
+
 /*
  * The legal cntlid range a NVMe Target will provide.
  * Note that cntlid of value 0 is considered illegal in the fabrics world.
@@ -1135,6 +1165,8 @@ enum {
 	NVME_SC_NS_NOT_ATTACHED		= 0x11a,
 	NVME_SC_THIN_PROV_NOT_SUPP	= 0x11b,
 	NVME_SC_CTRL_LIST_INVALID	= 0x11c,
+	NVME_SC_GRP_ID_INVALID		= 0x123,
+	NVME_SC_ANA_ATTACH_FAILED	= 0x124,
 
 	/*
 	 * I/O Command Set Specific - NVM commands:
@@ -1168,6 +1200,15 @@ enum {
 	NVME_SC_ACCESS_DENIED		= 0x286,
 	NVME_SC_UNWRITTEN_BLOCK		= 0x287,
 
+	/* Path-related Errors */
+	NVME_SC_INTERNAL_PATH_ERROR	= 0x300,
+	NVME_SC_ANA_PERSISTENT_LOSS	= 0x301,
+	NVME_SC_ANA_INACCESSIBLE	= 0x302,
+	NVME_SC_ANA_TRANSITION		= 0x303,
+	NVME_SC_CTRL_PATHING_ERROR	= 0x360,
+	NVME_SC_HOST_PATHING_ERROR	= 0x370,
+	NVME_SC_HOST_ABORTED		= 0x371,
+
 	NVME_SC_DNR			= 0x4000,
 };
 
-- 
2.12.3

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 3/5] nvmet: Add ANA base support
  2018-05-04 11:28 [RFC PATCH 0/5] nvme: ANA support Hannes Reinecke
  2018-05-04 11:28 ` [PATCH 1/5] nvmet: EUI64 support Hannes Reinecke
  2018-05-04 11:28 ` [PATCH 2/5] nvme: Add ANA base definitions Hannes Reinecke
@ 2018-05-04 11:28 ` Hannes Reinecke
  2018-05-06  2:42   ` Guan Junxiong
  2018-05-04 11:28 ` [PATCH 4/5] block: BLK_STS_NEXUS is a path failure Hannes Reinecke
  2018-05-04 11:28 ` [PATCH 5/5] nvme: ANA base support Hannes Reinecke
  4 siblings, 1 reply; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-04 11:28 UTC (permalink / raw)


Add ANA support to the nvme target. The ANA configuration is optional,
and doesn't interfere with existing configurations.
Each subsystem has distinct ANA groups; if ANA groups are created
it is required to link the ANA groups into the individual ports.
Linking the entire subsystem will be refused if ANA groups are
specified.
Also this implementation has a limit of one single ANA state per
groups, irrespective of the path. So when distinct ANA states
are required one needs to create different ANA groups.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 drivers/nvme/target/admin-cmd.c |  69 +++++++++-
 drivers/nvme/target/configfs.c  | 272 +++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/target/core.c      |  78 +++++++++++-
 drivers/nvme/target/discovery.c |  16 +++
 drivers/nvme/target/io-cmd.c    |  10 ++
 drivers/nvme/target/nvmet.h     |  44 +++++++
 6 files changed, 483 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index a53259e1082e..7e47527963e4 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -102,6 +102,35 @@ static u16 nvmet_get_smart_log(struct nvmet_req *req,
 	return status;
 }
 
+static u16 nvmet_get_ana_log(struct nvmet_req *req,
+			     struct nvmf_ana_rsp_page_header *alog)
+{
+	struct nvmf_ana_group_descriptor *desc;
+	struct nvmet_ns *ns;
+	struct nvmet_ctrl *ctrl;
+	int nsid = 0;
+
+	ctrl = req->sq->ctrl;
+
+	put_unaligned_le64(ctrl->subsys->change_count, &alog->chgcnt);
+	/* We only have one ANA group per controller for now */
+	put_unaligned_le64(1, &alog->grpid_num);
+	desc = &alog->desc[0];
+
+	put_unaligned_le32(ctrl->ag->grpid, &desc->groupid);
+	put_unaligned_le64(ctrl->ag->change_count, &desc->chgcnt);
+	desc->ana_state = ctrl->ag->state;
+	rcu_read_lock();
+	list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+		put_unaligned_le32(ns->nsid, &desc->nsid[nsid]);
+		nsid++;
+	}
+	rcu_read_unlock();
+	put_unaligned_le32(nsid, &desc->nsid_num);
+
+	return NVME_SC_SUCCESS;
+}
+
 static void nvmet_execute_get_log_page(struct nvmet_req *req)
 {
 	struct nvme_smart_log *smart_log;
@@ -149,6 +178,11 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		 * still claim to fully implement this mandatory log page.
 		 */
 		break;
+	case NVME_LOG_ANA:
+		status = nvmet_get_ana_log(req, buf);
+		if (status)
+			goto err;
+		break;
 	default:
 		BUG();
 	}
@@ -165,6 +199,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 {
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 	struct nvme_id_ctrl *id;
+	struct nvmet_ag *ag;
+	u32 num_ag = 0;
 	u16 status = 0;
 	const char model[] = "Linux";
 
@@ -189,9 +225,26 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 	memcpy(id->ieee, &ctrl->subsys->oui, 3);
 
-	/* we support multiple ports and multiples hosts: */
+	rcu_read_lock();
+	list_for_each_entry_rcu(ag, &ctrl->subsys->ana_groups, entry)
+		num_ag++;
+	rcu_read_unlock();
+
+	/* we support multiple ports and multiple hosts */
 	id->cmic = (1 << 0) | (1 << 1);
 
+	/* ANA support */
+	if (num_ag > 0) {
+		id->cmic |= (1 << 3);
+
+		/* All ANA states are supported */
+		id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | \
+			(1 << 3) | (1 << 4);
+		id->anatt = ctrl->subsys->ana_tt;
+		id->anagrpmax = cpu_to_le32((u32)-1);
+		id->nanagrpid = cpu_to_le32(num_ag);
+	}
+
 	/* no limit on data transfer sizes for now */
 	id->mdts = 0;
 	id->cntlid = cpu_to_le16(ctrl->cntlid);
@@ -277,6 +330,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	struct nvmet_ns *ns;
 	struct nvme_id_ns *id;
 	u16 status = 0;
+	u8 ag_state = NVME_ANA_STATE_OPTIMIZED;
 
 	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
 	if (!ns) {
@@ -290,12 +344,24 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 		goto out_put_ns;
 	}
 
+	if (req->sq->ctrl->ag) {
+		id->anagrpid = cpu_to_le32(req->sq->ctrl->ag->grpid);
+		ag_state = req->sq->ctrl->ag->state;
+	}
+
 	/*
 	 * nuse = ncap = nsze isn't always true, but we have no way to find
 	 * that out from the underlying device.
 	 */
 	id->ncap = id->nuse = id->nsze =
 		cpu_to_le64(ns->size >> ns->blksize_shift);
+	/*
+	 * nuse and nsze should be zero for inaccessible or
+	 * persistent loss ANA state.
+	 */
+	if ((ag_state == NVME_ANA_STATE_INACCESSIBLE) ||
+	    (ag_state == NVME_ANA_STATE_PERSISTENT_LOSS))
+		id->nuse = id->nsze = 0;
 
 	/*
 	 * We just provide a single LBA format that matches what the
@@ -560,6 +626,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 		case NVME_LOG_ERROR:
 		case NVME_LOG_SMART:
 		case NVME_LOG_FW_SLOT:
+		case NVME_LOG_ANA:
 			req->execute = nvmet_execute_get_log_page;
 			return 0;
 		}
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 3dc2b2ae56e5..851ad26550c4 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -22,6 +22,7 @@
 
 static const struct config_item_type nvmet_host_type;
 static const struct config_item_type nvmet_subsys_type;
+static const struct config_item_type nvmet_ag_type;
 
 static const struct nvmet_transport_name {
 	u8		type;
@@ -381,7 +382,7 @@ CONFIGFS_ATTR(nvmet_ns_, device_nguid);
 static ssize_t nvmet_ns_device_eui64_show(struct config_item *item, char *page)
 {
 	return sprintf(page, "%llx\n",
-		       (unsigned long long)&to_nvmet_ns(item)->eui64);
+		       (unsigned long long)to_nvmet_ns(item)->eui64);
 }
 
 static ssize_t nvmet_ns_device_eui64_store(struct config_item *item,
@@ -522,6 +523,10 @@ static int nvmet_port_subsys_allow_link(struct config_item *parent,
 		pr_err("can only link subsystems into the subsystems dir.!\n");
 		return -EINVAL;
 	}
+	if (!list_empty(&port->ags)) {
+		pr_err("can only link subsystems if no ANA groups are active!\n");
+		return -EAGAIN;
+	}
 	subsys = to_subsys(target);
 	link = kmalloc(sizeof(*link), GFP_KERNEL);
 	if (!link)
@@ -586,6 +591,227 @@ static const struct config_item_type nvmet_port_subsys_type = {
 	.ct_owner		= THIS_MODULE,
 };
 
+static int nvmet_port_ags_allow_link(struct config_item *parent,
+		struct config_item *target)
+{
+	struct nvmet_port *port = to_nvmet_port(parent->ci_parent);
+	struct nvmet_ag *ag;
+	struct nvmet_ag_link *link, *p;
+	int ret;
+
+	if (target->ci_type != &nvmet_ag_type) {
+		pr_err("can only link ana_groups into the ana_groups dir.!\n");
+		return -EINVAL;
+	}
+	if (!list_empty(&port->subsystems)) {
+		pr_err("can only link ana_groups if no subsystems are specified!\n");
+		return -EAGAIN;
+	}
+	ag = to_nvmet_ag(target);
+	link = kmalloc(sizeof(*link), GFP_KERNEL);
+	if (!link)
+		return -ENOMEM;
+	link->ag = ag;
+
+	down_write(&nvmet_config_sem);
+	ret = -EEXIST;
+	list_for_each_entry(p, &port->ags, entry) {
+		if (p->ag == ag)
+			goto out_free_link;
+	}
+	if (list_empty(&port->ags)) {
+		ret = nvmet_enable_port(port);
+		if (ret)
+			goto out_free_link;
+	}
+	list_add_tail(&link->entry, &port->ags);
+	list_add_tail(&link->ag_list, &ag->port_link);
+	link->port = port;
+	nvmet_genctr++;
+	/* NVMe-oF requires us to set ana_tt != 0 for ANA support */
+	if (ag->subsys->ana_tt == 0)
+		ag->subsys->ana_tt = 1;
+	up_write(&nvmet_config_sem);
+	return 0;
+
+out_free_link:
+	up_write(&nvmet_config_sem);
+	kfree(link);
+	return ret;
+}
+
+static void nvmet_port_ags_drop_link(struct config_item *parent,
+		struct config_item *target)
+{
+	struct nvmet_port *port = to_nvmet_port(parent->ci_parent);
+	struct nvmet_ag *ag = to_nvmet_ag(target);
+	struct nvmet_subsys *subsys = ag->subsys;
+	struct nvmet_ag_link *p;
+	int ana_active = 0;
+
+	down_write(&nvmet_config_sem);
+	list_for_each_entry(p, &port->ags, entry) {
+		if (p->ag == ag)
+			goto found;
+	}
+	up_write(&nvmet_config_sem);
+	return;
+
+found:
+	list_del(&p->entry);
+	list_del(&p->ag_list);
+	nvmet_genctr++;
+	if (list_empty(&port->ags))
+		nvmet_disable_port(port);
+	list_for_each_entry(ag, &subsys->ana_groups, entry) {
+		if (!list_empty(&ag->port_link))
+			ana_active++;
+	}
+	if (!ana_active)
+		subsys->ana_tt = 0;
+	up_write(&nvmet_config_sem);
+	kfree(p);
+}
+
+static struct configfs_item_operations nvmet_port_ags_item_ops = {
+	.allow_link		= nvmet_port_ags_allow_link,
+	.drop_link		= nvmet_port_ags_drop_link,
+};
+
+static const struct config_item_type nvmet_port_ag_type = {
+	.ct_item_ops		= &nvmet_port_ags_item_ops,
+	.ct_owner		= THIS_MODULE,
+};
+
+static ssize_t nvmet_ag_state_show(struct config_item *item, char *page)
+{
+	struct nvmet_ag *ag = to_nvmet_ag(item);
+
+	switch (ag->state) {
+	case NVME_ANA_STATE_OPTIMIZED:
+		return sprintf(page, "optimized\n");
+	case NVME_ANA_STATE_NONOPTIMIZED:
+		return sprintf(page, "nonoptimized\n");
+	case NVME_ANA_STATE_INACCESSIBLE:
+		return sprintf(page, "inaccessible\n");
+	case NVME_ANA_STATE_PERSISTENT_LOSS:
+		return sprintf(page, "persistent-loss\n");
+	case NVME_ANA_STATE_CHANGE_STATE:
+		return sprintf(page, "change-state\n");
+	default:
+		return sprintf(page, "<invalid>\n");
+	}
+}
+
+static ssize_t nvmet_ag_state_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_ag *ag = to_nvmet_ag(item);
+	u8 state;
+
+	if (sysfs_streq(page, "optimized")) {
+		state = NVME_ANA_STATE_OPTIMIZED;
+	} else if (sysfs_streq(page, "nonoptimized")) {
+		state = NVME_ANA_STATE_NONOPTIMIZED;
+	} else if (sysfs_streq(page, "inaccessible")) {
+		state = NVME_ANA_STATE_INACCESSIBLE;
+	} else if (sysfs_streq(page, "persistent-loss")) {
+		state = NVME_ANA_STATE_PERSISTENT_LOSS;
+	} else {
+		pr_err("Invalid state '%s' for state\n", page);
+		return -EINVAL;
+	}
+	down_write(&nvmet_config_sem);
+	ag->state = NVME_ANA_STATE_CHANGE_STATE;
+	ag->pending_state = state;
+	ag->subsys->change_count++;
+	ag->change_count++;
+	up_write(&nvmet_config_sem);
+	/*
+	 * Reduce the delay timeout by one second to guarantee that
+	 * the transition is finished by ana_tt.
+	 */
+	if (ag->subsys->ana_tt > 1)
+		schedule_delayed_work(&ag->state_change_work,
+				      (ag->subsys->ana_tt - 1) * HZ);
+
+	return count;
+}
+
+CONFIGFS_ATTR(nvmet_ag_, state);
+
+static struct configfs_attribute *nvmet_ag_attrs[] = {
+	&nvmet_ag_attr_state,
+	NULL,
+};
+
+static void nvmet_ag_release(struct config_item *item)
+{
+	struct nvmet_ag *ag = to_nvmet_ag(item);
+
+	down_write(&nvmet_config_sem);
+	list_del_init(&ag->entry);
+	up_write(&nvmet_config_sem);
+	nvmet_ag_free(ag);
+}
+
+static struct configfs_item_operations nvmet_ag_item_ops = {
+	.release		= nvmet_ag_release,
+};
+
+static const struct config_item_type nvmet_ag_type = {
+	.ct_item_ops		= &nvmet_ag_item_ops,
+	.ct_attrs		= nvmet_ag_attrs,
+	.ct_owner		= THIS_MODULE,
+};
+
+static struct config_group *nvmet_ag_make(struct config_group *group,
+		const char *name)
+{
+	struct nvmet_subsys *subsys = ag_to_subsys(&group->cg_item);
+	struct nvmet_ag *ag, *a;
+	int ret;
+	u32 agid;
+
+	ret = kstrtou32(name, 0, &agid);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	if (agid == 0)
+		goto out;
+	ret = -ENOMEM;
+	ag = nvmet_ag_alloc(subsys, agid);
+	if (!ag)
+		goto out;
+	ret = -EEXIST;
+	down_write(&nvmet_config_sem);
+	list_for_each_entry_rcu(a, &subsys->ana_groups, entry) {
+		if (a->grpid == agid) {
+			nvmet_ag_free(ag);
+			goto out;
+		}
+	}
+	list_add_tail(&ag->entry, &subsys->ana_groups);
+	up_write(&nvmet_config_sem);
+	config_group_init_type_name(&ag->group, name, &nvmet_ag_type);
+
+	pr_info("adding ana groupid %d to subsystem %s\n",
+		agid, subsys->subsysnqn);
+	return &ag->group;
+out:
+	return ERR_PTR(ret);
+}
+
+static struct configfs_group_operations nvmet_ana_group_group_ops = {
+	.make_group		= nvmet_ag_make,
+};
+
+static const struct config_item_type nvmet_ana_group_group_type = {
+	.ct_group_ops		= &nvmet_ana_group_group_ops,
+	.ct_owner		= THIS_MODULE,
+};
+
 static int nvmet_allowed_hosts_allow_link(struct config_item *parent,
 		struct config_item *target)
 {
@@ -761,11 +987,46 @@ static ssize_t nvmet_subsys_attr_oui_show(struct config_item *item,
 
 CONFIGFS_ATTR_RO(nvmet_subsys_, attr_oui);
 
+static ssize_t nvmet_subsys_attr_anatt_show(struct config_item *item,
+					    char *page)
+{
+	struct nvmet_subsys *subsys = to_subsys(item);
+
+	if (list_empty(&subsys->ana_groups))
+		return snprintf(page, PAGE_SIZE, "0\n");
+	return snprintf(page, PAGE_SIZE, "%d\n", subsys->ana_tt);
+}
+
+static ssize_t nvmet_subsys_attr_anatt_store(struct config_item *item,
+					     const char *page, size_t count)
+{
+	struct nvmet_subsys *subsys = to_subsys(item);
+	int ret;
+	u8 ana_tt;
+
+	down_write(&nvmet_config_sem);
+	if (list_empty(&subsys->ana_groups)) {
+		up_write(&nvmet_config_sem);
+		return -EAGAIN;
+	}
+	ret = kstrtou8(page, 0, &ana_tt);
+	if (ret || ana_tt == 0) {
+		up_write(&nvmet_config_sem);
+		return -EINVAL;
+	}
+	subsys->ana_tt = ana_tt;
+	up_write(&nvmet_config_sem);
+
+	return count;
+}
+CONFIGFS_ATTR(nvmet_subsys_, attr_anatt);
+
 static struct configfs_attribute *nvmet_subsys_attrs[] = {
 	&nvmet_subsys_attr_attr_allow_any_host,
 	&nvmet_subsys_attr_attr_version,
 	&nvmet_subsys_attr_attr_serial,
 	&nvmet_subsys_attr_attr_oui,
+	&nvmet_subsys_attr_attr_anatt,
 	NULL,
 };
 
@@ -810,6 +1071,10 @@ static struct config_group *nvmet_subsys_make(struct config_group *group,
 			"namespaces", &nvmet_namespaces_type);
 	configfs_add_default_group(&subsys->namespaces_group, &subsys->group);
 
+	config_group_init_type_name(&subsys->ana_group,
+			"ana_groups", &nvmet_ana_group_group_type);
+	configfs_add_default_group(&subsys->ana_group, &subsys->group);
+
 	config_group_init_type_name(&subsys->allowed_hosts_group,
 			"allowed_hosts", &nvmet_allowed_hosts_type);
 	configfs_add_default_group(&subsys->allowed_hosts_group,
@@ -957,6 +1222,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
 	INIT_LIST_HEAD(&port->entry);
 	INIT_LIST_HEAD(&port->subsystems);
 	INIT_LIST_HEAD(&port->referrals);
+	INIT_LIST_HEAD(&port->ags);
 
 	port->disc_addr.portid = cpu_to_le16(portid);
 	config_group_init_type_name(&port->group, name, &nvmet_port_type);
@@ -965,6 +1231,10 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
 			"subsystems", &nvmet_port_subsys_type);
 	configfs_add_default_group(&port->subsys_group, &port->group);
 
+	config_group_init_type_name(&port->ana_group,
+			"ana_groups", &nvmet_port_ag_type);
+	configfs_add_default_group(&port->ana_group, &port->group);
+
 	config_group_init_type_name(&port->referrals_group,
 			"referrals", &nvmet_referrals_type);
 	configfs_add_default_group(&port->referrals_group, &port->group);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index e95424f172fd..4d17da2d76bb 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -137,6 +137,22 @@ static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 	schedule_work(&ctrl->async_event_work);
 }
 
+static void nvmet_ana_state_change_work(struct work_struct *work)
+{
+	struct nvmet_ag *ag = container_of(work, struct nvmet_ag,
+					   state_change_work.work);
+	struct nvmet_ctrl *ctrl;
+
+	ag->state = ag->pending_state;
+	down_read(&nvmet_config_sem);
+	list_for_each_entry(ctrl, &ag->subsys->ctrls, subsys_entry) {
+		if (ctrl->ag == ag)
+			nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+					      0x03, 0);
+	}
+	up_read(&nvmet_config_sem);
+}
+
 int nvmet_register_transport(const struct nvmet_fabrics_ops *ops)
 {
 	int ret = 0;
@@ -233,6 +249,30 @@ static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 	cancel_delayed_work_sync(&ctrl->ka_work);
 }
 
+void nvmet_ag_free(struct nvmet_ag *ag)
+{
+	kfree(ag);
+}
+
+struct nvmet_ag *nvmet_ag_alloc(struct nvmet_subsys *subsys, u32 agid)
+{
+	struct nvmet_ag *ag;
+
+	ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+	if (!ag)
+		return NULL;
+
+	INIT_LIST_HEAD(&ag->entry);
+	INIT_LIST_HEAD(&ag->port_link);
+	INIT_DELAYED_WORK(&ag->state_change_work, nvmet_ana_state_change_work);
+
+	ag->grpid = agid;
+	ag->subsys = subsys;
+	ag->state = ag->pending_state = NVME_ANA_STATE_OPTIMIZED;
+
+	return ag;
+}
+
 static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
 		__le32 nsid)
 {
@@ -744,10 +784,18 @@ static bool nvmet_host_discovery_allowed(struct nvmet_req *req,
 		const char *hostnqn)
 {
 	struct nvmet_subsys_link *s;
+	struct nvmet_ag_link *a;
 
-	list_for_each_entry(s, &req->port->subsystems, entry) {
-		if (__nvmet_host_allowed(s->subsys, hostnqn))
-			return true;
+	if (!list_empty(&req->port->ags)) {
+		list_for_each_entry(a, &req->port->ags, entry) {
+			if (__nvmet_host_allowed(a->ag->subsys, hostnqn))
+				return true;
+		}
+	} else {
+		list_for_each_entry(s, &req->port->subsystems, entry) {
+			if (__nvmet_host_allowed(s->subsys, hostnqn))
+				return true;
+		}
 	}
 
 	return false;
@@ -769,6 +817,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 {
 	struct nvmet_subsys *subsys;
 	struct nvmet_ctrl *ctrl;
+	struct nvmet_ag_link *l;
+	struct nvmet_ag *ag = NULL;
 	int ret;
 	u16 status;
 
@@ -783,6 +833,12 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 
 	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
 	down_read(&nvmet_config_sem);
+	list_for_each_entry(l, &req->port->ags, entry) {
+		if (l->ag->subsys == subsys) {
+			ag = l->ag;
+			break;
+		}
+	}
 	if (!nvmet_host_allowed(req, subsys, hostnqn)) {
 		pr_info("connect by host %s for subsystem %s not allowed\n",
 			hostnqn, subsysnqn);
@@ -798,7 +854,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	if (!ctrl)
 		goto out_put_subsystem;
 	mutex_init(&ctrl->lock);
-
+	ctrl->ag = ag;
 	nvmet_init_cap(ctrl);
 
 	INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work);
@@ -891,6 +947,8 @@ static void nvmet_ctrl_free(struct kref *ref)
 
 	nvmet_stop_keep_alive_timer(ctrl);
 
+	if (ctrl->ag)
+		cancel_delayed_work(&ctrl->ag->state_change_work);
 	flush_work(&ctrl->async_event_work);
 	cancel_work_sync(&ctrl->fatal_err_work);
 
@@ -933,6 +991,7 @@ static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
 		const char *subsysnqn)
 {
 	struct nvmet_subsys_link *p;
+	struct nvmet_ag_link *a;
 
 	if (!port)
 		return NULL;
@@ -945,6 +1004,15 @@ static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
 	}
 
 	down_read(&nvmet_config_sem);
+	list_for_each_entry(a, &port->ags, entry) {
+		if (!strncmp(a->ag->subsys->subsysnqn, subsysnqn,
+			     NVMF_NQN_SIZE)) {
+			if (!kref_get_unless_zero(&a->ag->subsys->ref))
+				break;
+			up_read(&nvmet_config_sem);
+			return a->ag->subsys;
+		}
+	}
 	list_for_each_entry(p, &port->subsystems, entry) {
 		if (!strncmp(p->subsys->subsysnqn, subsysnqn,
 				NVMF_NQN_SIZE)) {
@@ -997,6 +1065,8 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
 	INIT_LIST_HEAD(&subsys->namespaces);
 	INIT_LIST_HEAD(&subsys->ctrls);
 	INIT_LIST_HEAD(&subsys->hosts);
+	INIT_LIST_HEAD(&subsys->ana_groups);
+	subsys->ana_tt = NVMET_DEFAULT_ANA_TT;
 
 	return subsys;
 }
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 231e04e0a496..7caa46691702 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -90,6 +90,7 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
 	size_t alloc_len = max(data_len, sizeof(*hdr));
 	int residual_len = data_len - sizeof(*hdr);
 	struct nvmet_subsys_link *p;
+	struct nvmet_ag_link *a;
 	struct nvmet_port *r;
 	u32 numrec = 0;
 	u16 status = 0;
@@ -106,6 +107,21 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
 	}
 
 	down_read(&nvmet_config_sem);
+	list_for_each_entry(a, &req->port->ags, entry) {
+		if (!nvmet_host_allowed(req, a->ag->subsys, ctrl->hostnqn))
+			continue;
+		if (residual_len >= entry_size) {
+			char traddr[NVMF_TRADDR_SIZE];
+
+			nvmet_set_disc_traddr(req, req->port, traddr);
+			nvmet_format_discovery_entry(hdr, req->port,
+					a->ag->subsys->subsysnqn, traddr,
+					NVME_NQN_NVME, numrec);
+			residual_len -= entry_size;
+		}
+		numrec++;
+	}
+
 	list_for_each_entry(p, &req->port->subsystems, entry) {
 		if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn))
 			continue;
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index cd2344179673..c25d459bb787 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -204,6 +204,16 @@ u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 	if (unlikely(!req->ns))
 		return NVME_SC_INVALID_NS | NVME_SC_DNR;
 
+	if (req->sq->ctrl->ag) {
+		struct nvmet_ag *ag = req->sq->ctrl->ag;
+
+		if (work_pending(&ag->state_change_work.work))
+			return NVME_SC_ANA_TRANSITION | NVME_SC_DNR;
+		if (ag->state == NVME_ANA_STATE_INACCESSIBLE)
+			return NVME_SC_ANA_INACCESSIBLE | NVME_SC_DNR;
+		if (ag->state == NVME_ANA_STATE_PERSISTENT_LOSS)
+			return NVME_SC_ANA_PERSISTENT_LOSS | NVME_SC_DNR;
+	}
 	switch (cmd->common.opcode) {
 	case nvme_cmd_read:
 	case nvme_cmd_write:
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 8cd42ed37314..6d947cab9bf7 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -29,6 +29,7 @@
 
 #define NVMET_ASYNC_EVENTS		4
 #define NVMET_ERROR_LOG_SLOTS		128
+#define NVMET_DEFAULT_ANA_TT		2
 
 /* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM
  * The 16 bit shift is to set IATTR bit to 1, which means offending
@@ -39,6 +40,23 @@
 #define IPO_IATTR_CONNECT_SQE(x)	\
 	(cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
 
+struct nvmet_ag {
+	struct list_head	entry;
+	struct list_head	port_link;
+	struct config_group	group;
+	struct nvmet_subsys	*subsys;
+	struct delayed_work	state_change_work;
+	u64			change_count;
+	u32			grpid;
+	u8			state;
+	u8			pending_state;
+};
+
+static inline struct nvmet_ag *to_nvmet_ag(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct nvmet_ag, group);
+}
+
 struct nvmet_ns {
 	struct list_head	dev_link;
 	struct percpu_ref	ref;
@@ -97,6 +115,8 @@ struct nvmet_port {
 	struct list_head		subsystems;
 	struct config_group		referrals_group;
 	struct list_head		referrals;
+	struct config_group		ana_group;
+	struct list_head		ags;
 	void				*priv;
 	bool				enabled;
 };
@@ -111,6 +131,7 @@ struct nvmet_ctrl {
 	struct nvmet_subsys	*subsys;
 	struct nvmet_cq		**cqs;
 	struct nvmet_sq		**sqs;
+	struct nvmet_ag		*ag;
 
 	struct mutex		lock;
 	u64			cap;
@@ -146,6 +167,11 @@ struct nvmet_subsys {
 	struct list_head	namespaces;
 	unsigned int		max_nsid;
 
+	struct list_head	ana_groups;
+	u32			max_grpid;
+	u8			ana_tt;
+	u64			change_count;
+
 	struct list_head	ctrls;
 
 	struct list_head	hosts;
@@ -161,6 +187,7 @@ struct nvmet_subsys {
 	struct config_group	group;
 
 	struct config_group	namespaces_group;
+	struct config_group	ana_group;
 	struct config_group	allowed_hosts_group;
 };
 
@@ -176,6 +203,13 @@ static inline struct nvmet_subsys *namespaces_to_subsys(
 			namespaces_group);
 }
 
+static inline struct nvmet_subsys *ag_to_subsys(
+		struct config_item *item)
+{
+	return container_of(to_config_group(item), struct nvmet_subsys,
+			ana_group);
+}
+
 struct nvmet_host {
 	struct config_group	group;
 };
@@ -200,6 +234,13 @@ struct nvmet_subsys_link {
 	struct nvmet_subsys	*subsys;
 };
 
+struct nvmet_ag_link {
+	struct list_head	entry;
+	struct list_head	ag_list;
+	struct nvmet_ag		*ag;
+	struct nvmet_port	*port;
+};
+
 struct nvmet_req;
 struct nvmet_fabrics_ops {
 	struct module *owner;
@@ -293,6 +334,9 @@ u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
 void nvmet_ctrl_put(struct nvmet_ctrl *ctrl);
 u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd);
 
+void nvmet_ag_free(struct nvmet_ag *ag);
+struct nvmet_ag *nvmet_ag_alloc(struct nvmet_subsys *subsys, u32 agid);
+
 struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
 		enum nvme_subsys_type type);
 void nvmet_subsys_put(struct nvmet_subsys *subsys);
-- 
2.12.3

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 4/5] block: BLK_STS_NEXUS is a path failure
  2018-05-04 11:28 [RFC PATCH 0/5] nvme: ANA support Hannes Reinecke
                   ` (2 preceding siblings ...)
  2018-05-04 11:28 ` [PATCH 3/5] nvmet: Add ANA base support Hannes Reinecke
@ 2018-05-04 11:28 ` Hannes Reinecke
  2018-05-09  7:31   ` Christoph Hellwig
  2018-05-04 11:28 ` [PATCH 5/5] nvme: ANA base support Hannes Reinecke
  4 siblings, 1 reply; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-04 11:28 UTC (permalink / raw)


A block error of BLK_STS_NEXUS is equivalent to a path failure,
so blk_path_error() should reflect this.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 include/linux/blk_types.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 17b18b91ebac..3c6aeba732f1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -80,7 +80,6 @@ static inline bool blk_path_error(blk_status_t error)
 	case BLK_STS_NOTSUPP:
 	case BLK_STS_NOSPC:
 	case BLK_STS_TARGET:
-	case BLK_STS_NEXUS:
 	case BLK_STS_MEDIUM:
 	case BLK_STS_PROTECTION:
 		return false;
-- 
2.12.3

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-04 11:28 [RFC PATCH 0/5] nvme: ANA support Hannes Reinecke
                   ` (3 preceding siblings ...)
  2018-05-04 11:28 ` [PATCH 4/5] block: BLK_STS_NEXUS is a path failure Hannes Reinecke
@ 2018-05-04 11:28 ` Hannes Reinecke
  2018-05-04 22:11   ` Schremmer, Steven
                     ` (5 more replies)
  4 siblings, 6 replies; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-04 11:28 UTC (permalink / raw)


Add ANA support to the nvme host. If ANA is supported the state
and the group id are displayed in new sysfs attributes 'ana_state' and
'ana_group'.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 drivers/nvme/host/core.c      | 123 +++++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/multipath.c |  12 ++++-
 drivers/nvme/host/nvme.h      |   3 ++
 3 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 62262fac7a5d..14dff8e96899 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -99,6 +99,7 @@ static struct class *nvme_subsys_class;
 
 static void nvme_ns_remove(struct nvme_ns *ns);
 static int nvme_revalidate_disk(struct gendisk *disk);
+static void nvme_get_ana_log(struct nvme_ctrl *ctrl, struct nvme_ns *ns);
 
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
 {
@@ -1488,6 +1489,9 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 		goto out;
 	}
 
+	if (ctrl->subsys->cmic & (1 << 3))
+		nvme_get_ana_log(ctrl, ns);
+
 	__nvme_revalidate_disk(disk, id);
 	nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
 	if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
@@ -2276,6 +2280,61 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
 	return ret;
 }
 
+static void nvme_get_ana_log(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
+{
+	int i, j;
+	struct nvmf_ana_rsp_page_header *ana_log;
+	size_t ana_log_size = 4096;
+
+	ana_log = kzalloc(ana_log_size, GFP_KERNEL);
+	if (!ana_log)
+		return;
+
+	if (nvme_get_log(ctrl, NVME_LOG_ANA, ana_log, ana_log_size))
+		dev_warn(ctrl->device,
+			 "Get ANA log error\n");
+	for (i = 0; i < ana_log->grpid_num; i++) {
+		struct nvmf_ana_group_descriptor *desc =
+			&ana_log->desc[i];
+		for (j = 0; j < desc->nsid_num; j++) {
+			if (desc->nsid[j] == ns->head->ns_id) {
+				ns->ana_state = desc->ana_state;
+				ns->ana_group = desc->groupid;
+			}
+		}
+	}
+	kfree(ana_log);
+}
+
+static void nvme_get_full_ana_log(struct nvme_ctrl *ctrl)
+{
+	int i, j;
+	struct nvme_ns *ns;
+	struct nvmf_ana_rsp_page_header *ana_log;
+	size_t ana_log_size = 4096;
+
+	ana_log = kzalloc(ana_log_size, GFP_KERNEL);
+	if (!ana_log)
+		return;
+
+	if (nvme_get_log(ctrl, NVME_LOG_ANA, ana_log, ana_log_size))
+		dev_warn(ctrl->device,
+			 "Get ANA log error\n");
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		for (i = 0; i < ana_log->grpid_num; i++) {
+			struct nvmf_ana_group_descriptor *desc =
+				&ana_log->desc[i];
+			for (j = 0; j < desc->nsid_num; j++) {
+				if (desc->nsid[j] == ns->head->ns_id) {
+					ns->ana_state = desc->ana_state;
+					ns->ana_group = desc->groupid;
+				}
+			}
+		}
+	}
+	kfree(ana_log);
+}
+
 /*
  * Initialize the cached copies of the Identify data and various controller
  * register in our nvme_ctrl structure.  This should be called as soon as
@@ -2631,12 +2690,45 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(nsid);
 
+static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+	switch (ns->ana_state & 0x0f) {
+	case NVME_ANA_STATE_OPTIMIZED:
+		return sprintf(buf, "optimized\n");
+	case NVME_ANA_STATE_NONOPTIMIZED:
+		return sprintf(buf, "non-optimized\n");
+	case NVME_ANA_STATE_INACCESSIBLE:
+		return sprintf(buf, "inaccessible\n");
+	case NVME_ANA_STATE_PERSISTENT_LOSS:
+		return sprintf(buf, "persistent-loss\n");
+	case NVME_ANA_STATE_CHANGE_STATE:
+		return sprintf(buf, "change-state\n");
+	default:
+		return sprintf(buf, "<reserved>\n");
+	}
+}
+static DEVICE_ATTR_RO(ana_state);
+
+static ssize_t ana_group_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+	return sprintf(buf, "%d\n", ns->ana_group);
+
+}
+static DEVICE_ATTR_RO(ana_group);
+
 static struct attribute *nvme_ns_id_attrs[] = {
 	&dev_attr_wwid.attr,
 	&dev_attr_uuid.attr,
 	&dev_attr_nguid.attr,
 	&dev_attr_eui.attr,
 	&dev_attr_nsid.attr,
+	&dev_attr_ana_state.attr,
+	&dev_attr_ana_group.attr,
 	NULL,
 };
 
@@ -2645,6 +2737,7 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
+	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 
 	if (a == &dev_attr_uuid.attr) {
 		if (uuid_is_null(&ids->uuid) &&
@@ -2659,6 +2752,12 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
 		if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
 			return 0;
 	}
+	if ((a == &dev_attr_ana_state.attr) ||
+	    (a == &dev_attr_ana_group.attr)) {
+		if (!ns->ctrl || !ns->ctrl->subsys ||
+		    !(ns->ctrl->subsys->cmic & (1 << 3)))
+			return 0;
+	}
 	return a->mode;
 }
 
@@ -2984,6 +3083,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
 	ns->queue->queuedata = ns;
 	ns->ctrl = ctrl;
+	ns->ana_state = NVME_ANA_STATE_OPTIMIZED;
 
 	kref_init(&ns->kref);
 	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
@@ -3001,7 +3101,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (nvme_init_ns_head(ns, nsid, id))
 		goto out_free_id;
 	nvme_setup_streams_ns(ctrl, ns);
-	
+
+	if (ctrl->subsys->cmic & (1 << 3))
+		nvme_get_ana_log(ctrl, ns);
 #ifdef CONFIG_NVME_MULTIPATH
 	/*
 	 * If multipathing is enabled we need to always use the subsystem
@@ -3343,6 +3445,19 @@ static void nvme_fw_act_work(struct work_struct *work)
 	nvme_get_fw_slot_info(ctrl);
 }
 
+static void nvme_ana_change_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl = container_of(work,
+				struct nvme_ctrl, ana_change_work);
+
+	if (ctrl->state != NVME_CTRL_LIVE)
+		return;
+
+	down_read(&ctrl->namespaces_rwsem);
+	nvme_get_full_ana_log(ctrl);
+	up_read(&ctrl->namespaces_rwsem);
+}
+
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res)
 {
@@ -3370,6 +3485,10 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 	case NVME_AER_NOTICE_FW_ACT_STARTING:
 		queue_work(nvme_wq, &ctrl->fw_act_work);
 		break;
+	case NVME_AER_NOTICE_ANA_CHANGE:
+		dev_info(ctrl->device, "ANA state change\n");
+		queue_work(nvme_wq, &ctrl->ana_change_work);
+		break;
 	default:
 		dev_warn(ctrl->device, "async event result %08x\n", result);
 	}
@@ -3383,6 +3502,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 	flush_work(&ctrl->async_event_work);
 	flush_work(&ctrl->scan_work);
 	cancel_work_sync(&ctrl->fw_act_work);
+	cancel_work_sync(&ctrl->ana_change_work);
 	if (ctrl->ops->stop_ctrl)
 		ctrl->ops->stop_ctrl(ctrl);
 }
@@ -3449,6 +3569,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
 	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
 	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
+	INIT_WORK(&ctrl->ana_change_work, nvme_ana_change_work);
 	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
 
 	ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 956e0b8e9c4d..077b56f1739a 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -56,8 +56,18 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
 {
 	struct nvme_ns *ns;
 
+	/* First round: select from all ANA optimized paths */
 	list_for_each_entry_rcu(ns, &head->list, siblings) {
-		if (ns->ctrl->state == NVME_CTRL_LIVE) {
+		if (ns->ctrl->state == NVME_CTRL_LIVE &&
+		    ns->ana_state == NVME_ANA_STATE_OPTIMIZED) {
+			rcu_assign_pointer(head->current_path, ns);
+			return ns;
+		}
+	}
+	/* Second round: select from all ANA non-optimized paths */
+	list_for_each_entry_rcu(ns, &head->list, siblings) {
+		if (ns->ctrl->state == NVME_CTRL_LIVE &&
+		    ns->ana_state == NVME_ANA_STATE_NONOPTIMIZED) {
 			rcu_assign_pointer(head->current_path, ns);
 			return ns;
 		}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 061fecfd44f5..75182a5cc166 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -187,6 +187,7 @@ struct nvme_ctrl {
 	struct delayed_work ka_work;
 	struct nvme_command ka_cmd;
 	struct work_struct fw_act_work;
+	struct work_struct ana_change_work;
 
 	/* Power saving configuration */
 	u64 ps_max_latency_us;
@@ -289,6 +290,8 @@ struct nvme_ns {
 	u32 sws;
 	bool ext;
 	u8 pi_type;
+	u8 ana_state;
+	u32 ana_group;
 	unsigned long flags;
 #define NVME_NS_REMOVING 0
 #define NVME_NS_DEAD     1
-- 
2.12.3

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/5] nvme: Add ANA base definitions
  2018-05-04 11:28 ` [PATCH 2/5] nvme: Add ANA base definitions Hannes Reinecke
@ 2018-05-04 16:17   ` Keith Busch
  2018-05-04 17:03     ` Meneghini, John
  2018-05-04 21:12   ` Schremmer, Steven
  2018-05-09  7:27   ` Christoph Hellwig
  2 siblings, 1 reply; 33+ messages in thread
From: Keith Busch @ 2018-05-04 16:17 UTC (permalink / raw)


On Fri, May 04, 2018@01:28:42PM +0200, Hannes Reinecke wrote:
> @@ -312,7 +317,8 @@ struct nvme_id_ns {
>  	__le16			nabspf;
>  	__le16			noiob;
>  	__u8			nvmcap[16];
> -	__u8			rsvd64[40];
> +	__u8			rsvd64[36];
> +	__le32			anagrpid;
>  	__u8			nguid[16];
>  	__u8			eui64[8];
>  	struct nvme_lbaf	lbaf[16];

Is this the right place for the new field? Your patch has the group ID
at byte 100. TP4004 has the ANA Group Identifier starting at byte 92,
followed by 8 reserved bytes, then the NGUID.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] nvme: Add ANA base definitions
  2018-05-04 16:17   ` Keith Busch
@ 2018-05-04 17:03     ` Meneghini, John
  2018-05-04 17:21       ` Knight, Frederick
  2018-05-05 13:17       ` Hannes Reinecke
  0 siblings, 2 replies; 33+ messages in thread
From: Meneghini, John @ 2018-05-04 17:03 UTC (permalink / raw)


Agreed: 

95:92 - ANAGRPID
103:96 - Reserved

Note that this was changed at the last minute.  There are some older drafts of the NVM_TP_4004 out there.  You want to be sure to use the ratified version from:

http://nvmexpress.org/resources/specifications/

NVM Express 1.3 Ratified TPs

/John

?On 5/4/18, 12:16 PM, "Keith Busch" <keith.busch@linux.intel.com> wrote:

    On Fri, May 04, 2018@01:28:42PM +0200, Hannes Reinecke wrote:
    > @@ -312,7 +317,8 @@ struct nvme_id_ns {
    >  	__le16			nabspf;
    >  	__le16			noiob;
    >  	__u8			nvmcap[16];
    > -	__u8			rsvd64[40];
    > +	__u8			rsvd64[36];
    > +	__le32			anagrpid;
    >  	__u8			nguid[16];
    >  	__u8			eui64[8];
    >  	struct nvme_lbaf	lbaf[16];
    
    Is this the right place for the new field? Your patch has the group ID
    at byte 100. TP4004 has the ANA Group Identifier starting at byte 92,
    followed by 8 reserved bytes, then the NGUID.
    

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] nvme: Add ANA base definitions
  2018-05-04 17:03     ` Meneghini, John
@ 2018-05-04 17:21       ` Knight, Frederick
  2018-05-05 13:17       ` Hannes Reinecke
  1 sibling, 0 replies; 33+ messages in thread
From: Knight, Frederick @ 2018-05-04 17:21 UTC (permalink / raw)


And here are several of the other fields around about that same area:

...
47:46 - NOIOB
63:48 - NVMCAP
91:64 - Reserved
95:92 - ANAGRPID
98:96 - Reserved
99       - NSATTR
101:100 - NVMSETID
103:102 - ENDGID
119:104 - NGUID
...

	Fred


-----Original Message-----
From: Meneghini, John 
Sent: Friday, May 04, 2018 1:04 PM
To: Keith Busch <keith.busch at linux.intel.com>; Hannes Reinecke <hare at suse.de>
Cc: Christoph Hellwig <hch at lst.de>; Jens Axboe <axboe at kernel.dk>; Hannes Reinecke <hare at suse.com>; Sagi Grimberg <sagi at grimberg.me>; linux-nvme at lists.infradead.org; Johannes Thumshirn <jth at kernel.org>; Knight, Frederick <Frederick.Knight at netapp.com>; Meneghini, John <John.Meneghini at netapp.com>
Subject: Re: [PATCH 2/5] nvme: Add ANA base definitions

Agreed: 

95:92 - ANAGRPID
103:96 - Reserved

Note that this was changed at the last minute.  There are some older drafts of the NVM_TP_4004 out there.  You want to be sure to use the ratified version from:

http://nvmexpress.org/resources/specifications/

NVM Express 1.3 Ratified TPs

/John

?On 5/4/18, 12:16 PM, "Keith Busch" <keith.busch@linux.intel.com> wrote:

    On Fri, May 04, 2018@01:28:42PM +0200, Hannes Reinecke wrote:
    > @@ -312,7 +317,8 @@ struct nvme_id_ns {
    >  	__le16			nabspf;
    >  	__le16			noiob;
    >  	__u8			nvmcap[16];
    > -	__u8			rsvd64[40];
    > +	__u8			rsvd64[36];
    > +	__le32			anagrpid;
    >  	__u8			nguid[16];
    >  	__u8			eui64[8];
    >  	struct nvme_lbaf	lbaf[16];
    
    Is this the right place for the new field? Your patch has the group ID
    at byte 100. TP4004 has the ANA Group Identifier starting at byte 92,
    followed by 8 reserved bytes, then the NGUID.
    

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] nvme: Add ANA base definitions
  2018-05-04 11:28 ` [PATCH 2/5] nvme: Add ANA base definitions Hannes Reinecke
  2018-05-04 16:17   ` Keith Busch
@ 2018-05-04 21:12   ` Schremmer, Steven
  2018-05-09  7:27   ` Christoph Hellwig
  2 siblings, 0 replies; 33+ messages in thread
From: Schremmer, Steven @ 2018-05-04 21:12 UTC (permalink / raw)


> @@ -242,7 +242,11 @@ struct nvme_id_ctrl {
>  	__le32			sanicap;
>  	__le32			hmminds;
>  	__le16			hmmaxd;
> -	__u8			rsvd338[174];
reserved bytes 338 to 341

> +	__u8			anatt;
> +	__u8			anacap;
> +	__le32			anagrpmax;
> +	__le32			nanagrpid;
> +	__u8			rsvd338[164];
rsvd352[160]

>  	__u8			sqes;
>  	__u8			cqes;
>  	__le16			maxcmd;
> @@ -258,7 +262,8 @@ struct nvme_id_ctrl {
>  	__le16			acwu;
>  	__u8			rsvd534[2];
>  	__le32			sgls;
> -	__u8			rsvd540[228];
> +	__le32			mnn;
mnan

> +	__u8			rsvd540[224];
rsvd544[224]

>  	char			subnqn[256];
>  	__u8			rsvd1024[768];
>  	__le32			ioccsz;
> @@ -312,7 +317,8 @@ struct nvme_id_ns {
>  	__le16			nabspf;
>  	__le16			noiob;
>  	__u8			nvmcap[16];
> -	__u8			rsvd64[40];
> +	__u8			rsvd64[36];
rsvd64[28]

> +	__le32			anagrpid;
rsvd96[8]

>  	__u8			nguid[16];
>  	__u8			eui64[8];
>  	struct nvme_lbaf	lbaf[16];
> @@ -440,6 +446,7 @@ enum {
>  	NVME_AER_VS			= 7,
>  	NVME_AER_NOTICE_NS_CHANGED	= 0x0002,
>  	NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102,
> +	NVME_AER_NOTICE_ANA_CHANGE	= 0x0302,
>  };
> 
>  struct nvme_lba_range_type {
> @@ -748,11 +755,17 @@ enum {
>  	NVME_LOG_SMART		= 0x02,
>  	NVME_LOG_FW_SLOT	= 0x03,
>  	NVME_LOG_CMD_EFFECTS	= 0x05,
> +	NVME_LOG_ANA		= 0x0c,
>  	NVME_LOG_DISC		= 0x70,
>  	NVME_LOG_RESERVATION	= 0x80,
>  	NVME_FWACT_REPL		= (0 << 3),
>  	NVME_FWACT_REPL_ACTV	= (1 << 3),
>  	NVME_FWACT_ACTV		= (2 << 3),
> +	NVME_ANA_STATE_OPTIMIZED = 0x01,
> +	NVME_ANA_STATE_NONOPTIMIZED = 0x02,
> +	NVME_ANA_STATE_INACCESSIBLE = 0x03,
> +	NVME_ANA_STATE_PERSISTENT_LOSS = 0x04,
> +	NVME_ANA_STATE_CHANGE_STATE = 0x05,
_CHANGE_STATE should have value of 0x0F

>  };
> 
>  struct nvme_identify {
> @@ -917,6 +930,23 @@ struct nvmf_common_command {
>  	__u8	ts[24];
>  };
> 
> +/* Asymmetric Namespace Access log page entry */
> +struct nvmf_ana_group_descriptor {
> +	__le32 groupid;
> +	__le32 nsid_num;
> +	__le64 chgcnt;
> +	__u8 ana_state;
> +	__u8 resv1[7];
> +	__le32 nsid[0];
> +};
> +
> +struct nvmf_ana_rsp_page_header {
> +	__le64 chgcnt;
> +	__le16 grpid_num;
grpdesc_num?

> +	__le16 resv[3];
> +	struct nvmf_ana_group_descriptor desc[0];
> +};
> +
>  /*
>   * The legal cntlid range a NVMe Target will provide.
>   * Note that cntlid of value 0 is considered illegal in the fabrics world.
> @@ -1135,6 +1165,8 @@ enum {
>  	NVME_SC_NS_NOT_ATTACHED		= 0x11a,
>  	NVME_SC_THIN_PROV_NOT_SUPP	= 0x11b,
>  	NVME_SC_CTRL_LIST_INVALID	= 0x11c,
> +	NVME_SC_GRP_ID_INVALID		= 0x123,
> +	NVME_SC_ANA_ATTACH_FAILED	= 0x124,
I think GRP_ID_INVALID should be 0x124 and _ANA_ATTACH_FAILED is 0x125.

> 
>  	/*
>  	 * I/O Command Set Specific - NVM commands:
> @@ -1168,6 +1200,15 @@ enum {
>  	NVME_SC_ACCESS_DENIED		= 0x286,
>  	NVME_SC_UNWRITTEN_BLOCK		= 0x287,
> 
> +	/* Path-related Errors */
> +	NVME_SC_INTERNAL_PATH_ERROR	= 0x300,
> +	NVME_SC_ANA_PERSISTENT_LOSS	= 0x301,
> +	NVME_SC_ANA_INACCESSIBLE	= 0x302,
> +	NVME_SC_ANA_TRANSITION		= 0x303,
> +	NVME_SC_CTRL_PATHING_ERROR	= 0x360,
> +	NVME_SC_HOST_PATHING_ERROR	= 0x370,
> +	NVME_SC_HOST_ABORTED		= 0x371,
> +
>  	NVME_SC_DNR			= 0x4000,
>  };
> 
> --
> 2.12.3

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-04 11:28 ` [PATCH 5/5] nvme: ANA base support Hannes Reinecke
@ 2018-05-04 22:11   ` Schremmer, Steven
  2018-05-05 13:23     ` Hannes Reinecke
  2018-05-06  3:23   ` Guan Junxiong
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 33+ messages in thread
From: Schremmer, Steven @ 2018-05-04 22:11 UTC (permalink / raw)


>  int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
>  {
> @@ -1488,6 +1489,9 @@ static int nvme_revalidate_disk(struct gendisk *disk)
>  		goto out;
>  	}
> 
> +	if (ctrl->subsys->cmic & (1 << 3))
> +		nvme_get_ana_log(ctrl, ns);
> +
If __nvme_revalidate_disk() would save the groupid from the Identify Namespace data,
then finding the ANA state in the log page could be done more quickly.

>  	__nvme_revalidate_disk(disk, id);
>  	nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
>  	if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
> @@ -2276,6 +2280,61 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
>  	return ret;
>  }
> 
> +static void nvme_get_ana_log(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
> +{
> +	int i, j;
> +	struct nvmf_ana_rsp_page_header *ana_log;
> +	size_t ana_log_size = 4096;
> +
> +	ana_log = kzalloc(ana_log_size, GFP_KERNEL);
> +	if (!ana_log)
> +		return;
> +
> +	if (nvme_get_log(ctrl, NVME_LOG_ANA, ana_log, ana_log_size))
> +		dev_warn(ctrl->device,
> +			 "Get ANA log error\n");
> +	for (i = 0; i < ana_log->grpid_num; i++) {
> +		struct nvmf_ana_group_descriptor *desc =
> +			&ana_log->desc[i];
> +		for (j = 0; j < desc->nsid_num; j++) {
> +			if (desc->nsid[j] == ns->head->ns_id) {
> +				ns->ana_state = desc->ana_state;
> +				ns->ana_group = desc->groupid;
Maybe jump out of the for loops here?

> @@ -3370,6 +3485,10 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
>  	case NVME_AER_NOTICE_FW_ACT_STARTING:
>  		queue_work(nvme_wq, &ctrl->fw_act_work);
>  		break;
> +	case NVME_AER_NOTICE_ANA_CHANGE:
> +		dev_info(ctrl->device, "ANA state change\n");
> +		queue_work(nvme_wq, &ctrl->ana_change_work);
> +		break;
I think eventually we'll need a way for the AER work to potentially force a call to
__nvme_find_path() to choose a better current_path based on the updated ANA states.

> @@ -56,8 +56,18 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
>  {
>  	struct nvme_ns *ns;
> 
> +	/* First round: select from all ANA optimized paths */
>  	list_for_each_entry_rcu(ns, &head->list, siblings) {
> -		if (ns->ctrl->state == NVME_CTRL_LIVE) {
> +		if (ns->ctrl->state == NVME_CTRL_LIVE &&
> +		    ns->ana_state == NVME_ANA_STATE_OPTIMIZED) {
> +			rcu_assign_pointer(head->current_path, ns);
> +			return ns;
> +		}
> +	}
> +	/* Second round: select from all ANA non-optimized paths */
> +	list_for_each_entry_rcu(ns, &head->list, siblings) {
> +		if (ns->ctrl->state == NVME_CTRL_LIVE &&
> +		    ns->ana_state == NVME_ANA_STATE_NONOPTIMIZED) {
>  			rcu_assign_pointer(head->current_path, ns);
>  			return ns;
>  		}

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] nvme: Add ANA base definitions
  2018-05-04 17:03     ` Meneghini, John
  2018-05-04 17:21       ` Knight, Frederick
@ 2018-05-05 13:17       ` Hannes Reinecke
  1 sibling, 0 replies; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-05 13:17 UTC (permalink / raw)


On 05/04/2018 07:03 PM,  Meneghini, John  wrote:
> Agreed:
> 
> 95:92 - ANAGRPID
> 103:96 - Reserved
> 
> Note that this was changed at the last minute.  There are some older drafts of the NVM_TP_4004 out there.  You want to be sure to use the ratified version from:
> 
> http://nvmexpress.org/resources/specifications/
> 
> NVM Express 1.3 Ratified TPs
> 
> /John
> 
> ?On 5/4/18, 12:16 PM, "Keith Busch" <keith.busch@linux.intel.com> wrote:
> 
>      On Fri, May 04, 2018@01:28:42PM +0200, Hannes Reinecke wrote:
>      > @@ -312,7 +317,8 @@ struct nvme_id_ns {
>      >  	__le16			nabspf;
>      >  	__le16			noiob;
>      >  	__u8			nvmcap[16];
>      > -	__u8			rsvd64[40];
>      > +	__u8			rsvd64[36];
>      > +	__le32			anagrpid;
>      >  	__u8			nguid[16];
>      >  	__u8			eui64[8];
>      >  	struct nvme_lbaf	lbaf[16];
>      
>      Is this the right place for the new field? Your patch has the group ID
>      at byte 100. TP4004 has the ANA Group Identifier starting at byte 92,
>      followed by 8 reserved bytes, then the NGUID.
>      
> 
Ah. Hence. Must've looked at an older copy then.

Will be fixing it up.

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-04 22:11   ` Schremmer, Steven
@ 2018-05-05 13:23     ` Hannes Reinecke
  0 siblings, 0 replies; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-05 13:23 UTC (permalink / raw)


On 05/05/2018 12:11 AM, Schremmer, Steven wrote:
>>   int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
>>   {
>> @@ -1488,6 +1489,9 @@ static int nvme_revalidate_disk(struct gendisk *disk)
>>   		goto out;
>>   	}
>>
>> +	if (ctrl->subsys->cmic & (1 << 3))
>> +		nvme_get_ana_log(ctrl, ns);
>> +
> If __nvme_revalidate_disk() would save the groupid from the Identify Namespace data,
> then finding the ANA state in the log page could be done more quickly.
> 
Good idea. Will be doing it.

>>   	__nvme_revalidate_disk(disk, id);
>>   	nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
>>   	if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
>> @@ -2276,6 +2280,61 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
>>   	return ret;
>>   }
>>
>> +static void nvme_get_ana_log(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
>> +{
>> +	int i, j;
>> +	struct nvmf_ana_rsp_page_header *ana_log;
>> +	size_t ana_log_size = 4096;
>> +
>> +	ana_log = kzalloc(ana_log_size, GFP_KERNEL);
>> +	if (!ana_log)
>> +		return;
>> +
>> +	if (nvme_get_log(ctrl, NVME_LOG_ANA, ana_log, ana_log_size))
>> +		dev_warn(ctrl->device,
>> +			 "Get ANA log error\n");
>> +	for (i = 0; i < ana_log->grpid_num; i++) {
>> +		struct nvmf_ana_group_descriptor *desc =
>> +			&ana_log->desc[i];
>> +		for (j = 0; j < desc->nsid_num; j++) {
>> +			if (desc->nsid[j] == ns->head->ns_id) {
>> +				ns->ana_state = desc->ana_state;
>> +				ns->ana_group = desc->groupid;
> Maybe jump out of the for loops here?
> 
Yes, should be doing so.

>> @@ -3370,6 +3485,10 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
>>   	case NVME_AER_NOTICE_FW_ACT_STARTING:
>>   		queue_work(nvme_wq, &ctrl->fw_act_work);
>>   		break;
>> +	case NVME_AER_NOTICE_ANA_CHANGE:
>> +		dev_info(ctrl->device, "ANA state change\n");
>> +		queue_work(nvme_wq, &ctrl->ana_change_work);
>> +		break;
> I think eventually we'll need a way for the AER work to potentially force a call to
> __nvme_find_path() to choose a better current_path based on the updated ANA states.
> 
Actually, it should be sufficient to just unset the current path; the 
algorithm will find the next best path then.
But yes, true, we should be doing it.
(I had the Optimized/Inaccessible scenario in mind here; there the new 
path will be selected automatically as we'd be getting an error from the 
Inaccessible path. But for Optimized/non-optimized the new path won't be 
chosen automatically as the original path remains valid.)

Thanks for the hints.

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 3/5] nvmet: Add ANA base support
  2018-05-04 11:28 ` [PATCH 3/5] nvmet: Add ANA base support Hannes Reinecke
@ 2018-05-06  2:42   ` Guan Junxiong
  0 siblings, 0 replies; 33+ messages in thread
From: Guan Junxiong @ 2018-05-06  2:42 UTC (permalink / raw)


Hi Hannes,


On Friday, May 04, 2018 07:28 PM, Hannes Reinecke wrote:
> Add ANA support to the nvme target. The ANA configuration is optional,
> and doesn't interfere with existing configurations.
> Each subsystem has distinct ANA groups; if ANA groups are created
> it is required to link the ANA groups into the individual ports.
> Linking the entire subsystem will be refused if ANA groups are
> specified.
> Also this implementation has a limit of one single ANA state per
> groups, irrespective of the path. So when distinct ANA states
> are required one needs to create different ANA groups.
>
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
>
>
> +
> +static void nvmet_port_ags_drop_link(struct config_item *parent,
> +		struct config_item *target)
> +{
> +	struct nvmet_port *port = to_nvmet_port(parent->ci_parent);
> +	struct nvmet_ag *ag = to_nvmet_ag(target);
> +	struct nvmet_subsys *subsys = ag->subsys;
> +	struct nvmet_ag_link *p;
> +	int ana_active = 0;
> +
> +	down_write(&nvmet_config_sem);
> +	list_for_each_entry(p, &port->ags, entry) {
> +		if (p->ag == ag)
> +			goto found;
> +	}
> +	up_write(&nvmet_config_sem);
> +	return;
> +
> +found:
> +	list_del(&p->entry);
> +	list_del(&p->ag_list);
> +	nvmet_genctr++;
Why not nvmet_genctrl-- when dropping link?

> +
> +
> +static struct config_group *nvmet_ag_make(struct config_group *group,
> +		const char *name)
> +{
> +	struct nvmet_subsys *subsys = ag_to_subsys(&group->cg_item);
> +	struct nvmet_ag *ag, *a;
> +	int ret;
> +	u32 agid;
> +
> +	ret = kstrtou32(name, 0, &agid);
> +	if (ret)
> +		goto out;
> +
> +	ret = -EINVAL;
> +	if (agid == 0)
> +		goto out;
> +	ret = -ENOMEM;
> +	ag = nvmet_ag_alloc(subsys, agid);
> +	if (!ag)
> +		goto out;
> +	ret = -EEXIST;
> +	down_write(&nvmet_config_sem);
> +	list_for_each_entry_rcu(a, &subsys->ana_groups, entry) {
> +		if (a->grpid == agid) {
> +			nvmet_ag_free(ag);
> +			goto out;
> +		}
> +	}
> +	list_add_tail(&ag->entry, &subsys->ana_groups);
> +	up_write(&nvmet_config_sem);
> +	config_group_init_type_name(&ag->group, name, &nvmet_ag_type);
> +
> +	pr_info("adding ana groupid %d to subsystem %s\n",
> +		agid, subsys->subsysnqn);
> +	return &ag->group;
> +out:
Here should be added:? up_write(&nvmet_config_sem);
> +	return ERR_PTR(ret);


> +}
> +
> +static struct configfs_group_operations nvmet_ana_group_group_ops = {
> +	.make_group		= nvmet_ag_make,
I guess we should add :
??????????????? .drop_item? = nvmet_ag_drop,

to implement rmdir operation .



Regards

Guan Junxiong

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-04 11:28 ` [PATCH 5/5] nvme: ANA base support Hannes Reinecke
  2018-05-04 22:11   ` Schremmer, Steven
@ 2018-05-06  3:23   ` Guan Junxiong
  2018-05-07  7:21   ` Johannes Thumshirn
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 33+ messages in thread
From: Guan Junxiong @ 2018-05-06  3:23 UTC (permalink / raw)




On Friday, May 04, 2018 07:28 PM, Hannes Reinecke wrote:
> +static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
> +		char *buf)
> +{
> +	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
> +
> +	switch (ns->ana_state & 0x0f) {
> +	case NVME_ANA_STATE_OPTIMIZED:
> +		return sprintf(buf, "optimized\n");
> +	case NVME_ANA_STATE_NONOPTIMIZED:
> +		return sprintf(buf, "non-optimized\n");
> +	case NVME_ANA_STATE_INACCESSIBLE:
> +		return sprintf(buf, "inaccessible\n");
> +	case NVME_ANA_STATE_PERSISTENT_LOSS:
> +		return sprintf(buf, "persistent-loss\n");
> +	case NVME_ANA_STATE_CHANGE_STATE:
> +		return sprintf(buf, "change-state\n");
> +	default:
> +		return sprintf(buf, "<reserved>\n");
> +	}
> +}
> +static DEVICE_ATTR_RO(ana_state);
> +
> +static ssize_t ana_group_show(struct device *dev, struct device_attribute *attr,
> +		char *buf)
> +{
> +	struct nvme_ns *ns = dev_to_disk(dev)->private_data;

Nick:   dev_to_disk(dev)->private_data  -->  nvme_get_ns_from_dev(dev);


Except this, IMO, it looks good.

Reviewed-by: Guan Junxiong <guanjunxiong at huawei.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/5] nvmet: EUI64 support
  2018-05-04 11:28 ` [PATCH 1/5] nvmet: EUI64 support Hannes Reinecke
@ 2018-05-07  7:17   ` Johannes Thumshirn
  2018-05-09  7:08   ` Christoph Hellwig
  2018-05-09 15:43   ` Ewan D. Milne
  2 siblings, 0 replies; 33+ messages in thread
From: Johannes Thumshirn @ 2018-05-07  7:17 UTC (permalink / raw)


Looks good,
Reviewed-by: Johannes Thumshirn <jthumshirn at suse.de>
-- 
Johannes Thumshirn                                          Storage
jthumshirn at suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Felix Imend?rffer, Jane Smithard, Graham Norton
HRB 21284 (AG N?rnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-04 11:28 ` [PATCH 5/5] nvme: ANA base support Hannes Reinecke
  2018-05-04 22:11   ` Schremmer, Steven
  2018-05-06  3:23   ` Guan Junxiong
@ 2018-05-07  7:21   ` Johannes Thumshirn
  2018-05-09 18:49   ` Ewan D. Milne
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 33+ messages in thread
From: Johannes Thumshirn @ 2018-05-07  7:21 UTC (permalink / raw)


On Fri, May 04, 2018@01:28:45PM +0200, Hannes Reinecke wrote:
> Add ANA support to the nvme host. If ANA is supported the state
> and the group id are displayed in new sysfs attributes 'ana_state' and
> 'ana_group'.
> 
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
>  drivers/nvme/host/core.c      | 123 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/nvme/host/multipath.c |  12 ++++-
>  drivers/nvme/host/nvme.h      |   3 ++
>  3 files changed, 136 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 62262fac7a5d..14dff8e96899 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -99,6 +99,7 @@ static struct class *nvme_subsys_class;
>  
>  static void nvme_ns_remove(struct nvme_ns *ns);
>  static int nvme_revalidate_disk(struct gendisk *disk);
> +static void nvme_get_ana_log(struct nvme_ctrl *ctrl, struct nvme_ns *ns);
>  
>  int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
>  {
> @@ -1488,6 +1489,9 @@ static int nvme_revalidate_disk(struct gendisk *disk)
>  		goto out;
>  	}
>  
> +	if (ctrl->subsys->cmic & (1 << 3))
> +		nvme_get_ana_log(ctrl, ns);
> +

Can you move nvme_get_ana_log(), et al above nvme_reset_ctrl() so we
don't need the forward declaration?

      Johannes

-- 
Johannes Thumshirn                                          Storage
jthumshirn at suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: Felix Imend?rffer, Jane Smithard, Graham Norton
HRB 21284 (AG N?rnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/5] nvmet: EUI64 support
  2018-05-04 11:28 ` [PATCH 1/5] nvmet: EUI64 support Hannes Reinecke
  2018-05-07  7:17   ` Johannes Thumshirn
@ 2018-05-09  7:08   ` Christoph Hellwig
  2018-05-09  7:45     ` Hannes Reinecke
  2018-05-09 15:43   ` Ewan D. Milne
  2 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:08 UTC (permalink / raw)


On Fri, May 04, 2018@01:28:41PM +0200, Hannes Reinecke wrote:
> Allow the setting of an IEEE Extended Unique Identifier (EUI64) for
> each namespace. As the OUI is per subsystem ensure that the OUI
> part of the EUI64 value (ie the top 3 bytes) are matching for each
> namespace.

I don't think we should support the EUI64 value.  It doesn't have
a lot of uniqueness, and it requires and IEEE OID to be properly
implemented.  Please use the UUID identifier always, unless you actually
doe havean IEEE OID and really want to use it, in which case you should
use the NGUID.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/5] nvme: Add ANA base definitions
  2018-05-04 11:28 ` [PATCH 2/5] nvme: Add ANA base definitions Hannes Reinecke
  2018-05-04 16:17   ` Keith Busch
  2018-05-04 21:12   ` Schremmer, Steven
@ 2018-05-09  7:27   ` Christoph Hellwig
  2 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:27 UTC (permalink / raw)


On Fri, May 04, 2018@01:28:42PM +0200, Hannes Reinecke wrote:
> Add definitions from the ANA TP errata.
> 
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
>  include/linux/nvme.h | 47 ++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 44 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 4112e2bd747f..40c2d08ee761 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -242,7 +242,11 @@ struct nvme_id_ctrl {
>  	__le32			sanicap;
>  	__le32			hmminds;
>  	__le16			hmmaxd;
> -	__u8			rsvd338[174];
> +	__u8			anatt;
> +	__u8			anacap;
> +	__le32			anagrpmax;
> +	__le32			nanagrpid;
> +	__u8			rsvd338[164];

SANICAP is at bytes 328 to 331, the HMM field go from 332 to 337,
so we really need to reserve bytes 338, 339, and 340 before starting
the ANA fields.  Note that this is already hinted at by the naming
for the rsvd338 field.  So the new post-ana rsvd field should also
be named rsvd352

>  	__u8			rsvd1024[768];
>  	__le32			ioccsz;
> @@ -312,7 +317,8 @@ struct nvme_id_ns {
>  	__le16			nabspf;
>  	__le16			noiob;
>  	__u8			nvmcap[16];
> -	__u8			rsvd64[40];
> +	__u8			rsvd64[36];
> +	__le32			anagrpid;
>  	__u8			nguid[16];

nguid is at offset 104, so if you please anagrip just before
it it is in the wrong offset.

> +	NVME_ANA_STATE_OPTIMIZED = 0x01,
> +	NVME_ANA_STATE_NONOPTIMIZED = 0x02,
> +	NVME_ANA_STATE_INACCESSIBLE = 0x03,
> +	NVME_ANA_STATE_PERSISTENT_LOSS = 0x04,
> +	NVME_ANA_STATE_CHANGE_STATE = 0x05,

Please make this a separate 'enum nvme_ana_state'

> +/* Asymmetric Namespace Access log page entry */
> +struct nvmf_ana_group_descriptor {
> +	__le32 groupid;
> +	__le32 nsid_num;
> +	__le64 chgcnt;
> +	__u8 ana_state;
> +	__u8 resv1[7];

Pleae encode the offset of the reserved field in the name:

	__u8 resv17[7];

> +	__le32 nsid[0];

Please use C99 flexible arrays:

	__le32 nsid[];

> +struct nvmf_ana_rsp_page_header {
> +	__le64 chgcnt;
> +	__le16 grpid_num;
> +	__le16 resv[3];
> +	struct nvmf_ana_group_descriptor desc[0];
> +};

Same here.

also we use _hdr for the discover log name, maybe match that here.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 4/5] block: BLK_STS_NEXUS is a path failure
  2018-05-04 11:28 ` [PATCH 4/5] block: BLK_STS_NEXUS is a path failure Hannes Reinecke
@ 2018-05-09  7:31   ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:31 UTC (permalink / raw)


On Fri, May 04, 2018@01:28:44PM +0200, Hannes Reinecke wrote:
> A block error of BLK_STS_NEXUS is equivalent to a path failure,
> so blk_path_error() should reflect this.

BLK_STS_NEXUS is the replacement of -EBADE, which in the old
world order was labelled 'critical nexus"' and is claim to be
used for reservation conflicts, in which case we should not rety
on another path.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/5] nvmet: EUI64 support
  2018-05-09  7:08   ` Christoph Hellwig
@ 2018-05-09  7:45     ` Hannes Reinecke
  2018-05-09  7:52       ` Christoph Hellwig
  0 siblings, 1 reply; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-09  7:45 UTC (permalink / raw)


On Wed, 9 May 2018 09:08:58 +0200
"Christoph Hellwig" <hch@lst.de> wrote:

> On Fri, May 04, 2018@01:28:41PM +0200, Hannes Reinecke wrote:
> > Allow the setting of an IEEE Extended Unique Identifier (EUI64) for
> > each namespace. As the OUI is per subsystem ensure that the OUI
> > part of the EUI64 value (ie the top 3 bytes) are matching for each
> > namespace.  
> 
> I don't think we should support the EUI64 value.  It doesn't have
> a lot of uniqueness, and it requires and IEEE OID to be properly
> implemented.  Please use the UUID identifier always, unless you
> actually doe havean IEEE OID and really want to use it, in which case
> you should use the NGUID.
> 

It's not so much about usability, it's about choice.
Some users might want to set an EUI64; companies (like ours) have
a valid OUI, so generating an EUI64 is pretty simple.

And the patch itself is pretty simple, so where's the harm?

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/5] nvmet: EUI64 support
  2018-05-09  7:45     ` Hannes Reinecke
@ 2018-05-09  7:52       ` Christoph Hellwig
  0 siblings, 0 replies; 33+ messages in thread
From: Christoph Hellwig @ 2018-05-09  7:52 UTC (permalink / raw)


On Wed, May 09, 2018@09:45:29AM +0200, Hannes Reinecke wrote:
> On Wed, 9 May 2018 09:08:58 +0200
> "Christoph Hellwig" <hch@lst.de> wrote:
> 
> > On Fri, May 04, 2018@01:28:41PM +0200, Hannes Reinecke wrote:
> > > Allow the setting of an IEEE Extended Unique Identifier (EUI64) for
> > > each namespace. As the OUI is per subsystem ensure that the OUI
> > > part of the EUI64 value (ie the top 3 bytes) are matching for each
> > > namespace.  
> > 
> > I don't think we should support the EUI64 value.  It doesn't have
> > a lot of uniqueness, and it requires and IEEE OID to be properly
> > implemented.  Please use the UUID identifier always, unless you
> > actually doe havean IEEE OID and really want to use it, in which case
> > you should use the NGUID.
> > 
> 
> It's not so much about usability, it's about choice.
> Some users might want to set an EUI64; companies (like ours) have
> a valid OUI, so generating an EUI64 is pretty simple.
> 
> And the patch itself is pretty simple, so where's the harm?

Please always use the NGUID then.  Using a 64-bit unique
identifier is a bad idea.  And even when using NGUIDs you have to
be extremely careful to make sure they are never reused.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/5] nvmet: EUI64 support
  2018-05-04 11:28 ` [PATCH 1/5] nvmet: EUI64 support Hannes Reinecke
  2018-05-07  7:17   ` Johannes Thumshirn
  2018-05-09  7:08   ` Christoph Hellwig
@ 2018-05-09 15:43   ` Ewan D. Milne
  2 siblings, 0 replies; 33+ messages in thread
From: Ewan D. Milne @ 2018-05-09 15:43 UTC (permalink / raw)


On Fri, 2018-05-04@13:28 +0200, Hannes Reinecke wrote:
> Allow the setting of an IEEE Extended Unique Identifier (EUI64) for
> each namespace. As the OUI is per subsystem ensure that the OUI
> part of the EUI64 value (ie the top 3 bytes) are matching for each
> namespace.
> 
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
>  drivers/nvme/target/admin-cmd.c |  6 ++---
>  drivers/nvme/target/configfs.c  | 58 +++++++++++++++++++++++++++++++++++++++++
>  drivers/nvme/target/nvmet.h     |  2 ++
>  3 files changed, 62 insertions(+), 4 deletions(-)
> 
...
> diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
> index ad9ff27234b5..3dc2b2ae56e5 100644
> --- a/drivers/nvme/target/configfs.c
> +++ b/drivers/nvme/target/configfs.c
> @@ -378,6 +378,49 @@ static ssize_t nvmet_ns_device_nguid_store(struct config_item *item,
>  
>  CONFIGFS_ATTR(nvmet_ns_, device_nguid);
>  
> +static ssize_t nvmet_ns_device_eui64_show(struct config_item *item, char *page)
> +{
> +	return sprintf(page, "%llx\n",
> +		       (unsigned long long)&to_nvmet_ns(item)->eui64);
> +}

Notwithstanding Christoph's comment, patch 3/5 changes the above line
again to remove the "&", so you should do that here in the first place
if you keep it.

-Ewan

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-04 11:28 ` [PATCH 5/5] nvme: ANA base support Hannes Reinecke
                     ` (2 preceding siblings ...)
  2018-05-07  7:21   ` Johannes Thumshirn
@ 2018-05-09 18:49   ` Ewan D. Milne
  2018-05-09 19:03   ` Ewan D. Milne
  2018-05-12 13:31   ` Christoph Hellwig
  5 siblings, 0 replies; 33+ messages in thread
From: Ewan D. Milne @ 2018-05-09 18:49 UTC (permalink / raw)


On Fri, 2018-05-04@13:28 +0200, Hannes Reinecke wrote:
> Add ANA support to the nvme host. If ANA is supported the state
> and the group id are displayed in new sysfs attributes 'ana_state' and
> 'ana_group'.
> 
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
>  drivers/nvme/host/core.c      | 123 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/nvme/host/multipath.c |  12 ++++-
>  drivers/nvme/host/nvme.h      |   3 ++
>  3 files changed, 136 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
...
> +static void nvme_get_ana_log(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
> +{
> +	int i, j;
> +	struct nvmf_ana_rsp_page_header *ana_log;
> +	size_t ana_log_size = 4096;
> +
> +	ana_log = kzalloc(ana_log_size, GFP_KERNEL);
> +	if (!ana_log)
> +		return;
> +
> +	if (nvme_get_log(ctrl, NVME_LOG_ANA, ana_log, ana_log_size))
> +		dev_warn(ctrl->device,
> +			 "Get ANA log error\n");

If nvme_get_log() above returns an error you don't want to use the data
in the ana_log buffer in the following section of code:

> +	for (i = 0; i < ana_log->grpid_num; i++) {
> +		struct nvmf_ana_group_descriptor *desc =
> +			&ana_log->desc[i];
> +		for (j = 0; j < desc->nsid_num; j++) {
> +			if (desc->nsid[j] == ns->head->ns_id) {
> +				ns->ana_state = desc->ana_state;
> +				ns->ana_group = desc->groupid;
> +			}
> +		}
> +	}
> +	kfree(ana_log);
> +}
> +

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-04 11:28 ` [PATCH 5/5] nvme: ANA base support Hannes Reinecke
                     ` (3 preceding siblings ...)
  2018-05-09 18:49   ` Ewan D. Milne
@ 2018-05-09 19:03   ` Ewan D. Milne
  2018-05-10  9:16     ` Sriram Popuri
       [not found]     ` <CAGYjvj0Mk0MFAfUEApOOyQ9Prm3CvGcZH14PJzDQT2+Qc+w81w@mail.gmail.com>
  2018-05-12 13:31   ` Christoph Hellwig
  5 siblings, 2 replies; 33+ messages in thread
From: Ewan D. Milne @ 2018-05-09 19:03 UTC (permalink / raw)


On Fri, 2018-05-04@13:28 +0200, Hannes Reinecke wrote:
> Add ANA support to the nvme host. If ANA is supported the state
> and the group id are displayed in new sysfs attributes 'ana_state' and
> 'ana_group'.
> 
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
>  drivers/nvme/host/core.c      | 123 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/nvme/host/multipath.c |  12 ++++-
>  drivers/nvme/host/nvme.h      |   3 ++
>  3 files changed, 136 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
...
> +static void nvme_ana_change_work(struct work_struct *work)
> +{
> +	struct nvme_ctrl *ctrl = container_of(work,
> +				struct nvme_ctrl, ana_change_work);
> +
> +	if (ctrl->state != NVME_CTRL_LIVE)
> +		return;
> +
> +	down_read(&ctrl->namespaces_rwsem);
> +	nvme_get_full_ana_log(ctrl);
> +	up_read(&ctrl->namespaces_rwsem);
> +}
> +

Do we really want to be holding the semaphore while performing the
command to get the log page, particularly in a fabric environment?
Or would it be sufficient to hold it after the log page is fetched
and we are iterating over the ctrl->namespaces list in
nvme_get_full_ana_log()?

(BTW, nvme_get_full_ana_log() has the same issue w/error returned
from nvme_get_log() as nvme_get_ana_log()).

-Ewan

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-09 19:03   ` Ewan D. Milne
@ 2018-05-10  9:16     ` Sriram Popuri
       [not found]     ` <CAGYjvj0Mk0MFAfUEApOOyQ9Prm3CvGcZH14PJzDQT2+Qc+w81w@mail.gmail.com>
  1 sibling, 0 replies; 33+ messages in thread
From: Sriram Popuri @ 2018-05-10  9:16 UTC (permalink / raw)


Hi,

  I work for NetApp and leading efforts to add ANA support for our
NVMe/FC target. So me and my team are excited with this patch.So
thanks for working on this!
We had some spec comments, but John M, Fred K and others have already
raised it. We have few more comments looking at the overall solution.
Please consider addressing the following:

1) Ana log size
+             size_t ana_log_size = 4096;

This is hard coded. Don?t you expect to have a larger log page? Looks
like the number of ANA groups will be restricted to around 113
assuming one namespace identifier per ana group.
Can we have a larger ana_log_size? Maybe it should be MDTS?

2) Sanity check after reading ana log page
There should be a sanity check to make sure the log page is read
completely. A check to see if all the ana groups are read and all the
num nsids(for the last group) are read otherwise states on few
namespace paths will be stale.

3) ANATT not used.
Didn't see anatt honored. Is there a plan to do that in future?
Looks like you retry forever if there is any path related error.
Let's say there are two paths ANA Optimized and other path ANA
Inaccessible and then the ANA optimized path is transitioning to ANA
Inaccessible.
If my understanding is correct, this is what happens:
* Linux gets back ANA Transition.
* The current controller is reset.
* IO will be re-queued or failover to other path regardless of the
other path being still Inaccessible (or even Persistent loss).
* nvme_find_path will return null so the IO is re-queued again.
* This state continues till the target completes an AER(?)
Can it go forever?

4) Controller reset on path related errors
Looks like any path related errors will result in controller reset.
This looks bad as path change for a single namespace will result in
controller reset.  Didn?t see any special handling for path related
errors. Should that happen as controller reset is a bigger hammer for
these errors?

5) Path related error handling
Again on the same lines as above. All path related errors fall in one
category as of now (BLK_STS_IOERR). Handling of persistent loss or ANA
inaccessible or ANA transition should be according to the spec.

6) Detecting host ANA support.
Didn?t see good way for target to know if host is ANA complaint or not
in the spec itself. One way we can negotiate ANA support is through
set features(0Bh). That helps targets to know whether its ok to return
ANA path related errors or complete AERs with ANA change notice.
Is there a plan to implement set features (0Bh) to negotiate
asynchronous event configuration?

Regards,
~Sriram

On Thu, May 10, 2018@12:33 AM, Ewan D. Milne <emilne@redhat.com> wrote:
> On Fri, 2018-05-04@13:28 +0200, Hannes Reinecke wrote:
>> Add ANA support to the nvme host. If ANA is supported the state
>> and the group id are displayed in new sysfs attributes 'ana_state' and
>> 'ana_group'.
>>
>> Signed-off-by: Hannes Reinecke <hare at suse.com>
>> ---
>>  drivers/nvme/host/core.c      | 123 +++++++++++++++++++++++++++++++++++++++++-
>>  drivers/nvme/host/multipath.c |  12 ++++-
>>  drivers/nvme/host/nvme.h      |   3 ++
>>  3 files changed, 136 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> ...
>> +static void nvme_ana_change_work(struct work_struct *work)
>> +{
>> +     struct nvme_ctrl *ctrl = container_of(work,
>> +                             struct nvme_ctrl, ana_change_work);
>> +
>> +     if (ctrl->state != NVME_CTRL_LIVE)
>> +             return;
>> +
>> +     down_read(&ctrl->namespaces_rwsem);
>> +     nvme_get_full_ana_log(ctrl);
>> +     up_read(&ctrl->namespaces_rwsem);
>> +}
>> +
>
> Do we really want to be holding the semaphore while performing the
> command to get the log page, particularly in a fabric environment?
> Or would it be sufficient to hold it after the log page is fetched
> and we are iterating over the ctrl->namespaces list in
> nvme_get_full_ana_log()?
>
> (BTW, nvme_get_full_ana_log() has the same issue w/error returned
> from nvme_get_log() as nvme_get_ana_log()).
>
> -Ewan
>
>
>
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
       [not found]     ` <CAGYjvj0Mk0MFAfUEApOOyQ9Prm3CvGcZH14PJzDQT2+Qc+w81w@mail.gmail.com>
@ 2018-05-10 13:40       ` Hannes Reinecke
  2018-05-11  7:50       ` Hannes Reinecke
  1 sibling, 0 replies; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-10 13:40 UTC (permalink / raw)


On 05/10/2018 10:28 AM, Sriram Popuri wrote:
> Hi,
> 
>  ? I work for NetApp and leading efforts to add ANA support for our 
> NVMe/FC target. So me and my team are excited with this patch.So thanks 
> for working on this!
> We had some spec comments, but John M, Fred K and others have already 
> raised it. We have few more comments looking at the overall solution. 
> Please consider addressing the following:
> 
>  1. Ana log size
> 
> + size_t ana_log_size = 4096;
> 
> This is hard coded. Don?t you expect to have a larger log page? Looks 
> like the number of ANA groups will be restricted to around 113 assuming 
> one namespace identifier per ana group.
> 
> Can we have a larger ana_log_size? Maybe it should be MDTS?
> 
Yeah, 'tis a bit of a cheeky implementation.
I'll see to make it configurable.

>  2. Sanity check after reading ana log page
> 
> There should be a sanity check to make sure the log page is read 
> completely. A check to see if all the ana groups are read and all the 
> num nsids(for the last group) are read otherwise states on few namespace 
> paths will be stale.
> 
Well; this basically is the same issue we had with ALUA on the SCSI 
side. What _are_ the correct actions if the ANA log page couldn't be 
read properly?
Sure, one could to a retry, but even that will be exhausted after some 
iterations. What then?
Fail the controller?
-> probably not the best of options; after all, ANA support is optional.
Set the paths to failed?
-> Again, probably not the best of ideas as the path itself might be 
operational.
So for now it's probably best to just leave the states as they are and 
wait for the situation to resolve itself.

>  3. ANATT not used.
> 
> Didn't see anatt honored. Is there a plan to do that in future?
> 
> Looks like you retry forever if there is any path related error.
> 
True. What we _could_ to is to move the paths to a tentative 
'non-optimized' state, and see if the I/O succeeds. If it fails we 
should be getting an appropriate status code, and should be able to set 
the path status accordingly.
We will not get a notification for the optimized path, but then 
something went wrong on the array, so we don't know the correct status.

Maybe this would be the best solution:

1.) If the ANA log fails set all paths to an invalid ANA state (eg 0), 
thus allowing I/O to continue. If the I/O comes back with an ANA status 
code like 'inaccessible' or 'persistent loss' (or even 'transitioning', 
use that to update the status. Then schedule an ANA log read at a later 
time (eg 1 seconds later or somesuch).
2.) If the paths are in 'transitioning' retry I/O until ANATT.
If we get an AEN signalling a state change or ANATT expired submit an 
ANA log page read to update the ANA state. If that fails, continue at 1.


> Let's say there are two paths ANA Optimized and other path ANA 
> Inaccessible and then the ANA optimized path is transitioning to ANA 
> Inaccessible.
> 
> If my understanding is correct, this is what happens:
> 
> * Linux gets back ANA Transition.
> 
> * The current controller is reset.
> 
> * IO will be requeued or failover to other path regardless of the other 
> path being still Inaccessible (or even Persistent loss).
> 
> * nvme_find_path will return null so the IO is requeued again.
> 
> * This state continues till the target completes an AER(?)
> 
> Can it go forever?
> 
The entire idea here is that the controller does _not_ need to be reset; 
yes, we've returned with an status, but this actually is expected, and 
shouldn't cause the controller to be reset.
Obviously we need to update the ANA state on all paths before attempting 
a failover; otherwise we're running against a not-yet-updated path in eg 
inaccessible and would not be able to issue I/O.

>  4. Controller reset on path related errors
> 
> Looks like any path related errors will result in controller reset. This 
> looks bad as path change for a single namespace will result in 
> controller reset.? Didn?t see any special handling for path related 
> errors. Should that happen as controller reset is a bigger hammer for 
> these errors?
> 
For 'normal' path state change errors like 'ANA inaccessible' we 
shouldn't reset the controller; but we'll need to run some tests here 
how the system actually behaves.

>  5. Path related error handling
> 
> Again on the same lines as above. All path related errors fall in one 
> category as of now (BLK_STS_IOERR). Handling of persistent loss or ANA 
> inaccessible or ANA transition should be according to the spec.
> 
Yeah, that's actually wrong. I've got a patch reclassifying ANA errors 
as path errors, which hopefully resolves this.

>  6. Detecting host ANA support.
> 
> Didn?t see good way for target to know if host is ANA complaint or not 
> in the spec itself. One way we can negotiate ANA support is through set 
> features(0Bh). That helps targets to know whether its ok to return ANA 
> path related errors or complete AERs with ANA change notice.
> 
> Is there a plan to implement set features (0Bh) to negotiate 
> asynchronous event configuration?
> 
Well. Yes, obviously we should be registering for ANA AENs; might be 
that the current code doesn't do it as the linux target implementation 
is somewhat lazy in that respect.

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
       [not found]     ` <CAGYjvj0Mk0MFAfUEApOOyQ9Prm3CvGcZH14PJzDQT2+Qc+w81w@mail.gmail.com>
  2018-05-10 13:40       ` Hannes Reinecke
@ 2018-05-11  7:50       ` Hannes Reinecke
  2018-05-11  8:22         ` Popuri, Sriram
  1 sibling, 1 reply; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-11  7:50 UTC (permalink / raw)


On Thu, 10 May 2018 13:58:05 +0530
Sriram Popuri <sgpopuri@gmail.com> wrote:

> Hi,
> 
>   I work for NetApp and leading efforts to add ANA support for our
> NVMe/FC target. So me and my team are excited with this patch.So
> thanks for working on this!
> We had some spec comments, but John M, Fred K and others have already
> raised it. We have few more comments looking at the overall solution.
> Please consider addressing the following:
> 
> 
>    1. Ana log size
> 
> +             size_t ana_log_size = 4096;
> 
> This is hard coded. Don?t you expect to have a larger log page? Looks
> like the number of ANA groups will be restricted to around 113
> assuming one namespace identifier per ana group.
> 
> Can we have a larger ana_log_size? Maybe it should be MDTS?
> 
After revisiting the ANA Base protocol this indeed is a good question.
The spec explicitly has:

If the RGO bit is cleared to ?0? in Command Dword 10, then the LPOL
field in Command Dword 12 and the LPOU field in Command Dword 13 of the
Get Log Page command should be cleared to zero.
Note: Clearing the RGO bit to ?0? and setting either the LPOL field or
the LPOU field to a non-zero value may return a log page that the host
is unable to parse.

Which I would interpret that we _have_ to use a buffer of the correct
size to retrieve the entire information.

Curiously, we should be able to use the LPOL and LPOU setting when the
RGO bit is set, ie when we're just enumerating ANA groups.

A quick calculation there reveals that the max size for the ANA log
page with the RGO bit set is about 2M (65535 descriptors * 32
bytes per descriptor + 16 byte header), which probably is a bit unwieldy
to use directly.

So this makes it a bit awkward to handle; we first have to get the ANA
log with the RGO bit set and a default size, then (possibly) re-issue
the ANA log with the RGO bit set and the correct size, then calculate
the actual size of the ANA log, and _then_ retrieve the full ANA log.

My, you don't really believe in making things easy for the
programmer ...

Fred: if you ever plan for a revisited ANA spec, _please_ change this.
Make the currently reserved field in the ANA log page header a max nsid
count (ie store the maximum number of NSIDs per ANA group here).
With that we have a max length of the ANA log page, and would have to
retrieve the log page only twice, not three times.

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-11  7:50       ` Hannes Reinecke
@ 2018-05-11  8:22         ` Popuri, Sriram
  2018-05-11  8:36           ` Popuri, Sriram
  0 siblings, 1 reply; 33+ messages in thread
From: Popuri, Sriram @ 2018-05-11  8:22 UTC (permalink / raw)


My comments inline with tag "[Sriram]".

Regards,
~Sriram

-----Original Message-----
From: Hannes Reinecke <hare@suse.de> 
Sent: Friday, May 11, 2018 1:20 PM
To: Sriram Popuri <sgpopuri at gmail.com>
Cc: emilne at redhat.com; linux-nvme at lists.infradead.org; Popuri, Sriram <Sriram.Popuri at netapp.com>; Knight, Frederick <Frederick.Knight at netapp.com>; Meneghini, John <John.Meneghini at netapp.com>
Subject: Re: [PATCH 5/5] nvme: ANA base support

On Thu, 10 May 2018 13:58:05 +0530
Sriram Popuri <sgpopuri@gmail.com> wrote:

> Hi,
> 
>   I work for NetApp and leading efforts to add ANA support for our 
> NVMe/FC target. So me and my team are excited with this patch.So 
> thanks for working on this!
> We had some spec comments, but John M, Fred K and others have already 
> raised it. We have few more comments looking at the overall solution.
> Please consider addressing the following:
> 
> 
>    1. Ana log size
> 
> +             size_t ana_log_size = 4096;
> 
> This is hard coded. Don?t you expect to have a larger log page? Looks 
> like the number of ANA groups will be restricted to around 113 
> assuming one namespace identifier per ana group.
> 
> Can we have a larger ana_log_size? Maybe it should be MDTS?
> 
After revisiting the ANA Base protocol this indeed is a good question.
The spec explicitly has:

If the RGO bit is cleared to ?0? in Command Dword 10, then the LPOL field in Command Dword 12 and the LPOU field in Command Dword 13 of the Get Log Page command should be cleared to zero.
Note: Clearing the RGO bit to ?0? and setting either the LPOL field or the LPOU field to a non-zero value may return a log page that the host is unable to parse.

Which I would interpret that we _have_ to use a buffer of the correct size to retrieve the entire information.

[Sriram] That?s precisely the reason I suggested MDTS. Target would only give you back the size in MDTS and you can't read beyond MDTS since the spec doesn't allow you to read at some offset.

Curiously, we should be able to use the LPOL and LPOU setting when the RGO bit is set, ie when we're just enumerating ANA groups.

[Sriram] I second that and I did suggest this to Fred sometime back.

A quick calculation there reveals that the max size for the ANA log page with the RGO bit set is about 2M (65535 descriptors * 32 bytes per descriptor + 16 byte header), which probably is a bit unwieldy to use directly.

So this makes it a bit awkward to handle; we first have to get the ANA log with the RGO bit set and a default size, then (possibly) re-issue the ANA log with the RGO bit set and the correct size, then calculate the actual size of the ANA log, and _then_ retrieve the full ANA log.

[Sriram] Unfortunately you can't get the correct size with the method above because the spec says if RGO=1, num_nsid field is 0. So you can't figure out the full size of log page.

My, you don't really believe in making things easy for the programmer ...

Fred: if you ever plan for a revisited ANA spec, _please_ change this.
Make the currently reserved field in the ANA log page header a max nsid count (ie store the maximum number of NSIDs per ANA group here).
With that we have a max length of the ANA log page, and would have to retrieve the log page only twice, not three times.

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-11  8:22         ` Popuri, Sriram
@ 2018-05-11  8:36           ` Popuri, Sriram
  2018-05-11 16:24             ` Knight, Frederick
  0 siblings, 1 reply; 33+ messages in thread
From: Popuri, Sriram @ 2018-05-11  8:36 UTC (permalink / raw)


Small corrections inline.

Regards,
~Sriram
-----Original Message-----
From: Linux-nvme <linux-nvme-bounces@lists.infradead.org> On Behalf Of Popuri, Sriram
Sent: Friday, May 11, 2018 1:52 PM
To: Hannes Reinecke <hare at suse.de>; Sriram Popuri <sgpopuri at gmail.com>
Cc: Knight, Frederick <Frederick.Knight at netapp.com>; linux-nvme at lists.infradead.org; emilne at redhat.com; Meneghini, John <John.Meneghini at netapp.com>
Subject: RE: [PATCH 5/5] nvme: ANA base support

My comments inline with tag "[Sriram]".

Regards,
~Sriram

-----Original Message-----
From: Hannes Reinecke <hare@suse.de>
Sent: Friday, May 11, 2018 1:20 PM
To: Sriram Popuri <sgpopuri at gmail.com>
Cc: emilne at redhat.com; linux-nvme at lists.infradead.org; Popuri, Sriram <Sriram.Popuri at netapp.com>; Knight, Frederick <Frederick.Knight at netapp.com>; Meneghini, John <John.Meneghini at netapp.com>
Subject: Re: [PATCH 5/5] nvme: ANA base support

On Thu, 10 May 2018 13:58:05 +0530
Sriram Popuri <sgpopuri@gmail.com> wrote:

> Hi,
> 
>   I work for NetApp and leading efforts to add ANA support for our 
> NVMe/FC target. So me and my team are excited with this patch.So 
> thanks for working on this!
> We had some spec comments, but John M, Fred K and others have already 
> raised it. We have few more comments looking at the overall solution.
> Please consider addressing the following:
> 
> 
>    1. Ana log size
> 
> +             size_t ana_log_size = 4096;
> 
> This is hard coded. Don?t you expect to have a larger log page? Looks 
> like the number of ANA groups will be restricted to around 113 
> assuming one namespace identifier per ana group.
> 
> Can we have a larger ana_log_size? Maybe it should be MDTS?
> 
After revisiting the ANA Base protocol this indeed is a good question.
The spec explicitly has:

If the RGO bit is cleared to ?0? in Command Dword 10, then the LPOL field in Command Dword 12 and the LPOU field in Command Dword 13 of the Get Log Page command should be cleared to zero.
Note: Clearing the RGO bit to ?0? and setting either the LPOL field or the LPOU field to a non-zero value may return a log page that the host is unable to parse.

Which I would interpret that we _have_ to use a buffer of the correct size to retrieve the entire information.

[Sriram] That?s precisely the reason I suggested MDTS. Target would only give you back at max the size in MDTS and you can't read beyond MDTS if the log size is larger since the spec doesn't allow you to read at some offset. So the targets have to restrict the config to be able to adjust the log page within MDTS (bad, but that?s how it is today).

Curiously, we should be able to use the LPOL and LPOU setting when the RGO bit is set, ie when we're just enumerating ANA groups.

A quick calculation there reveals that the max size for the ANA log page with the RGO bit set is about 2M (65535 descriptors * 32 bytes per descriptor + 16 byte header), which probably is a bit unwieldy to use directly.

So this makes it a bit awkward to handle; we first have to get the ANA log with the RGO bit set and a default size, then (possibly) re-issue the ANA log with the RGO bit set and the correct size, then calculate the actual size of the ANA log, and _then_ retrieve the full ANA log.

[Sriram] Unfortunately you can't get the correct size with the method above because the spec says if RGO=1, num_nsid field is 0. So you can't figure out the full size of log page.

My, you don't really believe in making things easy for the programmer ...

Fred: if you ever plan for a revisited ANA spec, _please_ change this.
Make the currently reserved field in the ANA log page header a max nsid count (ie store the maximum number of NSIDs per ANA group here).
With that we have a max length of the ANA log page, and would have to retrieve the log page only twice, not three times.

[Sriram] I second that and I did suggest this to Fred sometime back.

Cheers,

Hannes
_______________________________________________
Linux-nvme mailing list
Linux-nvme at lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-11  8:36           ` Popuri, Sriram
@ 2018-05-11 16:24             ` Knight, Frederick
  0 siblings, 0 replies; 33+ messages in thread
From: Knight, Frederick @ 2018-05-11 16:24 UTC (permalink / raw)


FK> INLINE

-----Original Message-----
From: Popuri, Sriram 
Sent: Friday, May 11, 2018 4:36 AM
To: Popuri, Sriram <Sriram.Popuri at netapp.com>; Hannes Reinecke <hare at suse.de>; Sriram Popuri <sgpopuri at gmail.com>
Cc: Knight, Frederick <Frederick.Knight at netapp.com>; linux-nvme at lists.infradead.org; emilne at redhat.com; Meneghini, John <John.Meneghini at netapp.com>
Subject: RE: [PATCH 5/5] nvme: ANA base support

Small corrections inline.

Regards,
~Sriram
-----Original Message-----
From: Linux-nvme <linux-nvme-bounces@lists.infradead.org> On Behalf Of Popuri, Sriram
Sent: Friday, May 11, 2018 1:52 PM
To: Hannes Reinecke <hare at suse.de>; Sriram Popuri <sgpopuri at gmail.com>
Cc: Knight, Frederick <Frederick.Knight at netapp.com>; linux-nvme at lists.infradead.org; emilne at redhat.com; Meneghini, John <John.Meneghini at netapp.com>
Subject: RE: [PATCH 5/5] nvme: ANA base support

My comments inline with tag "[Sriram]".

Regards,
~Sriram

-----Original Message-----
From: Hannes Reinecke <hare@suse.de>
Sent: Friday, May 11, 2018 1:20 PM
To: Sriram Popuri <sgpopuri at gmail.com>
Cc: emilne at redhat.com; linux-nvme at lists.infradead.org; Popuri, Sriram <Sriram.Popuri at netapp.com>; Knight, Frederick <Frederick.Knight at netapp.com>; Meneghini, John <John.Meneghini at netapp.com>
Subject: Re: [PATCH 5/5] nvme: ANA base support

On Thu, 10 May 2018 13:58:05 +0530
Sriram Popuri <sgpopuri@gmail.com> wrote:

> Hi,
> 
>   I work for NetApp and leading efforts to add ANA support for our 
> NVMe/FC target. So me and my team are excited with this patch.So 
> thanks for working on this!
> We had some spec comments, but John M, Fred K and others have already 
> raised it. We have few more comments looking at the overall solution.
> Please consider addressing the following:

[] FK> All spec comments that were raised were addressed.  There were no outstanding comments at the time the spec was approved (or now for that matter).

>    1. Ana log size
> 
> +             size_t ana_log_size = 4096;
> 
> This is hard coded. Don?t you expect to have a larger log page? Looks 
> like the number of ANA groups will be restricted to around 113 
> assuming one namespace identifier per ana group.
> 
> Can we have a larger ana_log_size? Maybe it should be MDTS?
> 
After revisiting the ANA Base protocol this indeed is a good question.
The spec explicitly has:

If the RGO bit is cleared to ?0? in Command Dword 10, then the LPOL field in Command Dword 12 and the LPOU field in Command Dword 13 of the Get Log Page command should be cleared to zero.
Note: Clearing the RGO bit to ?0? and setting either the LPOL field or the LPOU field to a non-zero value may return a log page that the host is unable to parse.

Which I would interpret that we _have_ to use a buffer of the correct size to retrieve the entire information.

[] FK> NO, you don't "have_to".  LPOL and LPOU are fine to use when RGO=1 (if you are getting a fixed definition data structure, then you a priory know where things are, and therefore, what offset to use to get to where you want to get).  If however, RGO=0, then the definition of the data structure is variable (the contents of offset X+n is dependent on the contents of the n bytes that come before X).  If RGO=0, the number of NSID values in each group can change (at any time), and therefore, change where things are - and therefore change the offset value you need to use - to get where you think you should be getting.  BUT, if you want to risk it, or if you've invented a unique new algorithm to find stuff in that variable structure - go for it - it is just a "should".

[Sriram] That?s precisely the reason I suggested MDTS. Target would only give you back at max the size in MDTS and you can't read beyond MDTS if the log size is larger since the spec doesn't allow you to read at some offset. So the targets have to restrict the config to be able to adjust the log page within MDTS (bad, but that?s how it is today).

[] FK> Wrong.  With RGO=1, it is fine to use the offset fields to work your way through a long log page.

Curiously, we should be able to use the LPOL and LPOU setting when the RGO bit is set, ie when we're just enumerating ANA groups.

[] FK> That is the way it was designed.

A quick calculation there reveals that the max size for the ANA log page with the RGO bit set is about 2M (65535 descriptors * 32 bytes per descriptor + 16 byte header), which probably is a bit unwieldy to use directly.

[] FK> You missed the MNAN field in that calculation.  The buffer only needs to be large enough to hold the maximum number of NSID values, not up to the maximum NSID value - you don't need to reserve space for a whole bunch of values that are never used.

So this makes it a bit awkward to handle; we first have to get the ANA log with the RGO bit set and a default size, then (possibly) re-issue the ANA log with the RGO bit set and the correct size, then calculate the actual size of the ANA log, and _then_ retrieve the full ANA log.

[] FK> Again, you missed the MNAN field - just use that, and do it once.

[Sriram] Unfortunately you can't get the correct size with the method above because the spec says if RGO=1, num_nsid field is 0. So you can't figure out the full size of log page.

My, you don't really believe in making things easy for the programmer ...

[] FK> This was agreed to by all the people on the call (which included some host programmers).

Fred: if you ever plan for a revisited ANA spec, _please_ change this.
Make the currently reserved field in the ANA log page header a max nsid count (ie store the maximum number of NSIDs per ANA group here).
With that we have a max length of the ANA log page, and would have to retrieve the log page only twice, not three times.

[] FK> You already have that with the MNAN field.

[Sriram] I second that and I did suggest this to Fred sometime back.

[] FK> Which is why we added the MNAN field.

Cheers,

Hannes
_______________________________________________
Linux-nvme mailing list
Linux-nvme at lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-04 11:28 ` [PATCH 5/5] nvme: ANA base support Hannes Reinecke
                     ` (4 preceding siblings ...)
  2018-05-09 19:03   ` Ewan D. Milne
@ 2018-05-12 13:31   ` Christoph Hellwig
  2018-05-13 10:33     ` Hannes Reinecke
  5 siblings, 1 reply; 33+ messages in thread
From: Christoph Hellwig @ 2018-05-12 13:31 UTC (permalink / raw)


> +static void nvme_get_ana_log(struct nvme_ctrl *ctrl, struct nvme_ns *ns)

Passing a ns to the function reading the ANA log doesn't make any
sense.  The ANA log page is global to the controller, so we should
only read it when rescanning the controller, or when the AER
triggers.

> +static void nvme_get_full_ana_log(struct nvme_ctrl *ctrl)
> +{
> +	int i, j;
> +	struct nvme_ns *ns;
> +	struct nvmf_ana_rsp_page_header *ana_log;
> +	size_t ana_log_size = 4096;

This needs to be size based on the MNAN and NANAGRPID fields.

> +
> +	ana_log = kzalloc(ana_log_size, GFP_KERNEL);
> +	if (!ana_log)
> +		return;

Please preallocate a per-controller buffer at controller scanning
time for this.  By the time path changes hit we might be under
sever memory pressure and want things to just work.

> +
> +	if (nvme_get_log(ctrl, NVME_LOG_ANA, ana_log, ana_log_size))
> +		dev_warn(ctrl->device,
> +			 "Get ANA log error\n");

I think we need some sane error handling here.

> +	list_for_each_entry(ns, &ctrl->namespaces, list) {
> +		for (i = 0; i < ana_log->grpid_num; i++) {
> +			struct nvmf_ana_group_descriptor *desc =
> +				&ana_log->desc[i];
> +			for (j = 0; j < desc->nsid_num; j++) {
> +				if (desc->nsid[j] == ns->head->ns_id) {
> +					ns->ana_state = desc->ana_state;
> +					ns->ana_group = desc->groupid;

What protects us from concurrent updates?  You only hold the
shared semaphore in the caller (which probably should move into
the function to start with).

> +	if ((a == &dev_attr_ana_state.attr) ||
> +	    (a == &dev_attr_ana_group.attr)) {

No need for the inner braces.

> +	/* First round: select from all ANA optimized paths */
>  	list_for_each_entry_rcu(ns, &head->list, siblings) {
> -		if (ns->ctrl->state == NVME_CTRL_LIVE) {
> +		if (ns->ctrl->state == NVME_CTRL_LIVE &&
> +		    ns->ana_state == NVME_ANA_STATE_OPTIMIZED) {
> +			rcu_assign_pointer(head->current_path, ns);
> +			return ns;
> +		}
> +	}
> +	/* Second round: select from all ANA non-optimized paths */
> +	list_for_each_entry_rcu(ns, &head->list, siblings) {
> +		if (ns->ctrl->state == NVME_CTRL_LIVE &&
> +		    ns->ana_state == NVME_ANA_STATE_NONOPTIMIZED) {
>  			rcu_assign_pointer(head->current_path, ns);
>  			return ns;

Based on the last setences in section 8.18.3.3 of the ANA spec
we should also do a last restort attempt at inaccseesible paths.

Also I think we should just pass the state to __nvme_find_path and
retry in the caller to avoid code duplication.

Note that this code seems to miss any handling of the transitioning
state and the error codes related to it, and also doesn't work around
the nasty bit in the spec that allows inaccesible states to report
incorrect capacity, which we'll need to work around by stealing that
data from other namespaces.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 5/5] nvme: ANA base support
  2018-05-12 13:31   ` Christoph Hellwig
@ 2018-05-13 10:33     ` Hannes Reinecke
  0 siblings, 0 replies; 33+ messages in thread
From: Hannes Reinecke @ 2018-05-13 10:33 UTC (permalink / raw)


On 05/12/2018 03:31 PM, Christoph Hellwig wrote:
>> +static void nvme_get_ana_log(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
> 
> Passing a ns to the function reading the ANA log doesn't make any
> sense.  The ANA log page is global to the controller, so we should
> only read it when rescanning the controller, or when the AER
> triggers.
> 
>> +static void nvme_get_full_ana_log(struct nvme_ctrl *ctrl)
>> +{
>> +	int i, j;
>> +	struct nvme_ns *ns;
>> +	struct nvmf_ana_rsp_page_header *ana_log;
>> +	size_t ana_log_size = 4096;
> 
> This needs to be size based on the MNAN and NANAGRPID fields.
> 
Yes, already fixed up for the next round.

>> +
>> +	ana_log = kzalloc(ana_log_size, GFP_KERNEL);
>> +	if (!ana_log)
>> +		return;
> 
> Please preallocate a per-controller buffer at controller scanning
> time for this.  By the time path changes hit we might be under
> sever memory pressure and want things to just work.
> 
Okay, will do.

>> +
>> +	if (nvme_get_log(ctrl, NVME_LOG_ANA, ana_log, ana_log_size))
>> +		dev_warn(ctrl->device,
>> +			 "Get ANA log error\n");
> 
> I think we need some sane error handling here.
> 
True; the next round will have an error handling here.

>> +	list_for_each_entry(ns, &ctrl->namespaces, list) {
>> +		for (i = 0; i < ana_log->grpid_num; i++) {
>> +			struct nvmf_ana_group_descriptor *desc =
>> +				&ana_log->desc[i];
>> +			for (j = 0; j < desc->nsid_num; j++) {
>> +				if (desc->nsid[j] == ns->head->ns_id) {
>> +					ns->ana_state = desc->ana_state;
>> +					ns->ana_group = desc->groupid;
> 
> What protects us from concurrent updates?  You only hold the
> shared semaphore in the caller (which probably should move into
> the function to start with).
> 
I'll be moving the function to get the ANA log page to a workqueue, so 
we'll get an implicit serialisation from there.
And obviously we need to get the namespace mutex when traversing the 
list of namspaces.

>> +	if ((a == &dev_attr_ana_state.attr) ||
>> +	    (a == &dev_attr_ana_group.attr)) {
> 
> No need for the inner braces.
> 
Ok.

>> +	/* First round: select from all ANA optimized paths */
>>   	list_for_each_entry_rcu(ns, &head->list, siblings) {
>> -		if (ns->ctrl->state == NVME_CTRL_LIVE) {
>> +		if (ns->ctrl->state == NVME_CTRL_LIVE &&
>> +		    ns->ana_state == NVME_ANA_STATE_OPTIMIZED) {
>> +			rcu_assign_pointer(head->current_path, ns);
>> +			return ns;
>> +		}
>> +	}
>> +	/* Second round: select from all ANA non-optimized paths */
>> +	list_for_each_entry_rcu(ns, &head->list, siblings) {
>> +		if (ns->ctrl->state == NVME_CTRL_LIVE &&
>> +		    ns->ana_state == NVME_ANA_STATE_NONOPTIMIZED) {
>>   			rcu_assign_pointer(head->current_path, ns);
>>   			return ns;
> 
> Based on the last setences in section 8.18.3.3 of the ANA spec
> we should also do a last restort attempt at inaccseesible paths.
> 
Okay; no problem with that.

> Also I think we should just pass the state to __nvme_find_path and
> retry in the caller to avoid code duplication.
> 
Hmm. Not sure if it that really simplifies things, but I'll have a look.

> Note that this code seems to miss any handling of the transitioning
> state and the error codes related to it, and also doesn't work around
> the nasty bit in the spec that allows inaccesible states to report
> incorrect capacity, which we'll need to work around by stealing that
> data from other namespaces.
> 
Yeah, that's particularly mean. I've added some code the nvmet, though, 
so we should be able to test this one.

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2018-05-13 10:33 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-04 11:28 [RFC PATCH 0/5] nvme: ANA support Hannes Reinecke
2018-05-04 11:28 ` [PATCH 1/5] nvmet: EUI64 support Hannes Reinecke
2018-05-07  7:17   ` Johannes Thumshirn
2018-05-09  7:08   ` Christoph Hellwig
2018-05-09  7:45     ` Hannes Reinecke
2018-05-09  7:52       ` Christoph Hellwig
2018-05-09 15:43   ` Ewan D. Milne
2018-05-04 11:28 ` [PATCH 2/5] nvme: Add ANA base definitions Hannes Reinecke
2018-05-04 16:17   ` Keith Busch
2018-05-04 17:03     ` Meneghini, John
2018-05-04 17:21       ` Knight, Frederick
2018-05-05 13:17       ` Hannes Reinecke
2018-05-04 21:12   ` Schremmer, Steven
2018-05-09  7:27   ` Christoph Hellwig
2018-05-04 11:28 ` [PATCH 3/5] nvmet: Add ANA base support Hannes Reinecke
2018-05-06  2:42   ` Guan Junxiong
2018-05-04 11:28 ` [PATCH 4/5] block: BLK_STS_NEXUS is a path failure Hannes Reinecke
2018-05-09  7:31   ` Christoph Hellwig
2018-05-04 11:28 ` [PATCH 5/5] nvme: ANA base support Hannes Reinecke
2018-05-04 22:11   ` Schremmer, Steven
2018-05-05 13:23     ` Hannes Reinecke
2018-05-06  3:23   ` Guan Junxiong
2018-05-07  7:21   ` Johannes Thumshirn
2018-05-09 18:49   ` Ewan D. Milne
2018-05-09 19:03   ` Ewan D. Milne
2018-05-10  9:16     ` Sriram Popuri
     [not found]     ` <CAGYjvj0Mk0MFAfUEApOOyQ9Prm3CvGcZH14PJzDQT2+Qc+w81w@mail.gmail.com>
2018-05-10 13:40       ` Hannes Reinecke
2018-05-11  7:50       ` Hannes Reinecke
2018-05-11  8:22         ` Popuri, Sriram
2018-05-11  8:36           ` Popuri, Sriram
2018-05-11 16:24             ` Knight, Frederick
2018-05-12 13:31   ` Christoph Hellwig
2018-05-13 10:33     ` Hannes Reinecke

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.