All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] nvme-fabrics: add discovery controller type
       [not found] <20220125145956.14746-1-nitram_67@hotmail.com>
@ 2022-01-25 14:59 ` Martin Belanger
  2022-01-27  9:16   ` Hannes Reinecke
  2022-01-27 13:16   ` Sagi Grimberg
  2022-01-25 14:59 ` [PATCH 2/4] nvme: add host symbolic name Martin Belanger
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 32+ messages in thread
From: Martin Belanger @ 2022-01-25 14:59 UTC (permalink / raw)
  To: linux-nvme; +Cc: kbusch, hare, axboe, hch, sagi, Martin Belanger

From: Martin Belanger <martin.belanger@dell.com>

TP8010 introduces the Central Discovery Controller (CDC). To
differentiate between a CDC and a DDC (Direct DC), the identify
response now carries a DCTYPE (Discovery Controller Type) field.

Signed-off-by: Martin Belanger <martin.belanger@dell.com>
---
 drivers/nvme/host/core.c |  3 +++
 drivers/nvme/host/nvme.h |  2 ++
 include/linux/nvme.h     | 10 +++++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4fc794d9c2f4..cd34b92e8e9d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2978,6 +2978,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ctrl->kas = le16_to_cpu(id->kas);
 	ctrl->max_namespaces = le32_to_cpu(id->mnan);
 	ctrl->ctratt = le32_to_cpu(id->ctratt);
+	ctrl->dctype = id->dctype;
 
 	if (id->rtd3e) {
 		/* us -> s */
@@ -3335,6 +3336,7 @@ nvme_show_int_function(numa_node);
 nvme_show_int_function(queue_count);
 nvme_show_int_function(sqsize);
 nvme_show_int_function(kato);
+nvme_show_int_function(dctype);
 
 static ssize_t nvme_sysfs_delete(struct device *dev,
 				struct device_attribute *attr, const char *buf,
@@ -3533,6 +3535,7 @@ static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_reconnect_delay.attr,
 	&dev_attr_fast_io_fail_tmo.attr,
 	&dev_attr_kato.attr,
+	&dev_attr_dctype.attr,
 	NULL
 };
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fe224016418e..f8b60f7346fd 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -349,6 +349,8 @@ struct nvme_ctrl {
 	unsigned long discard_page_busy;
 
 	struct nvme_fault_inject fault_inject;
+
+	enum nvme_dctype dctype;
 };
 
 enum nvme_iopolicy {
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 855dd9b3e84b..82567d493c51 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -240,6 +240,12 @@ enum nvme_ctrl_attr {
 	NVME_CTRL_ATTR_TBKAS		= (1 << 6),
 };
 
+enum nvme_dctype {
+	NVME_DCTYPE_NOT_REPORTED = 0,
+	NVME_DCTYPE_DDC = 1,
+	NVME_DCTYPE_CDC = 2,
+};
+
 struct nvme_id_ctrl {
 	__le16			vid;
 	__le16			ssvid;
@@ -320,7 +326,9 @@ struct nvme_id_ctrl {
 	__le16			icdoff;
 	__u8			ctrattr;
 	__u8			msdbd;
-	__u8			rsvd1804[244];
+	__le16			ofcs;
+	__u8			dctype;
+	__u8			rsvd1807[241];
 	struct nvme_id_power_state	psd[32];
 	__u8			vs[1024];
 };
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 2/4] nvme: add host symbolic name
       [not found] <20220125145956.14746-1-nitram_67@hotmail.com>
  2022-01-25 14:59 ` [PATCH 1/4] nvme-fabrics: add discovery controller type Martin Belanger
@ 2022-01-25 14:59 ` Martin Belanger
  2022-01-27  9:18   ` Hannes Reinecke
  2022-01-27 13:20   ` Sagi Grimberg
  2022-01-25 14:59 ` [PATCH 3/4] nvme-fabrics: add tp8010 support Martin Belanger
  2022-01-25 14:59 ` [PATCH 4/4] nvme-fabrics: add explicit deregistration on disconnect Martin Belanger
  3 siblings, 2 replies; 32+ messages in thread
From: Martin Belanger @ 2022-01-25 14:59 UTC (permalink / raw)
  To: linux-nvme; +Cc: kbusch, hare, axboe, hch, sagi, Martin Belanger

From: Martin Belanger <martin.belanger@dell.com>

TP8010 introduces the 'host symbolic name' as an optional
parameter for explicit registration with a central discovery
controller.

Signed-off-by: Martin Belanger <martin.belanger@dell.com>
---
 drivers/nvme/host/core.c    | 14 ++++++++++++++
 drivers/nvme/host/fabrics.c | 17 +++++++++++++++--
 drivers/nvme/host/fabrics.h |  2 ++
 include/linux/nvme.h        |  2 ++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index cd34b92e8e9d..cf5d9984f8f6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3404,6 +3404,16 @@ static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
 }
 static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
 
+static ssize_t nvme_sysfs_show_hostsymname(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%s\n", ctrl->opts->host->symname);
+}
+static DEVICE_ATTR(hostsymname, S_IRUGO, nvme_sysfs_show_hostsymname, NULL);
+
 static ssize_t nvme_sysfs_show_hostid(struct device *dev,
 					struct device_attribute *attr,
 					char *buf)
@@ -3531,6 +3541,7 @@ static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_sqsize.attr,
 	&dev_attr_hostnqn.attr,
 	&dev_attr_hostid.attr,
+	&dev_attr_hostsymname.attr,
 	&dev_attr_ctrl_loss_tmo.attr,
 	&dev_attr_reconnect_delay.attr,
 	&dev_attr_fast_io_fail_tmo.attr,
@@ -3553,6 +3564,9 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
 		return 0;
 	if (a == &dev_attr_hostid.attr && !ctrl->opts)
 		return 0;
+	if (a == &dev_attr_hostsymname.attr &&
+	    (!ctrl->opts || (ctrl->opts->host->symname[0] == '\0')))
+		return 0;
 	if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
 		return 0;
 	if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 7ae041e2b3fb..040a6cce6afa 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -540,6 +540,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_HOST_TRADDR,		"host_traddr=%s"	},
 	{ NVMF_OPT_HOST_IFACE,		"host_iface=%s"		},
 	{ NVMF_OPT_HOST_ID,		"hostid=%s"		},
+	{ NVMF_OPT_SYMNAME,		"hostsymname=%s"	},
 	{ NVMF_OPT_DUP_CONNECT,		"duplicate_connect"	},
 	{ NVMF_OPT_DISABLE_SQFLOW,	"disable_sqflow"	},
 	{ NVMF_OPT_HDR_DIGEST,		"hdr_digest"		},
@@ -556,7 +557,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		const char *buf)
 {
 	substring_t args[MAX_OPT_ARGS];
-	char *options, *o, *p;
+	char *options, *o, *p, *hostsymname = NULL;
 	int token, ret = 0;
 	size_t nqnlen  = 0;
 	int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO;
@@ -830,6 +831,13 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DISCOVERY:
 			opts->discovery_nqn = true;
 			break;
+		case NVMF_OPT_SYMNAME:
+			hostsymname = match_strdup(args);
+			if (!hostsymname) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
@@ -864,8 +872,13 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 
 	uuid_copy(&opts->host->id, &hostid);
 
+	if (hostsymname)
+		strncpy(opts->host->symname, hostsymname,
+			sizeof(opts->host->symname) - 1);
+
 out:
 	kfree(options);
+	kfree(hostsymname);
 	return ret;
 }
 
@@ -957,7 +970,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
 				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
 				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
 				 NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\
-				 NVMF_OPT_FAIL_FAST_TMO)
+				 NVMF_OPT_FAIL_FAST_TMO | NVMF_OPT_SYMNAME)
 
 static struct nvme_ctrl *
 nvmf_create_ctrl(struct device *dev, const char *buf)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index c3203ff1c654..494e6dbe233a 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -38,6 +38,7 @@ struct nvmf_host {
 	struct list_head	list;
 	char			nqn[NVMF_NQN_SIZE];
 	uuid_t			id;
+	char			symname[NVMF_HOSTSYMNAME_SIZE+1];
 };
 
 /**
@@ -68,6 +69,7 @@ enum {
 	NVMF_OPT_FAIL_FAST_TMO	= 1 << 20,
 	NVMF_OPT_HOST_IFACE	= 1 << 21,
 	NVMF_OPT_DISCOVERY	= 1 << 22,
+	NVMF_OPT_SYMNAME	= 1 << 23,
 };
 
 /**
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 82567d493c51..3b47951342f4 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -16,6 +16,8 @@
 /* However the max length of a qualified name is another size */
 #define NVMF_NQN_SIZE		223
 
+#define NVMF_HOSTSYMNAME_SIZE	256
+
 #define NVMF_TRSVCID_SIZE	32
 #define NVMF_TRADDR_SIZE	256
 #define NVMF_TSAS_SIZE		256
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 3/4] nvme-fabrics: add tp8010 support
       [not found] <20220125145956.14746-1-nitram_67@hotmail.com>
  2022-01-25 14:59 ` [PATCH 1/4] nvme-fabrics: add discovery controller type Martin Belanger
  2022-01-25 14:59 ` [PATCH 2/4] nvme: add host symbolic name Martin Belanger
@ 2022-01-25 14:59 ` Martin Belanger
  2022-01-27  9:30   ` Hannes Reinecke
  2022-01-31 17:03   ` John Meneghini
  2022-01-25 14:59 ` [PATCH 4/4] nvme-fabrics: add explicit deregistration on disconnect Martin Belanger
  3 siblings, 2 replies; 32+ messages in thread
From: Martin Belanger @ 2022-01-25 14:59 UTC (permalink / raw)
  To: linux-nvme; +Cc: kbusch, hare, axboe, hch, sagi, Martin Belanger

From: Martin Belanger <martin.belanger@dell.com>

TP8010 introduces Central Discovery Controllers (CDC) and the
Discovery Information Management (DIM) PDU, which is used to
perform explicit registration with Discovery Controllers.

This patch defines the DIM PDU and adds logic to perform explicit
registration with DCs when registration has been requested by the
user.

Signed-off-by: Martin Belanger <martin.belanger@dell.com>
---
 drivers/nvme/host/fabrics.c | 164 +++++++++++++++++++++++
 drivers/nvme/host/fabrics.h |   9 ++
 drivers/nvme/host/tcp.c     |  51 +++++++-
 include/linux/nvme.h        | 255 ++++++++++++++++++++++++++++++++++--
 4 files changed, 467 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 040a6cce6afa..e6fc2f85b670 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -10,6 +10,7 @@
 #include <linux/mutex.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
+#include <linux/utsname.h>
 #include "nvme.h"
 #include "fabrics.h"
 
@@ -526,6 +527,165 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
 	return NULL;
 }
 
+/**
+ * nvmf_get_tel() - Calculate the amount of memory needed for a Discovery
+ * Information Entry. Each Entry must contain at a minimum an Extended
+ * Attribute for the HostID. The Entry may optionally contain an Extended
+ * Attribute for the Symbolic Name.
+ *
+ * @hostsymname - Symbolic name (may be NULL)
+ *
+ * Return Total Entry Length
+ */
+static u32 nvmf_get_tel(const char *hostsymname)
+{
+	u32 tel = SIZEOF_EXT_DIE_WO_EXAT;
+	u16 len;
+
+	/* Host ID is mandatory */
+	tel += exat_size(sizeof(uuid_t));
+
+	/* Symbolic name is optional */
+	len = hostsymname ? strlen(hostsymname) : 0;
+	if (len)
+		tel += exat_size(len);
+
+	return tel;
+}
+
+/**
+ * nvmf_fill_die() - Fill the Discovery Information Entry pointed to by
+ *                   @die.
+ *
+ * @die - Pointer to Discovery Information Entry to be filled
+ * @tel - Length of the DIE
+ * @trtype - Transport type
+ * @adrfam - Address family
+ * @reg_addr - Address to register. Setting this to an empty string tells
+ *       the DC to infer address from the source address of the socket.
+ * @nqn - Host NQN
+ * @hostid - Host ID
+ * @hostsymname - Host symbolic name
+ * @tsas - Transport Specific Address Subtype for the address being
+ *       registered.
+ */
+static void nvmf_fill_die(struct nvmf_ext_die  *die,
+			  u32                   tel,
+			  u8                    trtype,
+			  u8                    adrfam,
+			  const char           *reg_addr,
+			  struct nvmf_host     *host,
+			  union nvmf_tsas      *tsas)
+{
+	u16 numexat = 0;
+	size_t symname_len;
+	struct nvmf_ext_attr *exat;
+
+	die->tel = cpu_to_le32(tel);
+	die->trtype = trtype;
+	die->adrfam = adrfam;
+
+	strncpy(die->nqn, host->nqn, sizeof(die->nqn));
+	strncpy(die->traddr, reg_addr, sizeof(die->traddr));
+	memcpy(&die->tsas, tsas, sizeof(die->tsas));
+
+	/* Extended Attribute for the HostID (mandatory) */
+	numexat++;
+	exat = &die->exat;
+	exat->type = cpu_to_le16(NVME_EXATTYPE_HOSTID);
+	exat->len  = cpu_to_le16(exat_len(sizeof(host->id)));
+	uuid_copy((uuid_t *)exat->val, &host->id);
+
+	/* Extended Attribute for the Symbolic Name (optional) */
+	symname_len = strlen(host->symname);
+	if (symname_len) {
+		u16 exatlen = exat_len(symname_len);
+
+		numexat++;
+		exat = exat_ptr_next(exat);
+		exat->type = cpu_to_le16(NVME_EXATTYPE_SYMNAME);
+		exat->len  = cpu_to_le16(exatlen);
+		memcpy(exat->val, host->symname, symname_len);
+		/* Per Base specs, ASCII strings must be padded with spaces */
+		memset(&exat->val[symname_len], ' ', exatlen - symname_len);
+	}
+
+	die->numexat = cpu_to_le16(numexat);
+}
+
+/**
+ * nvmf_disc_info_mgmt() - Perform explicit registration, deregistration,
+ * or registration-update (specified by @tas) by sending a Discovery
+ * Information Management (DIM) command to the Discovery Controller (DC).
+ *
+ * @ctrl: Host NVMe controller instance maintaining the admin queue used to
+ *      submit the DIM command to the DC.
+ * @tas: Task field of the Command Dword 10 (cdw10). Indicates whether to
+ *     perform a Registration, Deregistration, or Registration-update.
+ * @trtype: Transport type (e.g. NVMF_TRTYPE_TCP)
+ * @adrfam: Address family (e.g. NVMF_ADDR_FAMILY_IP6)
+ * @reg_addr - Address to register. Setting this to an empty string tells
+ *       the DC to infer address from the source address of the socket.
+ * @tsas - Transport Specific Address Subtype for the address being
+ *       registered.
+ *
+ * Return:   0: success,
+ *         > 0: NVMe error status code,
+ *         < 0: Linux errno error code
+ */
+int nvmf_disc_info_mgmt(struct nvme_ctrl *ctrl,
+			enum nvmf_dim_tas tas,
+			u8                trtype,
+			u8                adrfam,
+			const char       *reg_addr,
+			union nvmf_tsas  *tsas)
+{
+	int                   ret;
+	u32                   tdl;  /* Total Data Length */
+	u32                   tel;  /* Total Entry Length */
+	struct nvmf_dim_data *dim;  /* Discovery Information Management */
+	struct nvmf_ext_die  *die;  /* Discovery Information Entry */
+	struct nvme_command   cmd = {};
+	union nvme_result     result;
+
+	/* Register 1 Discovery Information Entry (DIE) of size TEL */
+	tel = nvmf_get_tel(ctrl->opts->host->symname);
+	tdl = SIZEOF_DIM_WO_DIE + tel;
+
+	dim = kzalloc(tdl, GFP_KERNEL);
+	if (!dim)
+		return -ENOMEM;
+
+	cmd.dim.opcode = nvme_admin_dim;
+	cmd.dim.cdw10  = cpu_to_le32(tas);
+
+	dim->tdl    = cpu_to_le32(tdl);
+	dim->nument = cpu_to_le64(1);    /* only 1 DIE to register */
+	dim->entfmt = cpu_to_le16(NVME_ENTFMT_EXTENDED);
+	dim->etype  = cpu_to_le16(NVME_REGISTRATION_HOST);
+	dim->ektype = cpu_to_le16(0x5F); /* must be 0x5F per specs */
+
+	strncpy(dim->eid, ctrl->opts->host->nqn, sizeof(dim->eid));
+	strncpy(dim->ename, utsname()->nodename, sizeof(dim->ename));
+	strncpy(dim->ever, utsname()->release, sizeof(dim->ever));
+
+	die = &dim->die.extended;
+	nvmf_fill_die(die, tel, trtype, adrfam, reg_addr,
+		      ctrl->opts->host, tsas);
+
+	ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &result,
+				     dim, tdl, 0, NVME_QID_ANY, 1, 0);
+	if (unlikely(ret)) {
+		dev_err(ctrl->device, "DIM error Result:0x%04X Status:0x%04X\n",
+		   le32_to_cpu(result.u32), ret);
+	}
+
+	kfree(dim);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvmf_disc_info_mgmt);
+
 static const match_table_t opt_tokens = {
 	{ NVMF_OPT_TRANSPORT,		"transport=%s"		},
 	{ NVMF_OPT_TRADDR,		"traddr=%s"		},
@@ -550,6 +710,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_TOS,			"tos=%d"		},
 	{ NVMF_OPT_FAIL_FAST_TMO,	"fast_io_fail_tmo=%d"	},
 	{ NVMF_OPT_DISCOVERY,		"discovery"		},
+	{ NVMF_OPT_REGISTER,		"register"		},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -831,6 +992,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DISCOVERY:
 			opts->discovery_nqn = true;
 			break;
+		case NVMF_OPT_REGISTER:
+			opts->explicit_registration = true;
+			break;
 		case NVMF_OPT_SYMNAME:
 			hostsymname = match_strdup(args);
 			if (!hostsymname) {
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 494e6dbe233a..77f5768f8ff4 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -70,6 +70,7 @@ enum {
 	NVMF_OPT_HOST_IFACE	= 1 << 21,
 	NVMF_OPT_DISCOVERY	= 1 << 22,
 	NVMF_OPT_SYMNAME	= 1 << 23,
+	NVMF_OPT_REGISTER	= 1 << 24,
 };
 
 /**
@@ -106,6 +107,7 @@ enum {
  * @nr_poll_queues: number of queues for polling I/O
  * @tos: type of service
  * @fast_io_fail_tmo: Fast I/O fail timeout in seconds
+ * @explicit_registration: Explicit Registration with DC using the DIM PDU
  */
 struct nvmf_ctrl_options {
 	unsigned		mask;
@@ -130,6 +132,7 @@ struct nvmf_ctrl_options {
 	unsigned int		nr_poll_queues;
 	int			tos;
 	int			fast_io_fail_tmo;
+	bool			explicit_registration;
 };
 
 /*
@@ -200,5 +203,11 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
 bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
 bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
 		struct nvmf_ctrl_options *opts);
+int nvmf_disc_info_mgmt(struct nvme_ctrl *ctrl,
+			enum nvmf_dim_tas tas,
+			u8                trtype,
+			u8                adrfam,
+			const char       *reg_addr,
+			union nvmf_tsas  *tsas);
 
 #endif /* _NVME_FABRICS_H */
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 4ceb28675fdf..970b7df574a4 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1993,6 +1993,40 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
 	}
 }
 
+/**
+ * nvme_get_adrfam() - Get address family for the address we're registering
+ * with the DC. We retrieve this info from the socket itself. If we can't
+ * get the source address from the socket, then we'll infer the address
+ * family from the address of the DC since the DC address has the same
+ * address family.
+ *
+ * @ctrl: Host NVMe controller instance maintaining the admin queue used to
+ *      submit the DIM command to the DC.
+ *
+ * Return: The address family of the source address associated with the
+ * socket connected to the DC.
+ */
+static u8 nvme_get_adrfam(struct nvme_ctrl *ctrl)
+{
+	u8                       adrfam = NVMF_ADDR_FAMILY_IP4;
+	struct sockaddr_storage  sas;
+	struct sockaddr         *sa = (struct sockaddr *)&sas;
+	struct nvme_tcp_queue   *queue = &to_tcp_ctrl(ctrl)->queues[0];
+
+	if (kernel_getsockname(queue->sock, sa) == 0) {
+		if (sa->sa_family == AF_INET6)
+			adrfam = NVMF_ADDR_FAMILY_IP6;
+	} else {
+		unsigned char buf[sizeof(struct in6_addr)];
+
+		if (in6_pton(ctrl->opts->traddr,
+			     strlen(ctrl->opts->traddr), buf, -1, NULL) > 0)
+			adrfam = NVMF_ADDR_FAMILY_IP6;
+	}
+
+	return adrfam;
+}
+
 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
 {
 	struct nvmf_ctrl_options *opts = ctrl->opts;
@@ -2045,6 +2079,20 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
 		goto destroy_io;
 	}
 
+	if (ctrl->opts->explicit_registration &&
++	    ((ctrl->dctype == NVME_DCTYPE_DDC) ||
+	     (ctrl->dctype == NVME_DCTYPE_CDC))) {
+		/* We're registering our source address with the DC. To do
+		 * that, we can simply send an empty string. This tells the DC
+		 * to retrieve the source address from the socket and use that
+		 * as the registration address.
+		 */
+		union nvmf_tsas tsas = {.tcp.sectype = NVMF_TCP_SECTYPE_NONE};
+
+		nvmf_disc_info_mgmt(ctrl, NVME_TAS_REGISTER, NVMF_TRTYPE_TCP,
+				    nvme_get_adrfam(ctrl), "", &tsas);
+	}
+
 	nvme_start_ctrl(ctrl);
 	return 0;
 
@@ -2613,7 +2661,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
 			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
 			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
 			  NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
-			  NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
+			  NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE |
+			  NVMF_OPT_REGISTER,
 	.create_ctrl	= nvme_tcp_create_ctrl,
 };
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 3b47951342f4..891615876cfa 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -9,6 +9,7 @@
 
 #include <linux/types.h>
 #include <linux/uuid.h>
+#include <linux/kernel.h>
 
 /* NQN names in commands fields specified one size */
 #define NVMF_NQN_FIELD_LEN	256
@@ -102,6 +103,16 @@ enum {
 	NVMF_RDMA_CMS_RDMA_CM	= 1, /* Sockets based endpoint addressing */
 };
 
+/**
+ * enum -
+ * @NVMF_TCP_SECTYPE_NONE: No Security
+ * @NVMF_TCP_SECTYPE_TLS:  Transport Layer Security
+ */
+enum {
+	NVMF_TCP_SECTYPE_NONE	= 0,
+	NVMF_TCP_SECTYPE_TLS	= 1,
+};
+
 #define NVME_AQ_DEPTH		32
 #define NVME_NR_AEN_COMMANDS	1
 #define NVME_AQ_BLK_MQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
@@ -1036,6 +1047,7 @@ enum nvme_admin_opcode {
 	nvme_admin_sanitize_nvm		= 0x84,
 	nvme_admin_get_lba_status	= 0x86,
 	nvme_admin_vendor_start		= 0xC0,
+	nvme_admin_dim			= 0x21,
 };
 
 #define nvme_admin_opcode_name(opcode)	{ opcode, #opcode }
@@ -1316,6 +1328,29 @@ struct nvmf_common_command {
 	__u8	ts[24];
 };
 
+/**
+ * union nvmf_tsas - Transport Specific Address Subtype
+ * @qptype:  Queue Pair Service Type (NVMF_RDMA_QPTYPE_CONNECTED, NVMF_RDMA_QPTYPE_DATAGRAM)
+ * @prtype:  Provider Type (see enum  nvme_rdma_prtype)
+ * @cma:     Connection Management Service (NVMF_RDMA_CMS_RDMA_CM)
+ * @pkey:    Partition Key
+ * @sectype: Security Type (NVMF_TCP_SECTYPE_NONE, NVMF_TCP_SECTYPE_TLS)
+ */
+union nvmf_tsas {
+	char		common[NVMF_TSAS_SIZE];
+	struct rdma {
+		__u8	qptype;
+		__u8	prtype;
+		__u8	cms;
+		__u8	rsvd3[5];
+		__u16	pkey;
+		__u8	rsvd10[246];
+	} rdma;
+	struct tcp {
+		__u8	sectype;
+	} tcp;
+};
+
 /*
  * The legal cntlid range a NVMe Target will provide.
  * Note that cntlid of value 0 is considered illegal in the fabrics world.
@@ -1349,17 +1384,7 @@ struct nvmf_disc_rsp_page_entry {
 	__u8		resv64[192];
 	char		subnqn[NVMF_NQN_FIELD_LEN];
 	char		traddr[NVMF_TRADDR_SIZE];
-	union tsas {
-		char		common[NVMF_TSAS_SIZE];
-		struct rdma {
-			__u8	qptype;
-			__u8	prtype;
-			__u8	cms;
-			__u8	resv3[5];
-			__u16	pkey;
-			__u8	resv10[246];
-		} rdma;
-	} tsas;
+	union nvmf_tsas tsas;
 };
 
 /* Discovery log page header */
@@ -1447,6 +1472,213 @@ struct streams_directive_params {
 	__u8	rsvd2[6];
 };
 
+/**
+ * Discovery Information Management (DIM) command. This is sent by a
+ * host to the Central Discovery Controller (CDC) to perform
+ * explicit registration.
+ */
+enum nvmf_dim_tas {
+	NVME_TAS_REGISTER   = 0x00,
+	NVME_TAS_DEREGISTER = 0x01,
+	NVME_TAS_UPDATE     = 0x02,
+};
+
+struct nvmf_dim_command {
+	__u8                  opcode;     /* nvme_admin_dim */
+	__u8                  flags;
+	__u16                 command_id;
+	__le32                nsid;
+	__u64                 rsvd2[2];
+	union nvme_data_ptr   dptr;
+	__le32                cdw10;      /* enum nvmf_dim_tas */
+	__le32                cdw11;
+	__le32                cdw12;
+	__le32                cdw13;
+	__le32                cdw14;
+	__le32                cdw15;
+};
+
+#define NVMF_ENAME_LEN    256
+#define NVMF_EVER_LEN     64
+
+enum nvmf_dim_entfmt {
+	NVME_ENTFMT_BASIC    = 0x01,
+	NVME_ENTFMT_EXTENDED = 0x02,
+};
+
+enum nvmf_dim_etype {
+	NVME_REGISTRATION_HOST = 0x01,
+	NVME_REGISTRATION_DDC  = 0x02,
+	NVME_REGISTRATION_CDC  = 0x03,
+};
+
+enum nvmf_exattype {
+	NVME_EXATTYPE_HOSTID   = 0x01, /* Host Identifier */
+	NVME_EXATTYPE_SYMNAME  = 0x02, /* Symbolic Name */
+};
+
+/**
+ * struct nvmf_ext_attr - Extended Attribute (EXAT)
+ *
+ *       Bytes       Description
+ * @type 01:00       Extended Attribute Type (EXATTYPE) - enum nvmf_exattype
+ * @len  03:02       Extended Attribute Length (EXATLEN)
+ * @val  (EXATLEN-1) Extended Attribute Value (EXATVAL) - size allocated
+ *       +4:04       for array must be a multiple of 4 bytes
+ */
+struct nvmf_ext_attr {
+	__le16			type;
+	__le16			len;
+	__u8			val[];
+};
+#define SIZEOF_EXT_EXAT_WO_VAL offsetof(struct nvmf_ext_attr, val)
+
+/**
+ * struct nvmf_ext_die - Extended Discovery Information Entry (DIE)
+ *
+ *            Bytes           Description
+ * @trtype    00              Transport Type (enum nvme_trtype)
+ * @adrfam    01              Address Family (enum nvmf_addr_familiy)
+ * @subtype   02              Subsystem Type (enum nvme_subsys_type)
+ * @treq      03              Transport Requirements (enum nvmf_treq)
+ * @portid    05:04           Port ID
+ * @cntlid    07:06           Controller ID
+ * @asqsz     09:08           Admin Max SQ Size
+ *            31:10           Reserved
+ * @trsvcid   63:32           Transport Service Identifier
+ *            255:64          Reserved
+ * @nqn       511:256         NVM Qualified Name
+ * @traddr    767:512         Transport Address
+ * @tsas      1023:768        Transport Specific Address Subtype
+ * @tel       1027:1024       Total Entry Length
+ * @numexat   1029:1028       Number of Extended Attributes
+ *            1031:1030       Reserved
+ * @exat[0]   ((EXATLEN-1)+   Extented Attributes 0 (see @numexat)
+ *            4)+1032:1032    Cannot access as an array because each EXAT
+ *                            entry has an undetermined size.
+ * @exat[N]   TEL-1:TEL-      Extented Attributes N (w/ N = NUMEXAT-1)
+ *            (EXATLEN+4)
+ */
+struct nvmf_ext_die {
+	__u8			trtype;
+	__u8			adrfam;
+	__u8			subtype;
+	__u8			treq;
+	__le16			portid;
+	__le16			cntlid;
+	__le16			asqsz;
+	__u8			resv10[22];
+	char			trsvcid[NVMF_TRSVCID_SIZE];
+	__u8			resv64[192];
+	char			nqn[NVMF_NQN_FIELD_LEN];
+	char			traddr[NVMF_TRADDR_SIZE];
+	union nvmf_tsas		tsas;
+	__le32			tel;
+	__le16			numexat;
+	__u16			resv1030;
+	struct nvmf_ext_attr	exat;
+};
+#define SIZEOF_EXT_DIE_WO_EXAT offsetof(struct nvmf_ext_die, exat)
+
+/**
+ * union nvmf_die - Discovery Information Entry (DIE)
+ *
+ * Depending on the ENTFMT specified in the DIM, DIEs can be entered with
+ * the Basic or Extended formats. For Basic format, each entry has a fixed
+ * length. Therefore, the "basic" field defined below can be accessed as a
+ * C array. For the Extended format, however, each entry is of variable
+ * length (TEL). Therefore, the "extended" field defined below cannot be
+ * accessed as a C array. Instead, the "extended" field is akin to a
+ * linked-list, where one can "walk" through the list. To move to the next
+ * entry, one simply adds the current entry's length (TEL) to the "walk"
+ * pointer. The number of entries in the list is specified by NUMENT.
+ * Although extended entries are of a variable lengths (TEL), TEL is
+ * always a multiple of 4 bytes.
+ */
+union nvmf_die {
+	struct nvmf_disc_rsp_page_entry	basic[0];
+	struct nvmf_ext_die		extended;
+};
+
+/**
+ * struct nvmf_dim_data - Discovery Information Management (DIM) - Data
+ *
+ *            Bytes       Description
+ * @tdl       03:00       Total Data Length
+ *            07:04       Reserved
+ * @nument    15:08       Number of entries
+ * @entfmt    17:16       Entry Format (enum nvmf_dim_entfmt)
+ * @etype     19:18       Entity Type (enum nvmf_dim_etype)
+ * @portlcL   20          Port Local
+ *            21          Reserved
+ * @ektype    23:22       Entry Key Type
+ * @eid       279:24      Entity Identifier (e.g. Host NQN)
+ * @ename     535:280     Entity Name (e.g. hostname)
+ * @ever      599:536     Entity Version (e.g. OS Name/Version)
+ *            1023:600    Reserved
+ * @die       TDL-1:1024  Discovery Information Entry (see @nument above)
+ */
+struct nvmf_dim_data {
+	__le32			tdl;
+	__u32			rsvd4;
+	__le64			nument;
+	__le16			entfmt;
+	__le16			etype;
+	__u8			portlcl;
+	__u8			rsvd21;
+	__le16			ektype;
+	char			eid[NVMF_NQN_FIELD_LEN];
+	char			ename[NVMF_ENAME_LEN];
+	char			ever[NVMF_EVER_LEN];
+	__u8			rsvd600[424];
+	union nvmf_die		die;
+};
+#define SIZEOF_DIM_WO_DIE offsetof(struct nvmf_dim_data, die)
+
+/**
+ * exat_len() - Return the size in bytes, rounded to a multiple of 4 ((i.e.
+ *         size of u32), of the buffer needed to hold the exat value of
+ *         size @val_len.
+ */
+static inline u16 exat_len(size_t val_len)
+{
+	return (u16)round_up(val_len, sizeof(u32));
+}
+
+/**
+ * exat_size - return the size of the "struct nvmf_ext_attr"
+ *   needed to hold a value of size @val_len.
+ *
+ * @val_len: This is the length of the data to be copied to the "val"
+ *         field of a "struct nvmf_ext_attr".
+ *
+ * @return The size in bytes, rounded to a multiple of 4 (i.e. size of
+ *         u32), of the "struct nvmf_ext_attr" required to hold a string of
+ *         length @val_len.
+ */
+static inline u16 exat_size(size_t val_len)
+{
+	return (u16)(SIZEOF_EXT_EXAT_WO_VAL + exat_len(val_len));
+}
+
+/**
+ * exat_ptr_next - Increment @p to the next element in the array. Extended
+ *   attributes are saved to an array of "struct nvmf_ext_attr" where each
+ *   element of the array is of variable size. In order to move to the next
+ *   element in the array one must increment the pointer to the current
+ *   element (@p) by the size of the current element.
+ *
+ * @p: Pointer to an element of an array of "struct nvmf_ext_attr".
+ *
+ * @return Pointer to the next element in the array.
+ */
+static inline struct nvmf_ext_attr *exat_ptr_next(struct nvmf_ext_attr *p)
+{
+	return (struct nvmf_ext_attr *)
+		((uintptr_t)p + (ptrdiff_t)exat_size(le16_to_cpu(p->len)));
+}
+
+
 struct nvme_command {
 	union {
 		struct nvme_common_command common;
@@ -1470,6 +1702,7 @@ struct nvme_command {
 		struct nvmf_property_get_command prop_get;
 		struct nvme_dbbuf dbbuf;
 		struct nvme_directive_cmd directive;
+		struct nvmf_dim_command dim;
 	};
 };
 
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 4/4] nvme-fabrics: add explicit deregistration on disconnect.
       [not found] <20220125145956.14746-1-nitram_67@hotmail.com>
                   ` (2 preceding siblings ...)
  2022-01-25 14:59 ` [PATCH 3/4] nvme-fabrics: add tp8010 support Martin Belanger
@ 2022-01-25 14:59 ` Martin Belanger
  2022-01-27  9:31   ` Hannes Reinecke
  3 siblings, 1 reply; 32+ messages in thread
From: Martin Belanger @ 2022-01-25 14:59 UTC (permalink / raw)
  To: linux-nvme; +Cc: kbusch, hare, axboe, hch, sagi, Martin Belanger

From: Martin Belanger <martin.belanger@dell.com>

Per TP8010, a host explicitly registered with a Discovery
Controller (DC) must deregister on disconnect. This patch
sends a DIM PDU to DCs to deregister the host on disconnect.

Signed-off-by: Martin Belanger <martin.belanger@dell.com>
---
 drivers/nvme/host/tcp.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 970b7df574a4..804d927b4b81 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2176,6 +2176,16 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
 
 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
 {
+	if (ctrl->opts->explicit_registration &&
++	    ((ctrl->dctype == NVME_DCTYPE_DDC) ||
+	     (ctrl->dctype == NVME_DCTYPE_CDC))) {
+		/* Deregister from discovery controller */
+		union nvmf_tsas tsas = {.tcp.sectype = NVMF_TCP_SECTYPE_NONE};
+
+		nvmf_disc_info_mgmt(ctrl, NVME_TAS_DEREGISTER, NVMF_TRTYPE_TCP,
+				    nvme_get_adrfam(ctrl), "", &tsas);
+	}
+
 	nvme_tcp_teardown_ctrl(ctrl, true);
 }
 
-- 
2.34.1



^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/4] nvme-fabrics: add discovery controller type
  2022-01-25 14:59 ` [PATCH 1/4] nvme-fabrics: add discovery controller type Martin Belanger
@ 2022-01-27  9:16   ` Hannes Reinecke
  2022-01-27 13:37     ` Belanger, Martin
  2022-01-27 13:16   ` Sagi Grimberg
  1 sibling, 1 reply; 32+ messages in thread
From: Hannes Reinecke @ 2022-01-27  9:16 UTC (permalink / raw)
  To: Martin Belanger, linux-nvme; +Cc: kbusch, axboe, hch, sagi, Martin Belanger

On 1/25/22 15:59, Martin Belanger wrote:
> From: Martin Belanger <martin.belanger@dell.com>
> 
> TP8010 introduces the Central Discovery Controller (CDC). To
> differentiate between a CDC and a DDC (Direct DC), the identify
> response now carries a DCTYPE (Discovery Controller Type) field.
> 
> Signed-off-by: Martin Belanger <martin.belanger@dell.com>
> ---
>   drivers/nvme/host/core.c |  3 +++
>   drivers/nvme/host/nvme.h |  2 ++
>   include/linux/nvme.h     | 10 +++++++++-
>   3 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 4fc794d9c2f4..cd34b92e8e9d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2978,6 +2978,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
>   	ctrl->kas = le16_to_cpu(id->kas);
>   	ctrl->max_namespaces = le32_to_cpu(id->mnan);
>   	ctrl->ctratt = le32_to_cpu(id->ctratt);
> +	ctrl->dctype = id->dctype;
>   
>   	if (id->rtd3e) {
>   		/* us -> s */
> @@ -3335,6 +3336,7 @@ nvme_show_int_function(numa_node);
>   nvme_show_int_function(queue_count);
>   nvme_show_int_function(sqsize);
>   nvme_show_int_function(kato);
> +nvme_show_int_function(dctype);
>   
>   static ssize_t nvme_sysfs_delete(struct device *dev,
>   				struct device_attribute *attr, const char *buf,
> @@ -3533,6 +3535,7 @@ static struct attribute *nvme_dev_attrs[] = {
>   	&dev_attr_reconnect_delay.attr,
>   	&dev_attr_fast_io_fail_tmo.attr,
>   	&dev_attr_kato.attr,
> +	&dev_attr_dctype.attr,
>   	NULL
>   };
>   
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index fe224016418e..f8b60f7346fd 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -349,6 +349,8 @@ struct nvme_ctrl {
>   	unsigned long discard_page_busy;
>   
>   	struct nvme_fault_inject fault_inject;
> +
> +	enum nvme_dctype dctype;
>   };
>   
>   enum nvme_iopolicy {
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 855dd9b3e84b..82567d493c51 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -240,6 +240,12 @@ enum nvme_ctrl_attr {
>   	NVME_CTRL_ATTR_TBKAS		= (1 << 6),
>   };
>   
> +enum nvme_dctype {
> +	NVME_DCTYPE_NOT_REPORTED = 0,
> +	NVME_DCTYPE_DDC = 1,
> +	NVME_DCTYPE_CDC = 2,
> +};
> +
>   struct nvme_id_ctrl {
>   	__le16			vid;
>   	__le16			ssvid;
> @@ -320,7 +326,9 @@ struct nvme_id_ctrl {
>   	__le16			icdoff;
>   	__u8			ctrattr;
>   	__u8			msdbd;
> -	__u8			rsvd1804[244];
> +	__le16			ofcs;
> +	__u8			dctype;
> +	__u8			rsvd1807[241];
>   	struct nvme_id_power_state	psd[32];
>   	__u8			vs[1024];
>   };

Please decode the fields for the sysfs attribute; 'cdc' is far easier to 
understand than '2'.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/4] nvme: add host symbolic name
  2022-01-25 14:59 ` [PATCH 2/4] nvme: add host symbolic name Martin Belanger
@ 2022-01-27  9:18   ` Hannes Reinecke
  2022-01-27 13:20   ` Sagi Grimberg
  1 sibling, 0 replies; 32+ messages in thread
From: Hannes Reinecke @ 2022-01-27  9:18 UTC (permalink / raw)
  To: Martin Belanger, linux-nvme; +Cc: kbusch, axboe, hch, sagi, Martin Belanger

On 1/25/22 15:59, Martin Belanger wrote:
> From: Martin Belanger <martin.belanger@dell.com>
> 
> TP8010 introduces the 'host symbolic name' as an optional
> parameter for explicit registration with a central discovery
> controller.
> 
> Signed-off-by: Martin Belanger <martin.belanger@dell.com>
> ---
>   drivers/nvme/host/core.c    | 14 ++++++++++++++
>   drivers/nvme/host/fabrics.c | 17 +++++++++++++++--
>   drivers/nvme/host/fabrics.h |  2 ++
>   include/linux/nvme.h        |  2 ++
>   4 files changed, 33 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index cd34b92e8e9d..cf5d9984f8f6 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3404,6 +3404,16 @@ static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
>   }
>   static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
>   
> +static ssize_t nvme_sysfs_show_hostsymname(struct device *dev,
> +					struct device_attribute *attr,
> +					char *buf)
> +{
> +	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
> +
> +	return sysfs_emit(buf, "%s\n", ctrl->opts->host->symname);
> +}
> +static DEVICE_ATTR(hostsymname, S_IRUGO, nvme_sysfs_show_hostsymname, NULL);
> +
>   static ssize_t nvme_sysfs_show_hostid(struct device *dev,
>   					struct device_attribute *attr,
>   					char *buf)
> @@ -3531,6 +3541,7 @@ static struct attribute *nvme_dev_attrs[] = {
>   	&dev_attr_sqsize.attr,
>   	&dev_attr_hostnqn.attr,
>   	&dev_attr_hostid.attr,
> +	&dev_attr_hostsymname.attr,
>   	&dev_attr_ctrl_loss_tmo.attr,
>   	&dev_attr_reconnect_delay.attr,
>   	&dev_attr_fast_io_fail_tmo.attr,
> @@ -3553,6 +3564,9 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
>   		return 0;
>   	if (a == &dev_attr_hostid.attr && !ctrl->opts)
>   		return 0;
> +	if (a == &dev_attr_hostsymname.attr &&
> +	    (!ctrl->opts || (ctrl->opts->host->symname[0] == '\0')))
> +		return 0;
>   	if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
>   		return 0;
>   	if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
> diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
> index 7ae041e2b3fb..040a6cce6afa 100644
> --- a/drivers/nvme/host/fabrics.c
> +++ b/drivers/nvme/host/fabrics.c
> @@ -540,6 +540,7 @@ static const match_table_t opt_tokens = {
>   	{ NVMF_OPT_HOST_TRADDR,		"host_traddr=%s"	},
>   	{ NVMF_OPT_HOST_IFACE,		"host_iface=%s"		},
>   	{ NVMF_OPT_HOST_ID,		"hostid=%s"		},
> +	{ NVMF_OPT_SYMNAME,		"hostsymname=%s"	},
>   	{ NVMF_OPT_DUP_CONNECT,		"duplicate_connect"	},
>   	{ NVMF_OPT_DISABLE_SQFLOW,	"disable_sqflow"	},
>   	{ NVMF_OPT_HDR_DIGEST,		"hdr_digest"		},
> @@ -556,7 +557,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
>   		const char *buf)
>   {
>   	substring_t args[MAX_OPT_ARGS];
> -	char *options, *o, *p;
> +	char *options, *o, *p, *hostsymname = NULL;
>   	int token, ret = 0;
>   	size_t nqnlen  = 0;
>   	int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO;
> @@ -830,6 +831,13 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
>   		case NVMF_OPT_DISCOVERY:
>   			opts->discovery_nqn = true;
>   			break;
> +		case NVMF_OPT_SYMNAME:
> +			hostsymname = match_strdup(args);
> +			if (!hostsymname) {
> +				ret = -ENOMEM;
> +				goto out;
> +			}
> +			break;
>   		default:
>   			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
>   				p);
> @@ -864,8 +872,13 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
>   
>   	uuid_copy(&opts->host->id, &hostid);
>   
> +	if (hostsymname)
> +		strncpy(opts->host->symname, hostsymname,
> +			sizeof(opts->host->symname) - 1);
> +
>   out:
>   	kfree(options);
> +	kfree(hostsymname);
>   	return ret;
>   }
>   
> @@ -957,7 +970,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
>   				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
>   				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
>   				 NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\
> -				 NVMF_OPT_FAIL_FAST_TMO)
> +				 NVMF_OPT_FAIL_FAST_TMO | NVMF_OPT_SYMNAME)
>   
>   static struct nvme_ctrl *
>   nvmf_create_ctrl(struct device *dev, const char *buf)
> diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
> index c3203ff1c654..494e6dbe233a 100644
> --- a/drivers/nvme/host/fabrics.h
> +++ b/drivers/nvme/host/fabrics.h
> @@ -38,6 +38,7 @@ struct nvmf_host {
>   	struct list_head	list;
>   	char			nqn[NVMF_NQN_SIZE];
>   	uuid_t			id;
> +	char			symname[NVMF_HOSTSYMNAME_SIZE+1];
>   };
>   
>   /**
> @@ -68,6 +69,7 @@ enum {
>   	NVMF_OPT_FAIL_FAST_TMO	= 1 << 20,
>   	NVMF_OPT_HOST_IFACE	= 1 << 21,
>   	NVMF_OPT_DISCOVERY	= 1 << 22,
> +	NVMF_OPT_SYMNAME	= 1 << 23,
>   };
>   
>   /**
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 82567d493c51..3b47951342f4 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -16,6 +16,8 @@
>   /* However the max length of a qualified name is another size */
>   #define NVMF_NQN_SIZE		223
>   
> +#define NVMF_HOSTSYMNAME_SIZE	256
> +
>   #define NVMF_TRSVCID_SIZE	32
>   #define NVMF_TRADDR_SIZE	256
>   #define NVMF_TSAS_SIZE		256

Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-25 14:59 ` [PATCH 3/4] nvme-fabrics: add tp8010 support Martin Belanger
@ 2022-01-27  9:30   ` Hannes Reinecke
  2022-01-27 13:12     ` Sagi Grimberg
  2022-01-31 17:03   ` John Meneghini
  1 sibling, 1 reply; 32+ messages in thread
From: Hannes Reinecke @ 2022-01-27  9:30 UTC (permalink / raw)
  To: Martin Belanger, linux-nvme; +Cc: kbusch, axboe, hch, sagi, Martin Belanger

On 1/25/22 15:59, Martin Belanger wrote:
> From: Martin Belanger <martin.belanger@dell.com>
> 
> TP8010 introduces Central Discovery Controllers (CDC) and the
> Discovery Information Management (DIM) PDU, which is used to
> perform explicit registration with Discovery Controllers.
> 
> This patch defines the DIM PDU and adds logic to perform explicit
> registration with DCs when registration has been requested by the
> user.
> 
> Signed-off-by: Martin Belanger <martin.belanger@dell.com>
> ---
>   drivers/nvme/host/fabrics.c | 164 +++++++++++++++++++++++
>   drivers/nvme/host/fabrics.h |   9 ++
>   drivers/nvme/host/tcp.c     |  51 +++++++-
>   include/linux/nvme.h        | 255 ++++++++++++++++++++++++++++++++++--
>   4 files changed, 467 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
> index 040a6cce6afa..e6fc2f85b670 100644
> --- a/drivers/nvme/host/fabrics.c
> +++ b/drivers/nvme/host/fabrics.c
> @@ -10,6 +10,7 @@
>   #include <linux/mutex.h>
>   #include <linux/parser.h>
>   #include <linux/seq_file.h>
> +#include <linux/utsname.h>
>   #include "nvme.h"
>   #include "fabrics.h"
>   
> @@ -526,6 +527,165 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
>   	return NULL;
>   }
>   
> +/**
> + * nvmf_get_tel() - Calculate the amount of memory needed for a Discovery
> + * Information Entry. Each Entry must contain at a minimum an Extended
> + * Attribute for the HostID. The Entry may optionally contain an Extended
> + * Attribute for the Symbolic Name.
> + *
> + * @hostsymname - Symbolic name (may be NULL)
> + *
> + * Return Total Entry Length
> + */
> +static u32 nvmf_get_tel(const char *hostsymname)
> +{
> +	u32 tel = SIZEOF_EXT_DIE_WO_EXAT;
> +	u16 len;
> +
> +	/* Host ID is mandatory */
> +	tel += exat_size(sizeof(uuid_t));
> +
> +	/* Symbolic name is optional */
> +	len = hostsymname ? strlen(hostsymname) : 0;
> +	if (len)
> +		tel += exat_size(len);
> +
> +	return tel;
> +}
> +
> +/**
> + * nvmf_fill_die() - Fill the Discovery Information Entry pointed to by
> + *                   @die.
> + *
> + * @die - Pointer to Discovery Information Entry to be filled
> + * @tel - Length of the DIE
> + * @trtype - Transport type
> + * @adrfam - Address family
> + * @reg_addr - Address to register. Setting this to an empty string tells
> + *       the DC to infer address from the source address of the socket.
> + * @nqn - Host NQN
> + * @hostid - Host ID
> + * @hostsymname - Host symbolic name
> + * @tsas - Transport Specific Address Subtype for the address being
> + *       registered.
> + */
> +static void nvmf_fill_die(struct nvmf_ext_die  *die,
> +			  u32                   tel,
> +			  u8                    trtype,
> +			  u8                    adrfam,
> +			  const char           *reg_addr,
> +			  struct nvmf_host     *host,
> +			  union nvmf_tsas      *tsas)
> +{
> +	u16 numexat = 0;
> +	size_t symname_len;
> +	struct nvmf_ext_attr *exat;
> +
> +	die->tel = cpu_to_le32(tel);
> +	die->trtype = trtype;
> +	die->adrfam = adrfam;
> +
> +	strncpy(die->nqn, host->nqn, sizeof(die->nqn));
> +	strncpy(die->traddr, reg_addr, sizeof(die->traddr));
> +	memcpy(&die->tsas, tsas, sizeof(die->tsas));
> +
> +	/* Extended Attribute for the HostID (mandatory) */
> +	numexat++;
> +	exat = &die->exat;
> +	exat->type = cpu_to_le16(NVME_EXATTYPE_HOSTID);
> +	exat->len  = cpu_to_le16(exat_len(sizeof(host->id)));
> +	uuid_copy((uuid_t *)exat->val, &host->id);
> +
> +	/* Extended Attribute for the Symbolic Name (optional) */
> +	symname_len = strlen(host->symname);
> +	if (symname_len) {
> +		u16 exatlen = exat_len(symname_len);
> +
> +		numexat++;
> +		exat = exat_ptr_next(exat);
> +		exat->type = cpu_to_le16(NVME_EXATTYPE_SYMNAME);
> +		exat->len  = cpu_to_le16(exatlen);
> +		memcpy(exat->val, host->symname, symname_len);
> +		/* Per Base specs, ASCII strings must be padded with spaces */
> +		memset(&exat->val[symname_len], ' ', exatlen - symname_len);
> +	}
> +
> +	die->numexat = cpu_to_le16(numexat);
> +}
> +
> +/**
> + * nvmf_disc_info_mgmt() - Perform explicit registration, deregistration,
> + * or registration-update (specified by @tas) by sending a Discovery
> + * Information Management (DIM) command to the Discovery Controller (DC).
> + *
> + * @ctrl: Host NVMe controller instance maintaining the admin queue used to
> + *      submit the DIM command to the DC.
> + * @tas: Task field of the Command Dword 10 (cdw10). Indicates whether to
> + *     perform a Registration, Deregistration, or Registration-update.
> + * @trtype: Transport type (e.g. NVMF_TRTYPE_TCP)
> + * @adrfam: Address family (e.g. NVMF_ADDR_FAMILY_IP6)
> + * @reg_addr - Address to register. Setting this to an empty string tells
> + *       the DC to infer address from the source address of the socket.
> + * @tsas - Transport Specific Address Subtype for the address being
> + *       registered.
> + *
> + * Return:   0: success,
> + *         > 0: NVMe error status code,
> + *         < 0: Linux errno error code
> + */
> +int nvmf_disc_info_mgmt(struct nvme_ctrl *ctrl,
> +			enum nvmf_dim_tas tas,
> +			u8                trtype,
> +			u8                adrfam,
> +			const char       *reg_addr,
> +			union nvmf_tsas  *tsas)
> +{
> +	int                   ret;
> +	u32                   tdl;  /* Total Data Length */
> +	u32                   tel;  /* Total Entry Length */
> +	struct nvmf_dim_data *dim;  /* Discovery Information Management */
> +	struct nvmf_ext_die  *die;  /* Discovery Information Entry */
> +	struct nvme_command   cmd = {};
> +	union nvme_result     result;
> +
> +	/* Register 1 Discovery Information Entry (DIE) of size TEL */
> +	tel = nvmf_get_tel(ctrl->opts->host->symname);
> +	tdl = SIZEOF_DIM_WO_DIE + tel;
> +
> +	dim = kzalloc(tdl, GFP_KERNEL);
> +	if (!dim)
> +		return -ENOMEM;
> +
> +	cmd.dim.opcode = nvme_admin_dim;
> +	cmd.dim.cdw10  = cpu_to_le32(tas);
> +
> +	dim->tdl    = cpu_to_le32(tdl);
> +	dim->nument = cpu_to_le64(1);    /* only 1 DIE to register */
> +	dim->entfmt = cpu_to_le16(NVME_ENTFMT_EXTENDED);
> +	dim->etype  = cpu_to_le16(NVME_REGISTRATION_HOST);
> +	dim->ektype = cpu_to_le16(0x5F); /* must be 0x5F per specs */
> +
> +	strncpy(dim->eid, ctrl->opts->host->nqn, sizeof(dim->eid));
> +	strncpy(dim->ename, utsname()->nodename, sizeof(dim->ename));
> +	strncpy(dim->ever, utsname()->release, sizeof(dim->ever));
> +
> +	die = &dim->die.extended;
> +	nvmf_fill_die(die, tel, trtype, adrfam, reg_addr,
> +		      ctrl->opts->host, tsas);
> +
> +	ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &result,
> +				     dim, tdl, 0, NVME_QID_ANY, 1, 0);
> +	if (unlikely(ret)) {
> +		dev_err(ctrl->device, "DIM error Result:0x%04X Status:0x%04X\n",
> +		   le32_to_cpu(result.u32), ret);
> +	}
> +
> +	kfree(dim);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(nvmf_disc_info_mgmt);
> +
>   static const match_table_t opt_tokens = {
>   	{ NVMF_OPT_TRANSPORT,		"transport=%s"		},
>   	{ NVMF_OPT_TRADDR,		"traddr=%s"		},
> @@ -550,6 +710,7 @@ static const match_table_t opt_tokens = {
>   	{ NVMF_OPT_TOS,			"tos=%d"		},
>   	{ NVMF_OPT_FAIL_FAST_TMO,	"fast_io_fail_tmo=%d"	},
>   	{ NVMF_OPT_DISCOVERY,		"discovery"		},
> +	{ NVMF_OPT_REGISTER,		"register"		},
>   	{ NVMF_OPT_ERR,			NULL			}
>   };
>   
> @@ -831,6 +992,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
>   		case NVMF_OPT_DISCOVERY:
>   			opts->discovery_nqn = true;
>   			break;
> +		case NVMF_OPT_REGISTER:
> +			opts->explicit_registration = true;
> +			break;
>   		case NVMF_OPT_SYMNAME:
>   			hostsymname = match_strdup(args);
>   			if (!hostsymname) {
> diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
> index 494e6dbe233a..77f5768f8ff4 100644
> --- a/drivers/nvme/host/fabrics.h
> +++ b/drivers/nvme/host/fabrics.h
> @@ -70,6 +70,7 @@ enum {
>   	NVMF_OPT_HOST_IFACE	= 1 << 21,
>   	NVMF_OPT_DISCOVERY	= 1 << 22,
>   	NVMF_OPT_SYMNAME	= 1 << 23,
> +	NVMF_OPT_REGISTER	= 1 << 24,
>   };
>   
>   /**
> @@ -106,6 +107,7 @@ enum {
>    * @nr_poll_queues: number of queues for polling I/O
>    * @tos: type of service
>    * @fast_io_fail_tmo: Fast I/O fail timeout in seconds
> + * @explicit_registration: Explicit Registration with DC using the DIM PDU
>    */
>   struct nvmf_ctrl_options {
>   	unsigned		mask;
> @@ -130,6 +132,7 @@ struct nvmf_ctrl_options {
>   	unsigned int		nr_poll_queues;
>   	int			tos;
>   	int			fast_io_fail_tmo;
> +	bool			explicit_registration;
>   };
>   
>   /*
> @@ -200,5 +203,11 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
>   bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
>   bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
>   		struct nvmf_ctrl_options *opts);
> +int nvmf_disc_info_mgmt(struct nvme_ctrl *ctrl,
> +			enum nvmf_dim_tas tas,
> +			u8                trtype,
> +			u8                adrfam,
> +			const char       *reg_addr,
> +			union nvmf_tsas  *tsas);
>   
>   #endif /* _NVME_FABRICS_H */
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 4ceb28675fdf..970b7df574a4 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -1993,6 +1993,40 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
>   	}
>   }
>   
> +/**
> + * nvme_get_adrfam() - Get address family for the address we're registering
> + * with the DC. We retrieve this info from the socket itself. If we can't
> + * get the source address from the socket, then we'll infer the address
> + * family from the address of the DC since the DC address has the same
> + * address family.
> + *
> + * @ctrl: Host NVMe controller instance maintaining the admin queue used to
> + *      submit the DIM command to the DC.
> + *
> + * Return: The address family of the source address associated with the
> + * socket connected to the DC.
> + */
> +static u8 nvme_get_adrfam(struct nvme_ctrl *ctrl)
> +{
> +	u8                       adrfam = NVMF_ADDR_FAMILY_IP4;
> +	struct sockaddr_storage  sas;
> +	struct sockaddr         *sa = (struct sockaddr *)&sas;
> +	struct nvme_tcp_queue   *queue = &to_tcp_ctrl(ctrl)->queues[0];
> +
> +	if (kernel_getsockname(queue->sock, sa) == 0) {
> +		if (sa->sa_family == AF_INET6)
> +			adrfam = NVMF_ADDR_FAMILY_IP6;
> +	} else {
> +		unsigned char buf[sizeof(struct in6_addr)];
> +
> +		if (in6_pton(ctrl->opts->traddr,
> +			     strlen(ctrl->opts->traddr), buf, -1, NULL) > 0)
> +			adrfam = NVMF_ADDR_FAMILY_IP6;
> +	}
> +
> +	return adrfam;
> +}
> +
>   static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
>   {
>   	struct nvmf_ctrl_options *opts = ctrl->opts;
> @@ -2045,6 +2079,20 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
>   		goto destroy_io;
>   	}
>   
> +	if (ctrl->opts->explicit_registration &&
> ++	    ((ctrl->dctype == NVME_DCTYPE_DDC) ||
> +	     (ctrl->dctype == NVME_DCTYPE_CDC))) {
> +		/* We're registering our source address with the DC. To do
> +		 * that, we can simply send an empty string. This tells the DC
> +		 * to retrieve the source address from the socket and use that
> +		 * as the registration address.
> +		 */
> +		union nvmf_tsas tsas = {.tcp.sectype = NVMF_TCP_SECTYPE_NONE};
> +
> +		nvmf_disc_info_mgmt(ctrl, NVME_TAS_REGISTER, NVMF_TRTYPE_TCP,
> +				    nvme_get_adrfam(ctrl), "", &tsas);
> +	}
> +
>   	nvme_start_ctrl(ctrl);
>   	return 0;
>   
> @@ -2613,7 +2661,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
>   			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
>   			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
>   			  NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
> -			  NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
> +			  NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE |
> +			  NVMF_OPT_REGISTER,
>   	.create_ctrl	= nvme_tcp_create_ctrl,
>   };
>   
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 3b47951342f4..891615876cfa 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -9,6 +9,7 @@
>   
>   #include <linux/types.h>
>   #include <linux/uuid.h>
> +#include <linux/kernel.h>
>   
>   /* NQN names in commands fields specified one size */
>   #define NVMF_NQN_FIELD_LEN	256
> @@ -102,6 +103,16 @@ enum {
>   	NVMF_RDMA_CMS_RDMA_CM	= 1, /* Sockets based endpoint addressing */
>   };
>   
> +/**
> + * enum -
> + * @NVMF_TCP_SECTYPE_NONE: No Security
> + * @NVMF_TCP_SECTYPE_TLS:  Transport Layer Security
> + */
> +enum {
> +	NVMF_TCP_SECTYPE_NONE	= 0,
> +	NVMF_TCP_SECTYPE_TLS	= 1,
> +};
> +
>   #define NVME_AQ_DEPTH		32
>   #define NVME_NR_AEN_COMMANDS	1
>   #define NVME_AQ_BLK_MQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
> @@ -1036,6 +1047,7 @@ enum nvme_admin_opcode {
>   	nvme_admin_sanitize_nvm		= 0x84,
>   	nvme_admin_get_lba_status	= 0x86,
>   	nvme_admin_vendor_start		= 0xC0,
> +	nvme_admin_dim			= 0x21,
>   };
>   
>   #define nvme_admin_opcode_name(opcode)	{ opcode, #opcode }
> @@ -1316,6 +1328,29 @@ struct nvmf_common_command {
>   	__u8	ts[24];
>   };
>   
> +/**
> + * union nvmf_tsas - Transport Specific Address Subtype
> + * @qptype:  Queue Pair Service Type (NVMF_RDMA_QPTYPE_CONNECTED, NVMF_RDMA_QPTYPE_DATAGRAM)
> + * @prtype:  Provider Type (see enum  nvme_rdma_prtype)
> + * @cma:     Connection Management Service (NVMF_RDMA_CMS_RDMA_CM)
> + * @pkey:    Partition Key
> + * @sectype: Security Type (NVMF_TCP_SECTYPE_NONE, NVMF_TCP_SECTYPE_TLS)
> + */
> +union nvmf_tsas {
> +	char		common[NVMF_TSAS_SIZE];
> +	struct rdma {
> +		__u8	qptype;
> +		__u8	prtype;
> +		__u8	cms;
> +		__u8	rsvd3[5];
> +		__u16	pkey;
> +		__u8	rsvd10[246];
> +	} rdma;
> +	struct tcp {
> +		__u8	sectype;
> +	} tcp;
> +};
> +
>   /*
>    * The legal cntlid range a NVMe Target will provide.
>    * Note that cntlid of value 0 is considered illegal in the fabrics world.
> @@ -1349,17 +1384,7 @@ struct nvmf_disc_rsp_page_entry {
>   	__u8		resv64[192];
>   	char		subnqn[NVMF_NQN_FIELD_LEN];
>   	char		traddr[NVMF_TRADDR_SIZE];
> -	union tsas {
> -		char		common[NVMF_TSAS_SIZE];
> -		struct rdma {
> -			__u8	qptype;
> -			__u8	prtype;
> -			__u8	cms;
> -			__u8	resv3[5];
> -			__u16	pkey;
> -			__u8	resv10[246];
> -		} rdma;
> -	} tsas;
> +	union nvmf_tsas tsas;
>   };
>   
>   /* Discovery log page header */
> @@ -1447,6 +1472,213 @@ struct streams_directive_params {
>   	__u8	rsvd2[6];
>   };
>   
> +/**
> + * Discovery Information Management (DIM) command. This is sent by a
> + * host to the Central Discovery Controller (CDC) to perform
> + * explicit registration.
> + */
> +enum nvmf_dim_tas {
> +	NVME_TAS_REGISTER   = 0x00,
> +	NVME_TAS_DEREGISTER = 0x01,
> +	NVME_TAS_UPDATE     = 0x02,
> +};
> +
> +struct nvmf_dim_command {
> +	__u8                  opcode;     /* nvme_admin_dim */
> +	__u8                  flags;
> +	__u16                 command_id;
> +	__le32                nsid;
> +	__u64                 rsvd2[2];
> +	union nvme_data_ptr   dptr;
> +	__le32                cdw10;      /* enum nvmf_dim_tas */
> +	__le32                cdw11;
> +	__le32                cdw12;
> +	__le32                cdw13;
> +	__le32                cdw14;
> +	__le32                cdw15;
> +};
> +
> +#define NVMF_ENAME_LEN    256
> +#define NVMF_EVER_LEN     64
> +
> +enum nvmf_dim_entfmt {
> +	NVME_ENTFMT_BASIC    = 0x01,
> +	NVME_ENTFMT_EXTENDED = 0x02,
> +};
> +
> +enum nvmf_dim_etype {
> +	NVME_REGISTRATION_HOST = 0x01,
> +	NVME_REGISTRATION_DDC  = 0x02,
> +	NVME_REGISTRATION_CDC  = 0x03,
> +};
> +
> +enum nvmf_exattype {
> +	NVME_EXATTYPE_HOSTID   = 0x01, /* Host Identifier */
> +	NVME_EXATTYPE_SYMNAME  = 0x02, /* Symbolic Name */
> +};
> +
> +/**
> + * struct nvmf_ext_attr - Extended Attribute (EXAT)
> + *
> + *       Bytes       Description
> + * @type 01:00       Extended Attribute Type (EXATTYPE) - enum nvmf_exattype
> + * @len  03:02       Extended Attribute Length (EXATLEN)
> + * @val  (EXATLEN-1) Extended Attribute Value (EXATVAL) - size allocated
> + *       +4:04       for array must be a multiple of 4 bytes
> + */
> +struct nvmf_ext_attr {
> +	__le16			type;
> +	__le16			len;
> +	__u8			val[];
> +};
> +#define SIZEOF_EXT_EXAT_WO_VAL offsetof(struct nvmf_ext_attr, val)
> +
> +/**
> + * struct nvmf_ext_die - Extended Discovery Information Entry (DIE)
> + *
> + *            Bytes           Description
> + * @trtype    00              Transport Type (enum nvme_trtype)
> + * @adrfam    01              Address Family (enum nvmf_addr_familiy)
> + * @subtype   02              Subsystem Type (enum nvme_subsys_type)
> + * @treq      03              Transport Requirements (enum nvmf_treq)
> + * @portid    05:04           Port ID
> + * @cntlid    07:06           Controller ID
> + * @asqsz     09:08           Admin Max SQ Size
> + *            31:10           Reserved
> + * @trsvcid   63:32           Transport Service Identifier
> + *            255:64          Reserved
> + * @nqn       511:256         NVM Qualified Name
> + * @traddr    767:512         Transport Address
> + * @tsas      1023:768        Transport Specific Address Subtype
> + * @tel       1027:1024       Total Entry Length
> + * @numexat   1029:1028       Number of Extended Attributes
> + *            1031:1030       Reserved
> + * @exat[0]   ((EXATLEN-1)+   Extented Attributes 0 (see @numexat)
> + *            4)+1032:1032    Cannot access as an array because each EXAT
> + *                            entry has an undetermined size.
> + * @exat[N]   TEL-1:TEL-      Extented Attributes N (w/ N = NUMEXAT-1)
> + *            (EXATLEN+4)
> + */
> +struct nvmf_ext_die {
> +	__u8			trtype;
> +	__u8			adrfam;
> +	__u8			subtype;
> +	__u8			treq;
> +	__le16			portid;
> +	__le16			cntlid;
> +	__le16			asqsz;
> +	__u8			resv10[22];
> +	char			trsvcid[NVMF_TRSVCID_SIZE];
> +	__u8			resv64[192];
> +	char			nqn[NVMF_NQN_FIELD_LEN];
> +	char			traddr[NVMF_TRADDR_SIZE];
> +	union nvmf_tsas		tsas;
> +	__le32			tel;
> +	__le16			numexat;
> +	__u16			resv1030;
> +	struct nvmf_ext_attr	exat;
> +};
> +#define SIZEOF_EXT_DIE_WO_EXAT offsetof(struct nvmf_ext_die, exat)
> +
> +/**
> + * union nvmf_die - Discovery Information Entry (DIE)
> + *
> + * Depending on the ENTFMT specified in the DIM, DIEs can be entered with
> + * the Basic or Extended formats. For Basic format, each entry has a fixed
> + * length. Therefore, the "basic" field defined below can be accessed as a
> + * C array. For the Extended format, however, each entry is of variable
> + * length (TEL). Therefore, the "extended" field defined below cannot be
> + * accessed as a C array. Instead, the "extended" field is akin to a
> + * linked-list, where one can "walk" through the list. To move to the next
> + * entry, one simply adds the current entry's length (TEL) to the "walk"
> + * pointer. The number of entries in the list is specified by NUMENT.
> + * Although extended entries are of a variable lengths (TEL), TEL is
> + * always a multiple of 4 bytes.
> + */
> +union nvmf_die {
> +	struct nvmf_disc_rsp_page_entry	basic[0];
> +	struct nvmf_ext_die		extended;
> +};
> +
> +/**
> + * struct nvmf_dim_data - Discovery Information Management (DIM) - Data
> + *
> + *            Bytes       Description
> + * @tdl       03:00       Total Data Length
> + *            07:04       Reserved
> + * @nument    15:08       Number of entries
> + * @entfmt    17:16       Entry Format (enum nvmf_dim_entfmt)
> + * @etype     19:18       Entity Type (enum nvmf_dim_etype)
> + * @portlcL   20          Port Local
> + *            21          Reserved
> + * @ektype    23:22       Entry Key Type
> + * @eid       279:24      Entity Identifier (e.g. Host NQN)
> + * @ename     535:280     Entity Name (e.g. hostname)
> + * @ever      599:536     Entity Version (e.g. OS Name/Version)
> + *            1023:600    Reserved
> + * @die       TDL-1:1024  Discovery Information Entry (see @nument above)
> + */
> +struct nvmf_dim_data {
> +	__le32			tdl;
> +	__u32			rsvd4;
> +	__le64			nument;
> +	__le16			entfmt;
> +	__le16			etype;
> +	__u8			portlcl;
> +	__u8			rsvd21;
> +	__le16			ektype;
> +	char			eid[NVMF_NQN_FIELD_LEN];
> +	char			ename[NVMF_ENAME_LEN];
> +	char			ever[NVMF_EVER_LEN];
> +	__u8			rsvd600[424];
> +	union nvmf_die		die;
> +};
> +#define SIZEOF_DIM_WO_DIE offsetof(struct nvmf_dim_data, die)
> +
> +/**
> + * exat_len() - Return the size in bytes, rounded to a multiple of 4 ((i.e.
> + *         size of u32), of the buffer needed to hold the exat value of
> + *         size @val_len.
> + */
> +static inline u16 exat_len(size_t val_len)
> +{
> +	return (u16)round_up(val_len, sizeof(u32));
> +}
> +
> +/**
> + * exat_size - return the size of the "struct nvmf_ext_attr"
> + *   needed to hold a value of size @val_len.
> + *
> + * @val_len: This is the length of the data to be copied to the "val"
> + *         field of a "struct nvmf_ext_attr".
> + *
> + * @return The size in bytes, rounded to a multiple of 4 (i.e. size of
> + *         u32), of the "struct nvmf_ext_attr" required to hold a string of
> + *         length @val_len.
> + */
> +static inline u16 exat_size(size_t val_len)
> +{
> +	return (u16)(SIZEOF_EXT_EXAT_WO_VAL + exat_len(val_len));
> +}
> +
> +/**
> + * exat_ptr_next - Increment @p to the next element in the array. Extended
> + *   attributes are saved to an array of "struct nvmf_ext_attr" where each
> + *   element of the array is of variable size. In order to move to the next
> + *   element in the array one must increment the pointer to the current
> + *   element (@p) by the size of the current element.
> + *
> + * @p: Pointer to an element of an array of "struct nvmf_ext_attr".
> + *
> + * @return Pointer to the next element in the array.
> + */
> +static inline struct nvmf_ext_attr *exat_ptr_next(struct nvmf_ext_attr *p)
> +{
> +	return (struct nvmf_ext_attr *)
> +		((uintptr_t)p + (ptrdiff_t)exat_size(le16_to_cpu(p->len)));
> +}
> +
> +
>   struct nvme_command {
>   	union {
>   		struct nvme_common_command common;
> @@ -1470,6 +1702,7 @@ struct nvme_command {
>   		struct nvmf_property_get_command prop_get;
>   		struct nvme_dbbuf dbbuf;
>   		struct nvme_directive_cmd directive;
> +		struct nvmf_dim_command dim;
>   	};
>   };
>   
I'd rather like to have the TLS bits in a separate patch; they are not 
strictly related to this issue.

And I have a rather general issue with this: Is it required to issue an 
explicit registration directly after sending the 'connect' command?
IE do we need to have it as part of the 'connect' routine?

Thing is, the linux 'connect' routine is doing several things in one go
(like establishing a transport association for the admin queue, send the 
'connect' command for the admin queue, create transport associations for 
I/O queues and send 'connect' commands on all I/O queues, _and_ possibly 
do authentication, too).
If we now start overloading it with yet more functionality it becomes 
extremely hard to cleanup after an error, and that doesn't even mention 
the non-existing error reporting to userspace.

Wouldn't it make more sense to delegate explicit registration to 
userspace (ie libnvme/nvme-cli), and leave the kernel out of it?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 4/4] nvme-fabrics: add explicit deregistration on disconnect.
  2022-01-25 14:59 ` [PATCH 4/4] nvme-fabrics: add explicit deregistration on disconnect Martin Belanger
@ 2022-01-27  9:31   ` Hannes Reinecke
  2022-01-27 13:14     ` Sagi Grimberg
  0 siblings, 1 reply; 32+ messages in thread
From: Hannes Reinecke @ 2022-01-27  9:31 UTC (permalink / raw)
  To: Martin Belanger, linux-nvme; +Cc: kbusch, axboe, hch, sagi, Martin Belanger

On 1/25/22 15:59, Martin Belanger wrote:
> From: Martin Belanger <martin.belanger@dell.com>
> 
> Per TP8010, a host explicitly registered with a Discovery
> Controller (DC) must deregister on disconnect. This patch
> sends a DIM PDU to DCs to deregister the host on disconnect.
> 
> Signed-off-by: Martin Belanger <martin.belanger@dell.com>
> ---
>   drivers/nvme/host/tcp.c | 10 ++++++++++
>   1 file changed, 10 insertions(+)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 970b7df574a4..804d927b4b81 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -2176,6 +2176,16 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
>   
>   static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
>   {
> +	if (ctrl->opts->explicit_registration &&
> ++	    ((ctrl->dctype == NVME_DCTYPE_DDC) ||
> +	     (ctrl->dctype == NVME_DCTYPE_CDC))) {
> +		/* Deregister from discovery controller */
> +		union nvmf_tsas tsas = {.tcp.sectype = NVMF_TCP_SECTYPE_NONE};
> +
> +		nvmf_disc_info_mgmt(ctrl, NVME_TAS_DEREGISTER, NVMF_TRTYPE_TCP,
> +				    nvme_get_adrfam(ctrl), "", &tsas);
> +	}
> +
>   	nvme_tcp_teardown_ctrl(ctrl, true);
>   }
>   
Similar argument to the previous patch: Does this need to be in the 
kernel? Can't we delegate this to userspace?

And, more importantly: What happens if the host does _not_ deregister on 
disconnect?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-27  9:30   ` Hannes Reinecke
@ 2022-01-27 13:12     ` Sagi Grimberg
  2022-01-27 13:30       ` Belanger, Martin
  0 siblings, 1 reply; 32+ messages in thread
From: Sagi Grimberg @ 2022-01-27 13:12 UTC (permalink / raw)
  To: Hannes Reinecke, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch, Martin Belanger



On 1/27/22 11:30, Hannes Reinecke wrote:
> On 1/25/22 15:59, Martin Belanger wrote:
>> From: Martin Belanger <martin.belanger@dell.com>
>>
>> TP8010 introduces Central Discovery Controllers (CDC) and the
>> Discovery Information Management (DIM) PDU, which is used to
>> perform explicit registration with Discovery Controllers.
>>
>> This patch defines the DIM PDU and adds logic to perform explicit
>> registration with DCs when registration has been requested by the
>> user.
>>
>> Signed-off-by: Martin Belanger <martin.belanger@dell.com>
>> ---
>>   drivers/nvme/host/fabrics.c | 164 +++++++++++++++++++++++
>>   drivers/nvme/host/fabrics.h |   9 ++
>>   drivers/nvme/host/tcp.c     |  51 +++++++-
>>   include/linux/nvme.h        | 255 ++++++++++++++++++++++++++++++++++--
>>   4 files changed, 467 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
>> index 040a6cce6afa..e6fc2f85b670 100644
>> --- a/drivers/nvme/host/fabrics.c
>> +++ b/drivers/nvme/host/fabrics.c
>> @@ -10,6 +10,7 @@
>>   #include <linux/mutex.h>
>>   #include <linux/parser.h>
>>   #include <linux/seq_file.h>
>> +#include <linux/utsname.h>
>>   #include "nvme.h"
>>   #include "fabrics.h"
>> @@ -526,6 +527,165 @@ static struct nvmf_transport_ops 
>> *nvmf_lookup_transport(
>>       return NULL;
>>   }
>> +/**
>> + * nvmf_get_tel() - Calculate the amount of memory needed for a 
>> Discovery
>> + * Information Entry. Each Entry must contain at a minimum an Extended
>> + * Attribute for the HostID. The Entry may optionally contain an 
>> Extended
>> + * Attribute for the Symbolic Name.
>> + *
>> + * @hostsymname - Symbolic name (may be NULL)
>> + *
>> + * Return Total Entry Length
>> + */
>> +static u32 nvmf_get_tel(const char *hostsymname)
>> +{
>> +    u32 tel = SIZEOF_EXT_DIE_WO_EXAT;
>> +    u16 len;
>> +
>> +    /* Host ID is mandatory */
>> +    tel += exat_size(sizeof(uuid_t));
>> +
>> +    /* Symbolic name is optional */
>> +    len = hostsymname ? strlen(hostsymname) : 0;
>> +    if (len)
>> +        tel += exat_size(len);
>> +
>> +    return tel;
>> +}
>> +
>> +/**
>> + * nvmf_fill_die() - Fill the Discovery Information Entry pointed to by
>> + *                   @die.
>> + *
>> + * @die - Pointer to Discovery Information Entry to be filled
>> + * @tel - Length of the DIE
>> + * @trtype - Transport type
>> + * @adrfam - Address family
>> + * @reg_addr - Address to register. Setting this to an empty string 
>> tells
>> + *       the DC to infer address from the source address of the socket.
>> + * @nqn - Host NQN
>> + * @hostid - Host ID
>> + * @hostsymname - Host symbolic name
>> + * @tsas - Transport Specific Address Subtype for the address being
>> + *       registered.
>> + */
>> +static void nvmf_fill_die(struct nvmf_ext_die  *die,
>> +              u32                   tel,
>> +              u8                    trtype,
>> +              u8                    adrfam,
>> +              const char           *reg_addr,
>> +              struct nvmf_host     *host,
>> +              union nvmf_tsas      *tsas)
>> +{
>> +    u16 numexat = 0;
>> +    size_t symname_len;
>> +    struct nvmf_ext_attr *exat;
>> +
>> +    die->tel = cpu_to_le32(tel);
>> +    die->trtype = trtype;
>> +    die->adrfam = adrfam;
>> +
>> +    strncpy(die->nqn, host->nqn, sizeof(die->nqn));
>> +    strncpy(die->traddr, reg_addr, sizeof(die->traddr));
>> +    memcpy(&die->tsas, tsas, sizeof(die->tsas));
>> +
>> +    /* Extended Attribute for the HostID (mandatory) */
>> +    numexat++;
>> +    exat = &die->exat;
>> +    exat->type = cpu_to_le16(NVME_EXATTYPE_HOSTID);
>> +    exat->len  = cpu_to_le16(exat_len(sizeof(host->id)));
>> +    uuid_copy((uuid_t *)exat->val, &host->id);
>> +
>> +    /* Extended Attribute for the Symbolic Name (optional) */
>> +    symname_len = strlen(host->symname);
>> +    if (symname_len) {
>> +        u16 exatlen = exat_len(symname_len);
>> +
>> +        numexat++;
>> +        exat = exat_ptr_next(exat);
>> +        exat->type = cpu_to_le16(NVME_EXATTYPE_SYMNAME);
>> +        exat->len  = cpu_to_le16(exatlen);
>> +        memcpy(exat->val, host->symname, symname_len);
>> +        /* Per Base specs, ASCII strings must be padded with spaces */
>> +        memset(&exat->val[symname_len], ' ', exatlen - symname_len);
>> +    }
>> +
>> +    die->numexat = cpu_to_le16(numexat);
>> +}
>> +
>> +/**
>> + * nvmf_disc_info_mgmt() - Perform explicit registration, 
>> deregistration,
>> + * or registration-update (specified by @tas) by sending a Discovery
>> + * Information Management (DIM) command to the Discovery Controller 
>> (DC).
>> + *
>> + * @ctrl: Host NVMe controller instance maintaining the admin queue 
>> used to
>> + *      submit the DIM command to the DC.
>> + * @tas: Task field of the Command Dword 10 (cdw10). Indicates 
>> whether to
>> + *     perform a Registration, Deregistration, or Registration-update.
>> + * @trtype: Transport type (e.g. NVMF_TRTYPE_TCP)
>> + * @adrfam: Address family (e.g. NVMF_ADDR_FAMILY_IP6)
>> + * @reg_addr - Address to register. Setting this to an empty string 
>> tells
>> + *       the DC to infer address from the source address of the socket.
>> + * @tsas - Transport Specific Address Subtype for the address being
>> + *       registered.
>> + *
>> + * Return:   0: success,
>> + *         > 0: NVMe error status code,
>> + *         < 0: Linux errno error code
>> + */
>> +int nvmf_disc_info_mgmt(struct nvme_ctrl *ctrl,
>> +            enum nvmf_dim_tas tas,
>> +            u8                trtype,
>> +            u8                adrfam,
>> +            const char       *reg_addr,
>> +            union nvmf_tsas  *tsas)
>> +{
>> +    int                   ret;
>> +    u32                   tdl;  /* Total Data Length */
>> +    u32                   tel;  /* Total Entry Length */
>> +    struct nvmf_dim_data *dim;  /* Discovery Information Management */
>> +    struct nvmf_ext_die  *die;  /* Discovery Information Entry */
>> +    struct nvme_command   cmd = {};
>> +    union nvme_result     result;
>> +
>> +    /* Register 1 Discovery Information Entry (DIE) of size TEL */
>> +    tel = nvmf_get_tel(ctrl->opts->host->symname);
>> +    tdl = SIZEOF_DIM_WO_DIE + tel;
>> +
>> +    dim = kzalloc(tdl, GFP_KERNEL);
>> +    if (!dim)
>> +        return -ENOMEM;
>> +
>> +    cmd.dim.opcode = nvme_admin_dim;
>> +    cmd.dim.cdw10  = cpu_to_le32(tas);
>> +
>> +    dim->tdl    = cpu_to_le32(tdl);
>> +    dim->nument = cpu_to_le64(1);    /* only 1 DIE to register */
>> +    dim->entfmt = cpu_to_le16(NVME_ENTFMT_EXTENDED);
>> +    dim->etype  = cpu_to_le16(NVME_REGISTRATION_HOST);
>> +    dim->ektype = cpu_to_le16(0x5F); /* must be 0x5F per specs */
>> +
>> +    strncpy(dim->eid, ctrl->opts->host->nqn, sizeof(dim->eid));
>> +    strncpy(dim->ename, utsname()->nodename, sizeof(dim->ename));
>> +    strncpy(dim->ever, utsname()->release, sizeof(dim->ever));
>> +
>> +    die = &dim->die.extended;
>> +    nvmf_fill_die(die, tel, trtype, adrfam, reg_addr,
>> +              ctrl->opts->host, tsas);
>> +
>> +    ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &result,
>> +                     dim, tdl, 0, NVME_QID_ANY, 1, 0);
>> +    if (unlikely(ret)) {
>> +        dev_err(ctrl->device, "DIM error Result:0x%04X Status:0x%04X\n",
>> +           le32_to_cpu(result.u32), ret);
>> +    }
>> +
>> +    kfree(dim);
>> +
>> +    return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(nvmf_disc_info_mgmt);
>> +
>>   static const match_table_t opt_tokens = {
>>       { NVMF_OPT_TRANSPORT,        "transport=%s"        },
>>       { NVMF_OPT_TRADDR,        "traddr=%s"        },
>> @@ -550,6 +710,7 @@ static const match_table_t opt_tokens = {
>>       { NVMF_OPT_TOS,            "tos=%d"        },
>>       { NVMF_OPT_FAIL_FAST_TMO,    "fast_io_fail_tmo=%d"    },
>>       { NVMF_OPT_DISCOVERY,        "discovery"        },
>> +    { NVMF_OPT_REGISTER,        "register"        },
>>       { NVMF_OPT_ERR,            NULL            }
>>   };
>> @@ -831,6 +992,9 @@ static int nvmf_parse_options(struct 
>> nvmf_ctrl_options *opts,
>>           case NVMF_OPT_DISCOVERY:
>>               opts->discovery_nqn = true;
>>               break;
>> +        case NVMF_OPT_REGISTER:
>> +            opts->explicit_registration = true;
>> +            break;
>>           case NVMF_OPT_SYMNAME:
>>               hostsymname = match_strdup(args);
>>               if (!hostsymname) {
>> diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
>> index 494e6dbe233a..77f5768f8ff4 100644
>> --- a/drivers/nvme/host/fabrics.h
>> +++ b/drivers/nvme/host/fabrics.h
>> @@ -70,6 +70,7 @@ enum {
>>       NVMF_OPT_HOST_IFACE    = 1 << 21,
>>       NVMF_OPT_DISCOVERY    = 1 << 22,
>>       NVMF_OPT_SYMNAME    = 1 << 23,
>> +    NVMF_OPT_REGISTER    = 1 << 24,
>>   };
>>   /**
>> @@ -106,6 +107,7 @@ enum {
>>    * @nr_poll_queues: number of queues for polling I/O
>>    * @tos: type of service
>>    * @fast_io_fail_tmo: Fast I/O fail timeout in seconds
>> + * @explicit_registration: Explicit Registration with DC using the 
>> DIM PDU
>>    */
>>   struct nvmf_ctrl_options {
>>       unsigned        mask;
>> @@ -130,6 +132,7 @@ struct nvmf_ctrl_options {
>>       unsigned int        nr_poll_queues;
>>       int            tos;
>>       int            fast_io_fail_tmo;
>> +    bool            explicit_registration;
>>   };
>>   /*
>> @@ -200,5 +203,11 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char 
>> *buf, int size);
>>   bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
>>   bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
>>           struct nvmf_ctrl_options *opts);
>> +int nvmf_disc_info_mgmt(struct nvme_ctrl *ctrl,
>> +            enum nvmf_dim_tas tas,
>> +            u8                trtype,
>> +            u8                adrfam,
>> +            const char       *reg_addr,
>> +            union nvmf_tsas  *tsas);
>>   #endif /* _NVME_FABRICS_H */
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index 4ceb28675fdf..970b7df574a4 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -1993,6 +1993,40 @@ static void nvme_tcp_reconnect_or_remove(struct 
>> nvme_ctrl *ctrl)
>>       }
>>   }
>> +/**
>> + * nvme_get_adrfam() - Get address family for the address we're 
>> registering
>> + * with the DC. We retrieve this info from the socket itself. If we 
>> can't
>> + * get the source address from the socket, then we'll infer the address
>> + * family from the address of the DC since the DC address has the same
>> + * address family.
>> + *
>> + * @ctrl: Host NVMe controller instance maintaining the admin queue 
>> used to
>> + *      submit the DIM command to the DC.
>> + *
>> + * Return: The address family of the source address associated with the
>> + * socket connected to the DC.
>> + */
>> +static u8 nvme_get_adrfam(struct nvme_ctrl *ctrl)
>> +{
>> +    u8                       adrfam = NVMF_ADDR_FAMILY_IP4;
>> +    struct sockaddr_storage  sas;
>> +    struct sockaddr         *sa = (struct sockaddr *)&sas;
>> +    struct nvme_tcp_queue   *queue = &to_tcp_ctrl(ctrl)->queues[0];
>> +
>> +    if (kernel_getsockname(queue->sock, sa) == 0) {
>> +        if (sa->sa_family == AF_INET6)
>> +            adrfam = NVMF_ADDR_FAMILY_IP6;
>> +    } else {
>> +        unsigned char buf[sizeof(struct in6_addr)];
>> +
>> +        if (in6_pton(ctrl->opts->traddr,
>> +                 strlen(ctrl->opts->traddr), buf, -1, NULL) > 0)
>> +            adrfam = NVMF_ADDR_FAMILY_IP6;
>> +    }
>> +
>> +    return adrfam;
>> +}
>> +
>>   static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
>>   {
>>       struct nvmf_ctrl_options *opts = ctrl->opts;
>> @@ -2045,6 +2079,20 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl 
>> *ctrl, bool new)
>>           goto destroy_io;
>>       }
>> +    if (ctrl->opts->explicit_registration &&
>> ++        ((ctrl->dctype == NVME_DCTYPE_DDC) ||
>> +         (ctrl->dctype == NVME_DCTYPE_CDC))) {
>> +        /* We're registering our source address with the DC. To do
>> +         * that, we can simply send an empty string. This tells the DC
>> +         * to retrieve the source address from the socket and use that
>> +         * as the registration address.
>> +         */
>> +        union nvmf_tsas tsas = {.tcp.sectype = NVMF_TCP_SECTYPE_NONE};
>> +
>> +        nvmf_disc_info_mgmt(ctrl, NVME_TAS_REGISTER, NVMF_TRTYPE_TCP,
>> +                    nvme_get_adrfam(ctrl), "", &tsas);
>> +    }
>> +
>>       nvme_start_ctrl(ctrl);
>>       return 0;
>> @@ -2613,7 +2661,8 @@ static struct nvmf_transport_ops 
>> nvme_tcp_transport = {
>>                 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
>>                 NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
>>                 NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
>> -              NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
>> +              NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE |
>> +              NVMF_OPT_REGISTER,
>>       .create_ctrl    = nvme_tcp_create_ctrl,
>>   };
>> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
>> index 3b47951342f4..891615876cfa 100644
>> --- a/include/linux/nvme.h
>> +++ b/include/linux/nvme.h
>> @@ -9,6 +9,7 @@
>>   #include <linux/types.h>
>>   #include <linux/uuid.h>
>> +#include <linux/kernel.h>
>>   /* NQN names in commands fields specified one size */
>>   #define NVMF_NQN_FIELD_LEN    256
>> @@ -102,6 +103,16 @@ enum {
>>       NVMF_RDMA_CMS_RDMA_CM    = 1, /* Sockets based endpoint 
>> addressing */
>>   };
>> +/**
>> + * enum -
>> + * @NVMF_TCP_SECTYPE_NONE: No Security
>> + * @NVMF_TCP_SECTYPE_TLS:  Transport Layer Security
>> + */
>> +enum {
>> +    NVMF_TCP_SECTYPE_NONE    = 0,
>> +    NVMF_TCP_SECTYPE_TLS    = 1,
>> +};
>> +
>>   #define NVME_AQ_DEPTH        32
>>   #define NVME_NR_AEN_COMMANDS    1
>>   #define NVME_AQ_BLK_MQ_DEPTH    (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
>> @@ -1036,6 +1047,7 @@ enum nvme_admin_opcode {
>>       nvme_admin_sanitize_nvm        = 0x84,
>>       nvme_admin_get_lba_status    = 0x86,
>>       nvme_admin_vendor_start        = 0xC0,
>> +    nvme_admin_dim            = 0x21,
>>   };
>>   #define nvme_admin_opcode_name(opcode)    { opcode, #opcode }
>> @@ -1316,6 +1328,29 @@ struct nvmf_common_command {
>>       __u8    ts[24];
>>   };
>> +/**
>> + * union nvmf_tsas - Transport Specific Address Subtype
>> + * @qptype:  Queue Pair Service Type (NVMF_RDMA_QPTYPE_CONNECTED, 
>> NVMF_RDMA_QPTYPE_DATAGRAM)
>> + * @prtype:  Provider Type (see enum  nvme_rdma_prtype)
>> + * @cma:     Connection Management Service (NVMF_RDMA_CMS_RDMA_CM)
>> + * @pkey:    Partition Key
>> + * @sectype: Security Type (NVMF_TCP_SECTYPE_NONE, NVMF_TCP_SECTYPE_TLS)
>> + */
>> +union nvmf_tsas {
>> +    char        common[NVMF_TSAS_SIZE];
>> +    struct rdma {
>> +        __u8    qptype;
>> +        __u8    prtype;
>> +        __u8    cms;
>> +        __u8    rsvd3[5];
>> +        __u16    pkey;
>> +        __u8    rsvd10[246];
>> +    } rdma;
>> +    struct tcp {
>> +        __u8    sectype;
>> +    } tcp;
>> +};
>> +
>>   /*
>>    * The legal cntlid range a NVMe Target will provide.
>>    * Note that cntlid of value 0 is considered illegal in the fabrics 
>> world.
>> @@ -1349,17 +1384,7 @@ struct nvmf_disc_rsp_page_entry {
>>       __u8        resv64[192];
>>       char        subnqn[NVMF_NQN_FIELD_LEN];
>>       char        traddr[NVMF_TRADDR_SIZE];
>> -    union tsas {
>> -        char        common[NVMF_TSAS_SIZE];
>> -        struct rdma {
>> -            __u8    qptype;
>> -            __u8    prtype;
>> -            __u8    cms;
>> -            __u8    resv3[5];
>> -            __u16    pkey;
>> -            __u8    resv10[246];
>> -        } rdma;
>> -    } tsas;
>> +    union nvmf_tsas tsas;
>>   };
>>   /* Discovery log page header */
>> @@ -1447,6 +1472,213 @@ struct streams_directive_params {
>>       __u8    rsvd2[6];
>>   };
>> +/**
>> + * Discovery Information Management (DIM) command. This is sent by a
>> + * host to the Central Discovery Controller (CDC) to perform
>> + * explicit registration.
>> + */
>> +enum nvmf_dim_tas {
>> +    NVME_TAS_REGISTER   = 0x00,
>> +    NVME_TAS_DEREGISTER = 0x01,
>> +    NVME_TAS_UPDATE     = 0x02,
>> +};
>> +
>> +struct nvmf_dim_command {
>> +    __u8                  opcode;     /* nvme_admin_dim */
>> +    __u8                  flags;
>> +    __u16                 command_id;
>> +    __le32                nsid;
>> +    __u64                 rsvd2[2];
>> +    union nvme_data_ptr   dptr;
>> +    __le32                cdw10;      /* enum nvmf_dim_tas */
>> +    __le32                cdw11;
>> +    __le32                cdw12;
>> +    __le32                cdw13;
>> +    __le32                cdw14;
>> +    __le32                cdw15;
>> +};
>> +
>> +#define NVMF_ENAME_LEN    256
>> +#define NVMF_EVER_LEN     64
>> +
>> +enum nvmf_dim_entfmt {
>> +    NVME_ENTFMT_BASIC    = 0x01,
>> +    NVME_ENTFMT_EXTENDED = 0x02,
>> +};
>> +
>> +enum nvmf_dim_etype {
>> +    NVME_REGISTRATION_HOST = 0x01,
>> +    NVME_REGISTRATION_DDC  = 0x02,
>> +    NVME_REGISTRATION_CDC  = 0x03,
>> +};
>> +
>> +enum nvmf_exattype {
>> +    NVME_EXATTYPE_HOSTID   = 0x01, /* Host Identifier */
>> +    NVME_EXATTYPE_SYMNAME  = 0x02, /* Symbolic Name */
>> +};
>> +
>> +/**
>> + * struct nvmf_ext_attr - Extended Attribute (EXAT)
>> + *
>> + *       Bytes       Description
>> + * @type 01:00       Extended Attribute Type (EXATTYPE) - enum 
>> nvmf_exattype
>> + * @len  03:02       Extended Attribute Length (EXATLEN)
>> + * @val  (EXATLEN-1) Extended Attribute Value (EXATVAL) - size allocated
>> + *       +4:04       for array must be a multiple of 4 bytes
>> + */
>> +struct nvmf_ext_attr {
>> +    __le16            type;
>> +    __le16            len;
>> +    __u8            val[];
>> +};
>> +#define SIZEOF_EXT_EXAT_WO_VAL offsetof(struct nvmf_ext_attr, val)
>> +
>> +/**
>> + * struct nvmf_ext_die - Extended Discovery Information Entry (DIE)
>> + *
>> + *            Bytes           Description
>> + * @trtype    00              Transport Type (enum nvme_trtype)
>> + * @adrfam    01              Address Family (enum nvmf_addr_familiy)
>> + * @subtype   02              Subsystem Type (enum nvme_subsys_type)
>> + * @treq      03              Transport Requirements (enum nvmf_treq)
>> + * @portid    05:04           Port ID
>> + * @cntlid    07:06           Controller ID
>> + * @asqsz     09:08           Admin Max SQ Size
>> + *            31:10           Reserved
>> + * @trsvcid   63:32           Transport Service Identifier
>> + *            255:64          Reserved
>> + * @nqn       511:256         NVM Qualified Name
>> + * @traddr    767:512         Transport Address
>> + * @tsas      1023:768        Transport Specific Address Subtype
>> + * @tel       1027:1024       Total Entry Length
>> + * @numexat   1029:1028       Number of Extended Attributes
>> + *            1031:1030       Reserved
>> + * @exat[0]   ((EXATLEN-1)+   Extented Attributes 0 (see @numexat)
>> + *            4)+1032:1032    Cannot access as an array because each 
>> EXAT
>> + *                            entry has an undetermined size.
>> + * @exat[N]   TEL-1:TEL-      Extented Attributes N (w/ N = NUMEXAT-1)
>> + *            (EXATLEN+4)
>> + */
>> +struct nvmf_ext_die {
>> +    __u8            trtype;
>> +    __u8            adrfam;
>> +    __u8            subtype;
>> +    __u8            treq;
>> +    __le16            portid;
>> +    __le16            cntlid;
>> +    __le16            asqsz;
>> +    __u8            resv10[22];
>> +    char            trsvcid[NVMF_TRSVCID_SIZE];
>> +    __u8            resv64[192];
>> +    char            nqn[NVMF_NQN_FIELD_LEN];
>> +    char            traddr[NVMF_TRADDR_SIZE];
>> +    union nvmf_tsas        tsas;
>> +    __le32            tel;
>> +    __le16            numexat;
>> +    __u16            resv1030;
>> +    struct nvmf_ext_attr    exat;
>> +};
>> +#define SIZEOF_EXT_DIE_WO_EXAT offsetof(struct nvmf_ext_die, exat)
>> +
>> +/**
>> + * union nvmf_die - Discovery Information Entry (DIE)
>> + *
>> + * Depending on the ENTFMT specified in the DIM, DIEs can be entered 
>> with
>> + * the Basic or Extended formats. For Basic format, each entry has a 
>> fixed
>> + * length. Therefore, the "basic" field defined below can be accessed 
>> as a
>> + * C array. For the Extended format, however, each entry is of variable
>> + * length (TEL). Therefore, the "extended" field defined below cannot be
>> + * accessed as a C array. Instead, the "extended" field is akin to a
>> + * linked-list, where one can "walk" through the list. To move to the 
>> next
>> + * entry, one simply adds the current entry's length (TEL) to the "walk"
>> + * pointer. The number of entries in the list is specified by NUMENT.
>> + * Although extended entries are of a variable lengths (TEL), TEL is
>> + * always a multiple of 4 bytes.
>> + */
>> +union nvmf_die {
>> +    struct nvmf_disc_rsp_page_entry    basic[0];
>> +    struct nvmf_ext_die        extended;
>> +};
>> +
>> +/**
>> + * struct nvmf_dim_data - Discovery Information Management (DIM) - Data
>> + *
>> + *            Bytes       Description
>> + * @tdl       03:00       Total Data Length
>> + *            07:04       Reserved
>> + * @nument    15:08       Number of entries
>> + * @entfmt    17:16       Entry Format (enum nvmf_dim_entfmt)
>> + * @etype     19:18       Entity Type (enum nvmf_dim_etype)
>> + * @portlcL   20          Port Local
>> + *            21          Reserved
>> + * @ektype    23:22       Entry Key Type
>> + * @eid       279:24      Entity Identifier (e.g. Host NQN)
>> + * @ename     535:280     Entity Name (e.g. hostname)
>> + * @ever      599:536     Entity Version (e.g. OS Name/Version)
>> + *            1023:600    Reserved
>> + * @die       TDL-1:1024  Discovery Information Entry (see @nument 
>> above)
>> + */
>> +struct nvmf_dim_data {
>> +    __le32            tdl;
>> +    __u32            rsvd4;
>> +    __le64            nument;
>> +    __le16            entfmt;
>> +    __le16            etype;
>> +    __u8            portlcl;
>> +    __u8            rsvd21;
>> +    __le16            ektype;
>> +    char            eid[NVMF_NQN_FIELD_LEN];
>> +    char            ename[NVMF_ENAME_LEN];
>> +    char            ever[NVMF_EVER_LEN];
>> +    __u8            rsvd600[424];
>> +    union nvmf_die        die;
>> +};
>> +#define SIZEOF_DIM_WO_DIE offsetof(struct nvmf_dim_data, die)
>> +
>> +/**
>> + * exat_len() - Return the size in bytes, rounded to a multiple of 4 
>> ((i.e.
>> + *         size of u32), of the buffer needed to hold the exat value of
>> + *         size @val_len.
>> + */
>> +static inline u16 exat_len(size_t val_len)
>> +{
>> +    return (u16)round_up(val_len, sizeof(u32));
>> +}
>> +
>> +/**
>> + * exat_size - return the size of the "struct nvmf_ext_attr"
>> + *   needed to hold a value of size @val_len.
>> + *
>> + * @val_len: This is the length of the data to be copied to the "val"
>> + *         field of a "struct nvmf_ext_attr".
>> + *
>> + * @return The size in bytes, rounded to a multiple of 4 (i.e. size of
>> + *         u32), of the "struct nvmf_ext_attr" required to hold a 
>> string of
>> + *         length @val_len.
>> + */
>> +static inline u16 exat_size(size_t val_len)
>> +{
>> +    return (u16)(SIZEOF_EXT_EXAT_WO_VAL + exat_len(val_len));
>> +}
>> +
>> +/**
>> + * exat_ptr_next - Increment @p to the next element in the array. 
>> Extended
>> + *   attributes are saved to an array of "struct nvmf_ext_attr" where 
>> each
>> + *   element of the array is of variable size. In order to move to 
>> the next
>> + *   element in the array one must increment the pointer to the current
>> + *   element (@p) by the size of the current element.
>> + *
>> + * @p: Pointer to an element of an array of "struct nvmf_ext_attr".
>> + *
>> + * @return Pointer to the next element in the array.
>> + */
>> +static inline struct nvmf_ext_attr *exat_ptr_next(struct 
>> nvmf_ext_attr *p)
>> +{
>> +    return (struct nvmf_ext_attr *)
>> +        ((uintptr_t)p + (ptrdiff_t)exat_size(le16_to_cpu(p->len)));
>> +}
>> +
>> +
>>   struct nvme_command {
>>       union {
>>           struct nvme_common_command common;
>> @@ -1470,6 +1702,7 @@ struct nvme_command {
>>           struct nvmf_property_get_command prop_get;
>>           struct nvme_dbbuf dbbuf;
>>           struct nvme_directive_cmd directive;
>> +        struct nvmf_dim_command dim;
>>       };
>>   };
> I'd rather like to have the TLS bits in a separate patch; they are not 
> strictly related to this issue.
> 
> And I have a rather general issue with this: Is it required to issue an 
> explicit registration directly after sending the 'connect' command?
> IE do we need to have it as part of the 'connect' routine?
> 
> Thing is, the linux 'connect' routine is doing several things in one go
> (like establishing a transport association for the admin queue, send the 
> 'connect' command for the admin queue, create transport associations for 
> I/O queues and send 'connect' commands on all I/O queues, _and_ possibly 
> do authentication, too).
> If we now start overloading it with yet more functionality it becomes 
> extremely hard to cleanup after an error, and that doesn't even mention 
> the non-existing error reporting to userspace.
> 
> Wouldn't it make more sense to delegate explicit registration to 
> userspace (ie libnvme/nvme-cli), and leave the kernel out of it?

It would. There is no reason what-so-ever to place all this register
stuff needs to live in the kernel. We have a way to passthru commands so
it can and should move to userspace.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 4/4] nvme-fabrics: add explicit deregistration on disconnect.
  2022-01-27  9:31   ` Hannes Reinecke
@ 2022-01-27 13:14     ` Sagi Grimberg
  0 siblings, 0 replies; 32+ messages in thread
From: Sagi Grimberg @ 2022-01-27 13:14 UTC (permalink / raw)
  To: Hannes Reinecke, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch, Martin Belanger


>> Per TP8010, a host explicitly registered with a Discovery
>> Controller (DC) must deregister on disconnect. This patch
>> sends a DIM PDU to DCs to deregister the host on disconnect.
>>
>> Signed-off-by: Martin Belanger <martin.belanger@dell.com>
>> ---
>>   drivers/nvme/host/tcp.c | 10 ++++++++++
>>   1 file changed, 10 insertions(+)
>>
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index 970b7df574a4..804d927b4b81 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -2176,6 +2176,16 @@ static void nvme_tcp_teardown_ctrl(struct 
>> nvme_ctrl *ctrl, bool shutdown)
>>   static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
>>   {
>> +    if (ctrl->opts->explicit_registration &&
>> ++        ((ctrl->dctype == NVME_DCTYPE_DDC) ||
>> +         (ctrl->dctype == NVME_DCTYPE_CDC))) {
>> +        /* Deregister from discovery controller */
>> +        union nvmf_tsas tsas = {.tcp.sectype = NVMF_TCP_SECTYPE_NONE};
>> +
>> +        nvmf_disc_info_mgmt(ctrl, NVME_TAS_DEREGISTER, NVMF_TRTYPE_TCP,
>> +                    nvme_get_adrfam(ctrl), "", &tsas);
>> +    }
>> +
>>       nvme_tcp_teardown_ctrl(ctrl, true);
>>   }
> Similar argument to the previous patch: Does this need to be in the 
> kernel? Can't we delegate this to userspace?

Same agreement here. we shouldn't add any of this to a transport driver.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 1/4] nvme-fabrics: add discovery controller type
  2022-01-25 14:59 ` [PATCH 1/4] nvme-fabrics: add discovery controller type Martin Belanger
  2022-01-27  9:16   ` Hannes Reinecke
@ 2022-01-27 13:16   ` Sagi Grimberg
  1 sibling, 0 replies; 32+ messages in thread
From: Sagi Grimberg @ 2022-01-27 13:16 UTC (permalink / raw)
  To: Martin Belanger, linux-nvme; +Cc: kbusch, hare, axboe, hch, Martin Belanger


> ---
>   drivers/nvme/host/core.c |  3 +++
>   drivers/nvme/host/nvme.h |  2 ++
>   include/linux/nvme.h     | 10 +++++++++-
>   3 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 4fc794d9c2f4..cd34b92e8e9d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2978,6 +2978,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
>   	ctrl->kas = le16_to_cpu(id->kas);
>   	ctrl->max_namespaces = le32_to_cpu(id->mnan);
>   	ctrl->ctratt = le32_to_cpu(id->ctratt);
> +	ctrl->dctype = id->dctype;
>   
>   	if (id->rtd3e) {
>   		/* us -> s */
> @@ -3335,6 +3336,7 @@ nvme_show_int_function(numa_node);
>   nvme_show_int_function(queue_count);
>   nvme_show_int_function(sqsize);
>   nvme_show_int_function(kato);
> +nvme_show_int_function(dctype);
>   
>   static ssize_t nvme_sysfs_delete(struct device *dev,
>   				struct device_attribute *attr, const char *buf,
> @@ -3533,6 +3535,7 @@ static struct attribute *nvme_dev_attrs[] = {
>   	&dev_attr_reconnect_delay.attr,
>   	&dev_attr_fast_io_fail_tmo.attr,
>   	&dev_attr_kato.attr,
> +	&dev_attr_dctype.attr,
>   	NULL
>   };

This attribute should not be visible to I/O controllers.

>   
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index fe224016418e..f8b60f7346fd 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -349,6 +349,8 @@ struct nvme_ctrl {
>   	unsigned long discard_page_busy;
>   
>   	struct nvme_fault_inject fault_inject;
> +
> +	enum nvme_dctype dctype;
>   };
>   
>   enum nvme_iopolicy {
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 855dd9b3e84b..82567d493c51 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -240,6 +240,12 @@ enum nvme_ctrl_attr {
>   	NVME_CTRL_ATTR_TBKAS		= (1 << 6),
>   };
>   
> +enum nvme_dctype {
> +	NVME_DCTYPE_NOT_REPORTED = 0,
> +	NVME_DCTYPE_DDC = 1,
> +	NVME_DCTYPE_CDC = 2,
> +};
> +
>   struct nvme_id_ctrl {
>   	__le16			vid;
>   	__le16			ssvid;
> @@ -320,7 +326,9 @@ struct nvme_id_ctrl {
>   	__le16			icdoff;
>   	__u8			ctrattr;
>   	__u8			msdbd;
> -	__u8			rsvd1804[244];
> +	__le16			ofcs;

this is unused...

> +	__u8			dctype;
> +	__u8			rsvd1807[241];
>   	struct nvme_id_power_state	psd[32];
>   	__u8			vs[1024];
>   };


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/4] nvme: add host symbolic name
  2022-01-25 14:59 ` [PATCH 2/4] nvme: add host symbolic name Martin Belanger
  2022-01-27  9:18   ` Hannes Reinecke
@ 2022-01-27 13:20   ` Sagi Grimberg
  1 sibling, 0 replies; 32+ messages in thread
From: Sagi Grimberg @ 2022-01-27 13:20 UTC (permalink / raw)
  To: Martin Belanger, linux-nvme; +Cc: kbusch, hare, axboe, hch, Martin Belanger


> From: Martin Belanger <martin.belanger@dell.com>
> 
> TP8010 introduces the 'host symbolic name' as an optional
> parameter for explicit registration with a central discovery
> controller.
> 

If the other pieces move out, I don't see why the kernel needs
this given that it is not supposed to function as a sysfs kv-store.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-27 13:12     ` Sagi Grimberg
@ 2022-01-27 13:30       ` Belanger, Martin
  2022-01-27 14:28         ` Hannes Reinecke
  2022-01-27 21:59         ` Sagi Grimberg
  0 siblings, 2 replies; 32+ messages in thread
From: Belanger, Martin @ 2022-01-27 13:30 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch

> >> TP8010 introduces Central Discovery Controllers (CDC) and the
> >> Discovery Information Management (DIM) PDU, which is used to perform
> >> explicit registration with Discovery Controllers.
> >>
> >> This patch defines the DIM PDU and adds logic to perform explicit
> >> registration with DCs when registration has been requested by the
> >> user.
> >>
> >> Signed-off-by: Martin Belanger <martin.belanger@dell.com>
> >> ---
> >>   drivers/nvme/host/fabrics.c | 164 +++++++++++++++++++++++
> >>   drivers/nvme/host/fabrics.h |   9 ++
> >>   drivers/nvme/host/tcp.c     |  51 +++++++-
> >>   include/linux/nvme.h        | 255
> >> ++++++++++++++++++++++++++++++++++--
> >>   4 files changed, 467 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/drivers/nvme/host/fabrics.c
> >> b/drivers/nvme/host/fabrics.c index 040a6cce6afa..e6fc2f85b670 100644
> >> --- a/drivers/nvme/host/fabrics.c
> >> +++ b/drivers/nvme/host/fabrics.c
> >> @@ -10,6 +10,7 @@
> >>   #include <linux/mutex.h>
> >>   #include <linux/parser.h>
> >>   #include <linux/seq_file.h>
> >> +#include <linux/utsname.h>
> >>   #include "nvme.h"
> >>   #include "fabrics.h"
> >> @@ -526,6 +527,165 @@ static struct nvmf_transport_ops
> >> *nvmf_lookup_transport(
> >>       return NULL;
> >>   }
> >> +/**
> >> + * nvmf_get_tel() - Calculate the amount of memory needed for a
> >> Discovery
> >> + * Information Entry. Each Entry must contain at a minimum an
> >> + Extended
> >> + * Attribute for the HostID. The Entry may optionally contain an
> >> Extended
> >> + * Attribute for the Symbolic Name.
> >> + *
> >> + * @hostsymname - Symbolic name (may be NULL)
> >> + *
> >> + * Return Total Entry Length
> >> + */
> >> +static u32 nvmf_get_tel(const char *hostsymname) {
> >> +    u32 tel = SIZEOF_EXT_DIE_WO_EXAT;
> >> +    u16 len;
> >> +
> >> +    /* Host ID is mandatory */
> >> +    tel += exat_size(sizeof(uuid_t));
> >> +
> >> +    /* Symbolic name is optional */
> >> +    len = hostsymname ? strlen(hostsymname) : 0;
> >> +    if (len)
> >> +        tel += exat_size(len);
> >> +
> >> +    return tel;
> >> +}
> >> +
> >> +/**
> >> + * nvmf_fill_die() - Fill the Discovery Information Entry pointed to
> >> +by
> >> + *                   @die.
> >> + *
> >> + * @die - Pointer to Discovery Information Entry to be filled
> >> + * @tel - Length of the DIE
> >> + * @trtype - Transport type
> >> + * @adrfam - Address family
> >> + * @reg_addr - Address to register. Setting this to an empty string
> >> tells
> >> + *       the DC to infer address from the source address of the socket.
> >> + * @nqn - Host NQN
> >> + * @hostid - Host ID
> >> + * @hostsymname - Host symbolic name
> >> + * @tsas - Transport Specific Address Subtype for the address being
> >> + *       registered.
> >> + */
> >> +static void nvmf_fill_die(struct nvmf_ext_die  *die,
> >> +              u32                   tel,
> >> +              u8                    trtype,
> >> +              u8                    adrfam,
> >> +              const char           *reg_addr,
> >> +              struct nvmf_host     *host,
> >> +              union nvmf_tsas      *tsas) {
> >> +    u16 numexat = 0;
> >> +    size_t symname_len;
> >> +    struct nvmf_ext_attr *exat;
> >> +
> >> +    die->tel = cpu_to_le32(tel);
> >> +    die->trtype = trtype;
> >> +    die->adrfam = adrfam;
> >> +
> >> +    strncpy(die->nqn, host->nqn, sizeof(die->nqn));
> >> +    strncpy(die->traddr, reg_addr, sizeof(die->traddr));
> >> +    memcpy(&die->tsas, tsas, sizeof(die->tsas));
> >> +
> >> +    /* Extended Attribute for the HostID (mandatory) */
> >> +    numexat++;
> >> +    exat = &die->exat;
> >> +    exat->type = cpu_to_le16(NVME_EXATTYPE_HOSTID);
> >> +    exat->len  = cpu_to_le16(exat_len(sizeof(host->id)));
> >> +    uuid_copy((uuid_t *)exat->val, &host->id);
> >> +
> >> +    /* Extended Attribute for the Symbolic Name (optional) */
> >> +    symname_len = strlen(host->symname);
> >> +    if (symname_len) {
> >> +        u16 exatlen = exat_len(symname_len);
> >> +
> >> +        numexat++;
> >> +        exat = exat_ptr_next(exat);
> >> +        exat->type = cpu_to_le16(NVME_EXATTYPE_SYMNAME);
> >> +        exat->len  = cpu_to_le16(exatlen);
> >> +        memcpy(exat->val, host->symname, symname_len);
> >> +        /* Per Base specs, ASCII strings must be padded with spaces
> >> +*/
> >> +        memset(&exat->val[symname_len], ' ', exatlen - symname_len);
> >> +    }
> >> +
> >> +    die->numexat = cpu_to_le16(numexat); }
> >> +
> >> +/**
> >> + * nvmf_disc_info_mgmt() - Perform explicit registration,
> >> deregistration,
> >> + * or registration-update (specified by @tas) by sending a Discovery
> >> + * Information Management (DIM) command to the Discovery Controller
> >> (DC).
> >> + *
> >> + * @ctrl: Host NVMe controller instance maintaining the admin queue
> >> used to
> >> + *      submit the DIM command to the DC.
> >> + * @tas: Task field of the Command Dword 10 (cdw10). Indicates
> >> whether to
> >> + *     perform a Registration, Deregistration, or Registration-update.
> >> + * @trtype: Transport type (e.g. NVMF_TRTYPE_TCP)
> >> + * @adrfam: Address family (e.g. NVMF_ADDR_FAMILY_IP6)
> >> + * @reg_addr - Address to register. Setting this to an empty string
> >> tells
> >> + *       the DC to infer address from the source address of the socket.
> >> + * @tsas - Transport Specific Address Subtype for the address being
> >> + *       registered.
> >> + *
> >> + * Return:   0: success,
> >> + *         > 0: NVMe error status code,
> >> + *         < 0: Linux errno error code  */ int
> >> +nvmf_disc_info_mgmt(struct nvme_ctrl *ctrl,
> >> +            enum nvmf_dim_tas tas,
> >> +            u8                trtype,
> >> +            u8                adrfam,
> >> +            const char       *reg_addr,
> >> +            union nvmf_tsas  *tsas)
> >> +{
> >> +    int                   ret;
> >> +    u32                   tdl;  /* Total Data Length */
> >> +    u32                   tel;  /* Total Entry Length */
> >> +    struct nvmf_dim_data *dim;  /* Discovery Information Management
> >> +*/
> >> +    struct nvmf_ext_die  *die;  /* Discovery Information Entry */
> >> +    struct nvme_command   cmd = {};
> >> +    union nvme_result     result;
> >> +
> >> +    /* Register 1 Discovery Information Entry (DIE) of size TEL */
> >> +    tel = nvmf_get_tel(ctrl->opts->host->symname);
> >> +    tdl = SIZEOF_DIM_WO_DIE + tel;
> >> +
> >> +    dim = kzalloc(tdl, GFP_KERNEL);
> >> +    if (!dim)
> >> +        return -ENOMEM;
> >> +
> >> +    cmd.dim.opcode = nvme_admin_dim;
> >> +    cmd.dim.cdw10  = cpu_to_le32(tas);
> >> +
> >> +    dim->tdl    = cpu_to_le32(tdl);
> >> +    dim->nument = cpu_to_le64(1);    /* only 1 DIE to register */
> >> +    dim->entfmt = cpu_to_le16(NVME_ENTFMT_EXTENDED);
> >> +    dim->etype  = cpu_to_le16(NVME_REGISTRATION_HOST);
> >> +    dim->ektype = cpu_to_le16(0x5F); /* must be 0x5F per specs */
> >> +
> >> +    strncpy(dim->eid, ctrl->opts->host->nqn, sizeof(dim->eid));
> >> +    strncpy(dim->ename, utsname()->nodename, sizeof(dim->ename));
> >> +    strncpy(dim->ever, utsname()->release, sizeof(dim->ever));
> >> +
> >> +    die = &dim->die.extended;
> >> +    nvmf_fill_die(die, tel, trtype, adrfam, reg_addr,
> >> +              ctrl->opts->host, tsas);
> >> +
> >> +    ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &result,
> >> +                     dim, tdl, 0, NVME_QID_ANY, 1, 0);
> >> +    if (unlikely(ret)) {
> >> +        dev_err(ctrl->device, "DIM error Result:0x%04X
> >> +Status:0x%04X\n",
> >> +           le32_to_cpu(result.u32), ret);
> >> +    }
> >> +
> >> +    kfree(dim);
> >> +
> >> +    return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(nvmf_disc_info_mgmt);
> >> +
> >>   static const match_table_t opt_tokens = {
> >>       { NVMF_OPT_TRANSPORT,        "transport=%s"        },
> >>       { NVMF_OPT_TRADDR,        "traddr=%s"        }, @@ -550,6
> >> +710,7 @@ static const match_table_t opt_tokens = {
> >>       { NVMF_OPT_TOS,            "tos=%d"        },
> >>       { NVMF_OPT_FAIL_FAST_TMO,    "fast_io_fail_tmo=%d"    },
> >>       { NVMF_OPT_DISCOVERY,        "discovery"        },
> >> +    { NVMF_OPT_REGISTER,        "register"        },
> >>       { NVMF_OPT_ERR,            NULL            }
> >>   };
> >> @@ -831,6 +992,9 @@ static int nvmf_parse_options(struct
> >> nvmf_ctrl_options *opts,
> >>           case NVMF_OPT_DISCOVERY:
> >>               opts->discovery_nqn = true;
> >>               break;
> >> +        case NVMF_OPT_REGISTER:
> >> +            opts->explicit_registration = true;
> >> +            break;
> >>           case NVMF_OPT_SYMNAME:
> >>               hostsymname = match_strdup(args);
> >>               if (!hostsymname) {
> >> diff --git a/drivers/nvme/host/fabrics.h
> >> b/drivers/nvme/host/fabrics.h index 494e6dbe233a..77f5768f8ff4 100644
> >> --- a/drivers/nvme/host/fabrics.h
> >> +++ b/drivers/nvme/host/fabrics.h
> >> @@ -70,6 +70,7 @@ enum {
> >>       NVMF_OPT_HOST_IFACE    = 1 << 21,
> >>       NVMF_OPT_DISCOVERY    = 1 << 22,
> >>       NVMF_OPT_SYMNAME    = 1 << 23,
> >> +    NVMF_OPT_REGISTER    = 1 << 24,
> >>   };
> >>   /**
> >> @@ -106,6 +107,7 @@ enum {
> >>    * @nr_poll_queues: number of queues for polling I/O
> >>    * @tos: type of service
> >>    * @fast_io_fail_tmo: Fast I/O fail timeout in seconds
> >> + * @explicit_registration: Explicit Registration with DC using the
> >> DIM PDU
> >>    */
> >>   struct nvmf_ctrl_options {
> >>       unsigned        mask;
> >> @@ -130,6 +132,7 @@ struct nvmf_ctrl_options {
> >>       unsigned int        nr_poll_queues;
> >>       int            tos;
> >>       int            fast_io_fail_tmo;
> >> +    bool            explicit_registration;
> >>   };
> >>   /*
> >> @@ -200,5 +203,11 @@ int nvmf_get_address(struct nvme_ctrl *ctrl,
> >> char *buf, int size);
> >>   bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
> >>   bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
> >>           struct nvmf_ctrl_options *opts);
> >> +int nvmf_disc_info_mgmt(struct nvme_ctrl *ctrl,
> >> +            enum nvmf_dim_tas tas,
> >> +            u8                trtype,
> >> +            u8                adrfam,
> >> +            const char       *reg_addr,
> >> +            union nvmf_tsas  *tsas);
> >>   #endif /* _NVME_FABRICS_H */
> >> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index
> >> 4ceb28675fdf..970b7df574a4 100644
> >> --- a/drivers/nvme/host/tcp.c
> >> +++ b/drivers/nvme/host/tcp.c
> >> @@ -1993,6 +1993,40 @@ static void
> >> nvme_tcp_reconnect_or_remove(struct
> >> nvme_ctrl *ctrl)
> >>       }
> >>   }
> >> +/**
> >> + * nvme_get_adrfam() - Get address family for the address we're
> >> registering
> >> + * with the DC. We retrieve this info from the socket itself. If we
> >> can't
> >> + * get the source address from the socket, then we'll infer the
> >> + address
> >> + * family from the address of the DC since the DC address has the
> >> + same
> >> + * address family.
> >> + *
> >> + * @ctrl: Host NVMe controller instance maintaining the admin queue
> >> used to
> >> + *      submit the DIM command to the DC.
> >> + *
> >> + * Return: The address family of the source address associated with
> >> +the
> >> + * socket connected to the DC.
> >> + */
> >> +static u8 nvme_get_adrfam(struct nvme_ctrl *ctrl) {
> >> +    u8                       adrfam = NVMF_ADDR_FAMILY_IP4;
> >> +    struct sockaddr_storage  sas;
> >> +    struct sockaddr         *sa = (struct sockaddr *)&sas;
> >> +    struct nvme_tcp_queue   *queue = &to_tcp_ctrl(ctrl)->queues[0];
> >> +
> >> +    if (kernel_getsockname(queue->sock, sa) == 0) {
> >> +        if (sa->sa_family == AF_INET6)
> >> +            adrfam = NVMF_ADDR_FAMILY_IP6;
> >> +    } else {
> >> +        unsigned char buf[sizeof(struct in6_addr)];
> >> +
> >> +        if (in6_pton(ctrl->opts->traddr,
> >> +                 strlen(ctrl->opts->traddr), buf, -1, NULL) > 0)
> >> +            adrfam = NVMF_ADDR_FAMILY_IP6;
> >> +    }
> >> +
> >> +    return adrfam;
> >> +}
> >> +
> >>   static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
> >>   {
> >>       struct nvmf_ctrl_options *opts = ctrl->opts; @@ -2045,6
> >> +2079,20 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl,
> >> bool new)
> >>           goto destroy_io;
> >>       }
> >> +    if (ctrl->opts->explicit_registration &&
> >> ++        ((ctrl->dctype == NVME_DCTYPE_DDC) ||
> >> +         (ctrl->dctype == NVME_DCTYPE_CDC))) {
> >> +        /* We're registering our source address with the DC. To do
> >> +         * that, we can simply send an empty string. This tells the
> >> +DC
> >> +         * to retrieve the source address from the socket and use
> >> +that
> >> +         * as the registration address.
> >> +         */
> >> +        union nvmf_tsas tsas = {.tcp.sectype =
> >> +NVMF_TCP_SECTYPE_NONE};
> >> +
> >> +        nvmf_disc_info_mgmt(ctrl, NVME_TAS_REGISTER,
> >> +NVMF_TRTYPE_TCP,
> >> +                    nvme_get_adrfam(ctrl), "", &tsas);
> >> +    }
> >> +
> >>       nvme_start_ctrl(ctrl);
> >>       return 0;
> >> @@ -2613,7 +2661,8 @@ static struct nvmf_transport_ops
> >> nvme_tcp_transport = {
> >>                 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
> >>                 NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
> >>                 NVMF_OPT_NR_WRITE_QUEUES |
> NVMF_OPT_NR_POLL_QUEUES |
> >> -              NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
> >> +              NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE |
> >> +              NVMF_OPT_REGISTER,
> >>       .create_ctrl    = nvme_tcp_create_ctrl,
> >>   };
> >> diff --git a/include/linux/nvme.h b/include/linux/nvme.h index
> >> 3b47951342f4..891615876cfa 100644
> >> --- a/include/linux/nvme.h
> >> +++ b/include/linux/nvme.h
> >> @@ -9,6 +9,7 @@
> >>   #include <linux/types.h>
> >>   #include <linux/uuid.h>
> >> +#include <linux/kernel.h>
> >>   /* NQN names in commands fields specified one size */
> >>   #define NVMF_NQN_FIELD_LEN    256
> >> @@ -102,6 +103,16 @@ enum {
> >>       NVMF_RDMA_CMS_RDMA_CM    = 1, /* Sockets based endpoint
> >> addressing */
> >>   };
> >> +/**
> >> + * enum -
> >> + * @NVMF_TCP_SECTYPE_NONE: No Security
> >> + * @NVMF_TCP_SECTYPE_TLS:  Transport Layer Security  */ enum {
> >> +    NVMF_TCP_SECTYPE_NONE    = 0,
> >> +    NVMF_TCP_SECTYPE_TLS    = 1,
> >> +};
> >> +
> >>   #define NVME_AQ_DEPTH        32
> >>   #define NVME_NR_AEN_COMMANDS    1
> >>   #define NVME_AQ_BLK_MQ_DEPTH    (NVME_AQ_DEPTH -
> >> NVME_NR_AEN_COMMANDS) @@ -1036,6 +1047,7 @@ enum
> nvme_admin_opcode {
> >>       nvme_admin_sanitize_nvm        = 0x84,
> >>       nvme_admin_get_lba_status    = 0x86,
> >>       nvme_admin_vendor_start        = 0xC0,
> >> +    nvme_admin_dim            = 0x21,
> >>   };
> >>   #define nvme_admin_opcode_name(opcode)    { opcode, #opcode }
> @@
> >> -1316,6 +1328,29 @@ struct nvmf_common_command {
> >>       __u8    ts[24];
> >>   };
> >> +/**
> >> + * union nvmf_tsas - Transport Specific Address Subtype
> >> + * @qptype:  Queue Pair Service Type
> (NVMF_RDMA_QPTYPE_CONNECTED,
> >> NVMF_RDMA_QPTYPE_DATAGRAM)
> >> + * @prtype:  Provider Type (see enum  nvme_rdma_prtype)
> >> + * @cma:     Connection Management Service
> (NVMF_RDMA_CMS_RDMA_CM)
> >> + * @pkey:    Partition Key
> >> + * @sectype: Security Type (NVMF_TCP_SECTYPE_NONE,
> >> +NVMF_TCP_SECTYPE_TLS)  */ union nvmf_tsas {
> >> +    char        common[NVMF_TSAS_SIZE];
> >> +    struct rdma {
> >> +        __u8    qptype;
> >> +        __u8    prtype;
> >> +        __u8    cms;
> >> +        __u8    rsvd3[5];
> >> +        __u16    pkey;
> >> +        __u8    rsvd10[246];
> >> +    } rdma;
> >> +    struct tcp {
> >> +        __u8    sectype;
> >> +    } tcp;
> >> +};
> >> +
> >>   /*
> >>    * The legal cntlid range a NVMe Target will provide.
> >>    * Note that cntlid of value 0 is considered illegal in the fabrics
> >> world.
> >> @@ -1349,17 +1384,7 @@ struct nvmf_disc_rsp_page_entry {
> >>       __u8        resv64[192];
> >>       char        subnqn[NVMF_NQN_FIELD_LEN];
> >>       char        traddr[NVMF_TRADDR_SIZE];
> >> -    union tsas {
> >> -        char        common[NVMF_TSAS_SIZE];
> >> -        struct rdma {
> >> -            __u8    qptype;
> >> -            __u8    prtype;
> >> -            __u8    cms;
> >> -            __u8    resv3[5];
> >> -            __u16    pkey;
> >> -            __u8    resv10[246];
> >> -        } rdma;
> >> -    } tsas;
> >> +    union nvmf_tsas tsas;
> >>   };
> >>   /* Discovery log page header */
> >> @@ -1447,6 +1472,213 @@ struct streams_directive_params {
> >>       __u8    rsvd2[6];
> >>   };
> >> +/**
> >> + * Discovery Information Management (DIM) command. This is sent by a
> >> + * host to the Central Discovery Controller (CDC) to perform
> >> + * explicit registration.
> >> + */
> >> +enum nvmf_dim_tas {
> >> +    NVME_TAS_REGISTER   = 0x00,
> >> +    NVME_TAS_DEREGISTER = 0x01,
> >> +    NVME_TAS_UPDATE     = 0x02,
> >> +};
> >> +
> >> +struct nvmf_dim_command {
> >> +    __u8                  opcode;     /* nvme_admin_dim */
> >> +    __u8                  flags;
> >> +    __u16                 command_id;
> >> +    __le32                nsid;
> >> +    __u64                 rsvd2[2];
> >> +    union nvme_data_ptr   dptr;
> >> +    __le32                cdw10;      /* enum nvmf_dim_tas */
> >> +    __le32                cdw11;
> >> +    __le32                cdw12;
> >> +    __le32                cdw13;
> >> +    __le32                cdw14;
> >> +    __le32                cdw15;
> >> +};
> >> +
> >> +#define NVMF_ENAME_LEN    256
> >> +#define NVMF_EVER_LEN     64
> >> +
> >> +enum nvmf_dim_entfmt {
> >> +    NVME_ENTFMT_BASIC    = 0x01,
> >> +    NVME_ENTFMT_EXTENDED = 0x02,
> >> +};
> >> +
> >> +enum nvmf_dim_etype {
> >> +    NVME_REGISTRATION_HOST = 0x01,
> >> +    NVME_REGISTRATION_DDC  = 0x02,
> >> +    NVME_REGISTRATION_CDC  = 0x03,
> >> +};
> >> +
> >> +enum nvmf_exattype {
> >> +    NVME_EXATTYPE_HOSTID   = 0x01, /* Host Identifier */
> >> +    NVME_EXATTYPE_SYMNAME  = 0x02, /* Symbolic Name */ };
> >> +
> >> +/**
> >> + * struct nvmf_ext_attr - Extended Attribute (EXAT)
> >> + *
> >> + *       Bytes       Description
> >> + * @type 01:00       Extended Attribute Type (EXATTYPE) - enum
> >> nvmf_exattype
> >> + * @len  03:02       Extended Attribute Length (EXATLEN)
> >> + * @val  (EXATLEN-1) Extended Attribute Value (EXATVAL) - size
> >> +allocated
> >> + *       +4:04       for array must be a multiple of 4 bytes  */
> >> +struct nvmf_ext_attr {
> >> +    __le16            type;
> >> +    __le16            len;
> >> +    __u8            val[];
> >> +};
> >> +#define SIZEOF_EXT_EXAT_WO_VAL offsetof(struct nvmf_ext_attr, val)
> >> +
> >> +/**
> >> + * struct nvmf_ext_die - Extended Discovery Information Entry (DIE)
> >> + *
> >> + *            Bytes           Description
> >> + * @trtype    00              Transport Type (enum nvme_trtype)
> >> + * @adrfam    01              Address Family (enum
> >> +nvmf_addr_familiy)
> >> + * @subtype   02              Subsystem Type (enum nvme_subsys_type)
> >> + * @treq      03              Transport Requirements (enum
> >> +nvmf_treq)
> >> + * @portid    05:04           Port ID
> >> + * @cntlid    07:06           Controller ID
> >> + * @asqsz     09:08           Admin Max SQ Size
> >> + *            31:10           Reserved
> >> + * @trsvcid   63:32           Transport Service Identifier
> >> + *            255:64          Reserved
> >> + * @nqn       511:256         NVM Qualified Name
> >> + * @traddr    767:512         Transport Address
> >> + * @tsas      1023:768        Transport Specific Address Subtype
> >> + * @tel       1027:1024       Total Entry Length
> >> + * @numexat   1029:1028       Number of Extended Attributes
> >> + *            1031:1030       Reserved
> >> + * @exat[0]   ((EXATLEN-1)+   Extented Attributes 0 (see @numexat)
> >> + *            4)+1032:1032    Cannot access as an array because each
> >> EXAT
> >> + *                            entry has an undetermined size.
> >> + * @exat[N]   TEL-1:TEL-      Extented Attributes N (w/ N =
> >> +NUMEXAT-1)
> >> + *            (EXATLEN+4)
> >> + */
> >> +struct nvmf_ext_die {
> >> +    __u8            trtype;
> >> +    __u8            adrfam;
> >> +    __u8            subtype;
> >> +    __u8            treq;
> >> +    __le16            portid;
> >> +    __le16            cntlid;
> >> +    __le16            asqsz;
> >> +    __u8            resv10[22];
> >> +    char            trsvcid[NVMF_TRSVCID_SIZE];
> >> +    __u8            resv64[192];
> >> +    char            nqn[NVMF_NQN_FIELD_LEN];
> >> +    char            traddr[NVMF_TRADDR_SIZE];
> >> +    union nvmf_tsas        tsas;
> >> +    __le32            tel;
> >> +    __le16            numexat;
> >> +    __u16            resv1030;
> >> +    struct nvmf_ext_attr    exat;
> >> +};
> >> +#define SIZEOF_EXT_DIE_WO_EXAT offsetof(struct nvmf_ext_die, exat)
> >> +
> >> +/**
> >> + * union nvmf_die - Discovery Information Entry (DIE)
> >> + *
> >> + * Depending on the ENTFMT specified in the DIM, DIEs can be entered
> >> with
> >> + * the Basic or Extended formats. For Basic format, each entry has a
> >> fixed
> >> + * length. Therefore, the "basic" field defined below can be
> >> + accessed
> >> as a
> >> + * C array. For the Extended format, however, each entry is of
> >> + variable
> >> + * length (TEL). Therefore, the "extended" field defined below
> >> + cannot be
> >> + * accessed as a C array. Instead, the "extended" field is akin to a
> >> + * linked-list, where one can "walk" through the list. To move to
> >> + the
> >> next
> >> + * entry, one simply adds the current entry's length (TEL) to the "walk"
> >> + * pointer. The number of entries in the list is specified by NUMENT.
> >> + * Although extended entries are of a variable lengths (TEL), TEL is
> >> + * always a multiple of 4 bytes.
> >> + */
> >> +union nvmf_die {
> >> +    struct nvmf_disc_rsp_page_entry    basic[0];
> >> +    struct nvmf_ext_die        extended; };
> >> +
> >> +/**
> >> + * struct nvmf_dim_data - Discovery Information Management (DIM) -
> >> +Data
> >> + *
> >> + *            Bytes       Description
> >> + * @tdl       03:00       Total Data Length
> >> + *            07:04       Reserved
> >> + * @nument    15:08       Number of entries
> >> + * @entfmt    17:16       Entry Format (enum nvmf_dim_entfmt)
> >> + * @etype     19:18       Entity Type (enum nvmf_dim_etype)
> >> + * @portlcL   20          Port Local
> >> + *            21          Reserved
> >> + * @ektype    23:22       Entry Key Type
> >> + * @eid       279:24      Entity Identifier (e.g. Host NQN)
> >> + * @ename     535:280     Entity Name (e.g. hostname)
> >> + * @ever      599:536     Entity Version (e.g. OS Name/Version)
> >> + *            1023:600    Reserved
> >> + * @die       TDL-1:1024  Discovery Information Entry (see @nument
> >> above)
> >> + */
> >> +struct nvmf_dim_data {
> >> +    __le32            tdl;
> >> +    __u32            rsvd4;
> >> +    __le64            nument;
> >> +    __le16            entfmt;
> >> +    __le16            etype;
> >> +    __u8            portlcl;
> >> +    __u8            rsvd21;
> >> +    __le16            ektype;
> >> +    char            eid[NVMF_NQN_FIELD_LEN];
> >> +    char            ename[NVMF_ENAME_LEN];
> >> +    char            ever[NVMF_EVER_LEN];
> >> +    __u8            rsvd600[424];
> >> +    union nvmf_die        die;
> >> +};
> >> +#define SIZEOF_DIM_WO_DIE offsetof(struct nvmf_dim_data, die)
> >> +
> >> +/**
> >> + * exat_len() - Return the size in bytes, rounded to a multiple of 4
> >> ((i.e.
> >> + *         size of u32), of the buffer needed to hold the exat value
> >> +of
> >> + *         size @val_len.
> >> + */
> >> +static inline u16 exat_len(size_t val_len) {
> >> +    return (u16)round_up(val_len, sizeof(u32)); }
> >> +
> >> +/**
> >> + * exat_size - return the size of the "struct nvmf_ext_attr"
> >> + *   needed to hold a value of size @val_len.
> >> + *
> >> + * @val_len: This is the length of the data to be copied to the "val"
> >> + *         field of a "struct nvmf_ext_attr".
> >> + *
> >> + * @return The size in bytes, rounded to a multiple of 4 (i.e. size
> >> +of
> >> + *         u32), of the "struct nvmf_ext_attr" required to hold a
> >> string of
> >> + *         length @val_len.
> >> + */
> >> +static inline u16 exat_size(size_t val_len) {
> >> +    return (u16)(SIZEOF_EXT_EXAT_WO_VAL + exat_len(val_len)); }
> >> +
> >> +/**
> >> + * exat_ptr_next - Increment @p to the next element in the array.
> >> Extended
> >> + *   attributes are saved to an array of "struct nvmf_ext_attr" where
> >> each
> >> + *   element of the array is of variable size. In order to move to
> >> the next
> >> + *   element in the array one must increment the pointer to the current
> >> + *   element (@p) by the size of the current element.
> >> + *
> >> + * @p: Pointer to an element of an array of "struct nvmf_ext_attr".
> >> + *
> >> + * @return Pointer to the next element in the array.
> >> + */
> >> +static inline struct nvmf_ext_attr *exat_ptr_next(struct
> >> nvmf_ext_attr *p)
> >> +{
> >> +    return (struct nvmf_ext_attr *)
> >> +        ((uintptr_t)p + (ptrdiff_t)exat_size(le16_to_cpu(p->len)));
> >> +}
> >> +
> >> +
> >>   struct nvme_command {
> >>       union {
> >>           struct nvme_common_command common;
> >> @@ -1470,6 +1702,7 @@ struct nvme_command {
> >>           struct nvmf_property_get_command prop_get;
> >>           struct nvme_dbbuf dbbuf;
> >>           struct nvme_directive_cmd directive;
> >> +        struct nvmf_dim_command dim;
> >>       };
> >>   };
> > I'd rather like to have the TLS bits in a separate patch; they are not
> > strictly related to this issue.
> >
> > And I have a rather general issue with this: Is it required to issue an
> > explicit registration directly after sending the 'connect' command?
> > IE do we need to have it as part of the 'connect' routine?
> >
> > Thing is, the linux 'connect' routine is doing several things in one go
> > (like establishing a transport association for the admin queue, send the
> > 'connect' command for the admin queue, create transport associations for
> > I/O queues and send 'connect' commands on all I/O queues, _and_
> possibly
> > do authentication, too).
> > If we now start overloading it with yet more functionality it becomes
> > extremely hard to cleanup after an error, and that doesn't even mention
> > the non-existing error reporting to userspace.
> >
> > Wouldn't it make more sense to delegate explicit registration to
> > userspace (ie libnvme/nvme-cli), and leave the kernel out of it?
> 
> It would. There is no reason what-so-ever to place all this register
> stuff needs to live in the kernel. We have a way to passthru commands so
> it can and should move to userspace.

I wish it could be delegated to a user-space app, and in fact that was my 
original design. 

Unfortunately, while testing I realized that the kernel can autonomously 
reconnect and user-space apps are completely unaware of it.

For example, let's say the network goes down momentarily. The kernel then 
tries to reconnect. Once it successfully reconnects it doesn't tell anyone 
about it. 

But let's say the kernel does send a signal to user-space on a reconnect. 
What if there is no user-space app to receive this signal? I'm thinking of 
the case where one uses nvme-cli to set up persistent connections to 
discovery controllers.  In that case there is no app to send the explicit 
registration on a re-connect.

Internal Use - Confidential

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH 1/4] nvme-fabrics: add discovery controller type
  2022-01-27  9:16   ` Hannes Reinecke
@ 2022-01-27 13:37     ` Belanger, Martin
  0 siblings, 0 replies; 32+ messages in thread
From: Belanger, Martin @ 2022-01-27 13:37 UTC (permalink / raw)
  To: Hannes Reinecke, Martin Belanger, linux-nvme; +Cc: kbusch, axboe, hch, sagi

> On 1/25/22 15:59, Martin Belanger wrote:
> > From: Martin Belanger <martin.belanger@dell.com>
> >
> > TP8010 introduces the Central Discovery Controller (CDC). To
> > differentiate between a CDC and a DDC (Direct DC), the identify
> > response now carries a DCTYPE (Discovery Controller Type) field.
> >
> > Signed-off-by: Martin Belanger <martin.belanger@dell.com>
> > ---
> >   drivers/nvme/host/core.c |  3 +++
> >   drivers/nvme/host/nvme.h |  2 ++
> >   include/linux/nvme.h     | 10 +++++++++-
> >   3 files changed, 14 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index
> > 4fc794d9c2f4..cd34b92e8e9d 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -2978,6 +2978,7 @@ static int nvme_init_identify(struct nvme_ctrl
> *ctrl)
> >   	ctrl->kas = le16_to_cpu(id->kas);
> >   	ctrl->max_namespaces = le32_to_cpu(id->mnan);
> >   	ctrl->ctratt = le32_to_cpu(id->ctratt);
> > +	ctrl->dctype = id->dctype;
> >
> >   	if (id->rtd3e) {
> >   		/* us -> s */
> > @@ -3335,6 +3336,7 @@ nvme_show_int_function(numa_node);
> >   nvme_show_int_function(queue_count);
> >   nvme_show_int_function(sqsize);
> >   nvme_show_int_function(kato);
> > +nvme_show_int_function(dctype);
> >
> >   static ssize_t nvme_sysfs_delete(struct device *dev,
> >   				struct device_attribute *attr, const char *buf,
> @@ -3533,6
> > +3535,7 @@ static struct attribute *nvme_dev_attrs[] = {
> >   	&dev_attr_reconnect_delay.attr,
> >   	&dev_attr_fast_io_fail_tmo.attr,
> >   	&dev_attr_kato.attr,
> > +	&dev_attr_dctype.attr,
> >   	NULL
> >   };
> >
> > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index
> > fe224016418e..f8b60f7346fd 100644
> > --- a/drivers/nvme/host/nvme.h
> > +++ b/drivers/nvme/host/nvme.h
> > @@ -349,6 +349,8 @@ struct nvme_ctrl {
> >   	unsigned long discard_page_busy;
> >
> >   	struct nvme_fault_inject fault_inject;
> > +
> > +	enum nvme_dctype dctype;
> >   };
> >
> >   enum nvme_iopolicy {
> > diff --git a/include/linux/nvme.h b/include/linux/nvme.h index
> > 855dd9b3e84b..82567d493c51 100644
> > --- a/include/linux/nvme.h
> > +++ b/include/linux/nvme.h
> > @@ -240,6 +240,12 @@ enum nvme_ctrl_attr {
> >   	NVME_CTRL_ATTR_TBKAS		= (1 << 6),
> >   };
> >
> > +enum nvme_dctype {
> > +	NVME_DCTYPE_NOT_REPORTED = 0,
> > +	NVME_DCTYPE_DDC = 1,
> > +	NVME_DCTYPE_CDC = 2,
> > +};
> > +
> >   struct nvme_id_ctrl {
> >   	__le16			vid;
> >   	__le16			ssvid;
> > @@ -320,7 +326,9 @@ struct nvme_id_ctrl {
> >   	__le16			icdoff;
> >   	__u8			ctrattr;
> >   	__u8			msdbd;
> > -	__u8			rsvd1804[244];
> > +	__le16			ofcs;
> > +	__u8			dctype;
> > +	__u8			rsvd1807[241];
> >   	struct nvme_id_power_state	psd[32];
> >   	__u8			vs[1024];
> >   };
> 
> Please decode the fields for the sysfs attribute; 'cdc' is far easier to
> understand than '2'.

Ok, I will add a decoder, but just to explain why I did it this way....

I agree that 'cdc' is more human-readable than 2. On the other hand,
decoding to a string has the drawback that when (if) the enumeration 
changes, then we need to add more decode code in the kernel.

As you know it takes a while for kernel changes to make it into a distro.
By simply printing the decimal value we never have to make changes 
to the kernel again and any value added to the enum in the future will
just show up.

> 
> Cheers,
> 
> Hannes
> --
> Dr. Hannes Reinecke		           Kernel Storage Architect
> hare@suse.de			                  +49 911 74053 688
> SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
> HRB 36809 (AG Nürnberg), GF: Felix Imendörffer

Internal Use - Confidential

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-27 13:30       ` Belanger, Martin
@ 2022-01-27 14:28         ` Hannes Reinecke
  2022-01-27 21:59         ` Sagi Grimberg
  1 sibling, 0 replies; 32+ messages in thread
From: Hannes Reinecke @ 2022-01-27 14:28 UTC (permalink / raw)
  To: Belanger, Martin, Sagi Grimberg, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch

On 1/27/22 14:30, Belanger, Martin wrote:
>>>> TP8010 introduces Central Discovery Controllers (CDC) and the
>>>> Discovery Information Management (DIM) PDU, which is used to perform
>>>> explicit registration with Discovery Controllers.
>>>>
>>>> This patch defines the DIM PDU and adds logic to perform explicit
>>>> registration with DCs when registration has been requested by the
>>>> user.
>>>>
[ .. ]
>>> I'd rather like to have the TLS bits in a separate patch; they are not
>>> strictly related to this issue.
>>>
>>> And I have a rather general issue with this: Is it required to issue an
>>> explicit registration directly after sending the 'connect' command?
>>> IE do we need to have it as part of the 'connect' routine?
>>>
>>> Thing is, the linux 'connect' routine is doing several things in one go
>>> (like establishing a transport association for the admin queue, send the
>>> 'connect' command for the admin queue, create transport associations for
>>> I/O queues and send 'connect' commands on all I/O queues, _and_
>> possibly
>>> do authentication, too).
>>> If we now start overloading it with yet more functionality it becomes
>>> extremely hard to cleanup after an error, and that doesn't even mention
>>> the non-existing error reporting to userspace.
>>>
>>> Wouldn't it make more sense to delegate explicit registration to
>>> userspace (ie libnvme/nvme-cli), and leave the kernel out of it?
>>
>> It would. There is no reason what-so-ever to place all this register
>> stuff needs to live in the kernel. We have a way to passthru commands so
>> it can and should move to userspace.
> 
> I wish it could be delegated to a user-space app, and in fact that was my
> original design.
> 
> Unfortunately, while testing I realized that the kernel can autonomously
> reconnect and user-space apps are completely unaware of it.
> 
> For example, let's say the network goes down momentarily. The kernel then
> tries to reconnect. Once it successfully reconnects it doesn't tell anyone
> about it.
> 
> But let's say the kernel does send a signal to user-space on a reconnect.
> What if there is no user-space app to receive this signal? I'm thinking of
> the case where one uses nvme-cli to set up persistent connections to
> discovery controllers.  In that case there is no app to send the explicit
> registration on a re-connect.
> 
Hmm. There must be something I'm not getting.

Either we have to re-register every time if a connection drops, but then 
the patch to unregister once a connection drops is pointless.
Or we need to explicitly de-register when disconnecting, but then we 
don't need to re-register for connection failures.

Which one is it?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-27 13:30       ` Belanger, Martin
  2022-01-27 14:28         ` Hannes Reinecke
@ 2022-01-27 21:59         ` Sagi Grimberg
  2022-01-28 17:55           ` Belanger, Martin
  1 sibling, 1 reply; 32+ messages in thread
From: Sagi Grimberg @ 2022-01-27 21:59 UTC (permalink / raw)
  To: Belanger, Martin, Hannes Reinecke, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch


>>> Wouldn't it make more sense to delegate explicit registration to
>>> userspace (ie libnvme/nvme-cli), and leave the kernel out of it?
>>
>> It would. There is no reason what-so-ever to place all this register
>> stuff needs to live in the kernel. We have a way to passthru commands so
>> it can and should move to userspace.
> 
> I wish it could be delegated to a user-space app, and in fact that was my
> original design.
> 
> Unfortunately, while testing I realized that the kernel can autonomously
> reconnect and user-space apps are completely unaware of it.
> 
> For example, let's say the network goes down momentarily. The kernel then
> tries to reconnect. Once it successfully reconnects it doesn't tell anyone
> about it.

Then add a uevent on the controller device-node. From there you should
trap it and do what you need (exactly like how discovery log change
events are handled).

BTW, I also don't understand why the host needs this reregistration on
reconnect, but that is besides the point.

> But let's say the kernel does send a signal to user-space on a reconnect.
> What if there is no user-space app to receive this signal? I'm thinking of
> the case where one uses nvme-cli to set up persistent connections to
> discovery controllers.  In that case there is no app to send the explicit
> registration on a re-connect.

This argument does not justify adding functionality in the kernel that
doesn't belong there. If we were to follow this argument we would be
placing everything in the kernel. If someone wants this functionality,
he/she needs to use the tools required for it to work.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-27 21:59         ` Sagi Grimberg
@ 2022-01-28 17:55           ` Belanger, Martin
  2022-01-28 21:49             ` Hannes Reinecke
  0 siblings, 1 reply; 32+ messages in thread
From: Belanger, Martin @ 2022-01-28 17:55 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch

> >>> Wouldn't it make more sense to delegate explicit registration to
> >>> userspace (ie libnvme/nvme-cli), and leave the kernel out of it?
> >>
> >> It would. There is no reason what-so-ever to place all this register
> >> stuff needs to live in the kernel. We have a way to passthru commands
> >> so it can and should move to userspace.
> >
> > I wish it could be delegated to a user-space app, and in fact that was
> > my original design.
> >
> > Unfortunately, while testing I realized that the kernel can
> > autonomously reconnect and user-space apps are completely unaware of it.
> >
> > For example, let's say the network goes down momentarily. The kernel
> > then tries to reconnect. Once it successfully reconnects it doesn't
> > tell anyone about it.
> 
> Then add a uevent on the controller device-node. From there you should
> trap it and do what you need (exactly like how discovery log change events
> are handled).
> 
> BTW, I also don't understand why the host needs this reregistration on
> reconnect, but that is besides the point.

The fact is that when connectivity is lost and restored, we don’t 
know whether changes have occurred on the host (e.g. maybe the 
symbolic name was changed while connectivity was lost, etc.)
and thus registration must be reapplied every time the host reconnects
to make sure there is no stale information at the discovery controller.

> 
> > But let's say the kernel does send a signal to user-space on a reconnect.
> > What if there is no user-space app to receive this signal? I'm
> > thinking of the case where one uses nvme-cli to set up persistent
> > connections to discovery controllers.  In that case there is no app to
> > send the explicit registration on a re-connect.
> 
> This argument does not justify adding functionality in the kernel that doesn't
> belong there. If we were to follow this argument we would be placing
> everything in the kernel. If someone wants this functionality, he/she needs
> to use the tools required for it to work.

Ok, Hannes, Sagi, Christoph, et al. I got the message loud and clear. 
Explicit Registration does not belong in the kernel. And, as you suggested,
the kernel needs to send a uevent when it loses connectivity and 
another uevent when connectivity is restored. This will allow userspace 
applications know when to reapply registration (if needed).

I'm not a uevent expert (at least how to send uevents from the kernel). 
From what I read, there's only a limited set of uevents defined (i.e. add, 
remove, move, online, offline). The nvme driver already uses the  
uevents "add"  and "remove" when nvme devices are "created" and 
"deleted" respectively . We also have  the "change" event that is used 
when there's a change in the Log Pages. May I suggest that we use the 
"offline" and "online" events when connectivity is "lost" and "restored" 
respectively? Please let me know.

Thanks,
Martin

Internal Use - Confidential

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-28 17:55           ` Belanger, Martin
@ 2022-01-28 21:49             ` Hannes Reinecke
  2022-01-28 23:02               ` Belanger, Martin
  0 siblings, 1 reply; 32+ messages in thread
From: Hannes Reinecke @ 2022-01-28 21:49 UTC (permalink / raw)
  To: Belanger, Martin, Sagi Grimberg, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch

On 1/28/22 18:55, Belanger, Martin wrote:
>>>>> Wouldn't it make more sense to delegate explicit registration to
>>>>> userspace (ie libnvme/nvme-cli), and leave the kernel out of it?
>>>>
>>>> It would. There is no reason what-so-ever to place all this register
>>>> stuff needs to live in the kernel. We have a way to passthru commands
>>>> so it can and should move to userspace.
>>>
>>> I wish it could be delegated to a user-space app, and in fact that was
>>> my original design.
>>>
>>> Unfortunately, while testing I realized that the kernel can
>>> autonomously reconnect and user-space apps are completely unaware of it.
>>>
>>> For example, let's say the network goes down momentarily. The kernel
>>> then tries to reconnect. Once it successfully reconnects it doesn't
>>> tell anyone about it.
>>
>> Then add a uevent on the controller device-node. From there you should
>> trap it and do what you need (exactly like how discovery log change events
>> are handled).
>>
>> BTW, I also don't understand why the host needs this reregistration on
>> reconnect, but that is besides the point.
> 
> The fact is that when connectivity is lost and restored, we don’t
> know whether changes have occurred on the host (e.g. maybe the
> symbolic name was changed while connectivity was lost, etc.)
> and thus registration must be reapplied every time the host reconnects
> to make sure there is no stale information at the discovery controller.
> 
>>
>>> But let's say the kernel does send a signal to user-space on a reconnect.
>>> What if there is no user-space app to receive this signal? I'm
>>> thinking of the case where one uses nvme-cli to set up persistent
>>> connections to discovery controllers.  In that case there is no app to
>>> send the explicit registration on a re-connect.
>>
>> This argument does not justify adding functionality in the kernel that doesn't
>> belong there. If we were to follow this argument we would be placing
>> everything in the kernel. If someone wants this functionality, he/she needs
>> to use the tools required for it to work.
> 
> Ok, Hannes, Sagi, Christoph, et al. I got the message loud and clear.
> Explicit Registration does not belong in the kernel. And, as you suggested,
> the kernel needs to send a uevent when it loses connectivity and
> another uevent when connectivity is restored. This will allow userspace
> applications know when to reapply registration (if needed).
> 
> I'm not a uevent expert (at least how to send uevents from the kernel).
>  From what I read, there's only a limited set of uevents defined (i.e. add,
> remove, move, online, offline). The nvme driver already uses the
> uevents "add"  and "remove" when nvme devices are "created" and
> "deleted" respectively . We also have  the "change" event that is used
> when there's a change in the Log Pages. May I suggest that we use the
> "offline" and "online" events when connectivity is "lost" and "restored"
> respectively? Please let me know.
> 

I guess we already have that; cf commit f6f09c15a767 ("nvme: generate 
uevent once a multipath namespace is operational again").

When multipath is activated (and I guess that will be for nvme-tcp) you 
will receive a 'change' uevent whenever a path gets reinstated.

So that will be the time when an explicit registration will need to be 
reapplied.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-28 21:49             ` Hannes Reinecke
@ 2022-01-28 23:02               ` Belanger, Martin
  2022-01-29  8:43                 ` Hannes Reinecke
  0 siblings, 1 reply; 32+ messages in thread
From: Belanger, Martin @ 2022-01-28 23:02 UTC (permalink / raw)
  To: Hannes Reinecke, Sagi Grimberg, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch


> On 1/28/22 18:55, Belanger, Martin wrote:
> >>>>> Wouldn't it make more sense to delegate explicit registration to
> >>>>> userspace (ie libnvme/nvme-cli), and leave the kernel out of it?
> >>>>
> >>>> It would. There is no reason what-so-ever to place all this
> >>>> register stuff needs to live in the kernel. We have a way to
> >>>> passthru commands so it can and should move to userspace.
> >>>
> >>> I wish it could be delegated to a user-space app, and in fact that
> >>> was my original design.
> >>>
> >>> Unfortunately, while testing I realized that the kernel can
> >>> autonomously reconnect and user-space apps are completely unaware
> of it.
> >>>
> >>> For example, let's say the network goes down momentarily. The kernel
> >>> then tries to reconnect. Once it successfully reconnects it doesn't
> >>> tell anyone about it.
> >>
> >> Then add a uevent on the controller device-node. From there you
> >> should trap it and do what you need (exactly like how discovery log
> >> change events are handled).
> >>
> >> BTW, I also don't understand why the host needs this reregistration
> >> on reconnect, but that is besides the point.
> >
> > The fact is that when connectivity is lost and restored, we don’t know
> > whether changes have occurred on the host (e.g. maybe the symbolic
> > name was changed while connectivity was lost, etc.) and thus
> > registration must be reapplied every time the host reconnects to make
> > sure there is no stale information at the discovery controller.
> >
> >>
> >>> But let's say the kernel does send a signal to user-space on a reconnect.
> >>> What if there is no user-space app to receive this signal? I'm
> >>> thinking of the case where one uses nvme-cli to set up persistent
> >>> connections to discovery controllers.  In that case there is no app
> >>> to send the explicit registration on a re-connect.
> >>
> >> This argument does not justify adding functionality in the kernel
> >> that doesn't belong there. If we were to follow this argument we
> >> would be placing everything in the kernel. If someone wants this
> >> functionality, he/she needs to use the tools required for it to work.
> >
> > Ok, Hannes, Sagi, Christoph, et al. I got the message loud and clear.
> > Explicit Registration does not belong in the kernel. And, as you
> > suggested, the kernel needs to send a uevent when it loses
> > connectivity and another uevent when connectivity is restored. This
> > will allow userspace applications know when to reapply registration (if
> needed).
> >
> > I'm not a uevent expert (at least how to send uevents from the kernel).
> >  From what I read, there's only a limited set of uevents defined (i.e.
> > add, remove, move, online, offline). The nvme driver already uses the
> > uevents "add"  and "remove" when nvme devices are "created" and
> > "deleted" respectively . We also have  the "change" event that is used
> > when there's a change in the Log Pages. May I suggest that we use the
> > "offline" and "online" events when connectivity is "lost" and "restored"
> > respectively? Please let me know.
> >
> 
> I guess we already have that; cf commit f6f09c15a767 ("nvme: generate
> uevent once a multipath namespace is operational again").
> 
> When multipath is activated (and I guess that will be for nvme-tcp) you will
> receive a 'change' uevent whenever a path gets reinstated.
> 
> So that will be the time when an explicit registration will need to be reapplied.
> 

Just want to make sure I understand. When the connection is lost the kernel 
sends nothing. When the connection is restored the kernel sends a "change" 
uevent. And when the Log Pages have changed the kernel also sends a "change" 
uevent.

It would be nice to have different events because if we use the "change"
uevent for both "log pages have changed" and "connection has been 
restored", then the user-space app will have to send both an "explicit 
registration" and a "get log page" whenever it receives a "change" uevent.
It won’t be able to tell what that "change" uevent is for.

Is that what you want me to do?

Regards,
Martin

> Cheers,
> 
> Hannes
> --
> Dr. Hannes Reinecke                Kernel Storage Architect
> hare@suse.de                              +49 911 74053 688
> SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg HRB 36809
> (AG Nürnberg), Geschäftsführer: Felix Imendörffer

Internal Use - Confidential

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-28 23:02               ` Belanger, Martin
@ 2022-01-29  8:43                 ` Hannes Reinecke
  2022-01-29 12:23                   ` Belanger, Martin
  0 siblings, 1 reply; 32+ messages in thread
From: Hannes Reinecke @ 2022-01-29  8:43 UTC (permalink / raw)
  To: Belanger, Martin, Sagi Grimberg, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch

On 1/29/22 00:02, Belanger, Martin wrote:
> 
>> On 1/28/22 18:55, Belanger, Martin wrote:
>>>>>>> Wouldn't it make more sense to delegate explicit registration to
>>>>>>> userspace (ie libnvme/nvme-cli), and leave the kernel out of it?
>>>>>>
>>>>>> It would. There is no reason what-so-ever to place all this
>>>>>> register stuff needs to live in the kernel. We have a way to
>>>>>> passthru commands so it can and should move to userspace.
>>>>>
>>>>> I wish it could be delegated to a user-space app, and in fact that
>>>>> was my original design.
>>>>>
>>>>> Unfortunately, while testing I realized that the kernel can
>>>>> autonomously reconnect and user-space apps are completely unaware
>> of it.
>>>>>
>>>>> For example, let's say the network goes down momentarily. The kernel
>>>>> then tries to reconnect. Once it successfully reconnects it doesn't
>>>>> tell anyone about it.
>>>>
>>>> Then add a uevent on the controller device-node. From there you
>>>> should trap it and do what you need (exactly like how discovery log
>>>> change events are handled).
>>>>
>>>> BTW, I also don't understand why the host needs this reregistration
>>>> on reconnect, but that is besides the point.
>>>
>>> The fact is that when connectivity is lost and restored, we don’t know
>>> whether changes have occurred on the host (e.g. maybe the symbolic
>>> name was changed while connectivity was lost, etc.) and thus
>>> registration must be reapplied every time the host reconnects to make
>>> sure there is no stale information at the discovery controller.
>>>
>>>>
>>>>> But let's say the kernel does send a signal to user-space on a reconnect.
>>>>> What if there is no user-space app to receive this signal? I'm
>>>>> thinking of the case where one uses nvme-cli to set up persistent
>>>>> connections to discovery controllers.  In that case there is no app
>>>>> to send the explicit registration on a re-connect.
>>>>
>>>> This argument does not justify adding functionality in the kernel
>>>> that doesn't belong there. If we were to follow this argument we
>>>> would be placing everything in the kernel. If someone wants this
>>>> functionality, he/she needs to use the tools required for it to work.
>>>
>>> Ok, Hannes, Sagi, Christoph, et al. I got the message loud and clear.
>>> Explicit Registration does not belong in the kernel. And, as you
>>> suggested, the kernel needs to send a uevent when it loses
>>> connectivity and another uevent when connectivity is restored. This
>>> will allow userspace applications know when to reapply registration (if
>> needed).
>>>
>>> I'm not a uevent expert (at least how to send uevents from the kernel).
>>>   From what I read, there's only a limited set of uevents defined (i.e.
>>> add, remove, move, online, offline). The nvme driver already uses the
>>> uevents "add"  and "remove" when nvme devices are "created" and
>>> "deleted" respectively . We also have  the "change" event that is used
>>> when there's a change in the Log Pages. May I suggest that we use the
>>> "offline" and "online" events when connectivity is "lost" and "restored"
>>> respectively? Please let me know.
>>>
>>
>> I guess we already have that; cf commit f6f09c15a767 ("nvme: generate
>> uevent once a multipath namespace is operational again").
>>
>> When multipath is activated (and I guess that will be for nvme-tcp) you will
>> receive a 'change' uevent whenever a path gets reinstated.
>>
>> So that will be the time when an explicit registration will need to be reapplied.
>>
> 
> Just want to make sure I understand. When the connection is lost the kernel
> sends nothing. When the connection is restored the kernel sends a "change"
> uevent. And when the Log Pages have changed the kernel also sends a "change"
> uevent.
> 

Yes.

> It would be nice to have different events because if we use the "change"
> uevent for both "log pages have changed" and "connection has been
> restored", then the user-space app will have to send both an "explicit
> registration" and a "get log page" whenever it receives a "change" uevent.
> It won’t be able to tell what that "change" uevent is for.
> 

Au contraire.
Change events come with a full payload detailing out what the event is 
about. In the case of a 'log page changed' uevent the event points to 
the controller device, and payload is a string
'NVME_AEN=<aen number>'.
In the case of the 'path reinstated' uevent the event points to the 
block device.

So it's easy to differentiate between those two.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-29  8:43                 ` Hannes Reinecke
@ 2022-01-29 12:23                   ` Belanger, Martin
  2022-01-29 12:47                     ` Hannes Reinecke
  0 siblings, 1 reply; 32+ messages in thread
From: Belanger, Martin @ 2022-01-29 12:23 UTC (permalink / raw)
  To: Hannes Reinecke, Sagi Grimberg, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch

> 
> On 1/29/22 00:02, Belanger, Martin wrote:
> >
> >> On 1/28/22 18:55, Belanger, Martin wrote:
> >>>>>>> Wouldn't it make more sense to delegate explicit registration to
> >>>>>>> userspace (ie libnvme/nvme-cli), and leave the kernel out of it?
> >>>>>>
> >>>>>> It would. There is no reason what-so-ever to place all this
> >>>>>> register stuff needs to live in the kernel. We have a way to
> >>>>>> passthru commands so it can and should move to userspace.
> >>>>>
> >>>>> I wish it could be delegated to a user-space app, and in fact that
> >>>>> was my original design.
> >>>>>
> >>>>> Unfortunately, while testing I realized that the kernel can
> >>>>> autonomously reconnect and user-space apps are completely
> unaware
> >> of it.
> >>>>>
> >>>>> For example, let's say the network goes down momentarily. The
> >>>>> kernel then tries to reconnect. Once it successfully reconnects it
> >>>>> doesn't tell anyone about it.
> >>>>
> >>>> Then add a uevent on the controller device-node. From there you
> >>>> should trap it and do what you need (exactly like how discovery log
> >>>> change events are handled).
> >>>>
> >>>> BTW, I also don't understand why the host needs this reregistration
> >>>> on reconnect, but that is besides the point.
> >>>
> >>> The fact is that when connectivity is lost and restored, we don’t
> >>> know whether changes have occurred on the host (e.g. maybe the
> >>> symbolic name was changed while connectivity was lost, etc.) and
> >>> thus registration must be reapplied every time the host reconnects
> >>> to make sure there is no stale information at the discovery controller.
> >>>
> >>>>
> >>>>> But let's say the kernel does send a signal to user-space on a
> reconnect.
> >>>>> What if there is no user-space app to receive this signal? I'm
> >>>>> thinking of the case where one uses nvme-cli to set up persistent
> >>>>> connections to discovery controllers.  In that case there is no
> >>>>> app to send the explicit registration on a re-connect.
> >>>>
> >>>> This argument does not justify adding functionality in the kernel
> >>>> that doesn't belong there. If we were to follow this argument we
> >>>> would be placing everything in the kernel. If someone wants this
> >>>> functionality, he/she needs to use the tools required for it to work.
> >>>
> >>> Ok, Hannes, Sagi, Christoph, et al. I got the message loud and clear.
> >>> Explicit Registration does not belong in the kernel. And, as you
> >>> suggested, the kernel needs to send a uevent when it loses
> >>> connectivity and another uevent when connectivity is restored. This
> >>> will allow userspace applications know when to reapply registration
> >>> (if
> >> needed).
> >>>
> >>> I'm not a uevent expert (at least how to send uevents from the kernel).
> >>>   From what I read, there's only a limited set of uevents defined (i.e.
> >>> add, remove, move, online, offline). The nvme driver already uses
> >>> the uevents "add"  and "remove" when nvme devices are "created" and
> >>> "deleted" respectively . We also have  the "change" event that is
> >>> used when there's a change in the Log Pages. May I suggest that we
> >>> use the "offline" and "online" events when connectivity is "lost" and
> "restored"
> >>> respectively? Please let me know.
> >>>
> >>
> >> I guess we already have that; cf commit f6f09c15a767 ("nvme: generate
> >> uevent once a multipath namespace is operational again").
> >>
> >> When multipath is activated (and I guess that will be for nvme-tcp)
> >> you will receive a 'change' uevent whenever a path gets reinstated.
> >>
> >> So that will be the time when an explicit registration will need to be
> reapplied.
> >>
> >
> > Just want to make sure I understand. When the connection is lost the
> > kernel sends nothing. When the connection is restored the kernel sends a
> "change"
> > uevent. And when the Log Pages have changed the kernel also sends a
> "change"
> > uevent.
> >
> 
> Yes.
> 
> > It would be nice to have different events because if we use the "change"
> > uevent for both "log pages have changed" and "connection has been
> > restored", then the user-space app will have to send both an "explicit
> > registration" and a "get log page" whenever it receives a "change" uevent.
> > It won’t be able to tell what that "change" uevent is for.
> >
> 
> Au contraire.
> Change events come with a full payload detailing out what the event is about.
> In the case of a 'log page changed' uevent the event points to the controller
> device, and payload is a string 'NVME_AEN=<aen number>'.
> In the case of the 'path reinstated' uevent the event points to the block
> device.

There is no *block device*. It's a connection to a Discover Controller. 

> 
> So it's easy to differentiate between those two.
> 
> Cheers,
> 
> Hannes
> --
> Dr. Hannes Reinecke                Kernel Storage Architect
> hare@suse.de                              +49 911 74053 688
> SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg HRB 36809
> (AG Nürnberg), Geschäftsführer: Felix Imendörffer

Internal Use - Confidential

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-29 12:23                   ` Belanger, Martin
@ 2022-01-29 12:47                     ` Hannes Reinecke
  2022-01-30  8:46                       ` Sagi Grimberg
  0 siblings, 1 reply; 32+ messages in thread
From: Hannes Reinecke @ 2022-01-29 12:47 UTC (permalink / raw)
  To: Belanger, Martin, Sagi Grimberg, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch

On 1/29/22 13:23, Belanger, Martin wrote:
>>
>> On 1/29/22 00:02, Belanger, Martin wrote:
>>>
[ .. ]
>>> It would be nice to have different events because if we use the "change"
>>> uevent for both "log pages have changed" and "connection has been
>>> restored", then the user-space app will have to send both an "explicit
>>> registration" and a "get log page" whenever it receives a "change" uevent.
>>> It won’t be able to tell what that "change" uevent is for.
>>>
>>
>> Au contraire.
>> Change events come with a full payload detailing out what the event is about.
>> In the case of a 'log page changed' uevent the event points to the controller
>> device, and payload is a string 'NVME_AEN=<aen number>'.
>> In the case of the 'path reinstated' uevent the event points to the block
>> device.
> 
> There is no *block device*. It's a connection to a Discover Controller.
> 
Ah. Correct. Guess we'll have to add an uevent here.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-29 12:47                     ` Hannes Reinecke
@ 2022-01-30  8:46                       ` Sagi Grimberg
  2022-01-30 13:18                         ` Sagi Grimberg
  2022-01-31 12:08                         ` Belanger, Martin
  0 siblings, 2 replies; 32+ messages in thread
From: Sagi Grimberg @ 2022-01-30  8:46 UTC (permalink / raw)
  To: Hannes Reinecke, Belanger, Martin, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch


> [ .. ]
>>>> It would be nice to have different events because if we use the 
>>>> "change"
>>>> uevent for both "log pages have changed" and "connection has been
>>>> restored", then the user-space app will have to send both an "explicit
>>>> registration" and a "get log page" whenever it receives a "change" 
>>>> uevent.
>>>> It won’t be able to tell what that "change" uevent is for.
>>>>
>>>
>>> Au contraire.
>>> Change events come with a full payload detailing out what the event 
>>> is about.
>>> In the case of a 'log page changed' uevent the event points to the 
>>> controller
>>> device, and payload is a string 'NVME_AEN=<aen number>'.
>>> In the case of the 'path reinstated' uevent the event points to the 
>>> block
>>> device.
>>
>> There is no *block device*. It's a connection to a Discover Controller.
>>
> Ah. Correct. Guess we'll have to add an uevent here.

Yes, thgere is no bdev for discovery controllers. We may add a uevent
for userspace to consume as soon as the transport reconnects or when
a controller (re)starts.

Don't see any point for the disconnect part as this is async and will
not have any ordering with the actual disconnect. userspace will need
to do it before it disconnects the discovery controller.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-30  8:46                       ` Sagi Grimberg
@ 2022-01-30 13:18                         ` Sagi Grimberg
  2022-01-31 12:08                         ` Belanger, Martin
  1 sibling, 0 replies; 32+ messages in thread
From: Sagi Grimberg @ 2022-01-30 13:18 UTC (permalink / raw)
  To: Hannes Reinecke, Belanger, Martin, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch


> Yes, thgere is no bdev for discovery controllers. We may add a uevent
> for userspace to consume as soon as the transport reconnects or when
> a controller (re)starts.

Something like this should be sufficient:
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index dd18861f77c0..55e11ee19460 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4396,6 +4396,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
                 nvme_queue_scan(ctrl);
                 nvme_start_queues(ctrl);
         }
+       kobject_uevent(&ctrl->device->kobj, KOBJ_ONLINE);
  }
  EXPORT_SYMBOL_GPL(nvme_start_ctrl);
--


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* RE: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-30  8:46                       ` Sagi Grimberg
  2022-01-30 13:18                         ` Sagi Grimberg
@ 2022-01-31 12:08                         ` Belanger, Martin
  2022-01-31 16:05                           ` Sagi Grimberg
  1 sibling, 1 reply; 32+ messages in thread
From: Belanger, Martin @ 2022-01-31 12:08 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch

> > [ .. ]
> >>>> It would be nice to have different events because if we use the
> >>>> "change"
> >>>> uevent for both "log pages have changed" and "connection has been
> >>>> restored", then the user-space app will have to send both an
> >>>> "explicit registration" and a "get log page" whenever it receives a
> "change"
> >>>> uevent.
> >>>> It won’t be able to tell what that "change" uevent is for.
> >>>>
> >>>
> >>> Au contraire.
> >>> Change events come with a full payload detailing out what the event
> >>> is about.
> >>> In the case of a 'log page changed' uevent the event points to the
> >>> controller device, and payload is a string 'NVME_AEN=<aen number>'.
> >>> In the case of the 'path reinstated' uevent the event points to the
> >>> block device.
> >>
> >> There is no *block device*. It's a connection to a Discover Controller.
> >>
> > Ah. Correct. Guess we'll have to add an uevent here.
> 
> Yes, thgere is no bdev for discovery controllers. We may add a uevent for
> userspace to consume as soon as the transport reconnects or when a
> controller (re)starts.
> 
> Don't see any point for the disconnect part as this is async and will not have
> any ordering with the actual disconnect. userspace will need to do it before it
> disconnects the discovery controller.

Hi Sagi. Thanks for the code snippet. I just had a question about your comment.

You say that you don’t see any point to send an event on the disconnect 
because it is async.  I don’t quite understand that part. For example, I've
observed that when connectivity is lost the kernel will try to connect every
10 sec. And it will keep trying to connect 60 times (i.e. 10 min) before 
giving up. Wouldn't sending an "offline" event when connectivity is lost 
give user space apps a heads up that the nvme device may not respond.

Thanks,
Martin

Internal Use - Confidential

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-31 12:08                         ` Belanger, Martin
@ 2022-01-31 16:05                           ` Sagi Grimberg
  2022-01-31 16:16                             ` Belanger, Martin
  0 siblings, 1 reply; 32+ messages in thread
From: Sagi Grimberg @ 2022-01-31 16:05 UTC (permalink / raw)
  To: Belanger, Martin, Hannes Reinecke, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch


> Hi Sagi. Thanks for the code snippet. I just had a question about your comment.
> 
> You say that you don’t see any point to send an event on the disconnect
> because it is async.  I don’t quite understand that part. For example, I've
> observed that when connectivity is lost the kernel will try to connect every
> 10 sec. And it will keep trying to connect 60 times (i.e. 10 min) before
> giving up. Wouldn't sending an "offline" event when connectivity is lost
> give user space apps a heads up that the nvme device may not respond.

Why does userspace need it?

When the host will disconnect it will send event asynchronously, so
userspace cannot reliably do anything before the host disconnects.
Other than that I don't understand what would userspace do with
"offline - I/O is going to fail"...


^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-31 16:05                           ` Sagi Grimberg
@ 2022-01-31 16:16                             ` Belanger, Martin
  2022-01-31 18:56                               ` Sagi Grimberg
  2022-02-02  8:36                               ` Hannes Reinecke
  0 siblings, 2 replies; 32+ messages in thread
From: Belanger, Martin @ 2022-01-31 16:16 UTC (permalink / raw)
  To: Sagi Grimberg, Hannes Reinecke, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch


> > Hi Sagi. Thanks for the code snippet. I just had a question about your
> comment.
> >
> > You say that you don’t see any point to send an event on the
> > disconnect because it is async.  I don’t quite understand that part.
> > For example, I've observed that when connectivity is lost the kernel
> > will try to connect every
> > 10 sec. And it will keep trying to connect 60 times (i.e. 10 min)
> > before giving up. Wouldn't sending an "offline" event when
> > connectivity is lost give user space apps a heads up that the nvme device
> may not respond.
> 
> Why does userspace need it?
> 
> When the host will disconnect it will send event asynchronously, so
> userspace cannot reliably do anything before the host disconnects.
> Other than that I don't understand what would userspace do with "offline -
> I/O is going to fail"...

I was just asking for completeness. Some apps may use it.

Internal Use - Confidential

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-25 14:59 ` [PATCH 3/4] nvme-fabrics: add tp8010 support Martin Belanger
  2022-01-27  9:30   ` Hannes Reinecke
@ 2022-01-31 17:03   ` John Meneghini
  2022-01-31 17:17     ` Belanger, Martin
  1 sibling, 1 reply; 32+ messages in thread
From: John Meneghini @ 2022-01-31 17:03 UTC (permalink / raw)
  To: Martin Belanger, linux-nvme
  Cc: kbusch, hare, axboe, hch, sagi, Martin Belanger

Hi Martin.

Properly speaking, registrations are sent to the CDC, not the DC.

Correct?

Can we please change this, everywhere, to be clear about when we are "talking" to the CDC (Centralized Discovery Controller) and 
when we are talking to the DC (Discovery Controller).

/John

On 1/25/22 09:59, Martin Belanger wrote:
> +/**
> + * nvme_get_adrfam() - Get address family for the address we're registering
> + * with the DC. We retrieve this info from the socket itself. If we can't
> + * get the source address from the socket, then we'll infer the address
> + * family from the address of the DC since the DC address has the same
> + * address family.
> + *
> + * @ctrl: Host NVMe controller instance maintaining the admin queue used to
> + *      submit the DIM command to the DC.
> + *
> + * Return: The address family of the source address associated with the
> + * socket connected to the DC.



^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-31 17:03   ` John Meneghini
@ 2022-01-31 17:17     ` Belanger, Martin
  2022-01-31 18:28       ` John Meneghini
  0 siblings, 1 reply; 32+ messages in thread
From: Belanger, Martin @ 2022-01-31 17:17 UTC (permalink / raw)
  To: John Meneghini, Martin Belanger, linux-nvme
  Cc: kbusch, hare, axboe, hch, sagi

> Hi Martin.
> 
> Properly speaking, registrations are sent to the CDC, not the DC.
> 
> Correct?
> 
> Can we please change this, everywhere, to be clear about when we are
> "talking" to the CDC (Centralized Discovery Controller) and when we are
> talking to the DC (Discovery Controller).
> 
> /John

Hi John. 

TP8010 originally intended for registration (DIM PDU) to be sent 
to CDCs only. However, this was changed to include Direct Discovery 
Controllers (DDC)  as well. That's why I use DC and not CDC. 

Also note that TP8010 introduces a new DCTYPE field in the response to 
the Identify command, which is used to differentiate between a CDC and 
a DDC. Legacy DCs that do not comply to TP8010 will have DCTYPE=0 
(i.e. Discovery controller type is not reported). So, registration will not 
be attempted  for DCTYPE other than 1 (DDC) and 2 (CDC).

Here you will find the ratified version of TP8010: 
https://nvmexpress.org/wp-content/uploads/NVM-Express-2.0-Ratified-TPs-01242022.zip

Martin

> 
> On 1/25/22 09:59, Martin Belanger wrote:
> > +/**
> > + * nvme_get_adrfam() - Get address family for the address we're
> > +registering
> > + * with the DC. We retrieve this info from the socket itself. If we
> > +can't
> > + * get the source address from the socket, then we'll infer the
> > +address
> > + * family from the address of the DC since the DC address has the
> > +same
> > + * address family.
> > + *
> > + * @ctrl: Host NVMe controller instance maintaining the admin queue
> used to
> > + *      submit the DIM command to the DC.
> > + *
> > + * Return: The address family of the source address associated with
> > +the
> > + * socket connected to the DC.


Internal Use - Confidential

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-31 17:17     ` Belanger, Martin
@ 2022-01-31 18:28       ` John Meneghini
  0 siblings, 0 replies; 32+ messages in thread
From: John Meneghini @ 2022-01-31 18:28 UTC (permalink / raw)
  To: Belanger, Martin, Martin Belanger, linux-nvme
  Cc: kbusch, hare, axboe, hch, sagi

OK. Thanks Martin.  That explains it.

On 1/31/22 12:17, Belanger, Martin wrote:
>> Hi Martin.
>>
>> Properly speaking, registrations are sent to the CDC, not the DC.
>>
>> Correct?
>>
>> Can we please change this, everywhere, to be clear about when we are
>> "talking" to the CDC (Centralized Discovery Controller) and when we are
>> talking to the DC (Discovery Controller).
>>
>> /John
> 
> Hi John.
> 
> TP8010 originally intended for registration (DIM PDU) to be sent
> to CDCs only. However, this was changed to include Direct Discovery
> Controllers (DDC)  as well. That's why I use DC and not CDC.
> 
> Also note that TP8010 introduces a new DCTYPE field in the response to
> the Identify command, which is used to differentiate between a CDC and
> a DDC. Legacy DCs that do not comply to TP8010 will have DCTYPE=0
> (i.e. Discovery controller type is not reported). So, registration will not
> be attempted  for DCTYPE other than 1 (DDC) and 2 (CDC).
> 
> Here you will find the ratified version of TP8010:
> https://nvmexpress.org/wp-content/uploads/NVM-Express-2.0-Ratified-TPs-01242022.zip
> 
> Martin
> 
>>
>> On 1/25/22 09:59, Martin Belanger wrote:
>>> +/**
>>> + * nvme_get_adrfam() - Get address family for the address we're
>>> +registering
>>> + * with the DC. We retrieve this info from the socket itself. If we
>>> +can't
>>> + * get the source address from the socket, then we'll infer the
>>> +address
>>> + * family from the address of the DC since the DC address has the
>>> +same
>>> + * address family.
>>> + *
>>> + * @ctrl: Host NVMe controller instance maintaining the admin queue
>> used to
>>> + *      submit the DIM command to the DC.
>>> + *
>>> + * Return: The address family of the source address associated with
>>> +the
>>> + * socket connected to the DC.
> 
> 
> Internal Use - Confidential



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-31 16:16                             ` Belanger, Martin
@ 2022-01-31 18:56                               ` Sagi Grimberg
  2022-02-02  8:36                               ` Hannes Reinecke
  1 sibling, 0 replies; 32+ messages in thread
From: Sagi Grimberg @ 2022-01-31 18:56 UTC (permalink / raw)
  To: Belanger, Martin, Hannes Reinecke, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch


>>> You say that you don’t see any point to send an event on the
>>> disconnect because it is async.  I don’t quite understand that part.
>>> For example, I've observed that when connectivity is lost the kernel
>>> will try to connect every
>>> 10 sec. And it will keep trying to connect 60 times (i.e. 10 min)
>>> before giving up. Wouldn't sending an "offline" event when
>>> connectivity is lost give user space apps a heads up that the nvme device
>> may not respond.
>>
>> Why does userspace need it?
>>
>> When the host will disconnect it will send event asynchronously, so
>> userspace cannot reliably do anything before the host disconnects.
>> Other than that I don't understand what would userspace do with "offline -
>> I/O is going to fail"...
> 
> I was just asking for completeness. Some apps may use it.

I think we'd want a real use-case before decide we expose it.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 3/4] nvme-fabrics: add tp8010 support
  2022-01-31 16:16                             ` Belanger, Martin
  2022-01-31 18:56                               ` Sagi Grimberg
@ 2022-02-02  8:36                               ` Hannes Reinecke
  1 sibling, 0 replies; 32+ messages in thread
From: Hannes Reinecke @ 2022-02-02  8:36 UTC (permalink / raw)
  To: Belanger, Martin, Sagi Grimberg, Martin Belanger, linux-nvme
  Cc: kbusch, axboe, hch

On 1/31/22 17:16, Belanger, Martin wrote:
> 
>>> Hi Sagi. Thanks for the code snippet. I just had a question about your
>> comment.
>>>
>>> You say that you don’t see any point to send an event on the
>>> disconnect because it is async.  I don’t quite understand that part.
>>> For example, I've observed that when connectivity is lost the kernel
>>> will try to connect every
>>> 10 sec. And it will keep trying to connect 60 times (i.e. 10 min)
>>> before giving up. Wouldn't sending an "offline" event when
>>> connectivity is lost give user space apps a heads up that the nvme device
>> may not respond.
>>
>> Why does userspace need it?
>>
>> When the host will disconnect it will send event asynchronously, so
>> userspace cannot reliably do anything before the host disconnects.
>> Other than that I don't understand what would userspace do with "offline -
>> I/O is going to fail"...
> 
> I was just asking for completeness. Some apps may use it.
> 
We had a similar discussion during udev development; originally we had 
KOBJ_ONLINE and KOBJ_OFFLINE uevents
Turned out that the KOBJ_OFFLINE events were quite pointless (as Sagi 
indicated), and hence we settled on a single KOBJ_CHANGE event which 
typically is sent when a device/system gets back online.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                Kernel Storage Architect
hare@suse.de                              +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer


^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2022-02-02  8:36 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20220125145956.14746-1-nitram_67@hotmail.com>
2022-01-25 14:59 ` [PATCH 1/4] nvme-fabrics: add discovery controller type Martin Belanger
2022-01-27  9:16   ` Hannes Reinecke
2022-01-27 13:37     ` Belanger, Martin
2022-01-27 13:16   ` Sagi Grimberg
2022-01-25 14:59 ` [PATCH 2/4] nvme: add host symbolic name Martin Belanger
2022-01-27  9:18   ` Hannes Reinecke
2022-01-27 13:20   ` Sagi Grimberg
2022-01-25 14:59 ` [PATCH 3/4] nvme-fabrics: add tp8010 support Martin Belanger
2022-01-27  9:30   ` Hannes Reinecke
2022-01-27 13:12     ` Sagi Grimberg
2022-01-27 13:30       ` Belanger, Martin
2022-01-27 14:28         ` Hannes Reinecke
2022-01-27 21:59         ` Sagi Grimberg
2022-01-28 17:55           ` Belanger, Martin
2022-01-28 21:49             ` Hannes Reinecke
2022-01-28 23:02               ` Belanger, Martin
2022-01-29  8:43                 ` Hannes Reinecke
2022-01-29 12:23                   ` Belanger, Martin
2022-01-29 12:47                     ` Hannes Reinecke
2022-01-30  8:46                       ` Sagi Grimberg
2022-01-30 13:18                         ` Sagi Grimberg
2022-01-31 12:08                         ` Belanger, Martin
2022-01-31 16:05                           ` Sagi Grimberg
2022-01-31 16:16                             ` Belanger, Martin
2022-01-31 18:56                               ` Sagi Grimberg
2022-02-02  8:36                               ` Hannes Reinecke
2022-01-31 17:03   ` John Meneghini
2022-01-31 17:17     ` Belanger, Martin
2022-01-31 18:28       ` John Meneghini
2022-01-25 14:59 ` [PATCH 4/4] nvme-fabrics: add explicit deregistration on disconnect Martin Belanger
2022-01-27  9:31   ` Hannes Reinecke
2022-01-27 13:14     ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.