All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements
@ 2022-05-18  6:40 Christoph Hellwig
  2022-05-18  9:26 ` Hannes Reinecke
  2022-05-18 14:36 ` Keith Busch
  0 siblings, 2 replies; 10+ messages in thread
From: Christoph Hellwig @ 2022-05-18  6:40 UTC (permalink / raw)
  To: sagi, kbusch; +Cc: linux-nvme

Add support for using longer timeouts during controller initialization
and letting the controller come up with namespaces that are not ready
for I/O yet.  We skip these not ready namespaces during scanning and
only bring them online once anoter scan is kicked off by the AEN that
is set when the NRDY bit gets set in the  I/O Command Set Independent
Identify Namespace Data Structure.   This asynchronous probing avoids
blocking the kernel boot when controllers take a very long time to
recover after unclean shutdowns (up to minutes).

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/constants.c |  1 +
 drivers/nvme/host/core.c      | 73 ++++++++++++++++++++++++++++++++---
 drivers/nvme/host/nvme.h      |  1 +
 include/linux/nvme.h          | 31 +++++++++++++++
 4 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index 1dd1d78de2956..4910543f00ff9 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -91,6 +91,7 @@ static const char * const nvme_statuses[] = {
 	[NVME_SC_NS_WRITE_PROTECTED] = "Namespace is Write Protected",
 	[NVME_SC_CMD_INTERRUPTED] = "Command Interrupted",
 	[NVME_SC_TRANSIENT_TR_ERR] = "Transient Transport Error",
+	[NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY] = "Admin Command Media Not Ready",
 	[NVME_SC_INVALID_IO_CMD_SET] = "Invalid IO Command Set",
 	[NVME_SC_LBA_RANGE] = "LBA Out of Range",
 	[NVME_SC_CAP_EXCEEDED] = "Capacity Exceeded",
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 42f9772abc4d0..c822083c46a8b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1427,6 +1427,32 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	return error;
 }
 
+static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
+			struct nvme_id_ns_cs_indep **id)
+{
+	struct nvme_command c = {
+		.identify.opcode	= nvme_admin_identify,
+		.identify.nsid		= cpu_to_le32(nsid),
+		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
+	};
+	int error;
+
+	*id = kmalloc(sizeof(**id), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
+	if (error) {
+		dev_warn(ctrl->device,
+			 "Identify namespace (CS independent) failed (%d)\n",
+			 error);
+		kfree(*id);
+		return error;
+	}
+
+	return 0;
+}
+
 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
 		unsigned int dword11, void *buffer, size_t buflen, u32 *result)
 {
@@ -2103,10 +2129,9 @@ static const struct block_device_operations nvme_bdev_ops = {
 	.pr_ops		= &nvme_pr_ops,
 };
 
-static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
+static int nvme_wait_ready(struct nvme_ctrl *ctrl, bool enabled)
 {
-	unsigned long timeout =
-		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+	unsigned long timeout = ((ctrl->enable_timeout + 1) * HZ / 2) + jiffies;
 	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
 	int ret;
 
@@ -2150,7 +2175,7 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
 	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
 		msleep(NVME_QUIRK_DELAY_AMOUNT);
 
-	return nvme_wait_ready(ctrl, ctrl->cap, false);
+	return nvme_wait_ready(ctrl, false);
 }
 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
 
@@ -2177,6 +2202,27 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
 		ctrl->ctrl_config = NVME_CC_CSS_CSI;
 	else
 		ctrl->ctrl_config = NVME_CC_CSS_NVM;
+
+	if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
+		u32 crto;
+
+		ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
+		if (ret) {
+			dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
+				ret);
+			return ret;
+		}
+
+		if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
+			ctrl->ctrl_config |= NVME_CC_CRIME;
+			ctrl->enable_timeout = NVME_CRTO_CRIMT(crto);
+		} else {
+			ctrl->enable_timeout = NVME_CRTO_CRWMT(crto);
+		}
+	} else {
+		ctrl->enable_timeout = NVME_CAP_TIMEOUT(ctrl->cap);
+	}
+
 	ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
 	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
@@ -2185,7 +2231,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
 	if (ret)
 		return ret;
-	return nvme_wait_ready(ctrl, ctrl->cap, true);
+	return nvme_wait_ready(ctrl, true);
 }
 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
 
@@ -4092,11 +4138,26 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns_ids ids = { };
+	struct nvme_id_ns_cs_indep *id;
 	struct nvme_ns *ns;
+	bool ready = true;
 
 	if (nvme_identify_ns_descs(ctrl, nsid, &ids))
 		return;
 
+	/*
+	 * Check if the namespace is ready.  If not ignore it, we will get an
+	 * AEN once it becomes ready and restart the scan.
+	 */
+	if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) &&
+	    !nvme_identify_ns_cs_indep(ctrl, nsid, &id)) {
+		ready = id->nstat & NVME_NSTAT_NRDY;
+		kfree(id);
+	}
+
+	if (!ready)
+		return;
+
 	ns = nvme_find_get_ns(ctrl, nsid);
 	if (ns) {
 		nvme_validate_ns(ns, &ids);
@@ -4841,6 +4902,8 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
+			NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 81c4f5379c0cf..6cdda2119e13b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -273,6 +273,7 @@ struct nvme_ctrl {
 	u32 queue_count;
 
 	u64 cap;
+	u32 enable_timeout;
 	u32 max_hw_sectors;
 	u32 max_segments;
 	u32 max_integrity_segments;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5f6d432fa06a6..29ec3e3481ff6 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -137,6 +137,7 @@ enum {
 	NVME_REG_CMBMSC = 0x0050,	/* Controller Memory Buffer Memory
 					 * Space Control
 					 */
+	NVME_REG_CRTO	= 0x0068,	/* Controller Ready Timeouts */
 	NVME_REG_PMRCAP	= 0x0e00,	/* Persistent Memory Capabilities */
 	NVME_REG_PMRCTL	= 0x0e04,	/* Persistent Memory Region Control */
 	NVME_REG_PMRSTS	= 0x0e08,	/* Persistent Memory Region Status */
@@ -161,6 +162,9 @@ enum {
 #define NVME_CMB_BIR(cmbloc)	((cmbloc) & 0x7)
 #define NVME_CMB_OFST(cmbloc)	(((cmbloc) >> 12) & 0xfffff)
 
+#define NVME_CRTO_CRIMT(crto)	((crto) >> 16)
+#define NVME_CRTO_CRWMT(crto)	((crto) & 0xffff)
+
 enum {
 	NVME_CMBSZ_SQS		= 1 << 0,
 	NVME_CMBSZ_CQS		= 1 << 1,
@@ -204,6 +208,7 @@ enum {
 	NVME_CC_SHN_MASK	= 3 << NVME_CC_SHN_SHIFT,
 	NVME_CC_IOSQES		= NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
 	NVME_CC_IOCQES		= NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
+	NVME_CC_CRIME		= 1 << 24,
 };
 
 enum {
@@ -227,6 +232,11 @@ enum {
 	NVME_CAP_CSS_CSI	= 1 << 6,
 };
 
+enum {
+	NVME_CAP_CRMS_CRIMS	= 1ULL << 59,
+	NVME_CAP_CRMS_CRWMS	= 1ULL << 60,
+};
+
 struct nvme_id_power_state {
 	__le16			max_power;	/* centiwatts */
 	__u8			rsvd2;
@@ -414,6 +424,21 @@ struct nvme_id_ns {
 	__u8			vs[3712];
 };
 
+/* I/O Command Set Independent Identify Namespace Data Structure */
+struct nvme_id_ns_cs_indep {
+	__u8			nsfeat;
+	__u8			nmic;
+	__u8			rescap;
+	__u8			fpi;
+	__le32			anagrpid;
+	__u8			nsattr;
+	__u8			rsvd9;
+	__le16			nvmsetid;
+	__le16			endgid;
+	__u8			nstat;
+	__u8			rsvd15[4081];
+};
+
 struct nvme_zns_lbafe {
 	__le64			zsze;
 	__u8			zdes;
@@ -478,6 +503,7 @@ enum {
 	NVME_ID_CNS_NS_DESC_LIST	= 0x03,
 	NVME_ID_CNS_CS_NS		= 0x05,
 	NVME_ID_CNS_CS_CTRL		= 0x06,
+	NVME_ID_CNS_NS_CS_INDEP		= 0x08,
 	NVME_ID_CNS_NS_PRESENT_LIST	= 0x10,
 	NVME_ID_CNS_NS_PRESENT		= 0x11,
 	NVME_ID_CNS_CTRL_NS_LIST	= 0x12,
@@ -531,6 +557,10 @@ enum {
 	NVME_NS_DPS_PI_TYPE3	= 3,
 };
 
+enum {
+	NVME_NSTAT_NRDY		= 1 << 0,
+};
+
 enum {
 	NVME_NVM_NS_16B_GUARD	= 0,
 	NVME_NVM_NS_32B_GUARD	= 1,
@@ -1592,6 +1622,7 @@ enum {
 	NVME_SC_NS_WRITE_PROTECTED	= 0x20,
 	NVME_SC_CMD_INTERRUPTED		= 0x21,
 	NVME_SC_TRANSIENT_TR_ERR	= 0x22,
+	NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY = 0x24,
 	NVME_SC_INVALID_IO_CMD_SET	= 0x2C,
 
 	NVME_SC_LBA_RANGE		= 0x80,
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements
  2022-05-18  6:40 [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements Christoph Hellwig
@ 2022-05-18  9:26 ` Hannes Reinecke
  2022-05-18 14:36 ` Keith Busch
  1 sibling, 0 replies; 10+ messages in thread
From: Hannes Reinecke @ 2022-05-18  9:26 UTC (permalink / raw)
  To: Christoph Hellwig, sagi, kbusch; +Cc: linux-nvme

On 5/18/22 08:40, Christoph Hellwig wrote:
> Add support for using longer timeouts during controller initialization
> and letting the controller come up with namespaces that are not ready
> for I/O yet.  We skip these not ready namespaces during scanning and
> only bring them online once another scan is kicked off by the AEN that
> is set when the NRDY bit gets set in the  I/O Command Set Independent
> Identify Namespace Data Structure.   This asynchronous probing avoids
> blocking the kernel boot when controllers take a very long time to
> recover after unclean shutdowns (up to minutes).
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/nvme/host/constants.c |  1 +
>   drivers/nvme/host/core.c      | 73 ++++++++++++++++++++++++++++++++---
>   drivers/nvme/host/nvme.h      |  1 +
>   include/linux/nvme.h          | 31 +++++++++++++++
>   4 files changed, 101 insertions(+), 5 deletions(-)
> 
Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		           Kernel Storage Architect
hare@suse.de			                  +49 911 74053 688
SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), GF: Felix Imendörffer


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements
  2022-05-18  6:40 [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements Christoph Hellwig
  2022-05-18  9:26 ` Hannes Reinecke
@ 2022-05-18 14:36 ` Keith Busch
  2022-05-18 15:00   ` Christoph Hellwig
  1 sibling, 1 reply; 10+ messages in thread
From: Keith Busch @ 2022-05-18 14:36 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: sagi, linux-nvme

On Wed, May 18, 2022 at 08:40:40AM +0200, Christoph Hellwig wrote:
> +static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
> +			struct nvme_id_ns_cs_indep **id)
> +{
> +	struct nvme_command c = {
> +		.identify.opcode	= nvme_admin_identify,
> +		.identify.nsid		= cpu_to_le32(nsid),
> +		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
> +	};
> +	int error;

Every other place in this driver calls this value 'ret'.

> -static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
> +static int nvme_wait_ready(struct nvme_ctrl *ctrl, bool enabled)
>  {
> -	unsigned long timeout =
> -		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
> +	unsigned long timeout = ((ctrl->enable_timeout + 1) * HZ / 2) + jiffies;

I think we need to continue using the NVME_CAP_TIMEOUT() value when 'enabled'
is false. The enable_timeout is only applicable for the 0->1 transition and may
be too short if CRIME is set or too long when it's not.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements
  2022-05-18 14:36 ` Keith Busch
@ 2022-05-18 15:00   ` Christoph Hellwig
  2022-05-18 15:09     ` Keith Busch
                       ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Christoph Hellwig @ 2022-05-18 15:00 UTC (permalink / raw)
  To: Keith Busch; +Cc: Christoph Hellwig, sagi, linux-nvme

On Wed, May 18, 2022 at 08:36:47AM -0600, Keith Busch wrote:
> On Wed, May 18, 2022 at 08:40:40AM +0200, Christoph Hellwig wrote:
> > +static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
> > +			struct nvme_id_ns_cs_indep **id)
> > +{
> > +	struct nvme_command c = {
> > +		.identify.opcode	= nvme_admin_identify,
> > +		.identify.nsid		= cpu_to_le32(nsid),
> > +		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
> > +	};
> > +	int error;
> 
> Every other place in this driver calls this value 'ret'.

Except for nvme_identify_ns, where I copied it from :)

But I can change it to ret.

> 
> > -static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
> > +static int nvme_wait_ready(struct nvme_ctrl *ctrl, bool enabled)
> >  {
> > -	unsigned long timeout =
> > -		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
> > +	unsigned long timeout = ((ctrl->enable_timeout + 1) * HZ / 2) + jiffies;
> 
> I think we need to continue using the NVME_CAP_TIMEOUT() value when 'enabled'
> is false. The enable_timeout is only applicable for the 0->1 transition and may
> be too short if CRIME is set or too long when it's not.

Yeah.  So we could do something like this instead, which even avoids
the new member in struct nvme_ctrl:

---
From 032b8a803bf5bec4c4f206d9cc5a4b1160e0dc69 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 May 2022 15:09:21 +0200
Subject: nvme: add support for TP4084 - Time-to-Ready Enhancements

Add support for using longer timeouts during controller initialization
and letting the controller come up with namespaces that are not ready
for I/O yet.  We skip these not ready namespaces during scanning and
only bring them online once anoter scan is kicked off by the AEN that
is set when the NRDY bit gets set in the  I/O Command Set Independent
Identify Namespace Data Structure.   This asynchronous probing avoids
blocking the kernel boot when controllers take a very long time to
recover after unclean shutdowns (up to minutes).

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/constants.c |  1 +
 drivers/nvme/host/core.c      | 76 ++++++++++++++++++++++++++++++++---
 include/linux/nvme.h          | 31 ++++++++++++++
 3 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index 1dd1d78de2956..4910543f00ff9 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -91,6 +91,7 @@ static const char * const nvme_statuses[] = {
 	[NVME_SC_NS_WRITE_PROTECTED] = "Namespace is Write Protected",
 	[NVME_SC_CMD_INTERRUPTED] = "Command Interrupted",
 	[NVME_SC_TRANSIENT_TR_ERR] = "Transient Transport Error",
+	[NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY] = "Admin Command Media Not Ready",
 	[NVME_SC_INVALID_IO_CMD_SET] = "Invalid IO Command Set",
 	[NVME_SC_LBA_RANGE] = "LBA Out of Range",
 	[NVME_SC_CAP_EXCEEDED] = "Capacity Exceeded",
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 42f9772abc4d0..faeb032719134 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1427,6 +1427,32 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	return error;
 }
 
+static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
+			struct nvme_id_ns_cs_indep **id)
+{
+	struct nvme_command c = {
+		.identify.opcode	= nvme_admin_identify,
+		.identify.nsid		= cpu_to_le32(nsid),
+		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
+	};
+	int ret;
+
+	*id = kmalloc(sizeof(**id), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
+	if (ret) {
+		dev_warn(ctrl->device,
+			 "Identify namespace (CS independent) failed (%d)\n",
+			 ret);
+		kfree(*id);
+		return ret;
+	}
+
+	return 0;
+}
+
 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
 		unsigned int dword11, void *buffer, size_t buflen, u32 *result)
 {
@@ -2103,10 +2129,9 @@ static const struct block_device_operations nvme_bdev_ops = {
 	.pr_ops		= &nvme_pr_ops,
 };
 
-static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
+static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
 {
-	unsigned long timeout =
-		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+	unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies;
 	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
 	int ret;
 
@@ -2119,7 +2144,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
 		usleep_range(1000, 2000);
 		if (fatal_signal_pending(current))
 			return -EINTR;
-		if (time_after(jiffies, timeout)) {
+		if (time_after(jiffies, timeout_jiffies)) {
 			dev_err(ctrl->device,
 				"Device not ready; aborting %s, CSTS=0x%x\n",
 				enabled ? "initialisation" : "reset", csts);
@@ -2150,13 +2175,14 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
 	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
 		msleep(NVME_QUIRK_DELAY_AMOUNT);
 
-	return nvme_wait_ready(ctrl, ctrl->cap, false);
+	return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false);
 }
 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
 
 int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
 {
 	unsigned dev_page_min;
+	u32 timeout;
 	int ret;
 
 	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
@@ -2177,6 +2203,27 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
 		ctrl->ctrl_config = NVME_CC_CSS_CSI;
 	else
 		ctrl->ctrl_config = NVME_CC_CSS_NVM;
+
+	if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
+		u32 crto;
+
+		ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
+		if (ret) {
+			dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
+				ret);
+			return ret;
+		}
+
+		if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
+			ctrl->ctrl_config |= NVME_CC_CRIME;
+			timeout = NVME_CRTO_CRIMT(crto);
+		} else {
+			timeout = NVME_CRTO_CRWMT(crto);
+		}
+	} else {
+		timeout = NVME_CAP_TIMEOUT(ctrl->cap);
+	}
+
 	ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
 	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
@@ -2185,7 +2232,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
 	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
 	if (ret)
 		return ret;
-	return nvme_wait_ready(ctrl, ctrl->cap, true);
+	return nvme_wait_ready(ctrl, timeout, true);
 }
 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
 
@@ -4092,11 +4139,26 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
 static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns_ids ids = { };
+	struct nvme_id_ns_cs_indep *id;
 	struct nvme_ns *ns;
+	bool ready = true;
 
 	if (nvme_identify_ns_descs(ctrl, nsid, &ids))
 		return;
 
+	/*
+	 * Check if the namespace is ready.  If not ignore it, we will get an
+	 * AEN once it becomes ready and restart the scan.
+	 */
+	if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) &&
+	    !nvme_identify_ns_cs_indep(ctrl, nsid, &id)) {
+		ready = id->nstat & NVME_NSTAT_NRDY;
+		kfree(id);
+	}
+
+	if (!ready)
+		return;
+
 	ns = nvme_find_get_ns(ctrl, nsid);
 	if (ns) {
 		nvme_validate_ns(ns, &ids);
@@ -4841,6 +4903,8 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
+			NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5f6d432fa06a6..29ec3e3481ff6 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -137,6 +137,7 @@ enum {
 	NVME_REG_CMBMSC = 0x0050,	/* Controller Memory Buffer Memory
 					 * Space Control
 					 */
+	NVME_REG_CRTO	= 0x0068,	/* Controller Ready Timeouts */
 	NVME_REG_PMRCAP	= 0x0e00,	/* Persistent Memory Capabilities */
 	NVME_REG_PMRCTL	= 0x0e04,	/* Persistent Memory Region Control */
 	NVME_REG_PMRSTS	= 0x0e08,	/* Persistent Memory Region Status */
@@ -161,6 +162,9 @@ enum {
 #define NVME_CMB_BIR(cmbloc)	((cmbloc) & 0x7)
 #define NVME_CMB_OFST(cmbloc)	(((cmbloc) >> 12) & 0xfffff)
 
+#define NVME_CRTO_CRIMT(crto)	((crto) >> 16)
+#define NVME_CRTO_CRWMT(crto)	((crto) & 0xffff)
+
 enum {
 	NVME_CMBSZ_SQS		= 1 << 0,
 	NVME_CMBSZ_CQS		= 1 << 1,
@@ -204,6 +208,7 @@ enum {
 	NVME_CC_SHN_MASK	= 3 << NVME_CC_SHN_SHIFT,
 	NVME_CC_IOSQES		= NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
 	NVME_CC_IOCQES		= NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
+	NVME_CC_CRIME		= 1 << 24,
 };
 
 enum {
@@ -227,6 +232,11 @@ enum {
 	NVME_CAP_CSS_CSI	= 1 << 6,
 };
 
+enum {
+	NVME_CAP_CRMS_CRIMS	= 1ULL << 59,
+	NVME_CAP_CRMS_CRWMS	= 1ULL << 60,
+};
+
 struct nvme_id_power_state {
 	__le16			max_power;	/* centiwatts */
 	__u8			rsvd2;
@@ -414,6 +424,21 @@ struct nvme_id_ns {
 	__u8			vs[3712];
 };
 
+/* I/O Command Set Independent Identify Namespace Data Structure */
+struct nvme_id_ns_cs_indep {
+	__u8			nsfeat;
+	__u8			nmic;
+	__u8			rescap;
+	__u8			fpi;
+	__le32			anagrpid;
+	__u8			nsattr;
+	__u8			rsvd9;
+	__le16			nvmsetid;
+	__le16			endgid;
+	__u8			nstat;
+	__u8			rsvd15[4081];
+};
+
 struct nvme_zns_lbafe {
 	__le64			zsze;
 	__u8			zdes;
@@ -478,6 +503,7 @@ enum {
 	NVME_ID_CNS_NS_DESC_LIST	= 0x03,
 	NVME_ID_CNS_CS_NS		= 0x05,
 	NVME_ID_CNS_CS_CTRL		= 0x06,
+	NVME_ID_CNS_NS_CS_INDEP		= 0x08,
 	NVME_ID_CNS_NS_PRESENT_LIST	= 0x10,
 	NVME_ID_CNS_NS_PRESENT		= 0x11,
 	NVME_ID_CNS_CTRL_NS_LIST	= 0x12,
@@ -531,6 +557,10 @@ enum {
 	NVME_NS_DPS_PI_TYPE3	= 3,
 };
 
+enum {
+	NVME_NSTAT_NRDY		= 1 << 0,
+};
+
 enum {
 	NVME_NVM_NS_16B_GUARD	= 0,
 	NVME_NVM_NS_32B_GUARD	= 1,
@@ -1592,6 +1622,7 @@ enum {
 	NVME_SC_NS_WRITE_PROTECTED	= 0x20,
 	NVME_SC_CMD_INTERRUPTED		= 0x21,
 	NVME_SC_TRANSIENT_TR_ERR	= 0x22,
+	NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY = 0x24,
 	NVME_SC_INVALID_IO_CMD_SET	= 0x2C,
 
 	NVME_SC_LBA_RANGE		= 0x80,
-- 
2.30.2



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements
  2022-05-18 15:00   ` Christoph Hellwig
@ 2022-05-18 15:09     ` Keith Busch
  2022-05-18 16:14     ` Chaitanya Kulkarni
  2022-05-20 11:51     ` Niklas Cassel
  2 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2022-05-18 15:09 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: sagi, linux-nvme

On Wed, May 18, 2022 at 05:00:44PM +0200, Christoph Hellwig wrote:
> Yeah.  So we could do something like this instead, which even avoids
> the new member in struct nvme_ctrl:

Nice, the below patch looks good.

Reviewed-by: Keith Busch <kbusch@kernel.org>
 
> ---
> From 032b8a803bf5bec4c4f206d9cc5a4b1160e0dc69 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Mon, 16 May 2022 15:09:21 +0200
> Subject: nvme: add support for TP4084 - Time-to-Ready Enhancements
> 
> Add support for using longer timeouts during controller initialization
> and letting the controller come up with namespaces that are not ready
> for I/O yet.  We skip these not ready namespaces during scanning and
> only bring them online once anoter scan is kicked off by the AEN that
> is set when the NRDY bit gets set in the  I/O Command Set Independent
> Identify Namespace Data Structure.   This asynchronous probing avoids
> blocking the kernel boot when controllers take a very long time to
> recover after unclean shutdowns (up to minutes).
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/nvme/host/constants.c |  1 +
>  drivers/nvme/host/core.c      | 76 ++++++++++++++++++++++++++++++++---
>  include/linux/nvme.h          | 31 ++++++++++++++
>  3 files changed, 102 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
> index 1dd1d78de2956..4910543f00ff9 100644
> --- a/drivers/nvme/host/constants.c
> +++ b/drivers/nvme/host/constants.c
> @@ -91,6 +91,7 @@ static const char * const nvme_statuses[] = {
>  	[NVME_SC_NS_WRITE_PROTECTED] = "Namespace is Write Protected",
>  	[NVME_SC_CMD_INTERRUPTED] = "Command Interrupted",
>  	[NVME_SC_TRANSIENT_TR_ERR] = "Transient Transport Error",
> +	[NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY] = "Admin Command Media Not Ready",
>  	[NVME_SC_INVALID_IO_CMD_SET] = "Invalid IO Command Set",
>  	[NVME_SC_LBA_RANGE] = "LBA Out of Range",
>  	[NVME_SC_CAP_EXCEEDED] = "Capacity Exceeded",
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 42f9772abc4d0..faeb032719134 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1427,6 +1427,32 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
>  	return error;
>  }
>  
> +static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
> +			struct nvme_id_ns_cs_indep **id)
> +{
> +	struct nvme_command c = {
> +		.identify.opcode	= nvme_admin_identify,
> +		.identify.nsid		= cpu_to_le32(nsid),
> +		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
> +	};
> +	int ret;
> +
> +	*id = kmalloc(sizeof(**id), GFP_KERNEL);
> +	if (!*id)
> +		return -ENOMEM;
> +
> +	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
> +	if (ret) {
> +		dev_warn(ctrl->device,
> +			 "Identify namespace (CS independent) failed (%d)\n",
> +			 ret);
> +		kfree(*id);
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
>  static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
>  		unsigned int dword11, void *buffer, size_t buflen, u32 *result)
>  {
> @@ -2103,10 +2129,9 @@ static const struct block_device_operations nvme_bdev_ops = {
>  	.pr_ops		= &nvme_pr_ops,
>  };
>  
> -static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
> +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
>  {
> -	unsigned long timeout =
> -		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
> +	unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies;
>  	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
>  	int ret;
>  
> @@ -2119,7 +2144,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
>  		usleep_range(1000, 2000);
>  		if (fatal_signal_pending(current))
>  			return -EINTR;
> -		if (time_after(jiffies, timeout)) {
> +		if (time_after(jiffies, timeout_jiffies)) {
>  			dev_err(ctrl->device,
>  				"Device not ready; aborting %s, CSTS=0x%x\n",
>  				enabled ? "initialisation" : "reset", csts);
> @@ -2150,13 +2175,14 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
>  	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
>  		msleep(NVME_QUIRK_DELAY_AMOUNT);
>  
> -	return nvme_wait_ready(ctrl, ctrl->cap, false);
> +	return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false);
>  }
>  EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
>  
>  int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
>  {
>  	unsigned dev_page_min;
> +	u32 timeout;
>  	int ret;
>  
>  	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
> @@ -2177,6 +2203,27 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
>  		ctrl->ctrl_config = NVME_CC_CSS_CSI;
>  	else
>  		ctrl->ctrl_config = NVME_CC_CSS_NVM;
> +
> +	if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
> +		u32 crto;
> +
> +		ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
> +		if (ret) {
> +			dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
> +				ret);
> +			return ret;
> +		}
> +
> +		if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
> +			ctrl->ctrl_config |= NVME_CC_CRIME;
> +			timeout = NVME_CRTO_CRIMT(crto);
> +		} else {
> +			timeout = NVME_CRTO_CRWMT(crto);
> +		}
> +	} else {
> +		timeout = NVME_CAP_TIMEOUT(ctrl->cap);
> +	}
> +
>  	ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
>  	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
>  	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
> @@ -2185,7 +2232,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
>  	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
>  	if (ret)
>  		return ret;
> -	return nvme_wait_ready(ctrl, ctrl->cap, true);
> +	return nvme_wait_ready(ctrl, timeout, true);
>  }
>  EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
>  
> @@ -4092,11 +4139,26 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
>  static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>  {
>  	struct nvme_ns_ids ids = { };
> +	struct nvme_id_ns_cs_indep *id;
>  	struct nvme_ns *ns;
> +	bool ready = true;
>  
>  	if (nvme_identify_ns_descs(ctrl, nsid, &ids))
>  		return;
>  
> +	/*
> +	 * Check if the namespace is ready.  If not ignore it, we will get an
> +	 * AEN once it becomes ready and restart the scan.
> +	 */
> +	if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) &&
> +	    !nvme_identify_ns_cs_indep(ctrl, nsid, &id)) {
> +		ready = id->nstat & NVME_NSTAT_NRDY;
> +		kfree(id);
> +	}
> +
> +	if (!ready)
> +		return;
> +
>  	ns = nvme_find_get_ns(ctrl, nsid);
>  	if (ns) {
>  		nvme_validate_ns(ns, &ids);
> @@ -4841,6 +4903,8 @@ static inline void _nvme_check_size(void)
>  	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
>  	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
>  	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
> +	BUILD_BUG_ON(sizeof(struct nvme_id_ns_cs_indep) !=
> +			NVME_IDENTIFY_DATA_SIZE);
>  	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
>  	BUILD_BUG_ON(sizeof(struct nvme_id_ns_nvm) != NVME_IDENTIFY_DATA_SIZE);
>  	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 5f6d432fa06a6..29ec3e3481ff6 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -137,6 +137,7 @@ enum {
>  	NVME_REG_CMBMSC = 0x0050,	/* Controller Memory Buffer Memory
>  					 * Space Control
>  					 */
> +	NVME_REG_CRTO	= 0x0068,	/* Controller Ready Timeouts */
>  	NVME_REG_PMRCAP	= 0x0e00,	/* Persistent Memory Capabilities */
>  	NVME_REG_PMRCTL	= 0x0e04,	/* Persistent Memory Region Control */
>  	NVME_REG_PMRSTS	= 0x0e08,	/* Persistent Memory Region Status */
> @@ -161,6 +162,9 @@ enum {
>  #define NVME_CMB_BIR(cmbloc)	((cmbloc) & 0x7)
>  #define NVME_CMB_OFST(cmbloc)	(((cmbloc) >> 12) & 0xfffff)
>  
> +#define NVME_CRTO_CRIMT(crto)	((crto) >> 16)
> +#define NVME_CRTO_CRWMT(crto)	((crto) & 0xffff)
> +
>  enum {
>  	NVME_CMBSZ_SQS		= 1 << 0,
>  	NVME_CMBSZ_CQS		= 1 << 1,
> @@ -204,6 +208,7 @@ enum {
>  	NVME_CC_SHN_MASK	= 3 << NVME_CC_SHN_SHIFT,
>  	NVME_CC_IOSQES		= NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
>  	NVME_CC_IOCQES		= NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
> +	NVME_CC_CRIME		= 1 << 24,
>  };
>  
>  enum {
> @@ -227,6 +232,11 @@ enum {
>  	NVME_CAP_CSS_CSI	= 1 << 6,
>  };
>  
> +enum {
> +	NVME_CAP_CRMS_CRIMS	= 1ULL << 59,
> +	NVME_CAP_CRMS_CRWMS	= 1ULL << 60,
> +};
> +
>  struct nvme_id_power_state {
>  	__le16			max_power;	/* centiwatts */
>  	__u8			rsvd2;
> @@ -414,6 +424,21 @@ struct nvme_id_ns {
>  	__u8			vs[3712];
>  };
>  
> +/* I/O Command Set Independent Identify Namespace Data Structure */
> +struct nvme_id_ns_cs_indep {
> +	__u8			nsfeat;
> +	__u8			nmic;
> +	__u8			rescap;
> +	__u8			fpi;
> +	__le32			anagrpid;
> +	__u8			nsattr;
> +	__u8			rsvd9;
> +	__le16			nvmsetid;
> +	__le16			endgid;
> +	__u8			nstat;
> +	__u8			rsvd15[4081];
> +};
> +
>  struct nvme_zns_lbafe {
>  	__le64			zsze;
>  	__u8			zdes;
> @@ -478,6 +503,7 @@ enum {
>  	NVME_ID_CNS_NS_DESC_LIST	= 0x03,
>  	NVME_ID_CNS_CS_NS		= 0x05,
>  	NVME_ID_CNS_CS_CTRL		= 0x06,
> +	NVME_ID_CNS_NS_CS_INDEP		= 0x08,
>  	NVME_ID_CNS_NS_PRESENT_LIST	= 0x10,
>  	NVME_ID_CNS_NS_PRESENT		= 0x11,
>  	NVME_ID_CNS_CTRL_NS_LIST	= 0x12,
> @@ -531,6 +557,10 @@ enum {
>  	NVME_NS_DPS_PI_TYPE3	= 3,
>  };
>  
> +enum {
> +	NVME_NSTAT_NRDY		= 1 << 0,
> +};
> +
>  enum {
>  	NVME_NVM_NS_16B_GUARD	= 0,
>  	NVME_NVM_NS_32B_GUARD	= 1,
> @@ -1592,6 +1622,7 @@ enum {
>  	NVME_SC_NS_WRITE_PROTECTED	= 0x20,
>  	NVME_SC_CMD_INTERRUPTED		= 0x21,
>  	NVME_SC_TRANSIENT_TR_ERR	= 0x22,
> +	NVME_SC_ADMIN_COMMAND_MEDIA_NOT_READY = 0x24,
>  	NVME_SC_INVALID_IO_CMD_SET	= 0x2C,
>  
>  	NVME_SC_LBA_RANGE		= 0x80,
> -- 
> 2.30.2
> 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements
  2022-05-18 15:00   ` Christoph Hellwig
  2022-05-18 15:09     ` Keith Busch
@ 2022-05-18 16:14     ` Chaitanya Kulkarni
  2022-05-20 11:51     ` Niklas Cassel
  2 siblings, 0 replies; 10+ messages in thread
From: Chaitanya Kulkarni @ 2022-05-18 16:14 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: sagi, linux-nvme, Keith Busch

On 5/18/22 08:00, Christoph Hellwig wrote:
> On Wed, May 18, 2022 at 08:36:47AM -0600, Keith Busch wrote:
>> On Wed, May 18, 2022 at 08:40:40AM +0200, Christoph Hellwig wrote:
>>> +static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
>>> +			struct nvme_id_ns_cs_indep **id)
>>> +{
>>> +	struct nvme_command c = {
>>> +		.identify.opcode	= nvme_admin_identify,
>>> +		.identify.nsid		= cpu_to_le32(nsid),
>>> +		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
>>> +	};
>>> +	int error;
>>
>> Every other place in this driver calls this value 'ret'.
> 
> Except for nvme_identify_ns, where I copied it from :)
> 
> But I can change it to ret.

that'd be great.

[...]

> 
> ---
>  From 032b8a803bf5bec4c4f206d9cc5a4b1160e0dc69 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Mon, 16 May 2022 15:09:21 +0200
> Subject: nvme: add support for TP4084 - Time-to-Ready Enhancements
> 
> Add support for using longer timeouts during controller initialization
> and letting the controller come up with namespaces that are not ready
> for I/O yet.  We skip these not ready namespaces during scanning and
> only bring them online once anoter scan is kicked off by the AEN that
> is set when the NRDY bit gets set in the  I/O Command Set Independent
> Identify Namespace Data Structure.   This asynchronous probing avoids
> blocking the kernel boot when controllers take a very long time to
> recover after unclean shutdowns (up to minutes).
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---


Thanks for not adding a new member to nvme_ctrl any new member.
Looks good.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements
  2022-05-18 15:00   ` Christoph Hellwig
  2022-05-18 15:09     ` Keith Busch
  2022-05-18 16:14     ` Chaitanya Kulkarni
@ 2022-05-20 11:51     ` Niklas Cassel
  2022-05-20 14:13       ` Keith Busch
  2 siblings, 1 reply; 10+ messages in thread
From: Niklas Cassel @ 2022-05-20 11:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, sagi, linux-nvme

On Wed, May 18, 2022 at 05:00:44PM +0200, Christoph Hellwig wrote:
> On Wed, May 18, 2022 at 08:36:47AM -0600, Keith Busch wrote:
> > On Wed, May 18, 2022 at 08:40:40AM +0200, Christoph Hellwig wrote:
> > > +static int nvme_identify_ns_cs_indep(struct nvme_ctrl *ctrl, unsigned nsid,
> > > +			struct nvme_id_ns_cs_indep **id)
> > > +{
> > > +	struct nvme_command c = {
> > > +		.identify.opcode	= nvme_admin_identify,
> > > +		.identify.nsid		= cpu_to_le32(nsid),
> > > +		.identify.cns		= NVME_ID_CNS_NS_CS_INDEP,
> > > +	};
> > > +	int error;
> > 
> > Every other place in this driver calls this value 'ret'.
> 
> Except for nvme_identify_ns, where I copied it from :)
> 
> But I can change it to ret.
> 
> > 
> > > -static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
> > > +static int nvme_wait_ready(struct nvme_ctrl *ctrl, bool enabled)
> > >  {
> > > -	unsigned long timeout =
> > > -		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
> > > +	unsigned long timeout = ((ctrl->enable_timeout + 1) * HZ / 2) + jiffies;
> > 
> > I think we need to continue using the NVME_CAP_TIMEOUT() value when 'enabled'
> > is false. The enable_timeout is only applicable for the 0->1 transition and may
> > be too short if CRIME is set or too long when it's not.

From the CC.CRIME description:
Changing the value of this field may cause a change in the time reported in the
CAP.TO field. Refer to the definition of CAP.TO for more details.

The definition for CAP.TO:
If the Controller Ready Independent of Media Enable (CC.CRIME) bit is set to ‘1’
and the worst-case time for CSTS.RDY to change state is due to enabling the
controller after CC.EN transitions from ‘0’ to ‘1’, then this field shall be set
to:
a) the value in Controller Ready Independent of Media Timeout (CRTO.CRIMT); or
b) FFh if CRTO.CRIMT is greater than FFh.

This phrasing is quite confusing IMO.

I interpret this as:
If CC.CRIME is set, and CC.EN changes value from 0 to 1,
then the the controller shall write the value of CRTO.CRIMT to CAP.TO.
(If we want to make use of CAP.TO, then we would need to read the value
after setting CC.EN.)

When does the controller change CAP.TO to contain the "disable" timeout?
After CSTS.RDY has transitioned to 1...?

Since CAP.TO register contains either the enable or disable timeout,
it is quite important to know at what point in time it will contain
one or the other, so we can know if we should read CAP.TO before or
after setting a new CC.EN value.

In nvme_enable_ctrl(), we read CAP register first, then set the bits.
Since this patch never uses CAP.TO when CRIMS is supported, it should
not matter that we read the register before setting CC.EN.

nvme_disable_ctrl() doesn't re-read the register after setting CC.EN = 0.
It relies on the cached register value that was read by nvme_enable_ctrl().
Since nvme_enable_ctrl() reads the value before setting CC.CRIME
(and because the reset value of CC.CRIME is 0), the cached value should
always be the long timeout.

So nvme_enable_ctrl() and nvme_disable_ctrl() should both use the correct
timeout value with this patch, even though it seems a bit fragile.

If we ever read and update ctrl->caps, depending on the state of the
controller, NVME_CAP_TIMEOUT(ctrl->cap) can potentially contain the
CRTO.CRIMT value, and then using this value in nvme_disable_ctrl()
could potentially cause us to use the (short) enable timeout instead
of the (long) disable timeout.

It would have been way easier if it was just separate registers for
enable and disable timeouts...

If we want things to be more robust, I guess we could read the CAP.TO
register after a reset, and save that value in a ctrl->disable_timeout
which nvme_disable_ctrl() then reuses. Enable timeout can be as it is
in V2, it does not need to be saved.


Kind regards,
Niklas

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements
  2022-05-20 11:51     ` Niklas Cassel
@ 2022-05-20 14:13       ` Keith Busch
  2022-05-25 16:19         ` Niklas Cassel
  0 siblings, 1 reply; 10+ messages in thread
From: Keith Busch @ 2022-05-20 14:13 UTC (permalink / raw)
  To: Niklas Cassel; +Cc: Christoph Hellwig, sagi, linux-nvme

On Fri, May 20, 2022 at 11:51:49AM +0000, Niklas Cassel wrote:
> From the CC.CRIME description:
> Changing the value of this field may cause a change in the time reported in the
> CAP.TO field. Refer to the definition of CAP.TO for more details.
> 
> The definition for CAP.TO:
> If the Controller Ready Independent of Media Enable (CC.CRIME) bit is set to ‘1’
> and the worst-case time for CSTS.RDY to change state is due to enabling the
> controller after CC.EN transitions from ‘0’ to ‘1’, then this field shall be set
> to:
> a) the value in Controller Ready Independent of Media Timeout (CRTO.CRIMT); or
> b) FFh if CRTO.CRIMT is greater than FFh.
> 
> This phrasing is quite confusing IMO.

Yes, that is rather confusing. I see you started a thread on the nvme workgroup
reflector. I hope they can clear this up.
 
> It would have been way easier if it was just separate registers for
> enable and disable timeouts...

That just makes too much sense. :) The two actions could reasonably have very
different expected times to complete, so why squeeze their timeouts into a
single value? It's not like we're running low on register space.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements
  2022-05-20 14:13       ` Keith Busch
@ 2022-05-25 16:19         ` Niklas Cassel
  2022-05-25 16:23           ` Christoph Hellwig
  0 siblings, 1 reply; 10+ messages in thread
From: Niklas Cassel @ 2022-05-25 16:19 UTC (permalink / raw)
  To: Keith Busch; +Cc: Christoph Hellwig, sagi, linux-nvme

On Fri, May 20, 2022 at 08:13:04AM -0600, Keith Busch wrote:
> On Fri, May 20, 2022 at 11:51:49AM +0000, Niklas Cassel wrote:
> > From the CC.CRIME description:
> > Changing the value of this field may cause a change in the time reported in the
> > CAP.TO field. Refer to the definition of CAP.TO for more details.
> > 
> > The definition for CAP.TO:
> > If the Controller Ready Independent of Media Enable (CC.CRIME) bit is set to ‘1’
> > and the worst-case time for CSTS.RDY to change state is due to enabling the
> > controller after CC.EN transitions from ‘0’ to ‘1’, then this field shall be set
> > to:
> > a) the value in Controller Ready Independent of Media Timeout (CRTO.CRIMT); or
> > b) FFh if CRTO.CRIMT is greater than FFh.
> > 
> > This phrasing is quite confusing IMO.
> 
> Yes, that is rather confusing. I see you started a thread on the nvme workgroup
> reflector. I hope they can clear this up.

The current patch that is in Jens's tree is be fine, since we cache
the CAP register at start-up (after a reset), and never re-read it.

However, while reading through the spec, in NVMe 2.0b,
3.5.3 Controller Ready Modes During Initialization
it says:

[...] In this situation, the host should set the controller ready mode by
writing to the CC.CRIME bit before the controller is enabled [...].



Right now we do not write CC.CRIME bit _before_ the controller is enabled.
We set CC.CRIME and CC.ENABLE at the same time, which strictly speaking
is not according to spec.

Should we perhaps consider splitting the write up into two,
the first write sets everything except the enable bit,
and the second write sets everything + the enable bit?


Kind regards,
Niklas

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements
  2022-05-25 16:19         ` Niklas Cassel
@ 2022-05-25 16:23           ` Christoph Hellwig
  0 siblings, 0 replies; 10+ messages in thread
From: Christoph Hellwig @ 2022-05-25 16:23 UTC (permalink / raw)
  To: Niklas Cassel; +Cc: Keith Busch, Christoph Hellwig, sagi, linux-nvme

On Wed, May 25, 2022 at 04:19:04PM +0000, Niklas Cassel wrote:
> Right now we do not write CC.CRIME bit _before_ the controller is enabled.
> We set CC.CRIME and CC.ENABLE at the same time, which strictly speaking
> is not according to spec.
> 
> Should we perhaps consider splitting the write up into two,
> the first write sets everything except the enable bit,
> and the second write sets everything + the enable bit?

Actually various other CC bits require that as well.  It hasn't been
an issue so far, but we might as well fix it.  That being said to
properly fix it we'd have to read back a register after setting
CC bits to flush posted writes as well.


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-05-25 16:24 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-18  6:40 [PATCH] nvme: add support for TP4084 - Time-to-Ready Enhancements Christoph Hellwig
2022-05-18  9:26 ` Hannes Reinecke
2022-05-18 14:36 ` Keith Busch
2022-05-18 15:00   ` Christoph Hellwig
2022-05-18 15:09     ` Keith Busch
2022-05-18 16:14     ` Chaitanya Kulkarni
2022-05-20 11:51     ` Niklas Cassel
2022-05-20 14:13       ` Keith Busch
2022-05-25 16:19         ` Niklas Cassel
2022-05-25 16:23           ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.