All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nvme/pci: Remove watchdog timer
@ 2017-05-26 12:16 Keith Busch
  2017-05-28  9:41 ` Christoph Hellwig
  2017-05-30 11:53 ` Sagi Grimberg
  0 siblings, 2 replies; 8+ messages in thread
From: Keith Busch @ 2017-05-26 12:16 UTC (permalink / raw)


The controller status polling was added to preemptively reset a failed
controller. This early detection would allow commands that would normally
timeout a chance for a retry, or find broken links when the platform
didn't support hotplug.

This once-per-second MMIO read, however, created more problems than
it solves. This often races with PCIe Hotplug events that required
complicated syncing between work queues, frequently triggered PCIe
Completion Timeout errors that also lead to fatal machine checks, and
unnecessarily disrupts low power modes by running on idle controllers.

This patch removes the watchdog timer, and instead checks controller
health only on an IO timeout when we have a reason to believe something
is wrong. If the controller is failed, the driver will disable immediately
and request scheduling a reset.

Suggested-by: Andy Lutomirski <luto at amacapital.net>
Signed-off-by: Keith Busch <keith.busch at intel.com>
Cc: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/pci.c | 123 ++++++++++++++++++++++--------------------------
 1 file changed, 56 insertions(+), 67 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d52701df..eedb9b4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -94,7 +94,6 @@ struct nvme_dev {
 	void __iomem *bar;
 	struct work_struct reset_work;
 	struct work_struct remove_work;
-	struct timer_list watchdog_timer;
 	struct mutex shutdown_lock;
 	bool subsystem;
 	void __iomem *cmb;
@@ -950,6 +949,51 @@ static void abort_endio(struct request *req, int error)
 	blk_mq_free_request(req);
 }
 
+static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
+{
+
+	/* If true, indicates loss of adapter communication, possibly by a
+	 * NVMe Subsystem reset.
+	 */
+	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
+
+	/* If there is a reset ongoing, we shouldn't reset again. */
+	if (work_busy(&dev->reset_work))
+		return false;
+
+	/* We shouldn't reset unless the controller is on fatal error state
+	 * _or_ if we lost the communication with it.
+	 */
+	if (!(csts & NVME_CSTS_CFS) && !nssro)
+		return false;
+
+	/* If PCI error recovery process is happening, we cannot reset or
+	 * the recovery mechanism will surely fail.
+	 */
+	if (pci_channel_offline(to_pci_dev(dev->dev)))
+		return false;
+
+	return true;
+}
+
+static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
+{
+	/* Read a config register to help see what died. */
+	u16 pci_status;
+	int result;
+
+	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
+				      &pci_status);
+	if (result == PCIBIOS_SUCCESSFUL)
+		dev_warn(dev->ctrl.device,
+			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
+			 csts, pci_status);
+	else
+		dev_warn(dev->ctrl.device,
+			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
+			 csts, result);
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -957,6 +1001,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	struct nvme_dev *dev = nvmeq->dev;
 	struct request *abort_req;
 	struct nvme_command cmd;
+	u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+	/*
+	 * Reset immediately if the controller is failed
+	 */
+	if (nvme_should_reset(dev, readl(dev->bar + NVME_REG_CSTS))) {
+		nvme_warn_reset(dev, csts);
+		nvme_dev_disable(dev, false);
+		nvme_reset(dev);
+		return BLK_EH_HANDLED;
+	}
 
 	/*
 	 * Did we miss an interrupt?
@@ -1358,66 +1413,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
-{
-
-	/* If true, indicates loss of adapter communication, possibly by a
-	 * NVMe Subsystem reset.
-	 */
-	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
-
-	/* If there is a reset ongoing, we shouldn't reset again. */
-	if (work_busy(&dev->reset_work))
-		return false;
-
-	/* We shouldn't reset unless the controller is on fatal error state
-	 * _or_ if we lost the communication with it.
-	 */
-	if (!(csts & NVME_CSTS_CFS) && !nssro)
-		return false;
-
-	/* If PCI error recovery process is happening, we cannot reset or
-	 * the recovery mechanism will surely fail.
-	 */
-	if (pci_channel_offline(to_pci_dev(dev->dev)))
-		return false;
-
-	return true;
-}
-
-static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
-{
-	/* Read a config register to help see what died. */
-	u16 pci_status;
-	int result;
-
-	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
-				      &pci_status);
-	if (result == PCIBIOS_SUCCESSFUL)
-		dev_warn(dev->ctrl.device,
-			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
-			 csts, pci_status);
-	else
-		dev_warn(dev->ctrl.device,
-			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
-			 csts, result);
-}
-
-static void nvme_watchdog_timer(unsigned long data)
-{
-	struct nvme_dev *dev = (struct nvme_dev *)data;
-	u32 csts = readl(dev->bar + NVME_REG_CSTS);
-
-	/* Skip controllers under certain specific conditions. */
-	if (nvme_should_reset(dev, csts)) {
-		if (!nvme_reset(dev))
-			nvme_warn_reset(dev, csts);
-		return;
-	}
-
-	mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
-}
-
 static int nvme_create_io_queues(struct nvme_dev *dev)
 {
 	unsigned i, max;
@@ -1799,8 +1794,6 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	bool dead = true;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
-	del_timer_sync(&dev->watchdog_timer);
-
 	mutex_lock(&dev->shutdown_lock);
 	if (pci_is_enabled(pdev)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
@@ -1964,8 +1957,6 @@ static void nvme_reset_work(struct work_struct *work)
 	if (dev->online_queues > 1)
 		nvme_queue_async_events(&dev->ctrl);
 
-	mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
-
 	/*
 	 * Keep the controller around but remove all namespaces if we don't have
 	 * any working I/O queue.
@@ -2120,8 +2111,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	INIT_WORK(&dev->reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
-	setup_timer(&dev->watchdog_timer, nvme_watchdog_timer,
-		(unsigned long)dev);
 	mutex_init(&dev->shutdown_lock);
 	init_completion(&dev->ioq_wait);
 
-- 
2.7.2

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH] nvme/pci: Remove watchdog timer
  2017-05-26 12:16 [PATCH] nvme/pci: Remove watchdog timer Keith Busch
@ 2017-05-28  9:41 ` Christoph Hellwig
  2017-05-30 14:20   ` Keith Busch
  2017-05-30 11:53 ` Sagi Grimberg
  1 sibling, 1 reply; 8+ messages in thread
From: Christoph Hellwig @ 2017-05-28  9:41 UTC (permalink / raw)


On Fri, May 26, 2017@08:16:28AM -0400, Keith Busch wrote:
> The controller status polling was added to preemptively reset a failed
> controller. This early detection would allow commands that would normally
> timeout a chance for a retry, or find broken links when the platform
> didn't support hotplug.
> 
> This once-per-second MMIO read, however, created more problems than
> it solves. This often races with PCIe Hotplug events that required
> complicated syncing between work queues, frequently triggered PCIe
> Completion Timeout errors that also lead to fatal machine checks, and
> unnecessarily disrupts low power modes by running on idle controllers.
> 
> This patch removes the watchdog timer, and instead checks controller
> health only on an IO timeout when we have a reason to believe something
> is wrong. If the controller is failed, the driver will disable immediately
> and request scheduling a reset.
> 
> Suggested-by: Andy Lutomirski <luto at amacapital.net>
> Signed-off-by: Keith Busch <keith.busch at intel.com>
> Cc: Christoph Hellwig <hch at lst.de>
> ---
>  drivers/nvme/host/pci.c | 123 ++++++++++++++++++++++--------------------------
>  1 file changed, 56 insertions(+), 67 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index d52701df..eedb9b4 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -94,7 +94,6 @@ struct nvme_dev {
>  	void __iomem *bar;
>  	struct work_struct reset_work;
>  	struct work_struct remove_work;
> -	struct timer_list watchdog_timer;
>  	struct mutex shutdown_lock;
>  	bool subsystem;
>  	void __iomem *cmb;
> @@ -950,6 +949,51 @@ static void abort_endio(struct request *req, int error)
>  	blk_mq_free_request(req);
>  }
>  
> +static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
> +{
> +
> +	/* If true, indicates loss of adapter communication, possibly by a
> +	 * NVMe Subsystem reset.
> +	 */
> +	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
> +
> +	/* If there is a reset ongoing, we shouldn't reset again. */
> +	if (work_busy(&dev->reset_work))
> +		return false;
> +
> +	/* We shouldn't reset unless the controller is on fatal error state
> +	 * _or_ if we lost the communication with it.
> +	 */
> +	if (!(csts & NVME_CSTS_CFS) && !nssro)
> +		return false;
> +
> +	/* If PCI error recovery process is happening, we cannot reset or
> +	 * the recovery mechanism will surely fail.
> +	 */
> +	if (pci_channel_offline(to_pci_dev(dev->dev)))
> +		return false;
> +
> +	return true;
> +}
> +
> +static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
> +{
> +	/* Read a config register to help see what died. */
> +	u16 pci_status;
> +	int result;
> +
> +	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
> +				      &pci_status);
> +	if (result == PCIBIOS_SUCCESSFUL)
> +		dev_warn(dev->ctrl.device,
> +			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
> +			 csts, pci_status);
> +	else
> +		dev_warn(dev->ctrl.device,
> +			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
> +			 csts, result);
> +}
> +
>  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> @@ -957,6 +1001,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  	struct nvme_dev *dev = nvmeq->dev;
>  	struct request *abort_req;
>  	struct nvme_command cmd;
> +	u32 csts = readl(dev->bar + NVME_REG_CSTS);
> +
> +	/*
> +	 * Reset immediately if the controller is failed

s/is/hash?

Otherwise this looks good to me, but I'd like to defer it until
we have the multiple reset issue sorted out, to avoid conflicts.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] nvme/pci: Remove watchdog timer
  2017-05-26 12:16 [PATCH] nvme/pci: Remove watchdog timer Keith Busch
  2017-05-28  9:41 ` Christoph Hellwig
@ 2017-05-30 11:53 ` Sagi Grimberg
  2017-05-31 16:21   ` Keith Busch
  1 sibling, 1 reply; 8+ messages in thread
From: Sagi Grimberg @ 2017-05-30 11:53 UTC (permalink / raw)



>  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> @@ -957,6 +1001,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  	struct nvme_dev *dev = nvmeq->dev;
>  	struct request *abort_req;
>  	struct nvme_command cmd;
> +	u32 csts = readl(dev->bar + NVME_REG_CSTS);
> +
> +	/*
> +	 * Reset immediately if the controller is failed
> +	 */
> +	if (nvme_should_reset(dev, readl(dev->bar + NVME_REG_CSTS))) {
> +		nvme_warn_reset(dev, csts);
> +		nvme_dev_disable(dev, false);

Why the nvme_dev_disable here? its going to happen in reset_work..

Other then that this change looks good!

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] nvme/pci: Remove watchdog timer
  2017-05-28  9:41 ` Christoph Hellwig
@ 2017-05-30 14:20   ` Keith Busch
  2017-06-01 12:50     ` Christoph Hellwig
  0 siblings, 1 reply; 8+ messages in thread
From: Keith Busch @ 2017-05-30 14:20 UTC (permalink / raw)


On Sun, May 28, 2017@11:41:52AM +0200, Christoph Hellwig wrote:
> 
> Otherwise this looks good to me, but I'd like to defer it until
> we have the multiple reset issue sorted out, to avoid conflicts.

That's okay to defer till after that settles. I'll need to respin this
anyway since I mistakely re-read CSTS instead of reading it once.

I am hoping to get this in this round since it would quiet a lot of
errors I'm frequently pinged on: a non-posted request during a hotremove
downstream a switch creates a lot of problems for hardware if you don't
have DPC enabled.

One thing I'm just realizing, though, this will break subsystem reset.
We were depending on the watchdog to detect that event, but now we require
an IO timeout. It's easy enough to fix that from the host initiating
the NSSR, but there's no way to notify other hosts connected to the
same subsystem.

Despite that, I still think this patch is still a step in the right
direction for previously mentioned reasons.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] nvme/pci: Remove watchdog timer
  2017-05-30 11:53 ` Sagi Grimberg
@ 2017-05-31 16:21   ` Keith Busch
  0 siblings, 0 replies; 8+ messages in thread
From: Keith Busch @ 2017-05-31 16:21 UTC (permalink / raw)


On Tue, May 30, 2017@02:53:28PM +0300, Sagi Grimberg wrote:
> 
> >  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
> >  {
> >  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> > @@ -957,6 +1001,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
> >  	struct nvme_dev *dev = nvmeq->dev;
> >  	struct request *abort_req;
> >  	struct nvme_command cmd;
> > +	u32 csts = readl(dev->bar + NVME_REG_CSTS);
> > +
> > +	/*
> > +	 * Reset immediately if the controller is failed
> > +	 */
> > +	if (nvme_should_reset(dev, readl(dev->bar + NVME_REG_CSTS))) {
> > +		nvme_warn_reset(dev, csts);
> > +		nvme_dev_disable(dev, false);
> 
> Why the nvme_dev_disable here? its going to happen in reset_work..

Not if the reset work is already running or we are trying to remove the
device. We have to disable inline with the timeout work to make sure we
reclaim the timed out command.

Back in the day, we needed reset_work to do the disable when blk-mq's
timeout handler ran in soft-irq context, but that part of reset_work.
This patch should obviate the need for reset_work to ever disable the
controller.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] nvme/pci: Remove watchdog timer
  2017-05-30 14:20   ` Keith Busch
@ 2017-06-01 12:50     ` Christoph Hellwig
  2017-06-01 13:13       ` Keith Busch
  0 siblings, 1 reply; 8+ messages in thread
From: Christoph Hellwig @ 2017-06-01 12:50 UTC (permalink / raw)


On Tue, May 30, 2017@10:20:57AM -0400, Keith Busch wrote:
> I am hoping to get this in this round since it would quiet a lot of
> errors I'm frequently pinged on: a non-posted request during a hotremove
> downstream a switch creates a lot of problems for hardware if you don't
> have DPC enabled.

With this round do you mean 4.12-rc or 4.13?

> One thing I'm just realizing, though, this will break subsystem reset.
> We were depending on the watchdog to detect that event, but now we require
> an IO timeout. It's easy enough to fix that from the host initiating
> the NSSR, but there's no way to notify other hosts connected to the
> same subsystem.

I don't think I've actually seen it trigger for that case due to the
PCIe link going down earlier.  But I'll need to double check as I
actually have access to a dual ported drive at the moment, the machine
setup is a bit quirky, though.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] nvme/pci: Remove watchdog timer
  2017-06-01 12:50     ` Christoph Hellwig
@ 2017-06-01 13:13       ` Keith Busch
  2017-06-05  8:19         ` Christoph Hellwig
  0 siblings, 1 reply; 8+ messages in thread
From: Keith Busch @ 2017-06-01 13:13 UTC (permalink / raw)


On Thu, Jun 01, 2017@02:50:23PM +0200, Christoph Hellwig wrote:
> On Tue, May 30, 2017@10:20:57AM -0400, Keith Busch wrote:
> > I am hoping to get this in this round since it would quiet a lot of
> > errors I'm frequently pinged on: a non-posted request during a hotremove
> > downstream a switch creates a lot of problems for hardware if you don't
> > have DPC enabled.
> 
> With this round do you mean 4.12-rc or 4.13?

Was hoping 4.12 simply only the problems from doing this are recently
becomming more clear, but I guess no rush at this point.
 
> > One thing I'm just realizing, though, this will break subsystem reset.
> > We were depending on the watchdog to detect that event, but now we require
> > an IO timeout. It's easy enough to fix that from the host initiating
> > the NSSR, but there's no way to notify other hosts connected to the
> > same subsystem.
> 
> I don't think I've actually seen it trigger for that case due to the
> PCIe link going down earlier.  But I'll need to double check as I
> actually have access to a dual ported drive at the moment, the machine
> setup is a bit quirky, though.

The watchdog timer might see all 1's if it happens to run when the
link is in the detect state after NSSR, but normally the host should see
just CSTS.NSSRO to know to it needs to reset its controller.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] nvme/pci: Remove watchdog timer
  2017-06-01 13:13       ` Keith Busch
@ 2017-06-05  8:19         ` Christoph Hellwig
  0 siblings, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2017-06-05  8:19 UTC (permalink / raw)


On Thu, Jun 01, 2017@09:13:53AM -0400, Keith Busch wrote:
> On Thu, Jun 01, 2017@02:50:23PM +0200, Christoph Hellwig wrote:
> > On Tue, May 30, 2017@10:20:57AM -0400, Keith Busch wrote:
> > > I am hoping to get this in this round since it would quiet a lot of
> > > errors I'm frequently pinged on: a non-posted request during a hotremove
> > > downstream a switch creates a lot of problems for hardware if you don't
> > > have DPC enabled.
> > 
> > With this round do you mean 4.12-rc or 4.13?
> 
> Was hoping 4.12 simply only the problems from doing this are recently
> becomming more clear, but I guess no rush at this point.

Given that we don't have an actual bug report caused by it yet I'd
be tempted to leave it for 4.13.  Otherwise we're going to run into
Jens or worse Linus screaming at us.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-06-05  8:19 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-26 12:16 [PATCH] nvme/pci: Remove watchdog timer Keith Busch
2017-05-28  9:41 ` Christoph Hellwig
2017-05-30 14:20   ` Keith Busch
2017-06-01 12:50     ` Christoph Hellwig
2017-06-01 13:13       ` Keith Busch
2017-06-05  8:19         ` Christoph Hellwig
2017-05-30 11:53 ` Sagi Grimberg
2017-05-31 16:21   ` Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.