linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
@ 2019-02-22  1:05 Jon Derrick
  2019-02-22 21:28 ` Linus Torvalds
  0 siblings, 1 reply; 21+ messages in thread
From: Jon Derrick @ 2019-02-22  1:05 UTC (permalink / raw)
  To: linux-nvme
  Cc: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Alex Gagniuc, linux-kernel, Jon Derrick

Some platforms don't seem to easily tolerate non-posted mmio reads on
lost (hot removed) devices. This has been noted in previous
modifications to other layers where an mmio read to a lost device could
cause an undesired firmware intervention [1][2].

This patch reworks the nvme-pci reads to quickly check connectivity
prior to reading the BAR. The intent is to prevent a non-posted mmio
read which would definitely result in an error condition of some sort.
There is, of course, a chance the link becomes disconnected between the
check and the read. Like other similar checks, it is only intended to
reduce the likelihood of encountering these issues and not fully close
the gap.

mmio writes are posted and in the fast path and have been left as-is.

[1] https://lkml.org/lkml/2018/7/30/858
[2] https://lkml.org/lkml/2018/9/18/1546

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
---
 drivers/nvme/host/pci.c | 75 ++++++++++++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 19 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f54718b63637..e555ac2afacd 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1264,6 +1264,33 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static int nvme_reg_readl(struct nvme_dev *dev, u32 off, u32 *val)
+{
+	if (pci_channel_offline(to_pci_dev(dev->dev)))
+		return -ENODEV;
+
+	*val = readl(dev->bar + off);
+	return 0;
+}
+
+static int nvme_reg_readq(struct nvme_dev *dev, u32 off, u64 *val)
+{
+	if (pci_channel_offline(to_pci_dev(dev->dev)))
+		return -ENODEV;
+
+	*val = readq(dev->bar + off);
+	return 0;
+}
+
+static int nvme_reg_lo_hi_readq(struct nvme_dev *dev, u32 off, u64 *val)
+{
+	if (pci_channel_offline(to_pci_dev(dev->dev)))
+		return -ENODEV;
+
+	*val = lo_hi_readq(dev->bar + off);
+	return 0;
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1271,13 +1298,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	struct nvme_dev *dev = nvmeq->dev;
 	struct request *abort_req;
 	struct nvme_command cmd;
-	u32 csts = readl(dev->bar + NVME_REG_CSTS);
+	u32 csts;
 
 	/* If PCI error recovery process is happening, we cannot reset or
 	 * the recovery mechanism will surely fail.
 	 */
 	mb();
-	if (pci_channel_offline(to_pci_dev(dev->dev)))
+	if (nvme_reg_readl(dev, NVME_REG_CSTS, &csts))
 		return BLK_EH_RESET_TIMER;
 
 	/*
@@ -1692,18 +1719,24 @@ static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
 static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 {
 	int result;
-	u32 aqa;
+	u32 csts, vs, aqa;
 	struct nvme_queue *nvmeq;
 
 	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
 	if (result < 0)
 		return result;
 
-	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
-				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
+	result = nvme_reg_readl(dev, NVME_REG_CSTS, &csts);
+	if (result)
+		return result;
+
+	result = nvme_reg_readl(dev, NVME_REG_VS, &vs);
+	if (result)
+		return result;
 
-	if (dev->subsystem &&
-	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
+	dev->subsystem = (vs >= NVME_VS(1, 1, 0)) ?
+				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
+	if (dev->subsystem && (csts & NVME_CSTS_NSSRO))
 		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
 
 	result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
@@ -1808,10 +1841,11 @@ static void nvme_map_cmb(struct nvme_dev *dev)
 	if (dev->cmb_size)
 		return;
 
-	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
-	if (!dev->cmbsz)
+	if (nvme_reg_readl(dev, NVME_REG_CMBSZ, &dev->cmbsz) || !dev->cmbsz)
+		return;
+
+	if (nvme_reg_readl(dev, NVME_REG_CMBLOC, &dev->cmbloc))
 		return;
-	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
 
 	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
 	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
@@ -2357,6 +2391,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
 {
 	int result = -ENOMEM;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	u32 csts;
 
 	if (pci_enable_device_mem(pdev))
 		return result;
@@ -2367,7 +2402,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
 	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
 		goto disable;
 
-	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
+	if (nvme_reg_readl(dev, NVME_REG_CSTS, &csts) || csts == -1) {
 		result = -ENODEV;
 		goto disable;
 	}
@@ -2381,7 +2416,9 @@ static int nvme_pci_enable(struct nvme_dev *dev)
 	if (result < 0)
 		return result;
 
-	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
+	result = nvme_reg_lo_hi_readq(dev, NVME_REG_CAP, &dev->ctrl.cap);
+	if (result)
+		return result;
 
 	dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
 				io_queue_depth);
@@ -2442,13 +2479,15 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 
 	mutex_lock(&dev->shutdown_lock);
 	if (pci_is_enabled(pdev)) {
-		u32 csts = readl(dev->bar + NVME_REG_CSTS);
+		u32 csts;
 
 		if (dev->ctrl.state == NVME_CTRL_LIVE ||
 		    dev->ctrl.state == NVME_CTRL_RESETTING)
 			nvme_start_freeze(&dev->ctrl);
-		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
-			pdev->error_state  != pci_channel_io_normal);
+
+		if (nvme_reg_readl(dev, NVME_REG_CSTS, &csts) == 0)
+			dead = !!((csts & NVME_CSTS_CFS) ||
+				!(csts & NVME_CSTS_RDY));
 	}
 
 	/*
@@ -2661,8 +2700,7 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 
 static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
 {
-	*val = readl(to_nvme_dev(ctrl)->bar + off);
-	return 0;
+	return nvme_reg_readl(to_nvme_dev(ctrl), off, val);
 }
 
 static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
@@ -2673,8 +2711,7 @@ static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
 
 static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
 {
-	*val = readq(to_nvme_dev(ctrl)->bar + off);
-	return 0;
+	return nvme_reg_readq(to_nvme_dev(ctrl), off, val);
 }
 
 static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
-- 
2.19.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-22  1:05 [PATCH] nvme-pci: Prevent mmio reads if pci channel offline Jon Derrick
@ 2019-02-22 21:28 ` Linus Torvalds
  2019-02-22 21:59   ` Keith Busch
  2019-02-24 20:37   ` Alex_Gagniuc
  0 siblings, 2 replies; 21+ messages in thread
From: Linus Torvalds @ 2019-02-22 21:28 UTC (permalink / raw)
  To: Jon Derrick
  Cc: linux-nvme, Keith Busch, Jens Axboe, Christoph Hellwig,
	Sagi Grimberg, Alex Gagniuc, Linux List Kernel Mailing

On Thu, Feb 21, 2019 at 5:07 PM Jon Derrick <jonathan.derrick@intel.com> wrote:
>
> Some platforms don't seem to easily tolerate non-posted mmio reads on
> lost (hot removed) devices. This has been noted in previous
> modifications to other layers where an mmio read to a lost device could
> cause an undesired firmware intervention [1][2].

This is broken, and whatever platform that requires this is broken.

This has absolutely nothing to do with nvme, and should not be handled
by a driver.

The platform code should be fixed.

What broken platform is this, and why is it causing problems?

None of this wishy-washy "some platforms". Name them, and let's get them fixed.

                 Linus

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-22 21:28 ` Linus Torvalds
@ 2019-02-22 21:59   ` Keith Busch
  2019-02-24 20:37   ` Alex_Gagniuc
  1 sibling, 0 replies; 21+ messages in thread
From: Keith Busch @ 2019-02-22 21:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Derrick, Jonathan, linux-nvme, Jens Axboe, Christoph Hellwig,
	Sagi Grimberg, Alex Gagniuc, Linux List Kernel Mailing

On Fri, Feb 22, 2019 at 01:28:42PM -0800, Linus Torvalds wrote:
> On Thu, Feb 21, 2019 at 5:07 PM Jon Derrick <jonathan.derrick@intel.com> wrote:
> >
> > Some platforms don't seem to easily tolerate non-posted mmio reads on
> > lost (hot removed) devices. This has been noted in previous
> > modifications to other layers where an mmio read to a lost device could
> > cause an undesired firmware intervention [1][2].
> 
> This is broken, and whatever platform that requires this is broken.
> 
> This has absolutely nothing to do with nvme, and should not be handled
> by a driver.
> 
> The platform code should be fixed.

This is, of course, the correct answer. We just find platform firmware
uncooperative, so we see these attempts to avoid them.

I really don't like this driver piecemeal approach if we're going to
quirk around these platforms, though. I'd rather see the invalidated
address ranges remapped to a fault handler fixup exception once and be
done with it.

Or we can say you don't get to use this feature if you bought that
hardware.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-22 21:28 ` Linus Torvalds
  2019-02-22 21:59   ` Keith Busch
@ 2019-02-24 20:37   ` Alex_Gagniuc
  2019-02-24 22:42     ` Linus Torvalds
  1 sibling, 1 reply; 21+ messages in thread
From: Alex_Gagniuc @ 2019-02-24 20:37 UTC (permalink / raw)
  To: torvalds, jonathan.derrick
  Cc: linux-nvme, keith.busch, axboe, hch, sagi, linux-kernel, mr.nuke.me

On 2/22/19 3:29 PM, Linus Torvalds wrote:
> On Thu, Feb 21, 2019 at 5:07 PM Jon Derrick <jonathan.derrick@intel.com> wrote:
>>
>> Some platforms don't seem to easily tolerate non-posted mmio reads on
>> lost (hot removed) devices. This has been noted in previous
>> modifications to other layers where an mmio read to a lost device could
>> cause an undesired firmware intervention [1][2].
> 
> This is broken, and whatever platform that requires this is broken.
> 
> This has absolutely nothing to do with nvme, and should not be handled
> by a driver.
> 
> The platform code should be fixed.
> 
> What broken platform is this, and why is it causing problems?
> 
> None of this wishy-washy "some platforms". Name them, and let's get them fixed.

Dell r740xd to name one. r640 is even worse -- they probably didn't give 
me one because I'd have too much stuff to complain about.

On the above machines, firmware-first (FFS) tries to guess when there's 
a SURPRISE!!! removal of a PCIe card and supress any errors reported to 
the OS. When the OS keeps firing IO over the dead link, FFS doesn't know 
if it can safely supress the error. It reports is via NMI, and
drivers/acpi/apei/ghes.c panics whenever that happens.

Does this sound like horrible code?

As I see it, there's a more fundamental problem. As long as we accept 
platforms where firmware does some things first (FFS), we have much less 
control over what happens. The best we can do is wishy-washy fixes like 
this one.

Did I mention that FFS's explicit intent on the above machines is to 
make the OS crash?

Alex





^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-24 20:37   ` Alex_Gagniuc
@ 2019-02-24 22:42     ` Linus Torvalds
  2019-02-24 23:27       ` Alex_Gagniuc
  0 siblings, 1 reply; 21+ messages in thread
From: Linus Torvalds @ 2019-02-24 22:42 UTC (permalink / raw)
  To: Alex Gagniuc
  Cc: Jon Derrick, linux-nvme, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Linux List Kernel Mailing,
	mr.nuke.me

On Sun, Feb 24, 2019 at 12:37 PM <Alex_Gagniuc@dellteam.com> wrote:
>
> Dell r740xd to name one. r640 is even worse -- they probably didn't give
> me one because I'd have too much stuff to complain about.
>
> On the above machines, firmware-first (FFS) tries to guess when there's
> a SURPRISE!!! removal of a PCIe card and supress any errors reported to
> the OS. When the OS keeps firing IO over the dead link, FFS doesn't know
> if it can safely supress the error. It reports is via NMI, and
> drivers/acpi/apei/ghes.c panics whenever that happens.

Can we just fix that ghes driver?

It's not useful to panic just for random reasons. I realize that some
of the RAS people have the mindset that "hey, I don't know what's
wrong, so I'd better kill the machine than continue", but that's
bogus.

What happens if we just fix that part?

> As I see it, there's a more fundamental problem. As long as we accept
> platforms where firmware does some things first (FFS), we have much less
> control over what happens. The best we can do is wishy-washy fixes like
> this one.

Oh, I agree that platforms with random firmware things are horrid. But
we've been able to handle them just fine before, without making every
single possible hotplug pci driver have nasty problems and
workarounds.

I suspect we'd be much better off having the ghes driver just not panic.

What is the actual ghes error? Is it the "unknown, just panic" case,
or something else?

               Linus

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-24 22:42     ` Linus Torvalds
@ 2019-02-24 23:27       ` Alex_Gagniuc
  2019-02-25  0:43         ` Linus Torvalds
  2019-02-25 15:55         ` Keith Busch
  0 siblings, 2 replies; 21+ messages in thread
From: Alex_Gagniuc @ 2019-02-24 23:27 UTC (permalink / raw)
  To: torvalds
  Cc: jonathan.derrick, linux-nvme, keith.busch, axboe, hch, sagi,
	linux-kernel, mr.nuke.me

On 2/24/19 4:42 PM, Linus Torvalds wrote:
> On Sun, Feb 24, 2019 at 12:37 PM <Alex_Gagniuc@dellteam.com> wrote:
>>
>> Dell r740xd to name one. r640 is even worse -- they probably didn't give
>> me one because I'd have too much stuff to complain about.
>>
>> On the above machines, firmware-first (FFS) tries to guess when there's
>> a SURPRISE!!! removal of a PCIe card and supress any errors reported to
>> the OS. When the OS keeps firing IO over the dead link, FFS doesn't know
>> if it can safely supress the error. It reports is via NMI, and
>> drivers/acpi/apei/ghes.c panics whenever that happens.
> 
> Can we just fix that ghes driver?
> 
> It's not useful to panic just for random reasons. I realize that some
> of the RAS people have the mindset that "hey, I don't know what's
> wrong, so I'd better kill the machine than continue", but that's
> bogus.

That's the first thing I tried, but Borislav didn't like it. And he's 
right in the strictest sense of the ACPI spec: a fatal GHES error must 
result in a machine reboot [1].

> What happens if we just fix that part?

On rx740xd, on a NVMe hotplug bay, the upstream port stops sending 
hotplug interrupts. We could fix that with a quirk by clearing a 
proprietary bit in the switch. However, FFS won't re-arm itself to 
receive any further errors, so we'd never get notified in case there is 
a genuine error.

>> As I see it, there's a more fundamental problem. As long as we accept
>> platforms where firmware does some things first (FFS), we have much less
>> control over what happens. The best we can do is wishy-washy fixes like
>> this one.
> 
> Oh, I agree that platforms with random firmware things are horrid. But
> we've been able to handle them just fine before, without making every
> single possible hotplug pci driver have nasty problems and
> workarounds.
> 
> I suspect we'd be much better off having the ghes driver just not panic.

Keith Busch of Intel at some point suggested remapping all MMIO 
resources of a dead PCIe device to a read-only page that returns all 
F's. Neither of us were too sure how to do that, or how to handle the 
problem of in-flight DMA, which wouldn't hit the page tables.

> What is the actual ghes error? Is it the "unknown, just panic" case,
> or something else?

More like "fatal error, just panic". It looks like this (from a serial 
console):

[   57.680494] {1}[Hardware Error]: Hardware error from APEI Generic 
Hardware Error Source: 1
[   57.680495] {1}[Hardware Error]: event severity: fatal
[   57.680496] {1}[Hardware Error]:  Error 0, type: fatal
[   57.680496] {1}[Hardware Error]:   section_type: PCIe error
[   57.680497] {1}[Hardware Error]:   port_type: 6, downstream switch port
[   57.680498] {1}[Hardware Error]:   version: 3.0
[   57.680498] {1}[Hardware Error]:   command: 0x0407, status: 0x0010
[   57.680499] {1}[Hardware Error]:   device_id: 0000:3c:07.0
[   57.680499] {1}[Hardware Error]:   slot: 1
[   57.680500] {1}[Hardware Error]:   secondary_bus: 0x40
[   57.680500] {1}[Hardware Error]:   vendor_id: 0x10b5, device_id: 0x9733
[   57.680501] {1}[Hardware Error]:   class_code: 000406
[   57.680502] {1}[Hardware Error]:   bridge: secondary_status: 0x0000, 
control: 0x0003
[   57.680503] Kernel panic - not syncing: Fatal hardware error!
[   57.680572] Kernel Offset: 0x2a000000 from 0xffffffff81000000 
(relocation range: 0xffffffff80000000-0xffffffffbfffffff)


Alex



[1] ACPI 6.3 - 18.1 Hardware Errors and Error Sources

"A fatal hardware error is an uncorrected or uncontained error condition 
that is determined to be unrecoverable by the hardware. When a fatal 
uncorrected error occurs, the system is restarted to prevent propagation 
of the error."



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-24 23:27       ` Alex_Gagniuc
@ 2019-02-25  0:43         ` Linus Torvalds
  2019-02-25 15:55         ` Keith Busch
  1 sibling, 0 replies; 21+ messages in thread
From: Linus Torvalds @ 2019-02-25  0:43 UTC (permalink / raw)
  To: Alex Gagniuc
  Cc: Jon Derrick, linux-nvme, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Linux List Kernel Mailing,
	mr.nuke.me

On Sun, Feb 24, 2019 at 3:27 PM <Alex_Gagniuc@dellteam.com> wrote:
>
> >
> > It's not useful to panic just for random reasons. I realize that some
> > of the RAS people have the mindset that "hey, I don't know what's
> > wrong, so I'd better kill the machine than continue", but that's
> > bogus.
>
> That's the first thing I tried, but Borislav didn't like it. And he's
> right in the strictest sense of the ACPI spec: a fatal GHES error must
> result in a machine reboot [1].
>
> > What happens if we just fix that part?
>
> On rx740xd, on a NVMe hotplug bay, the upstream port stops sending
> hotplug interrupts. We could fix that with a quirk by clearing a
> proprietary bit in the switch. However, FFS won't re-arm itself to
> receive any further errors, so we'd never get notified in case there is
> a genuine error.

But this is not a genuine fatal error.

When spec and reality collide, the spec is just so much toilet paper.

In fact, the spec is worth _less_ than toilet paper, because at least
toilet paper is useful for wiping your butt clean. The spec? Not so
much.

> Keith Busch of Intel at some point suggested remapping all MMIO
> resources of a dead PCIe device to a read-only page that returns all
> F's. Neither of us were too sure how to do that, or how to handle the
> problem of in-flight DMA, which wouldn't hit the page tables.

I agree that that would be a really cute and smart way to fix things,
but no, right now I don't think we have any kind of infrastructure in
place to do something like that.

> > What is the actual ghes error? Is it the "unknown, just panic" case,
> > or something else?
>
> More like "fatal error, just panic". It looks like this (from a serial
> console):
>
> [   57.680494] {1}[Hardware Error]: Hardware error from APEI Generic
> Hardware Error Source: 1
> [   57.680495] {1}[Hardware Error]: event severity: fatal

Ok, so the ghes information is actively wrong, and tries to kill the
machine when it shouldn't be killed.

I seriously think that the correct thing is to fix the problem at the
*source* - ie the ghes driver. That's the only driver that should care
about "this platform is broken and sends invalid fatal errors".

So instead of adding hacks to the nvme driver, I think the hacks
should be in the ghes driver. Possibly just a black-list of "this
platform is known broken, don't even enable the ghes driver for it".
Or possibly a bit more fine-grained in the sense that it knows that
"ok, this particular kind of error is due to a hotplug event, the
driver will handle it without help from us, so ignore it".

                 Linus

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-24 23:27       ` Alex_Gagniuc
  2019-02-25  0:43         ` Linus Torvalds
@ 2019-02-25 15:55         ` Keith Busch
  2019-02-26 22:37           ` Alex_Gagniuc
  1 sibling, 1 reply; 21+ messages in thread
From: Keith Busch @ 2019-02-25 15:55 UTC (permalink / raw)
  To: Alex_Gagniuc
  Cc: torvalds, axboe, sagi, linux-kernel, linux-nvme, mr.nuke.me, hch,
	Derrick, Jonathan

On Sun, Feb 24, 2019 at 03:27:09PM -0800, Alex_Gagniuc@Dellteam.com wrote:
> 
> More like "fatal error, just panic". It looks like this (from a serial 
> console):
> 
> [   57.680494] {1}[Hardware Error]: Hardware error from APEI Generic  Hardware Error Source: 1
> [   57.680495] {1}[Hardware Error]: event severity: fatal
> [   57.680496] {1}[Hardware Error]:  Error 0, type: fatal
> [   57.680496] {1}[Hardware Error]:   section_type: PCIe error
> [   57.680497] {1}[Hardware Error]:   port_type: 6, downstream switch port
> [   57.680498] {1}[Hardware Error]:   version: 3.0
> [   57.680498] {1}[Hardware Error]:   command: 0x0407, status: 0x0010
> [   57.680499] {1}[Hardware Error]:   device_id: 0000:3c:07.0
> [   57.680499] {1}[Hardware Error]:   slot: 1
> [   57.680500] {1}[Hardware Error]:   secondary_bus: 0x40
> [   57.680500] {1}[Hardware Error]:   vendor_id: 0x10b5, device_id: 0x9733
> [   57.680501] {1}[Hardware Error]:   class_code: 000406
> [   57.680502] {1}[Hardware Error]:   bridge: secondary_status: 0x0000, > control: 0x0003

This is a reaction to a ERR_FATAL message, right? What happens if we
ignore firmware first and override control of the AER masking with a
set to the Unsupported Request Error Mask in the root and downstream
ports? You can do a quick test like this for the above's hardware:

  # setpci -s 3c:07.0 ECAP_AER+8.l=100000:100000

You'd probably have to do the same command to the root port BDf, and any
other switches you have them in the hierarchy. 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-25 15:55         ` Keith Busch
@ 2019-02-26 22:37           ` Alex_Gagniuc
  2019-02-27  1:01             ` Linus Torvalds
  0 siblings, 1 reply; 21+ messages in thread
From: Alex_Gagniuc @ 2019-02-26 22:37 UTC (permalink / raw)
  To: keith.busch
  Cc: torvalds, axboe, sagi, linux-kernel, linux-nvme, mr.nuke.me, hch,
	jonathan.derrick

On 2/25/19 9:55 AM, Keith Busch wrote:
> On Sun, Feb 24, 2019 at 03:27:09PM -0800, Alex_Gagniuc@Dellteam.com wrote:
>> [   57.680494] {1}[Hardware Error]: Hardware error from APEI Generic  Hardware Error Source: 1
>> [   57.680495] {1}[Hardware Error]: event severity: fatal
>> [   57.680496] {1}[Hardware Error]:  Error 0, type: fatal
>> [   57.680496] {1}[Hardware Error]:   section_type: PCIe error
>> [   57.680497] {1}[Hardware Error]:   port_type: 6, downstream switch port
>> [   57.680498] {1}[Hardware Error]:   version: 3.0
>> [   57.680498] {1}[Hardware Error]:   command: 0x0407, status: 0x0010
>> [   57.680499] {1}[Hardware Error]:   device_id: 0000:3c:07.0
>> [   57.680499] {1}[Hardware Error]:   slot: 1
>> [   57.680500] {1}[Hardware Error]:   secondary_bus: 0x40
>> [   57.680500] {1}[Hardware Error]:   vendor_id: 0x10b5, device_id: 0x9733
>> [   57.680501] {1}[Hardware Error]:   class_code: 000406
>> [   57.680502] {1}[Hardware Error]:   bridge: secondary_status: 0x0000, > control: 0x0003
> 
> This is a reaction to a ERR_FATAL message, right? What happens if we
> ignore firmware first and override control of the AER masking with a
> set to the Unsupported Request Error Mask in the root and downstream
> ports? You can do a quick test like this for the above's hardware:
> 
>    # setpci -s 3c:07.0 ECAP_AER+8.l=100000:100000
> 
> You'd probably have to do the same command to the root port BDf, and any
> other switches you have them in the hierarchy.

Then nobody gets the (error) message. You can go a bit further and try 
'pcie_ports=native". Again, nobody gets the memo. ):

Alex


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-26 22:37           ` Alex_Gagniuc
@ 2019-02-27  1:01             ` Linus Torvalds
  2019-02-27 16:42               ` Alex_Gagniuc
  0 siblings, 1 reply; 21+ messages in thread
From: Linus Torvalds @ 2019-02-27  1:01 UTC (permalink / raw)
  To: Alex Gagniuc
  Cc: Keith Busch, Jens Axboe, Sagi Grimberg,
	Linux List Kernel Mailing, linux-nvme, mr.nuke.me,
	Christoph Hellwig, Jon Derrick

On Tue, Feb 26, 2019 at 2:37 PM <Alex_Gagniuc@dellteam.com> wrote:
>
> Then nobody gets the (error) message. You can go a bit further and try
> 'pcie_ports=native". Again, nobody gets the memo. ):

So? The error was bogus to begin with. Why would we care?

Yes, yes, PCI bridges have the ability to return errors in accesses to
non-existent devices. But that was always bogus, and is never useful.
The whole "you get an interrupt or NMI on a bad access" is simply a
horribly broken model. It's not useful.

We already have long depended on hotplug drivers noticing the "oh, I'm
getting all-ff returns, the device may be gone". It's usually trivial,
and works a whole lot better.

It's not an error. Trying to force it to be an NMI or SCI or machine
check is bogus. It causes horrendous pain, because asynchronous
reporting doesn't work reliably anyway, and *synchronous* reporting is
impossible to sanely handle without crazy problems.

So the only sane model for hotplug devices is "IO still works, and
returns all ones". Maybe with an async one-time and *recoverable*
machine check or other reporting the access after the fact.

Anything else is simply broken. It would be broken even if firmware
wasn't involved, but obviously firmware people tend to often make a
bad situation even worse.

              Linus

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-27  1:01             ` Linus Torvalds
@ 2019-02-27 16:42               ` Alex_Gagniuc
  2019-02-27 17:51                 ` Keith Busch
  2019-02-27 17:55                 ` Austin.Bolen
  0 siblings, 2 replies; 21+ messages in thread
From: Alex_Gagniuc @ 2019-02-27 16:42 UTC (permalink / raw)
  To: torvalds
  Cc: keith.busch, axboe, sagi, linux-kernel, linux-nvme, mr.nuke.me,
	hch, jonathan.derrick

On 2/26/19 7:02 PM, Linus Torvalds wrote:
> On Tue, Feb 26, 2019 at 2:37 PM <Alex_Gagniuc@dellteam.com> wrote:
>>
>> Then nobody gets the (error) message. You can go a bit further and try
>> 'pcie_ports=native". Again, nobody gets the memo. ):
> 
> So? The error was bogus to begin with. Why would we care?

Of course nobody cares about that. We care about actual errors that we 
now know we won't be notified of. Imagine if we didn't get the memo that 
a piece of data is corrupt, and imagine the reaction of RAS folk.

And I know the counter to that is a panic() is much more likely to cause 
data corruption, and we're trading one piece of crap for an even 
stinkier one. Whatever we end up doing, we have to do better than 
silence errors and pretend nothing happened.


> Yes, yes, PCI bridges have the ability to return errors in accesses to
> non-existent devices. But that was always bogus, and is never useful.
> The whole "you get an interrupt or NMI on a bad access" is simply a
> horribly broken model. It's not useful.
> 
> We already have long depended on hotplug drivers noticing the "oh, I'm
> getting all-ff returns, the device may be gone". It's usually trivial,
> and works a whole lot better.

And that's been working great, hasn't it? I think you're thinking 
strictly about hotplug. There are other situations where things are all 
F'd, but the hardware isn't sending all F's. (example: ECRC errors)


> It's not an error. Trying to force it to be an NMI or SCI or machine
> check is bogus. It causes horrendous pain, because asynchronous
> reporting doesn't work reliably anyway, and *synchronous* reporting is
> impossible to sanely handle without crazy problems.
> 
> So the only sane model for hotplug devices is "IO still works, and
> returns all ones". Maybe with an async one-time and *recoverable*
> machine check or other reporting the access after the fact.

Exactly!!! A notification (not calling it an 'error') that something 
unusual has happened is good. Treating these things like errors is so 
obvious, even a caveman wouldn't do it.
In a world with FFS, we don't always get to have that model. Oh, FFS!


> Anything else is simply broken. It would be broken even if firmware
> wasn't involved, but obviously firmware people tend to often make a
> bad situation even worse.

Linus, be nice to firmware people. I've met a few, and I can vouch that 
they're very kind and nice. They're also very scared, especially when OS 
people want to ask them a few questions.

I think FFS should get out of the way when OS advertises it's capable of 
handling XYZ. There are some good arguments why this hasn't happened, 
but I won't get into details. I do think it's unlikely that machines 
will be moving back to an OS-controlled model.

And Linus, keep in mind, when these machines were developed, OSes 
couldn't handle recovery properly. None of this was ever an issue. It's 
our fault that we've changed the OS after the machines are on the market.

Alex

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-27 16:42               ` Alex_Gagniuc
@ 2019-02-27 17:51                 ` Keith Busch
  2019-02-27 18:07                   ` Alex_Gagniuc
  2019-02-27 17:55                 ` Austin.Bolen
  1 sibling, 1 reply; 21+ messages in thread
From: Keith Busch @ 2019-02-27 17:51 UTC (permalink / raw)
  To: Alex_Gagniuc
  Cc: torvalds, axboe, sagi, linux-kernel, linux-nvme, keith.busch,
	mr.nuke.me, hch, jonathan.derrick

On Wed, Feb 27, 2019 at 04:42:05PM +0000, Alex_Gagniuc@Dellteam.com wrote:
> On 2/26/19 7:02 PM, Linus Torvalds wrote:
> > On Tue, Feb 26, 2019 at 2:37 PM <Alex_Gagniuc@dellteam.com> wrote:
> >>
> >> Then nobody gets the (error) message. You can go a bit further and try
> >> 'pcie_ports=native". Again, nobody gets the memo. ):
> > 
> > So? The error was bogus to begin with. Why would we care?
> 
> Of course nobody cares about that. We care about actual errors that we 
> now know we won't be notified of. Imagine if we didn't get the memo that 
> a piece of data is corrupt, and imagine the reaction of RAS folk.
> 
> And I know the counter to that is a panic() is much more likely to cause 
> data corruption, and we're trading one piece of crap for an even 
> stinkier one. Whatever we end up doing, we have to do better than 
> silence errors and pretend nothing happened.
> 
> 
> > Yes, yes, PCI bridges have the ability to return errors in accesses to
> > non-existent devices. But that was always bogus, and is never useful.
> > The whole "you get an interrupt or NMI on a bad access" is simply a
> > horribly broken model. It's not useful.
> > 
> > We already have long depended on hotplug drivers noticing the "oh, I'm
> > getting all-ff returns, the device may be gone". It's usually trivial,
> > and works a whole lot better.
> 
> And that's been working great, hasn't it? I think you're thinking 
> strictly about hotplug. There are other situations where things are all 
> F'd, but the hardware isn't sending all F's. (example: ECRC errors)
> 
> 
> > It's not an error. Trying to force it to be an NMI or SCI or machine
> > check is bogus. It causes horrendous pain, because asynchronous
> > reporting doesn't work reliably anyway, and *synchronous* reporting is
> > impossible to sanely handle without crazy problems.
> > 
> > So the only sane model for hotplug devices is "IO still works, and
> > returns all ones". Maybe with an async one-time and *recoverable*
> > machine check or other reporting the access after the fact.
> 
> Exactly!!! A notification (not calling it an 'error') that something 
> unusual has happened is good. Treating these things like errors is so 
> obvious, even a caveman wouldn't do it.
> In a world with FFS, we don't always get to have that model. Oh, FFS!
> 
> 
> > Anything else is simply broken. It would be broken even if firmware
> > wasn't involved, but obviously firmware people tend to often make a
> > bad situation even worse.
> 
> Linus, be nice to firmware people. I've met a few, and I can vouch that 
> they're very kind and nice. They're also very scared, especially when OS 
> people want to ask them a few questions.
> 
> I think FFS should get out of the way when OS advertises it's capable of 
> handling XYZ. There are some good arguments why this hasn't happened, 
> but I won't get into details. I do think it's unlikely that machines 
> will be moving back to an OS-controlled model.
> 
> And Linus, keep in mind, when these machines were developed, OSes 
> couldn't handle recovery properly. None of this was ever an issue. It's 
> our fault that we've changed the OS after the machines are on the market.
> 
> Alex

I can't tell where you're going with this. It doesn't sound like you're
talking about hotplug anymore, at least.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-27 16:42               ` Alex_Gagniuc
  2019-02-27 17:51                 ` Keith Busch
@ 2019-02-27 17:55                 ` Austin.Bolen
  2019-02-27 20:04                   ` Austin.Bolen
  1 sibling, 1 reply; 21+ messages in thread
From: Austin.Bolen @ 2019-02-27 17:55 UTC (permalink / raw)
  To: Alex_Gagniuc, torvalds
  Cc: axboe, sagi, linux-kernel, linux-nvme, keith.busch, mr.nuke.me,
	hch, jonathan.derrick

On 2/27/2019 10:42 AM, Gagniuc, Alexandru - Dell Team wrote:
> 
> [EXTERNAL EMAIL]
> 
> On 2/26/19 7:02 PM, Linus Torvalds wrote:
>> On Tue, Feb 26, 2019 at 2:37 PM <Alex_Gagniuc@dellteam.com> wrote:
>>>
>>> Then nobody gets the (error) message. You can go a bit further and try
>>> 'pcie_ports=native". Again, nobody gets the memo. ):
>>
>> So? The error was bogus to begin with. Why would we care?
> 
> Of course nobody cares about that. We care about actual errors that we
> now know we won't be notified of. Imagine if we didn't get the memo that
> a piece of data is corrupt, and imagine the reaction of RAS folk.
> 
> And I know the counter to that is a panic() is much more likely to cause
> data corruption, and we're trading one piece of crap for an even
> stinkier one. Whatever we end up doing, we have to do better than
> silence errors and pretend nothing happened.
> 
> 
>> Yes, yes, PCI bridges have the ability to return errors in accesses to
>> non-existent devices. But that was always bogus, and is never useful.
>> The whole "you get an interrupt or NMI on a bad access" is simply a
>> horribly broken model. It's not useful.
>>
>> We already have long depended on hotplug drivers noticing the "oh, I'm
>> getting all-ff returns, the device may be gone". It's usually trivial,
>> and works a whole lot better.
> 
> And that's been working great, hasn't it? I think you're thinking
> strictly about hotplug. There are other situations where things are all
> F'd, but the hardware isn't sending all F's. (example: ECRC errors)
> 
> 
>> It's not an error. Trying to force it to be an NMI or SCI or machine
>> check is bogus. It causes horrendous pain, because asynchronous
>> reporting doesn't work reliably anyway, and *synchronous* reporting is
>> impossible to sanely handle without crazy problems.
>>
>> So the only sane model for hotplug devices is "IO still works, and
>> returns all ones". Maybe with an async one-time and *recoverable*
>> machine check or other reporting the access after the fact.
> 
> Exactly!!! A notification (not calling it an 'error') that something
> unusual has happened is good. Treating these things like errors is so
> obvious, even a caveman wouldn't do it.
> In a world with FFS, we don't always get to have that model. Oh, FFS!
> 
> 
>> Anything else is simply broken. It would be broken even if firmware
>> wasn't involved, but obviously firmware people tend to often make a
>> bad situation even worse.
> 
> Linus, be nice to firmware people. I've met a few, and I can vouch that
> they're very kind and nice. They're also very scared, especially when OS
> people want to ask them a few questions.
> 
> I think FFS should get out of the way when OS advertises it's capable of
> handling XYZ. There are some good arguments why this hasn't happened,
> but I won't get into details. I do think it's unlikely that machines
> will be moving back to an OS-controlled model.
> 
> And Linus, keep in mind, when these machines were developed, OSes
> couldn't handle recovery properly. None of this was ever an issue. It's
> our fault that we've changed the OS after the machines are on the market.

Just to add some background on these particular systems... at the time 
they were designed none of the Linux distros we use supported the PCI 
Error Recovery Services or maybe they did and we simply didn't know 
about it.  We found out rather by accident after the systems were 
shipped that Linux had this ability.  It was a pleasant surprise as 
we've been asking OSVs to try to recover from PCI/PCIe errors for years. 
  Linux is the first (and still only) OS we use that can has a PCIe 
error recovery model so kudos on that!

100% agree that crashing the system on PCIe errors like this is terrible 
and we want to get to a recovery model.  It was too late for the 
generation of systems being discussed as it is a big paradigm shift and 
lots of changes/validation that folks are not comfortable doing on 
shipping products.  But it's coming in future products.

We also noticed that there was no mechanism in the recovery models for 
system firmware and OS to interact and come to know if recovery services 
are available and no way for OS to inform platform if error recovery was 
successful or not and no way to establish control of Downstream Port 
Containment.  This model - which PCI-SIG is calling Containment Error 
Recovery - has been added in the relevant PCI-SIG documents and ACPI 
spec and I believe Intel will start pushing patches soon.  Hopefully 
this will provide the industry standard solution needed to get to a full 
error recovery model for PCIe-related errors.

There are other hardware additions in PCIe that could help with 
synchronizing errors such as the RP PIO synchronous exception handling 
added to PCIe as part of eDPC.  Hardware is sparse but shipping AMD 
Naples products already support this new hardware.  I expect more 
hardware to support as the industry shifts to Downstream Port 
Containment/Containment Error Recovery model.

For these issues on existing platforms, I'm not sure what could be done. 
  Continuing to crash may be necessary until the CER solution is in place.

BTW, this patch in particular is complaining about an error for a 
removed device.  The Dell servers referenced in this chain will check if 
the device is removed and if so it will suppress the error so I don't 
think they are susceptible to this particular issue and I agree it is 
broken if they do.  If that is the case we can and will fix it in firmware.

> 
> Alex
> 
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-27 17:51                 ` Keith Busch
@ 2019-02-27 18:07                   ` Alex_Gagniuc
  0 siblings, 0 replies; 21+ messages in thread
From: Alex_Gagniuc @ 2019-02-27 18:07 UTC (permalink / raw)
  To: kbusch
  Cc: torvalds, axboe, sagi, linux-kernel, linux-nvme, keith.busch,
	mr.nuke.me, hch, jonathan.derrick

On 2/27/19 11:51 AM, Keith Busch wrote:
> I can't tell where you're going with this. It doesn't sound like you're
> talking about hotplug anymore, at least.

We're trying to fix an issue related to hotplug. However, the proposed 
fixes may have unintended consequences and side-effects. I want to make 
sure that we're aware of those potential problems.

Alex



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-27 17:55                 ` Austin.Bolen
@ 2019-02-27 20:04                   ` Austin.Bolen
  2019-02-28 14:16                     ` Christoph Hellwig
  0 siblings, 1 reply; 21+ messages in thread
From: Austin.Bolen @ 2019-02-27 20:04 UTC (permalink / raw)
  To: Austin.Bolen, Alex_Gagniuc, torvalds
  Cc: keith.busch, sagi, linux-kernel, linux-nvme, axboe, mr.nuke.me,
	hch, jonathan.derrick

On 2/27/2019 11:56 AM, Bolen, Austin wrote:
> 
> BTW, this patch in particular is complaining about an error for a
> removed device.  The Dell servers referenced in this chain will check if
> the device is removed and if so it will suppress the error so I don't
> think they are susceptible to this particular issue and I agree it is
> broken if they do.  If that is the case we can and will fix it in firmware.
> 

Confirmed this issue does not apply to the referenced Dell servers so I 
don't not have a stake in how this should be handled for those systems. 
It may be they just don't support surprise removal.  I know in our case 
all the Linux distributions we qualify (RHEL, SLES, Ubuntu Server) have 
told us they do not support surprise removal.  So I'm guessing that any 
issues found with surprise removal could potentially fall under the 
category of "unsupported".

Still though, the larger issue of recovering from other types of PCIe 
errors that are not due to device removal is still important.  I would 
expect many system from many platform makers to not be able to recover 
PCIe errors in general and hopefully the new DPC CER model will help 
address this and provide added protection for cases like above as well.

Thanks,
Austin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-27 20:04                   ` Austin.Bolen
@ 2019-02-28 14:16                     ` Christoph Hellwig
  2019-02-28 23:10                       ` Austin.Bolen
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2019-02-28 14:16 UTC (permalink / raw)
  To: Austin.Bolen
  Cc: Alex_Gagniuc, torvalds, keith.busch, sagi, linux-kernel,
	linux-nvme, axboe, mr.nuke.me, hch, jonathan.derrick

On Wed, Feb 27, 2019 at 08:04:35PM +0000, Austin.Bolen@dell.com wrote:
> Confirmed this issue does not apply to the referenced Dell servers so I 
> don't not have a stake in how this should be handled for those systems. 
> It may be they just don't support surprise removal.  I know in our case 
> all the Linux distributions we qualify (RHEL, SLES, Ubuntu Server) have 
> told us they do not support surprise removal.  So I'm guessing that any 
> issues found with surprise removal could potentially fall under the 
> category of "unsupported".
> 
> Still though, the larger issue of recovering from other types of PCIe 
> errors that are not due to device removal is still important.  I would 
> expect many system from many platform makers to not be able to recover 
> PCIe errors in general and hopefully the new DPC CER model will help 
> address this and provide added protection for cases like above as well.

FYI, a related issue I saw about a year two ago with Dell servers was
with a dual ported NVMe add-in (non U.2) card, is that once you did
a subsystem reset, which would cause both controller to retrain the link
you'd run into Firmware First error handling issue that would instantly
crash the system.  I don't really have the hardware anymore, but the
end result was that I think the affected product ended up shipping
with subsystem resets only enabled for the U.2 form factor.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-28 14:16                     ` Christoph Hellwig
@ 2019-02-28 23:10                       ` Austin.Bolen
  2019-02-28 23:20                         ` Keith Busch
  0 siblings, 1 reply; 21+ messages in thread
From: Austin.Bolen @ 2019-02-28 23:10 UTC (permalink / raw)
  To: hch, Austin.Bolen
  Cc: Alex_Gagniuc, torvalds, keith.busch, sagi, linux-kernel,
	linux-nvme, axboe, mr.nuke.me, hch, jonathan.derrick

On 2/28/2019 8:17 AM, Christoph Hellwig wrote:
> 
> [EXTERNAL EMAIL]
> 
> On Wed, Feb 27, 2019 at 08:04:35PM +0000, Austin.Bolen@dell.com wrote:
>> Confirmed this issue does not apply to the referenced Dell servers so I
>> don't not have a stake in how this should be handled for those systems.
>> It may be they just don't support surprise removal.  I know in our case
>> all the Linux distributions we qualify (RHEL, SLES, Ubuntu Server) have
>> told us they do not support surprise removal.  So I'm guessing that any
>> issues found with surprise removal could potentially fall under the
>> category of "unsupported".
>>
>> Still though, the larger issue of recovering from other types of PCIe
>> errors that are not due to device removal is still important.  I would
>> expect many system from many platform makers to not be able to recover
>> PCIe errors in general and hopefully the new DPC CER model will help
>> address this and provide added protection for cases like above as well.
> 
> FYI, a related issue I saw about a year two ago with Dell servers was
> with a dual ported NVMe add-in (non U.2) card, is that once you did
> a subsystem reset, which would cause both controller to retrain the link
> you'd run into Firmware First error handling issue that would instantly
> crash the system.  I don't really have the hardware anymore, but the
> end result was that I think the affected product ended up shipping
> with subsystem resets only enabled for the U.2 form factor.
> 

Yes, that's another good one.  For add-in cards, they are not 
hot-pluggable and so the platform will not set the Hot-Plug Surprise bit 
in the port above them.  So when the surprise link down happens the 
platform will generate a fatal error.  For U.2, the Hot-Plug Surprise 
bit is set on these platforms which suppresses the fatal error.  It's ok 
to suppress in this case since OS will get notified via hot-plug 
interrupt.  In the case of the add-in card there is no hot-plug 
interrupt and so the platform has no idea if the OS will handle the 
surprise link down or not so platform has to err on the side of caution. 
  This is another case where the new containment error recovery model 
will help by allowing platform to know if OS can recover from this error 
or not.

Even if the system sets the Hot-Plug Surprise bit, the system can still 
crater if OS does an NSSR and then some sort of MMIO is generated to the 
downed port.  Platforms that suppress errors for removed devices will 
still escalate this error as fatal since the device is still present. 
But again the error containment model should protect this case as well.

I'd also note that in PCIe, things that intentionally take the link down 
like SBR or Link Disable suppress surprise down error reporting.  But 
NSSR doesn't have this requirement to suppress surprise down reporting. 
I think that's a gap on the part of the NVMe spec.

-Austin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-28 23:10                       ` Austin.Bolen
@ 2019-02-28 23:20                         ` Keith Busch
  2019-02-28 23:43                           ` Austin.Bolen
  0 siblings, 1 reply; 21+ messages in thread
From: Keith Busch @ 2019-02-28 23:20 UTC (permalink / raw)
  To: Austin.Bolen
  Cc: hch, Alex_Gagniuc, torvalds, keith.busch, sagi, linux-kernel,
	linux-nvme, axboe, mr.nuke.me, hch, jonathan.derrick

On Thu, Feb 28, 2019 at 11:10:11PM +0000, Austin.Bolen@dell.com wrote:
> I'd also note that in PCIe, things that intentionally take the link down 
> like SBR or Link Disable suppress surprise down error reporting.  But 
> NSSR doesn't have this requirement to suppress surprise down reporting. 
> I think that's a gap on the part of the NVMe spec.

SBR and Link Disable are done from the down stream port, though, so the
host can still communicate with the function that took the link down.
That's entirely different than taking the link down from the end device,
so I'm not sure how NVMe can fix that.

But I can't even recall why NVMe defined NSSR to require PCIe LTSSM
Detect. That seems entirely unnecessary and is just asking for trouble.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-28 23:20                         ` Keith Busch
@ 2019-02-28 23:43                           ` Austin.Bolen
  2019-03-01  0:30                             ` Keith Busch
  0 siblings, 1 reply; 21+ messages in thread
From: Austin.Bolen @ 2019-02-28 23:43 UTC (permalink / raw)
  To: kbusch, Austin.Bolen
  Cc: hch, Alex_Gagniuc, torvalds, keith.busch, sagi, linux-kernel,
	linux-nvme, axboe, mr.nuke.me, hch, jonathan.derrick

On 2/28/2019 5:20 PM, Keith Busch wrote:
> 
> [EXTERNAL EMAIL]
> 
> On Thu, Feb 28, 2019 at 11:10:11PM +0000, Austin.Bolen@dell.com wrote:
>> I'd also note that in PCIe, things that intentionally take the link down
>> like SBR or Link Disable suppress surprise down error reporting.  But
>> NSSR doesn't have this requirement to suppress surprise down reporting.
>> I think that's a gap on the part of the NVMe spec.
> 
> SBR and Link Disable are done from the down stream port, though, so the
> host can still communicate with the function that took the link down.
> That's entirely different than taking the link down from the end device,
> so I'm not sure how NVMe can fix that.
> 

Agreed it is different.  Here is one way they could have solved it: host 
writes magic value to NSSRC but device latches this instead of 
resetting.  Then require host to do SBR.  When device sees SBR with 
magic value in NSSRC it does an NSSR.

> But I can't even recall why NVMe defined NSSR to require PCIe LTSSM
> Detect. That seems entirely unnecessary and is just asking for trouble.
> 

Not sure why but maybe they wanted a full reset of everything including 
the PCIe block.  I can ask around.  Also agree it is asking for trouble 
to have device take the link down without parent bridge knowing what's 
going on.  There are new mechanism being proposed in NVMe that would 
also allow the device to initiate link down with no co-ordination with 
the parent bridge so may need to think on ways to avoid this issue for 
these new similar mechanisms.

-Austin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-02-28 23:43                           ` Austin.Bolen
@ 2019-03-01  0:30                             ` Keith Busch
  2019-03-01  1:52                               ` Austin.Bolen
  0 siblings, 1 reply; 21+ messages in thread
From: Keith Busch @ 2019-03-01  0:30 UTC (permalink / raw)
  To: Austin.Bolen
  Cc: hch, Alex_Gagniuc, torvalds, keith.busch, sagi, linux-kernel,
	linux-nvme, axboe, mr.nuke.me, hch, jonathan.derrick

On Thu, Feb 28, 2019 at 11:43:46PM +0000, Austin.Bolen@dell.com wrote:
> On 2/28/2019 5:20 PM, Keith Busch wrote:
> > SBR and Link Disable are done from the down stream port, though, so the
> > host can still communicate with the function that took the link down.
> > That's entirely different than taking the link down from the end device,
> > so I'm not sure how NVMe can fix that.
> > 
> 
> Agreed it is different.  Here is one way they could have solved it: host 
> writes magic value to NSSRC but device latches this instead of 
> resetting.  Then require host to do SBR.  When device sees SBR with 
> magic value in NSSRC it does an NSSR.

For single port drives, yes, but that wouldn't work so well for multi-port
devices connected to different busses, maybe even across multiple hosts.
The equivalent of an FLR across all ports should have been sufficient,
IMO.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] nvme-pci: Prevent mmio reads if pci channel offline
  2019-03-01  0:30                             ` Keith Busch
@ 2019-03-01  1:52                               ` Austin.Bolen
  0 siblings, 0 replies; 21+ messages in thread
From: Austin.Bolen @ 2019-03-01  1:52 UTC (permalink / raw)
  To: kbusch, Austin.Bolen
  Cc: hch, Alex_Gagniuc, torvalds, keith.busch, sagi, linux-kernel,
	linux-nvme, axboe, mr.nuke.me, hch, jonathan.derrick

On 2/28/2019 6:30 PM, Keith Busch wrote:
> 
> For single port drives, yes, but that wouldn't work so well for multi-port
> devices connected to different busses, maybe even across multiple hosts.
> The equivalent of an FLR across all ports should have been sufficient,
> IMO.
> 

In that case I'd argue that it really is a surprise down to the other 
host and escalating as such is appropriate.  Agree that something like 
FLR that reset everything in the subsystem but kept link up may have 
been sufficient.

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2019-03-01  2:00 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-22  1:05 [PATCH] nvme-pci: Prevent mmio reads if pci channel offline Jon Derrick
2019-02-22 21:28 ` Linus Torvalds
2019-02-22 21:59   ` Keith Busch
2019-02-24 20:37   ` Alex_Gagniuc
2019-02-24 22:42     ` Linus Torvalds
2019-02-24 23:27       ` Alex_Gagniuc
2019-02-25  0:43         ` Linus Torvalds
2019-02-25 15:55         ` Keith Busch
2019-02-26 22:37           ` Alex_Gagniuc
2019-02-27  1:01             ` Linus Torvalds
2019-02-27 16:42               ` Alex_Gagniuc
2019-02-27 17:51                 ` Keith Busch
2019-02-27 18:07                   ` Alex_Gagniuc
2019-02-27 17:55                 ` Austin.Bolen
2019-02-27 20:04                   ` Austin.Bolen
2019-02-28 14:16                     ` Christoph Hellwig
2019-02-28 23:10                       ` Austin.Bolen
2019-02-28 23:20                         ` Keith Busch
2019-02-28 23:43                           ` Austin.Bolen
2019-03-01  0:30                             ` Keith Busch
2019-03-01  1:52                               ` Austin.Bolen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).