linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
@ 2022-01-27  2:54 Kai-Heng Feng
  2022-01-27  2:54 ` [PATCH v2 2/2] PCI/DPC: Disable DPC " Kai-Heng Feng
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Kai-Heng Feng @ 2022-01-27  2:54 UTC (permalink / raw)
  To: bhelgaas
  Cc: mika.westerberg, koba.ko, Kai-Heng Feng, Russell Currey,
	Oliver O'Halloran, Lalithambika Krishnakumar, Lu Baolu,
	Joerg Roedel, linuxppc-dev, linux-pci, linux-kernel

Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
hint") enables ACS, and some platforms lose its NVMe after resume from
S3:
[   50.947816] pcieport 0000:00:1b.0: DPC: containment event, status:0x1f01 source:0x0000
[   50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
[   50.947829] pcieport 0000:00:1b.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
[   50.947830] pcieport 0000:00:1b.0:   device [8086:06ac] error status/mask=00200000/00010000
[   50.947831] pcieport 0000:00:1b.0:    [21] ACSViol                (First)
[   50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
[   50.947843] nvme nvme0: frozen state error detected, reset controller

It happens right after ACS gets enabled during resume.

There's another case, when Thunderbolt reaches D3cold:
[   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
[   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
[   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
[   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
[   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
[   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
[   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
[   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed

So disable AER service to avoid the noises from turning power rails
on/off when the device is in low power states (D3hot and D3cold), as
PCIe spec "5.2 Link State Power Management" states that TLP and DLLP
transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold
with aux power) and L3 (D3cold).

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=209149
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453
Fixes: 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in hint")
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
v2:
 - Wording change.

 drivers/pci/pcie/aer.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9fa1f97e5b270..e4e9d4a3098d7 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1367,6 +1367,22 @@ static int aer_probe(struct pcie_device *dev)
 	return 0;
 }
 
+static int aer_suspend(struct pcie_device *dev)
+{
+	struct aer_rpc *rpc = get_service_data(dev);
+
+	aer_disable_rootport(rpc);
+	return 0;
+}
+
+static int aer_resume(struct pcie_device *dev)
+{
+	struct aer_rpc *rpc = get_service_data(dev);
+
+	aer_enable_rootport(rpc);
+	return 0;
+}
+
 /**
  * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
  * @dev: pointer to Root Port, RCEC, or RCiEP
@@ -1433,12 +1449,15 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 }
 
 static struct pcie_port_service_driver aerdriver = {
-	.name		= "aer",
-	.port_type	= PCIE_ANY_PORT,
-	.service	= PCIE_PORT_SERVICE_AER,
-
-	.probe		= aer_probe,
-	.remove		= aer_remove,
+	.name			= "aer",
+	.port_type		= PCIE_ANY_PORT,
+	.service		= PCIE_PORT_SERVICE_AER,
+	.probe			= aer_probe,
+	.suspend		= aer_suspend,
+	.resume			= aer_resume,
+	.runtime_suspend	= aer_suspend,
+	.runtime_resume		= aer_resume,
+	.remove			= aer_remove,
 };
 
 /**
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v2 2/2] PCI/DPC: Disable DPC service when link is in L2/L3 ready, L2 and L3 state
  2022-01-27  2:54 [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state Kai-Heng Feng
@ 2022-01-27  2:54 ` Kai-Heng Feng
  2022-01-27  6:31   ` Mika Westerberg
  2022-03-19 20:40   ` Sathyanarayanan Kuppuswamy
  2022-01-27  6:30 ` [PATCH v2 1/2] PCI/AER: Disable AER " Mika Westerberg
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 12+ messages in thread
From: Kai-Heng Feng @ 2022-01-27  2:54 UTC (permalink / raw)
  To: bhelgaas
  Cc: mika.westerberg, koba.ko, Kai-Heng Feng, Russell Currey,
	Oliver O'Halloran, linuxppc-dev, linux-pci, linux-kernel

Since TLP and DLLP transmission is disabled for a Link in L2/L3 Ready,
L2 and L3 (i.e. device in D3hot and D3cold), and DPC depends on AER, so
also disable DPC here.

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
v2:
 - Wording change.
 - Empty line dropped.

 drivers/pci/pcie/dpc.c | 60 +++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 16 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 3e9afee02e8d1..414258967f08e 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -343,13 +343,33 @@ void pci_dpc_init(struct pci_dev *pdev)
 	}
 }
 
+static void dpc_enable(struct pcie_device *dev)
+{
+	struct pci_dev *pdev = dev->port;
+	u16 ctl;
+
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
+	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
+	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
+}
+
+static void dpc_disable(struct pcie_device *dev)
+{
+	struct pci_dev *pdev = dev->port;
+	u16 ctl;
+
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
+	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
+	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
+}
+
 #define FLAG(x, y) (((x) & (y)) ? '+' : '-')
 static int dpc_probe(struct pcie_device *dev)
 {
 	struct pci_dev *pdev = dev->port;
 	struct device *device = &dev->device;
 	int status;
-	u16 ctl, cap;
+	u16 cap;
 
 	if (!pcie_aer_is_native(pdev) && !pcie_ports_dpc_native)
 		return -ENOTSUPP;
@@ -364,10 +384,7 @@ static int dpc_probe(struct pcie_device *dev)
 	}
 
 	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CAP, &cap);
-	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
-
-	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
-	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
+	dpc_enable(dev);
 	pci_info(pdev, "enabled with IRQ %d\n", dev->irq);
 
 	pci_info(pdev, "error containment capabilities: Int Msg #%d, RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -380,22 +397,33 @@ static int dpc_probe(struct pcie_device *dev)
 	return status;
 }
 
-static void dpc_remove(struct pcie_device *dev)
+static int dpc_suspend(struct pcie_device *dev)
 {
-	struct pci_dev *pdev = dev->port;
-	u16 ctl;
+	dpc_disable(dev);
+	return 0;
+}
 
-	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
-	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
-	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
+static int dpc_resume(struct pcie_device *dev)
+{
+	dpc_enable(dev);
+	return 0;
+}
+
+static void dpc_remove(struct pcie_device *dev)
+{
+	dpc_disable(dev);
 }
 
 static struct pcie_port_service_driver dpcdriver = {
-	.name		= "dpc",
-	.port_type	= PCIE_ANY_PORT,
-	.service	= PCIE_PORT_SERVICE_DPC,
-	.probe		= dpc_probe,
-	.remove		= dpc_remove,
+	.name			= "dpc",
+	.port_type		= PCIE_ANY_PORT,
+	.service		= PCIE_PORT_SERVICE_DPC,
+	.probe			= dpc_probe,
+	.suspend		= dpc_suspend,
+	.resume			= dpc_resume,
+	.runtime_suspend	= dpc_suspend,
+	.runtime_resume		= dpc_resume,
+	.remove			= dpc_remove,
 };
 
 int __init pcie_dpc_init(void)
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-01-27  2:54 [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state Kai-Heng Feng
  2022-01-27  2:54 ` [PATCH v2 2/2] PCI/DPC: Disable DPC " Kai-Heng Feng
@ 2022-01-27  6:30 ` Mika Westerberg
  2022-01-27  7:01 ` Lu Baolu
  2022-03-19 20:38 ` Sathyanarayanan Kuppuswamy
  3 siblings, 0 replies; 12+ messages in thread
From: Mika Westerberg @ 2022-01-27  6:30 UTC (permalink / raw)
  To: Kai-Heng Feng
  Cc: bhelgaas, koba.ko, Russell Currey, Oliver O'Halloran,
	Lalithambika Krishnakumar, Lu Baolu, Joerg Roedel, linuxppc-dev,
	linux-pci, linux-kernel

On Thu, Jan 27, 2022 at 10:54:17AM +0800, Kai-Heng Feng wrote:
> Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
> hint") enables ACS, and some platforms lose its NVMe after resume from
> S3:
> [   50.947816] pcieport 0000:00:1b.0: DPC: containment event, status:0x1f01 source:0x0000
> [   50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
> [   50.947829] pcieport 0000:00:1b.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
> [   50.947830] pcieport 0000:00:1b.0:   device [8086:06ac] error status/mask=00200000/00010000
> [   50.947831] pcieport 0000:00:1b.0:    [21] ACSViol                (First)
> [   50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
> [   50.947843] nvme nvme0: frozen state error detected, reset controller
> 
> It happens right after ACS gets enabled during resume.
> 
> There's another case, when Thunderbolt reaches D3cold:
> [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
> [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> 
> So disable AER service to avoid the noises from turning power rails
> on/off when the device is in low power states (D3hot and D3cold), as
> PCIe spec "5.2 Link State Power Management" states that TLP and DLLP
> transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold
> with aux power) and L3 (D3cold).
> 
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=209149
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453
> Fixes: 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in hint")
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>

Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 2/2] PCI/DPC: Disable DPC service when link is in L2/L3 ready, L2 and L3 state
  2022-01-27  2:54 ` [PATCH v2 2/2] PCI/DPC: Disable DPC " Kai-Heng Feng
@ 2022-01-27  6:31   ` Mika Westerberg
  2022-03-19 20:40   ` Sathyanarayanan Kuppuswamy
  1 sibling, 0 replies; 12+ messages in thread
From: Mika Westerberg @ 2022-01-27  6:31 UTC (permalink / raw)
  To: Kai-Heng Feng
  Cc: bhelgaas, koba.ko, Russell Currey, Oliver O'Halloran,
	linuxppc-dev, linux-pci, linux-kernel

On Thu, Jan 27, 2022 at 10:54:18AM +0800, Kai-Heng Feng wrote:
> Since TLP and DLLP transmission is disabled for a Link in L2/L3 Ready,
> L2 and L3 (i.e. device in D3hot and D3cold), and DPC depends on AER, so
> also disable DPC here.
> 
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>

Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-01-27  2:54 [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state Kai-Heng Feng
  2022-01-27  2:54 ` [PATCH v2 2/2] PCI/DPC: Disable DPC " Kai-Heng Feng
  2022-01-27  6:30 ` [PATCH v2 1/2] PCI/AER: Disable AER " Mika Westerberg
@ 2022-01-27  7:01 ` Lu Baolu
  2022-01-27 11:14   ` Kai-Heng Feng
  2022-03-19 20:38 ` Sathyanarayanan Kuppuswamy
  3 siblings, 1 reply; 12+ messages in thread
From: Lu Baolu @ 2022-01-27  7:01 UTC (permalink / raw)
  To: Kai-Heng Feng, bhelgaas
  Cc: baolu.lu, mika.westerberg, koba.ko, Russell Currey,
	Oliver O'Halloran, Lalithambika Krishnakumar, Joerg Roedel,
	linuxppc-dev, linux-pci, linux-kernel

On 2022/1/27 10:54, Kai-Heng Feng wrote:
> Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
> hint") enables ACS, and some platforms lose its NVMe after resume from
> S3:
> [   50.947816] pcieport 0000:00:1b.0: DPC: containment event, status:0x1f01 source:0x0000
> [   50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
> [   50.947829] pcieport 0000:00:1b.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
> [   50.947830] pcieport 0000:00:1b.0:   device [8086:06ac] error status/mask=00200000/00010000
> [   50.947831] pcieport 0000:00:1b.0:    [21] ACSViol                (First)
> [   50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
> [   50.947843] nvme nvme0: frozen state error detected, reset controller
> 
> It happens right after ACS gets enabled during resume.
> 
> There's another case, when Thunderbolt reaches D3cold:
> [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
> [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> 
> So disable AER service to avoid the noises from turning power rails
> on/off when the device is in low power states (D3hot and D3cold), as
> PCIe spec "5.2 Link State Power Management" states that TLP and DLLP
> transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold
> with aux power) and L3 (D3cold).
> 
> Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=209149
> Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=215453
> Fixes: 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in hint")

I don't know what this fix has to do with the commit 50310600ebda.
Commit 50310600ebda only makes sure that PCI ACS is enabled whenever
Intel IOMMU is on. Before this commit, PCI ACS could also be enabled
and result in the same problem. Or anything I missed?

Best regards,
baolu

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-01-27  7:01 ` Lu Baolu
@ 2022-01-27 11:14   ` Kai-Heng Feng
  2022-01-28  2:53     ` Lu Baolu
  0 siblings, 1 reply; 12+ messages in thread
From: Kai-Heng Feng @ 2022-01-27 11:14 UTC (permalink / raw)
  To: Lu Baolu
  Cc: bhelgaas, mika.westerberg, koba.ko, Russell Currey,
	Oliver O'Halloran, Lalithambika Krishnakumar, Joerg Roedel,
	linuxppc-dev, linux-pci, linux-kernel

On Thu, Jan 27, 2022 at 3:01 PM Lu Baolu <baolu.lu@linux.intel.com> wrote:
>
> On 2022/1/27 10:54, Kai-Heng Feng wrote:
> > Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
> > hint") enables ACS, and some platforms lose its NVMe after resume from
> > S3:
> > [   50.947816] pcieport 0000:00:1b.0: DPC: containment event, status:0x1f01 source:0x0000
> > [   50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
> > [   50.947829] pcieport 0000:00:1b.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
> > [   50.947830] pcieport 0000:00:1b.0:   device [8086:06ac] error status/mask=00200000/00010000
> > [   50.947831] pcieport 0000:00:1b.0:    [21] ACSViol                (First)
> > [   50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
> > [   50.947843] nvme nvme0: frozen state error detected, reset controller
> >
> > It happens right after ACS gets enabled during resume.
> >
> > There's another case, when Thunderbolt reaches D3cold:
> > [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> > [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> > [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> > [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> > [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> > [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
> > [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> > [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> >
> > So disable AER service to avoid the noises from turning power rails
> > on/off when the device is in low power states (D3hot and D3cold), as
> > PCIe spec "5.2 Link State Power Management" states that TLP and DLLP
> > transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold
> > with aux power) and L3 (D3cold).
> >
> > Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=209149
> > Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=215453
> > Fixes: 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in hint")
>
> I don't know what this fix has to do with the commit 50310600ebda.

Commit 50310600ebda only exposed the underlying issue. Do you think
"Fixes:" tag should change to other commits?

> Commit 50310600ebda only makes sure that PCI ACS is enabled whenever
> Intel IOMMU is on. Before this commit, PCI ACS could also be enabled
> and result in the same problem. Or anything I missed?

The system in question didn't enable ACS before commit 50310600ebda.

Kai-Heng

>
> Best regards,
> baolu

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-01-27 11:14   ` Kai-Heng Feng
@ 2022-01-28  2:53     ` Lu Baolu
  2022-01-28  3:29       ` Kai-Heng Feng
  0 siblings, 1 reply; 12+ messages in thread
From: Lu Baolu @ 2022-01-28  2:53 UTC (permalink / raw)
  To: Kai-Heng Feng
  Cc: baolu.lu, bhelgaas, mika.westerberg, koba.ko, Russell Currey,
	Oliver O'Halloran, Lalithambika Krishnakumar, Joerg Roedel,
	linuxppc-dev, linux-pci, linux-kernel

On 1/27/22 7:14 PM, Kai-Heng Feng wrote:
> On Thu, Jan 27, 2022 at 3:01 PM Lu Baolu <baolu.lu@linux.intel.com> wrote:
>>
>> On 2022/1/27 10:54, Kai-Heng Feng wrote:
>>> Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
>>> hint") enables ACS, and some platforms lose its NVMe after resume from
>>> S3:
>>> [   50.947816] pcieport 0000:00:1b.0: DPC: containment event, status:0x1f01 source:0x0000
>>> [   50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
>>> [   50.947829] pcieport 0000:00:1b.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
>>> [   50.947830] pcieport 0000:00:1b.0:   device [8086:06ac] error status/mask=00200000/00010000
>>> [   50.947831] pcieport 0000:00:1b.0:    [21] ACSViol                (First)
>>> [   50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
>>> [   50.947843] nvme nvme0: frozen state error detected, reset controller
>>>
>>> It happens right after ACS gets enabled during resume.
>>>
>>> There's another case, when Thunderbolt reaches D3cold:
>>> [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
>>> [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
>>> [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
>>> [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
>>> [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
>>> [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
>>> [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
>>> [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
>>>
>>> So disable AER service to avoid the noises from turning power rails
>>> on/off when the device is in low power states (D3hot and D3cold), as
>>> PCIe spec "5.2 Link State Power Management" states that TLP and DLLP
>>> transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold
>>> with aux power) and L3 (D3cold).
>>>
>>> Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=209149
>>> Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=215453
>>> Fixes: 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in hint")
>>
>> I don't know what this fix has to do with the commit 50310600ebda.
> 
> Commit 50310600ebda only exposed the underlying issue. Do you think
> "Fixes:" tag should change to other commits?
> 
>> Commit 50310600ebda only makes sure that PCI ACS is enabled whenever
>> Intel IOMMU is on. Before this commit, PCI ACS could also be enabled
>> and result in the same problem. Or anything I missed?
> 
> The system in question didn't enable ACS before commit 50310600ebda.

This commit exposed the issue on your configuration doesn't mean the
fix should be back ported as far as that commit. I believe if you add
intel-iommu=on in the kernel parameter, the issue still exists even you
revert commit 50310600ebda or checkout a tag before it.

Best regards,
baolu

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-01-28  2:53     ` Lu Baolu
@ 2022-01-28  3:29       ` Kai-Heng Feng
  0 siblings, 0 replies; 12+ messages in thread
From: Kai-Heng Feng @ 2022-01-28  3:29 UTC (permalink / raw)
  To: Lu Baolu
  Cc: bhelgaas, mika.westerberg, koba.ko, Russell Currey,
	Oliver O'Halloran, Lalithambika Krishnakumar, Joerg Roedel,
	linuxppc-dev, linux-pci, linux-kernel

On Fri, Jan 28, 2022 at 10:57 AM Lu Baolu <baolu.lu@linux.intel.com> wrote:
>
> On 1/27/22 7:14 PM, Kai-Heng Feng wrote:
> > On Thu, Jan 27, 2022 at 3:01 PM Lu Baolu <baolu.lu@linux.intel.com> wrote:
> >>
> >> On 2022/1/27 10:54, Kai-Heng Feng wrote:
> >>> Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
> >>> hint") enables ACS, and some platforms lose its NVMe after resume from
> >>> S3:
> >>> [   50.947816] pcieport 0000:00:1b.0: DPC: containment event, status:0x1f01 source:0x0000
> >>> [   50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
> >>> [   50.947829] pcieport 0000:00:1b.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
> >>> [   50.947830] pcieport 0000:00:1b.0:   device [8086:06ac] error status/mask=00200000/00010000
> >>> [   50.947831] pcieport 0000:00:1b.0:    [21] ACSViol                (First)
> >>> [   50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
> >>> [   50.947843] nvme nvme0: frozen state error detected, reset controller
> >>>
> >>> It happens right after ACS gets enabled during resume.
> >>>
> >>> There's another case, when Thunderbolt reaches D3cold:
> >>> [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> >>> [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> >>> [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> >>> [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> >>> [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> >>> [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
> >>> [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> >>> [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> >>>
> >>> So disable AER service to avoid the noises from turning power rails
> >>> on/off when the device is in low power states (D3hot and D3cold), as
> >>> PCIe spec "5.2 Link State Power Management" states that TLP and DLLP
> >>> transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold
> >>> with aux power) and L3 (D3cold).
> >>>
> >>> Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=209149
> >>> Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=215453
> >>> Fixes: 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in hint")
> >>
> >> I don't know what this fix has to do with the commit 50310600ebda.
> >
> > Commit 50310600ebda only exposed the underlying issue. Do you think
> > "Fixes:" tag should change to other commits?
> >
> >> Commit 50310600ebda only makes sure that PCI ACS is enabled whenever
> >> Intel IOMMU is on. Before this commit, PCI ACS could also be enabled
> >> and result in the same problem. Or anything I missed?
> >
> > The system in question didn't enable ACS before commit 50310600ebda.
>
> This commit exposed the issue on your configuration doesn't mean the
> fix should be back ported as far as that commit. I believe if you add
> intel-iommu=on in the kernel parameter, the issue still exists even you
> revert commit 50310600ebda or checkout a tag before it.

That's true.

I guess it's better to drop the "Fixes:" tag.

Bjorn, should I send another version of it?

Kai-Heng

>
> Best regards,
> baolu

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-01-27  2:54 [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state Kai-Heng Feng
                   ` (2 preceding siblings ...)
  2022-01-27  7:01 ` Lu Baolu
@ 2022-03-19 20:38 ` Sathyanarayanan Kuppuswamy
  2022-03-21  2:38   ` Kai-Heng Feng
  3 siblings, 1 reply; 12+ messages in thread
From: Sathyanarayanan Kuppuswamy @ 2022-03-19 20:38 UTC (permalink / raw)
  To: Kai-Heng Feng, bhelgaas
  Cc: mika.westerberg, koba.ko, Russell Currey, Oliver O'Halloran,
	Lalithambika Krishnakumar, Lu Baolu, Joerg Roedel, linuxppc-dev,
	linux-pci, linux-kernel



On 1/26/22 6:54 PM, Kai-Heng Feng wrote:
> Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
> hint") enables ACS, and some platforms lose its NVMe after resume from

Why enabling ACS makes platform lose NVMe? Can you add more details
about the problem?

> S3:
> [   50.947816] pcieport 0000:00:1b.0: DPC: containment event, status:0x1f01 source:0x0000
> [   50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
> [   50.947829] pcieport 0000:00:1b.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
> [   50.947830] pcieport 0000:00:1b.0:   device [8086:06ac] error status/mask=00200000/00010000
> [   50.947831] pcieport 0000:00:1b.0:    [21] ACSViol                (First)
> [   50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
> [   50.947843] nvme nvme0: frozen state error detected, reset controller
> 
> It happens right after ACS gets enabled during resume.
> 
> There's another case, when Thunderbolt reaches D3cold:
> [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)

no callback message means one or more devices in the given port does not
support error handler. How is this related to ACS?

> [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> 
> So disable AER service to avoid the noises from turning power rails
> on/off when the device is in low power states (D3hot and D3cold), as
> PCIe spec "5.2 Link State Power Management" states that TLP and DLLP
> transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold
> with aux power) and L3 (D3cold).
> 
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=209149
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453
> Fixes: 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in hint")
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ---
> v2:
>   - Wording change.
> 
>   drivers/pci/pcie/aer.c | 31 +++++++++++++++++++++++++------
>   1 file changed, 25 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 9fa1f97e5b270..e4e9d4a3098d7 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -1367,6 +1367,22 @@ static int aer_probe(struct pcie_device *dev)
>   	return 0;
>   }
>   
> +static int aer_suspend(struct pcie_device *dev)
> +{
> +	struct aer_rpc *rpc = get_service_data(dev);
> +
> +	aer_disable_rootport(rpc);
> +	return 0;
> +}
> +
> +static int aer_resume(struct pcie_device *dev)
> +{
> +	struct aer_rpc *rpc = get_service_data(dev);
> +
> +	aer_enable_rootport(rpc);
> +	return 0;
> +}
> +
>   /**
>    * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
>    * @dev: pointer to Root Port, RCEC, or RCiEP
> @@ -1433,12 +1449,15 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
>   }
>   
>   static struct pcie_port_service_driver aerdriver = {
> -	.name		= "aer",
> -	.port_type	= PCIE_ANY_PORT,
> -	.service	= PCIE_PORT_SERVICE_AER,
> -
> -	.probe		= aer_probe,
> -	.remove		= aer_remove,
> +	.name			= "aer",
> +	.port_type		= PCIE_ANY_PORT,
> +	.service		= PCIE_PORT_SERVICE_AER,
> +	.probe			= aer_probe,
> +	.suspend		= aer_suspend,
> +	.resume			= aer_resume,
> +	.runtime_suspend	= aer_suspend,
> +	.runtime_resume		= aer_resume,
> +	.remove			= aer_remove,
>   };
>   
>   /**

-- 
Sathyanarayanan Kuppuswamy
Linux Kernel Developer

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 2/2] PCI/DPC: Disable DPC service when link is in L2/L3 ready, L2 and L3 state
  2022-01-27  2:54 ` [PATCH v2 2/2] PCI/DPC: Disable DPC " Kai-Heng Feng
  2022-01-27  6:31   ` Mika Westerberg
@ 2022-03-19 20:40   ` Sathyanarayanan Kuppuswamy
  1 sibling, 0 replies; 12+ messages in thread
From: Sathyanarayanan Kuppuswamy @ 2022-03-19 20:40 UTC (permalink / raw)
  To: Kai-Heng Feng, bhelgaas
  Cc: mika.westerberg, koba.ko, Russell Currey, Oliver O'Halloran,
	linuxppc-dev, linux-pci, linux-kernel



On 1/26/22 6:54 PM, Kai-Heng Feng wrote:
> Since TLP and DLLP transmission is disabled for a Link in L2/L3 Ready,
> L2 and L3 (i.e. device in D3hot and D3cold), and DPC depends on AER, so

Better description about the problem would be helpful. I know you
have included a log in AER patch. But a quick summary of the problem
in this patch will make it easier to read the patch.

> also disable DPC here.
> 
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ---
> v2:
>   - Wording change.
>   - Empty line dropped.
> 
>   drivers/pci/pcie/dpc.c | 60 +++++++++++++++++++++++++++++++-----------
>   1 file changed, 44 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index 3e9afee02e8d1..414258967f08e 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -343,13 +343,33 @@ void pci_dpc_init(struct pci_dev *pdev)
>   	}
>   }
>   
> +static void dpc_enable(struct pcie_device *dev)
> +{
> +	struct pci_dev *pdev = dev->port;
> +	u16 ctl;
> +
> +	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
> +	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
> +	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
> +}
> +
> +static void dpc_disable(struct pcie_device *dev)
> +{
> +	struct pci_dev *pdev = dev->port;
> +	u16 ctl;
> +
> +	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
> +	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
> +	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
> +}
> +
>   #define FLAG(x, y) (((x) & (y)) ? '+' : '-')
>   static int dpc_probe(struct pcie_device *dev)
>   {
>   	struct pci_dev *pdev = dev->port;
>   	struct device *device = &dev->device;
>   	int status;
> -	u16 ctl, cap;
> +	u16 cap;
>   
>   	if (!pcie_aer_is_native(pdev) && !pcie_ports_dpc_native)
>   		return -ENOTSUPP;
> @@ -364,10 +384,7 @@ static int dpc_probe(struct pcie_device *dev)
>   	}
>   
>   	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CAP, &cap);
> -	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
> -
> -	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
> -	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
> +	dpc_enable(dev);
>   	pci_info(pdev, "enabled with IRQ %d\n", dev->irq);
>   
>   	pci_info(pdev, "error containment capabilities: Int Msg #%d, RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
> @@ -380,22 +397,33 @@ static int dpc_probe(struct pcie_device *dev)
>   	return status;
>   }
>   
> -static void dpc_remove(struct pcie_device *dev)
> +static int dpc_suspend(struct pcie_device *dev)
>   {
> -	struct pci_dev *pdev = dev->port;
> -	u16 ctl;
> +	dpc_disable(dev);
> +	return 0;
> +}
>   
> -	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
> -	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
> -	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
> +static int dpc_resume(struct pcie_device *dev)
> +{
> +	dpc_enable(dev);
> +	return 0;
> +}
> +
> +static void dpc_remove(struct pcie_device *dev)
> +{
> +	dpc_disable(dev);
>   }
>   
>   static struct pcie_port_service_driver dpcdriver = {
> -	.name		= "dpc",
> -	.port_type	= PCIE_ANY_PORT,
> -	.service	= PCIE_PORT_SERVICE_DPC,
> -	.probe		= dpc_probe,
> -	.remove		= dpc_remove,
> +	.name			= "dpc",
> +	.port_type		= PCIE_ANY_PORT,
> +	.service		= PCIE_PORT_SERVICE_DPC,
> +	.probe			= dpc_probe,
> +	.suspend		= dpc_suspend,
> +	.resume			= dpc_resume,
> +	.runtime_suspend	= dpc_suspend,
> +	.runtime_resume		= dpc_resume,
> +	.remove			= dpc_remove,
>   };
>   
>   int __init pcie_dpc_init(void)

-- 
Sathyanarayanan Kuppuswamy
Linux Kernel Developer

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-03-19 20:38 ` Sathyanarayanan Kuppuswamy
@ 2022-03-21  2:38   ` Kai-Heng Feng
  2022-03-21  3:52     ` Sathyanarayanan Kuppuswamy
  0 siblings, 1 reply; 12+ messages in thread
From: Kai-Heng Feng @ 2022-03-21  2:38 UTC (permalink / raw)
  To: Sathyanarayanan Kuppuswamy
  Cc: bhelgaas, mika.westerberg, koba.ko, Russell Currey,
	Oliver O'Halloran, Lalithambika Krishnakumar, Lu Baolu,
	Joerg Roedel, linuxppc-dev, linux-pci, linux-kernel

On Sun, Mar 20, 2022 at 4:38 AM Sathyanarayanan Kuppuswamy
<sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
>
>
>
> On 1/26/22 6:54 PM, Kai-Heng Feng wrote:
> > Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
> > hint") enables ACS, and some platforms lose its NVMe after resume from
>
> Why enabling ACS makes platform lose NVMe? Can you add more details
> about the problem?

I don't have a hardware analyzer, so the only detail I can provide is
the symptom.
I believe the affected system was sent Intel, and there wasn't any
feedback since then.

>
> > S3:
> > [   50.947816] pcieport 0000:00:1b.0: DPC: containment event, status:0x1f01 source:0x0000
> > [   50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
> > [   50.947829] pcieport 0000:00:1b.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
> > [   50.947830] pcieport 0000:00:1b.0:   device [8086:06ac] error status/mask=00200000/00010000
> > [   50.947831] pcieport 0000:00:1b.0:    [21] ACSViol                (First)
> > [   50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
> > [   50.947843] nvme nvme0: frozen state error detected, reset controller
> >
> > It happens right after ACS gets enabled during resume.
> >
> > There's another case, when Thunderbolt reaches D3cold:
> > [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> > [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> > [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> > [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> > [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> > [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
>
> no callback message means one or more devices in the given port does not
> support error handler. How is this related to ACS?

This case is about D3cold, not related to ACS.
And no error_detected is just part of the message. The whole AER
message is more important.

Kai-Heng

>
> > [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> > [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> >
> > So disable AER service to avoid the noises from turning power rails
> > on/off when the device is in low power states (D3hot and D3cold), as
> > PCIe spec "5.2 Link State Power Management" states that TLP and DLLP
> > transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold
> > with aux power) and L3 (D3cold).
> >
> > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=209149
> > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453
> > Fixes: 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in hint")
> > Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> > ---
> > v2:
> >   - Wording change.
> >
> >   drivers/pci/pcie/aer.c | 31 +++++++++++++++++++++++++------
> >   1 file changed, 25 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > index 9fa1f97e5b270..e4e9d4a3098d7 100644
> > --- a/drivers/pci/pcie/aer.c
> > +++ b/drivers/pci/pcie/aer.c
> > @@ -1367,6 +1367,22 @@ static int aer_probe(struct pcie_device *dev)
> >       return 0;
> >   }
> >
> > +static int aer_suspend(struct pcie_device *dev)
> > +{
> > +     struct aer_rpc *rpc = get_service_data(dev);
> > +
> > +     aer_disable_rootport(rpc);
> > +     return 0;
> > +}
> > +
> > +static int aer_resume(struct pcie_device *dev)
> > +{
> > +     struct aer_rpc *rpc = get_service_data(dev);
> > +
> > +     aer_enable_rootport(rpc);
> > +     return 0;
> > +}
> > +
> >   /**
> >    * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
> >    * @dev: pointer to Root Port, RCEC, or RCiEP
> > @@ -1433,12 +1449,15 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
> >   }
> >
> >   static struct pcie_port_service_driver aerdriver = {
> > -     .name           = "aer",
> > -     .port_type      = PCIE_ANY_PORT,
> > -     .service        = PCIE_PORT_SERVICE_AER,
> > -
> > -     .probe          = aer_probe,
> > -     .remove         = aer_remove,
> > +     .name                   = "aer",
> > +     .port_type              = PCIE_ANY_PORT,
> > +     .service                = PCIE_PORT_SERVICE_AER,
> > +     .probe                  = aer_probe,
> > +     .suspend                = aer_suspend,
> > +     .resume                 = aer_resume,
> > +     .runtime_suspend        = aer_suspend,
> > +     .runtime_resume         = aer_resume,
> > +     .remove                 = aer_remove,
> >   };
> >
> >   /**
>
> --
> Sathyanarayanan Kuppuswamy
> Linux Kernel Developer

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-03-21  2:38   ` Kai-Heng Feng
@ 2022-03-21  3:52     ` Sathyanarayanan Kuppuswamy
  0 siblings, 0 replies; 12+ messages in thread
From: Sathyanarayanan Kuppuswamy @ 2022-03-21  3:52 UTC (permalink / raw)
  To: Kai-Heng Feng
  Cc: bhelgaas, mika.westerberg, koba.ko, Russell Currey,
	Oliver O'Halloran, Lalithambika Krishnakumar, Lu Baolu,
	Joerg Roedel, linuxppc-dev, linux-pci, linux-kernel



On 3/20/22 7:38 PM, Kai-Heng Feng wrote:
> On Sun, Mar 20, 2022 at 4:38 AM Sathyanarayanan Kuppuswamy
> <sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
>>
>>
>>
>> On 1/26/22 6:54 PM, Kai-Heng Feng wrote:
>>> Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
>>> hint") enables ACS, and some platforms lose its NVMe after resume from
>>
>> Why enabling ACS makes platform lose NVMe? Can you add more details
>> about the problem?
> 
> I don't have a hardware analyzer, so the only detail I can provide is
> the symptom.
> I believe the affected system was sent Intel, and there wasn't any
> feedback since then.

Since your commit log refers to ACS, I think first we need to understand
following points.

1. Why we get ACSViol during S3 resume. Is this just a noise?
2. Why AER recovery fails?
3. Is this common for all platforms, or only happens in your test
    platform?

If you are not clear about above points, I think you can submit this
patch as adding suspend/resume support to AER/DPC driver and not include
the issue about ACS.

 From your commit log, the problem is not very clear.

> 
>>
>>> S3:
>>> [   50.947816] pcieport 0000:00:1b.0: DPC: containment event, status:0x1f01 source:0x0000
>>> [   50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
>>> [   50.947829] pcieport 0000:00:1b.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
>>> [   50.947830] pcieport 0000:00:1b.0:   device [8086:06ac] error status/mask=00200000/00010000
>>> [   50.947831] pcieport 0000:00:1b.0:    [21] ACSViol                (First)
>>> [   50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
>>> [   50.947843] nvme nvme0: frozen state error detected, reset controller
>>>
>>> It happens right after ACS gets enabled during resume.
>>>
>>> There's another case, when Thunderbolt reaches D3cold:
>>> [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
>>> [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
>>> [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
>>> [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
>>> [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
>>> [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
>>
>> no callback message means one or more devices in the given port does not
>> support error handler. How is this related to ACS?
> 
> This case is about D3cold, not related to ACS.
> And no error_detected is just part of the message. The whole AER
> message is more important.
> 
> Kai-Heng
> 
>>
>>> [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
>>> [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
>>>
>>> So disable AER service to avoid the noises from turning power rails
>>> on/off when the device is in low power states (D3hot and D3cold), as
>>> PCIe spec "5.2 Link State Power Management" states that TLP and DLLP
>>> transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold
>>> with aux power) and L3 (D3cold).
>>>
>>> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=209149
>>> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453
>>> Fixes: 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in hint")
>>> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
>>> ---
>>> v2:
>>>    - Wording change.
>>>
>>>    drivers/pci/pcie/aer.c | 31 +++++++++++++++++++++++++------
>>>    1 file changed, 25 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
>>> index 9fa1f97e5b270..e4e9d4a3098d7 100644
>>> --- a/drivers/pci/pcie/aer.c
>>> +++ b/drivers/pci/pcie/aer.c
>>> @@ -1367,6 +1367,22 @@ static int aer_probe(struct pcie_device *dev)
>>>        return 0;
>>>    }
>>>
>>> +static int aer_suspend(struct pcie_device *dev)
>>> +{
>>> +     struct aer_rpc *rpc = get_service_data(dev);
>>> +
>>> +     aer_disable_rootport(rpc);
>>> +     return 0;
>>> +}
>>> +
>>> +static int aer_resume(struct pcie_device *dev)
>>> +{
>>> +     struct aer_rpc *rpc = get_service_data(dev);
>>> +
>>> +     aer_enable_rootport(rpc);
>>> +     return 0;
>>> +}
>>> +
>>>    /**
>>>     * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
>>>     * @dev: pointer to Root Port, RCEC, or RCiEP
>>> @@ -1433,12 +1449,15 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
>>>    }
>>>
>>>    static struct pcie_port_service_driver aerdriver = {
>>> -     .name           = "aer",
>>> -     .port_type      = PCIE_ANY_PORT,
>>> -     .service        = PCIE_PORT_SERVICE_AER,
>>> -
>>> -     .probe          = aer_probe,
>>> -     .remove         = aer_remove,
>>> +     .name                   = "aer",
>>> +     .port_type              = PCIE_ANY_PORT,
>>> +     .service                = PCIE_PORT_SERVICE_AER,
>>> +     .probe                  = aer_probe,
>>> +     .suspend                = aer_suspend,
>>> +     .resume                 = aer_resume,
>>> +     .runtime_suspend        = aer_suspend,
>>> +     .runtime_resume         = aer_resume,
>>> +     .remove                 = aer_remove,
>>>    };
>>>
>>>    /**
>>
>> --
>> Sathyanarayanan Kuppuswamy
>> Linux Kernel Developer

-- 
Sathyanarayanan Kuppuswamy
Linux Kernel Developer

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-03-21  3:52 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-27  2:54 [PATCH v2 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state Kai-Heng Feng
2022-01-27  2:54 ` [PATCH v2 2/2] PCI/DPC: Disable DPC " Kai-Heng Feng
2022-01-27  6:31   ` Mika Westerberg
2022-03-19 20:40   ` Sathyanarayanan Kuppuswamy
2022-01-27  6:30 ` [PATCH v2 1/2] PCI/AER: Disable AER " Mika Westerberg
2022-01-27  7:01 ` Lu Baolu
2022-01-27 11:14   ` Kai-Heng Feng
2022-01-28  2:53     ` Lu Baolu
2022-01-28  3:29       ` Kai-Heng Feng
2022-03-19 20:38 ` Sathyanarayanan Kuppuswamy
2022-03-21  2:38   ` Kai-Heng Feng
2022-03-21  3:52     ` Sathyanarayanan Kuppuswamy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).