linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
@ 2022-04-08 15:31 Kai-Heng Feng
  2022-04-08 15:31 ` [PATCH v4 2/2] PCI/DPC: Disable DPC " Kai-Heng Feng
  2022-04-22 22:24 ` [PATCH v4 1/2] PCI/AER: Disable AER " Bjorn Helgaas
  0 siblings, 2 replies; 8+ messages in thread
From: Kai-Heng Feng @ 2022-04-08 15:31 UTC (permalink / raw)
  To: bhelgaas
  Cc: sathyanarayanan.kuppuswamy, linuxppc-dev, linux-pci,
	linux-kernel, koba.ko, Kai-Heng Feng, Oliver O'Halloran,
	mika.westerberg, baolu.lu

On Intel Alder Lake platforms, Thunderbolt entering D3cold can cause
some errors reported by AER:
[   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
[   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
[   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
[   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
[   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
[   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
[   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
[   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed

So disable AER service to avoid the noises from turning power rails
on/off when the device is in low power states (D3hot and D3cold), as
PCIe Base Spec 5.0, section 5.2 "Link State Power Management" states
that TLP and DLLP transmission is disabled for a Link in L2/L3 Ready
(D3hot), L2 (D3cold with aux power) and L3 (D3cold).

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
v4:
 - Explicitly states the spec version.
 - Wording change. 

v3:
 - Remove reference to ACS.
 - Wording change.

v2:
 - Wording change.

 drivers/pci/pcie/aer.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9fa1f97e5b270..e4e9d4a3098d7 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1367,6 +1367,22 @@ static int aer_probe(struct pcie_device *dev)
 	return 0;
 }
 
+static int aer_suspend(struct pcie_device *dev)
+{
+	struct aer_rpc *rpc = get_service_data(dev);
+
+	aer_disable_rootport(rpc);
+	return 0;
+}
+
+static int aer_resume(struct pcie_device *dev)
+{
+	struct aer_rpc *rpc = get_service_data(dev);
+
+	aer_enable_rootport(rpc);
+	return 0;
+}
+
 /**
  * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
  * @dev: pointer to Root Port, RCEC, or RCiEP
@@ -1433,12 +1449,15 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
 }
 
 static struct pcie_port_service_driver aerdriver = {
-	.name		= "aer",
-	.port_type	= PCIE_ANY_PORT,
-	.service	= PCIE_PORT_SERVICE_AER,
-
-	.probe		= aer_probe,
-	.remove		= aer_remove,
+	.name			= "aer",
+	.port_type		= PCIE_ANY_PORT,
+	.service		= PCIE_PORT_SERVICE_AER,
+	.probe			= aer_probe,
+	.suspend		= aer_suspend,
+	.resume			= aer_resume,
+	.runtime_suspend	= aer_suspend,
+	.runtime_resume		= aer_resume,
+	.remove			= aer_remove,
 };
 
 /**
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v4 2/2] PCI/DPC: Disable DPC service when link is in L2/L3 ready, L2 and L3 state
  2022-04-08 15:31 [PATCH v4 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state Kai-Heng Feng
@ 2022-04-08 15:31 ` Kai-Heng Feng
  2022-04-18  2:41   ` Sathyanarayanan Kuppuswamy
  2022-04-22 22:24 ` [PATCH v4 1/2] PCI/AER: Disable AER " Bjorn Helgaas
  1 sibling, 1 reply; 8+ messages in thread
From: Kai-Heng Feng @ 2022-04-08 15:31 UTC (permalink / raw)
  To: bhelgaas
  Cc: sathyanarayanan.kuppuswamy, linuxppc-dev, linux-pci,
	linux-kernel, koba.ko, Kai-Heng Feng, Oliver O'Halloran,
	mika.westerberg, baolu.lu

On Intel Alder Lake platforms, Thunderbolt entering D3cold can cause
some errors reported by AER:
[   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
[   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
[   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
[   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
[   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
[   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
[   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
[   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed

Since AER is disabled in previous patch for a Link in L2/L3 Ready, L2
and L3, also disable DPC here as DPC depends on AER to work.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
---
v4:
 - Wording change.

v3:
 - Wording change to make the patch more clear.

v2:
 - Wording change.
 - Empty line dropped.

 drivers/pci/pcie/dpc.c | 60 +++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 16 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 3e9afee02e8d1..414258967f08e 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -343,13 +343,33 @@ void pci_dpc_init(struct pci_dev *pdev)
 	}
 }
 
+static void dpc_enable(struct pcie_device *dev)
+{
+	struct pci_dev *pdev = dev->port;
+	u16 ctl;
+
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
+	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
+	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
+}
+
+static void dpc_disable(struct pcie_device *dev)
+{
+	struct pci_dev *pdev = dev->port;
+	u16 ctl;
+
+	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
+	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
+	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
+}
+
 #define FLAG(x, y) (((x) & (y)) ? '+' : '-')
 static int dpc_probe(struct pcie_device *dev)
 {
 	struct pci_dev *pdev = dev->port;
 	struct device *device = &dev->device;
 	int status;
-	u16 ctl, cap;
+	u16 cap;
 
 	if (!pcie_aer_is_native(pdev) && !pcie_ports_dpc_native)
 		return -ENOTSUPP;
@@ -364,10 +384,7 @@ static int dpc_probe(struct pcie_device *dev)
 	}
 
 	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CAP, &cap);
-	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
-
-	ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN;
-	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
+	dpc_enable(dev);
 	pci_info(pdev, "enabled with IRQ %d\n", dev->irq);
 
 	pci_info(pdev, "error containment capabilities: Int Msg #%d, RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -380,22 +397,33 @@ static int dpc_probe(struct pcie_device *dev)
 	return status;
 }
 
-static void dpc_remove(struct pcie_device *dev)
+static int dpc_suspend(struct pcie_device *dev)
 {
-	struct pci_dev *pdev = dev->port;
-	u16 ctl;
+	dpc_disable(dev);
+	return 0;
+}
 
-	pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, &ctl);
-	ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
-	pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl);
+static int dpc_resume(struct pcie_device *dev)
+{
+	dpc_enable(dev);
+	return 0;
+}
+
+static void dpc_remove(struct pcie_device *dev)
+{
+	dpc_disable(dev);
 }
 
 static struct pcie_port_service_driver dpcdriver = {
-	.name		= "dpc",
-	.port_type	= PCIE_ANY_PORT,
-	.service	= PCIE_PORT_SERVICE_DPC,
-	.probe		= dpc_probe,
-	.remove		= dpc_remove,
+	.name			= "dpc",
+	.port_type		= PCIE_ANY_PORT,
+	.service		= PCIE_PORT_SERVICE_DPC,
+	.probe			= dpc_probe,
+	.suspend		= dpc_suspend,
+	.resume			= dpc_resume,
+	.runtime_suspend	= dpc_suspend,
+	.runtime_resume		= dpc_resume,
+	.remove			= dpc_remove,
 };
 
 int __init pcie_dpc_init(void)
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v4 2/2] PCI/DPC: Disable DPC service when link is in L2/L3 ready, L2 and L3 state
  2022-04-08 15:31 ` [PATCH v4 2/2] PCI/DPC: Disable DPC " Kai-Heng Feng
@ 2022-04-18  2:41   ` Sathyanarayanan Kuppuswamy
  2022-06-21  2:27     ` Kai-Heng Feng
  0 siblings, 1 reply; 8+ messages in thread
From: Sathyanarayanan Kuppuswamy @ 2022-04-18  2:41 UTC (permalink / raw)
  To: Kai-Heng Feng, bhelgaas
  Cc: linuxppc-dev, linux-pci, linux-kernel, koba.ko,
	Oliver O'Halloran, mika.westerberg, baolu.lu



On 4/8/22 8:31 AM, Kai-Heng Feng wrote:
> On Intel Alder Lake platforms, Thunderbolt entering D3cold can cause
> some errors reported by AER:
> [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
> [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> 
> Since AER is disabled in previous patch for a Link in L2/L3 Ready, L2
> and L3, also disable DPC here as DPC depends on AER to work.
> 
> Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=215453
> Reviewed-by: Mika Westerberg<mika.westerberg@linux.intel.com>
> Signed-off-by: Kai-Heng Feng<kai.heng.feng@canonical.com>

Reviewed-by: Kuppuswamy Sathyanarayanan 
<sathyanarayanan.kuppuswamy@linux.intel.com>
-- 
Sathyanarayanan Kuppuswamy
Linux Kernel Developer

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-04-08 15:31 [PATCH v4 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state Kai-Heng Feng
  2022-04-08 15:31 ` [PATCH v4 2/2] PCI/DPC: Disable DPC " Kai-Heng Feng
@ 2022-04-22 22:24 ` Bjorn Helgaas
  2022-04-22 22:26   ` Bjorn Helgaas
  1 sibling, 1 reply; 8+ messages in thread
From: Bjorn Helgaas @ 2022-04-22 22:24 UTC (permalink / raw)
  To: Kai-Heng Feng
  Cc: sathyanarayanan.kuppuswamy, linuxppc-dev, linux-pci,
	linux-kernel, koba.ko, Rajvi Jingar, Oliver O'Halloran,
	david.e.box, bhelgaas, mika.westerberg, baolu.lu

[+cc Rajvi, David]

On Fri, Apr 08, 2022 at 11:31:58PM +0800, Kai-Heng Feng wrote:
> On Intel Alder Lake platforms, Thunderbolt entering D3cold can cause
> some errors reported by AER:
> [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
> [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> 
> So disable AER service to avoid the noises from turning power rails
> on/off when the device is in low power states (D3hot and D3cold), as
> PCIe Base Spec 5.0, section 5.2 "Link State Power Management" states
> that TLP and DLLP transmission is disabled for a Link in L2/L3 Ready
> (D3hot), L2 (D3cold with aux power) and L3 (D3cold).

Help me walk through what's happening here, because I'm never very
confident about how error reporting works.  I *think* the Unsupported
Request error means some request was in progress and was not
completed.  I don't think a link going down should by itself cause
an Unsupported Request error because there's no *request*.

I have a theory about what happened here.  Decoding the TLP Header
(from PCIe r6.0, sec 2.2.1.1, 2.2.8.10) gives:

  34000000 (0011 0100 ...):
    Fmt               001        4 DW header, no data
    Type           1 0100        Msg, Local - Terminate at Receiver

  08000052 (0800 ... 0101 0010)
    Requester ID     0800        00:08.0
    Message Code     0101 0010   PTM Request

From your lspci in bugzilla, 08:00 has PTM enabled.  So my theory is
that:

  - 08:00.0 sent a PTM Request Message (a Posted Request)
  - 00:1d.0 received the PTM Request Message
  - The link transitioned to DL_Down
  - Per sec 2.9.1, 00:1d.0 discarded the Request and reported an
    Unsupported Request
  - Or, per sec 6.21.3, if 00:1d.0 received a PTM Request when its
    own PTM Enable was clear, it would also be treated as an
    Unsupported Request

So I suspect we should disable PTM on 08:00.0 before putting it in a
low-power state.  If you manually disable PTM on 08:00.0, do these
errors stop happening?

David did something like this [1], but just for Root Ports.  That
looks wrong to me because sec 6.21.3 says we should not have PTM
enabled in an Upstream Port (i.e., in a downstream device like
08:00.0) unless it is already enabled in the Downstream Port (i.e., in
the Root Port 00:1d.0).

Nit: can you remove the timestamps from the log?  They add clutter but
no useful information.

[1] https://git.kernel.org/linus/a697f072f5da

> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453
> Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> ---
> v4:
>  - Explicitly states the spec version.
>  - Wording change. 
> 
> v3:
>  - Remove reference to ACS.
>  - Wording change.
> 
> v2:
>  - Wording change.
> 
>  drivers/pci/pcie/aer.c | 31 +++++++++++++++++++++++++------
>  1 file changed, 25 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 9fa1f97e5b270..e4e9d4a3098d7 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -1367,6 +1367,22 @@ static int aer_probe(struct pcie_device *dev)
>  	return 0;
>  }
>  
> +static int aer_suspend(struct pcie_device *dev)
> +{
> +	struct aer_rpc *rpc = get_service_data(dev);
> +
> +	aer_disable_rootport(rpc);
> +	return 0;
> +}
> +
> +static int aer_resume(struct pcie_device *dev)
> +{
> +	struct aer_rpc *rpc = get_service_data(dev);
> +
> +	aer_enable_rootport(rpc);
> +	return 0;
> +}
> +
>  /**
>   * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
>   * @dev: pointer to Root Port, RCEC, or RCiEP
> @@ -1433,12 +1449,15 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
>  }
>  
>  static struct pcie_port_service_driver aerdriver = {
> -	.name		= "aer",
> -	.port_type	= PCIE_ANY_PORT,
> -	.service	= PCIE_PORT_SERVICE_AER,
> -
> -	.probe		= aer_probe,
> -	.remove		= aer_remove,
> +	.name			= "aer",
> +	.port_type		= PCIE_ANY_PORT,
> +	.service		= PCIE_PORT_SERVICE_AER,
> +	.probe			= aer_probe,
> +	.suspend		= aer_suspend,
> +	.resume			= aer_resume,
> +	.runtime_suspend	= aer_suspend,
> +	.runtime_resume		= aer_resume,
> +	.remove			= aer_remove,
>  };
>  
>  /**
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-04-22 22:24 ` [PATCH v4 1/2] PCI/AER: Disable AER " Bjorn Helgaas
@ 2022-04-22 22:26   ` Bjorn Helgaas
  2022-07-01  4:06     ` Kai-Heng Feng
  0 siblings, 1 reply; 8+ messages in thread
From: Bjorn Helgaas @ 2022-04-22 22:26 UTC (permalink / raw)
  To: Kai-Heng Feng
  Cc: Rafael J. Wysocki, sathyanarayanan.kuppuswamy, linux-pm,
	linuxppc-dev, linux-pci, linux-kernel, koba.ko, Rajvi Jingar,
	Oliver O'Halloran, david.e.box, bhelgaas, mika.westerberg,
	baolu.lu

[+cc Rafael, linux-pm; sorry forgot this last time]

On Fri, Apr 22, 2022 at 05:24:36PM -0500, Bjorn Helgaas wrote:
> On Fri, Apr 08, 2022 at 11:31:58PM +0800, Kai-Heng Feng wrote:
> > On Intel Alder Lake platforms, Thunderbolt entering D3cold can cause
> > some errors reported by AER:
> > [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> > [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> > [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> > [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> > [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> > [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
> > [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> > [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> > 
> > So disable AER service to avoid the noises from turning power rails
> > on/off when the device is in low power states (D3hot and D3cold), as
> > PCIe Base Spec 5.0, section 5.2 "Link State Power Management" states
> > that TLP and DLLP transmission is disabled for a Link in L2/L3 Ready
> > (D3hot), L2 (D3cold with aux power) and L3 (D3cold).
> 
> Help me walk through what's happening here, because I'm never very
> confident about how error reporting works.  I *think* the Unsupported
> Request error means some request was in progress and was not
> completed.  I don't think a link going down should by itself cause
> an Unsupported Request error because there's no *request*.
> 
> I have a theory about what happened here.  Decoding the TLP Header
> (from PCIe r6.0, sec 2.2.1.1, 2.2.8.10) gives:
> 
>   34000000 (0011 0100 ...):
>     Fmt               001        4 DW header, no data
>     Type           1 0100        Msg, Local - Terminate at Receiver
> 
>   08000052 (0800 ... 0101 0010)
>     Requester ID     0800        00:08.0
>     Message Code     0101 0010   PTM Request
> 
> From your lspci in bugzilla, 08:00 has PTM enabled.  So my theory is
> that:
> 
>   - 08:00.0 sent a PTM Request Message (a Posted Request)
>   - 00:1d.0 received the PTM Request Message
>   - The link transitioned to DL_Down
>   - Per sec 2.9.1, 00:1d.0 discarded the Request and reported an
>     Unsupported Request
>   - Or, per sec 6.21.3, if 00:1d.0 received a PTM Request when its
>     own PTM Enable was clear, it would also be treated as an
>     Unsupported Request
> 
> So I suspect we should disable PTM on 08:00.0 before putting it in a
> low-power state.  If you manually disable PTM on 08:00.0, do these
> errors stop happening?
> 
> David did something like this [1], but just for Root Ports.  That
> looks wrong to me because sec 6.21.3 says we should not have PTM
> enabled in an Upstream Port (i.e., in a downstream device like
> 08:00.0) unless it is already enabled in the Downstream Port (i.e., in
> the Root Port 00:1d.0).
> 
> Nit: can you remove the timestamps from the log?  They add clutter but
> no useful information.
> 
> [1] https://git.kernel.org/linus/a697f072f5da
> 
> > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453
> > Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> > ---
> > v4:
> >  - Explicitly states the spec version.
> >  - Wording change. 
> > 
> > v3:
> >  - Remove reference to ACS.
> >  - Wording change.
> > 
> > v2:
> >  - Wording change.
> > 
> >  drivers/pci/pcie/aer.c | 31 +++++++++++++++++++++++++------
> >  1 file changed, 25 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > index 9fa1f97e5b270..e4e9d4a3098d7 100644
> > --- a/drivers/pci/pcie/aer.c
> > +++ b/drivers/pci/pcie/aer.c
> > @@ -1367,6 +1367,22 @@ static int aer_probe(struct pcie_device *dev)
> >  	return 0;
> >  }
> >  
> > +static int aer_suspend(struct pcie_device *dev)
> > +{
> > +	struct aer_rpc *rpc = get_service_data(dev);
> > +
> > +	aer_disable_rootport(rpc);
> > +	return 0;
> > +}
> > +
> > +static int aer_resume(struct pcie_device *dev)
> > +{
> > +	struct aer_rpc *rpc = get_service_data(dev);
> > +
> > +	aer_enable_rootport(rpc);
> > +	return 0;
> > +}
> > +
> >  /**
> >   * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
> >   * @dev: pointer to Root Port, RCEC, or RCiEP
> > @@ -1433,12 +1449,15 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
> >  }
> >  
> >  static struct pcie_port_service_driver aerdriver = {
> > -	.name		= "aer",
> > -	.port_type	= PCIE_ANY_PORT,
> > -	.service	= PCIE_PORT_SERVICE_AER,
> > -
> > -	.probe		= aer_probe,
> > -	.remove		= aer_remove,
> > +	.name			= "aer",
> > +	.port_type		= PCIE_ANY_PORT,
> > +	.service		= PCIE_PORT_SERVICE_AER,
> > +	.probe			= aer_probe,
> > +	.suspend		= aer_suspend,
> > +	.resume			= aer_resume,
> > +	.runtime_suspend	= aer_suspend,
> > +	.runtime_resume		= aer_resume,
> > +	.remove			= aer_remove,
> >  };
> >  
> >  /**
> > -- 
> > 2.34.1
> > 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4 2/2] PCI/DPC: Disable DPC service when link is in L2/L3 ready, L2 and L3 state
  2022-04-18  2:41   ` Sathyanarayanan Kuppuswamy
@ 2022-06-21  2:27     ` Kai-Heng Feng
  2022-06-23 17:28       ` Bjorn Helgaas
  0 siblings, 1 reply; 8+ messages in thread
From: Kai-Heng Feng @ 2022-06-21  2:27 UTC (permalink / raw)
  To: Sathyanarayanan Kuppuswamy
  Cc: linuxppc-dev, linux-pci, linux-kernel, koba.ko,
	Oliver O'Halloran, bhelgaas, mika.westerberg, baolu.lu

On Mon, Apr 18, 2022 at 10:41 AM Sathyanarayanan Kuppuswamy
<sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
>
>
>
> On 4/8/22 8:31 AM, Kai-Heng Feng wrote:
> > On Intel Alder Lake platforms, Thunderbolt entering D3cold can cause
> > some errors reported by AER:
> > [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> > [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> > [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> > [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> > [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> > [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
> > [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> > [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> >
> > Since AER is disabled in previous patch for a Link in L2/L3 Ready, L2
> > and L3, also disable DPC here as DPC depends on AER to work.
> >
> > Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=215453
> > Reviewed-by: Mika Westerberg<mika.westerberg@linux.intel.com>
> > Signed-off-by: Kai-Heng Feng<kai.heng.feng@canonical.com>
>
> Reviewed-by: Kuppuswamy Sathyanarayanan
> <sathyanarayanan.kuppuswamy@linux.intel.com>

A gentle ping...

> --
> Sathyanarayanan Kuppuswamy
> Linux Kernel Developer

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4 2/2] PCI/DPC: Disable DPC service when link is in L2/L3 ready, L2 and L3 state
  2022-06-21  2:27     ` Kai-Heng Feng
@ 2022-06-23 17:28       ` Bjorn Helgaas
  0 siblings, 0 replies; 8+ messages in thread
From: Bjorn Helgaas @ 2022-06-23 17:28 UTC (permalink / raw)
  To: Kai-Heng Feng
  Cc: Sathyanarayanan Kuppuswamy, mika.westerberg, linux-pci,
	linux-kernel, koba.ko, Oliver O'Halloran, bhelgaas,
	linuxppc-dev, baolu.lu

On Tue, Jun 21, 2022 at 10:27:31AM +0800, Kai-Heng Feng wrote:
> On Mon, Apr 18, 2022 at 10:41 AM Sathyanarayanan Kuppuswamy
> <sathyanarayanan.kuppuswamy@linux.intel.com> wrote:
> > On 4/8/22 8:31 AM, Kai-Heng Feng wrote:
> > > On Intel Alder Lake platforms, Thunderbolt entering D3cold can cause
> > > some errors reported by AER:
> > > [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> > > [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> > > [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> > > [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> > > [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> > > [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
> > > [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> > > [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> > >
> > > Since AER is disabled in previous patch for a Link in L2/L3 Ready, L2
> > > and L3, also disable DPC here as DPC depends on AER to work.
> > >
> > > Bugzilla:https://bugzilla.kernel.org/show_bug.cgi?id=215453
> > > Reviewed-by: Mika Westerberg<mika.westerberg@linux.intel.com>
> > > Signed-off-by: Kai-Heng Feng<kai.heng.feng@canonical.com>
> >
> > Reviewed-by: Kuppuswamy Sathyanarayanan
> > <sathyanarayanan.kuppuswamy@linux.intel.com>
> 
> A gentle ping...

See questions here:
https://lore.kernel.org/r/20220422222433.GA1464120@bhelgaas

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v4 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state
  2022-04-22 22:26   ` Bjorn Helgaas
@ 2022-07-01  4:06     ` Kai-Heng Feng
  0 siblings, 0 replies; 8+ messages in thread
From: Kai-Heng Feng @ 2022-07-01  4:06 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Rafael J. Wysocki, sathyanarayanan.kuppuswamy, linux-pm,
	linuxppc-dev, linux-pci, linux-kernel, koba.ko, Rajvi Jingar,
	Oliver O'Halloran, david.e.box, bhelgaas, mika.westerberg,
	baolu.lu

On Sat, Apr 23, 2022 at 6:26 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> [+cc Rafael, linux-pm; sorry forgot this last time]
>
> On Fri, Apr 22, 2022 at 05:24:36PM -0500, Bjorn Helgaas wrote:
> > On Fri, Apr 08, 2022 at 11:31:58PM +0800, Kai-Heng Feng wrote:
> > > On Intel Alder Lake platforms, Thunderbolt entering D3cold can cause
> > > some errors reported by AER:
> > > [   30.100211] pcieport 0000:00:1d.0: AER: Uncorrected (Non-Fatal) error received: 0000:00:1d.0
> > > [   30.100251] pcieport 0000:00:1d.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID)
> > > [   30.100256] pcieport 0000:00:1d.0:   device [8086:7ab0] error status/mask=00100000/00004000
> > > [   30.100262] pcieport 0000:00:1d.0:    [20] UnsupReq               (First)
> > > [   30.100267] pcieport 0000:00:1d.0: AER:   TLP Header: 34000000 08000052 00000000 00000000
> > > [   30.100372] thunderbolt 0000:0a:00.0: AER: can't recover (no error_detected callback)
> > > [   30.100401] xhci_hcd 0000:3e:00.0: AER: can't recover (no error_detected callback)
> > > [   30.100427] pcieport 0000:00:1d.0: AER: device recovery failed
> > >
> > > So disable AER service to avoid the noises from turning power rails
> > > on/off when the device is in low power states (D3hot and D3cold), as
> > > PCIe Base Spec 5.0, section 5.2 "Link State Power Management" states
> > > that TLP and DLLP transmission is disabled for a Link in L2/L3 Ready
> > > (D3hot), L2 (D3cold with aux power) and L3 (D3cold).
> >
> > Help me walk through what's happening here, because I'm never very
> > confident about how error reporting works.  I *think* the Unsupported
> > Request error means some request was in progress and was not
> > completed.  I don't think a link going down should by itself cause
> > an Unsupported Request error because there's no *request*.
> >
> > I have a theory about what happened here.  Decoding the TLP Header
> > (from PCIe r6.0, sec 2.2.1.1, 2.2.8.10) gives:
> >
> >   34000000 (0011 0100 ...):
> >     Fmt               001        4 DW header, no data
> >     Type           1 0100        Msg, Local - Terminate at Receiver
> >
> >   08000052 (0800 ... 0101 0010)
> >     Requester ID     0800        00:08.0
> >     Message Code     0101 0010   PTM Request

Is there any TLP decoder software available? That will be really
helpful for debugging.

> >
> > From your lspci in bugzilla, 08:00 has PTM enabled.  So my theory is
> > that:
> >
> >   - 08:00.0 sent a PTM Request Message (a Posted Request)
> >   - 00:1d.0 received the PTM Request Message
> >   - The link transitioned to DL_Down
> >   - Per sec 2.9.1, 00:1d.0 discarded the Request and reported an
> >     Unsupported Request
> >   - Or, per sec 6.21.3, if 00:1d.0 received a PTM Request when its
> >     own PTM Enable was clear, it would also be treated as an
> >     Unsupported Request
> >
> > So I suspect we should disable PTM on 08:00.0 before putting it in a
> > low-power state.  If you manually disable PTM on 08:00.0, do these
> > errors stop happening?

Yes, disabling PTM on upstream port can solve the issue.
Thanks for find the root cause!

> >
> > David did something like this [1], but just for Root Ports.  That
> > looks wrong to me because sec 6.21.3 says we should not have PTM
> > enabled in an Upstream Port (i.e., in a downstream device like
> > 08:00.0) unless it is already enabled in the Downstream Port (i.e., in
> > the Root Port 00:1d.0).

So I think it should be like this?
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index cfaf40a540a82..8ba8a0e12946e 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2717,7 +2717,8 @@ int pci_prepare_to_sleep(struct pci_dev *dev)
         * port to enter a lower-power PM state and the SoC to reach a
         * lower-power idle state as a whole.
         */
-       if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
+       if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
+           pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM)
                pci_disable_ptm(dev);

        pci_enable_wake(dev, target_state, wakeup);
@@ -2775,7 +2776,8 @@ int pci_finish_runtime_suspend(struct pci_dev *dev)
         * port to enter a lower-power PM state and the SoC to reach a
         * lower-power idle state as a whole.
         */
-       if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT)
+       if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
+           pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM)
                pci_disable_ptm(dev);

        __pci_enable_wake(dev, target_state, pci_dev_run_wake(dev));


> >
> > Nit: can you remove the timestamps from the log?  They add clutter but
> > no useful information.

Sure.

> >
> > [1] https://git.kernel.org/linus/a697f072f5da
> >
> > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=215453
> > > Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
> > > Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> > > ---
> > > v4:
> > >  - Explicitly states the spec version.
> > >  - Wording change.
> > >
> > > v3:
> > >  - Remove reference to ACS.
> > >  - Wording change.
> > >
> > > v2:
> > >  - Wording change.
> > >
> > >  drivers/pci/pcie/aer.c | 31 +++++++++++++++++++++++++------
> > >  1 file changed, 25 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > > index 9fa1f97e5b270..e4e9d4a3098d7 100644
> > > --- a/drivers/pci/pcie/aer.c
> > > +++ b/drivers/pci/pcie/aer.c
> > > @@ -1367,6 +1367,22 @@ static int aer_probe(struct pcie_device *dev)
> > >     return 0;
> > >  }
> > >
> > > +static int aer_suspend(struct pcie_device *dev)
> > > +{
> > > +   struct aer_rpc *rpc = get_service_data(dev);
> > > +
> > > +   aer_disable_rootport(rpc);
> > > +   return 0;
> > > +}
> > > +
> > > +static int aer_resume(struct pcie_device *dev)
> > > +{
> > > +   struct aer_rpc *rpc = get_service_data(dev);
> > > +
> > > +   aer_enable_rootport(rpc);
> > > +   return 0;
> > > +}
> > > +
> > >  /**
> > >   * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
> > >   * @dev: pointer to Root Port, RCEC, or RCiEP
> > > @@ -1433,12 +1449,15 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
> > >  }
> > >
> > >  static struct pcie_port_service_driver aerdriver = {
> > > -   .name           = "aer",
> > > -   .port_type      = PCIE_ANY_PORT,
> > > -   .service        = PCIE_PORT_SERVICE_AER,
> > > -
> > > -   .probe          = aer_probe,
> > > -   .remove         = aer_remove,
> > > +   .name                   = "aer",
> > > +   .port_type              = PCIE_ANY_PORT,
> > > +   .service                = PCIE_PORT_SERVICE_AER,
> > > +   .probe                  = aer_probe,
> > > +   .suspend                = aer_suspend,
> > > +   .resume                 = aer_resume,
> > > +   .runtime_suspend        = aer_suspend,
> > > +   .runtime_resume         = aer_resume,
> > > +   .remove                 = aer_remove,
> > >  };
> > >
> > >  /**
> > > --
> > > 2.34.1
> > >

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-07-01  4:07 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-08 15:31 [PATCH v4 1/2] PCI/AER: Disable AER service when link is in L2/L3 ready, L2 and L3 state Kai-Heng Feng
2022-04-08 15:31 ` [PATCH v4 2/2] PCI/DPC: Disable DPC " Kai-Heng Feng
2022-04-18  2:41   ` Sathyanarayanan Kuppuswamy
2022-06-21  2:27     ` Kai-Heng Feng
2022-06-23 17:28       ` Bjorn Helgaas
2022-04-22 22:24 ` [PATCH v4 1/2] PCI/AER: Disable AER " Bjorn Helgaas
2022-04-22 22:26   ` Bjorn Helgaas
2022-07-01  4:06     ` Kai-Heng Feng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).