From: Rajat Khandelwal <rajat.khandelwal@linux.intel.com> To: ruscur@russell.cc, oohall@gmail.com, bhelgaas@google.com Cc: linuxppc-dev@lists.ozlabs.org, linux-pci@vger.kernel.org, linux-kernel@vger.kernel.org, rajat.khandelwal@intel.com, Rajat Khandelwal <rajat.khandelwal@linux.intel.com> Subject: [PATCH] PCI/AER: Rate limit the reporting of the correctable errors Date: Tue, 3 Jan 2023 22:25:48 +0530 [thread overview] Message-ID: <20230103165548.570377-1-rajat.khandelwal@linux.intel.com> (raw) There are many instances where correctable errors tend to inundate the message buffer. We observe such instances during thunderbolt PCIe tunneling. It's true that they are mitigated by the hardware and are non-fatal but we shouldn't be spamming the logs with such correctable errors as it confuses other kernel developers less familiar with PCI errors, support staff, and users who happen to look at the logs, hence rate limit them. A typical example log inside an HP TBT4 dock: [54912.661142] pcieport 0000:00:07.0: AER: Multiple Corrected error received: 0000:2b:00.0 [54912.661194] igc 0000:2b:00.0: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID) [54912.661203] igc 0000:2b:00.0: device [8086:5502] error status/mask=00001100/00002000 [54912.661211] igc 0000:2b:00.0: [ 8] Rollover [54912.661219] igc 0000:2b:00.0: [12] Timeout [54982.838760] pcieport 0000:00:07.0: AER: Corrected error received: 0000:2b:00.0 [54982.838798] igc 0000:2b:00.0: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID) [54982.838808] igc 0000:2b:00.0: device [8086:5502] error status/mask=00001000/00002000 [54982.838817] igc 0000:2b:00.0: [12] Timeout This gets repeated continuously, thus inundating the buffer. Signed-off-by: Rajat Khandelwal <rajat.khandelwal@linux.intel.com> --- drivers/pci/pcie/aer.c | 54 +++++++++++++++++++++++++++--------------- include/linux/pci.h | 3 +++ 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index e2d8a74f83c3..7ae6761a8e59 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -684,23 +684,24 @@ static void __aer_print_error(struct pci_dev *dev, { const char **strings; unsigned long status = info->status & ~info->mask; - const char *level, *errmsg; + const char *errmsg; int i; - if (info->severity == AER_CORRECTABLE) { + if (info->severity == AER_CORRECTABLE) strings = aer_correctable_error_string; - level = KERN_WARNING; - } else { + else strings = aer_uncorrectable_error_string; - level = KERN_ERR; - } for_each_set_bit(i, &status, 32) { errmsg = strings[i]; if (!errmsg) errmsg = "Unknown Error Bit"; - pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg, + if (info->severity == AER_CORRECTABLE) + pci_warn_ratelimited(dev, " [%2d] %-22s%s\n", i, errmsg, + info->first_error == i ? " (First)" : ""); + else + pci_err(dev, " [%2d] %-22s%s\n", i, errmsg, info->first_error == i ? " (First)" : ""); } pci_dev_aer_stats_incr(dev, info); @@ -710,7 +711,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { int layer, agent; int id = ((dev->bus->number << 8) | dev->devfn); - const char *level; if (!info->status) { pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", @@ -721,14 +718,21 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) layer = AER_GET_LAYER_ERROR(info->severity, info->status); agent = AER_GET_AGENT(info->severity, info->status); - level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR; + if (info->severity == AER_CORRECTABLE) { + pci_warn_ratelimited(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", + aer_error_severity_string[info->severity], + aer_error_layer[layer], aer_agent_string[agent]); - pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", - aer_error_severity_string[info->severity], - aer_error_layer[layer], aer_agent_string[agent]); + pci_warn_ratelimited(dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", + dev->vendor, dev->device, info->status, info->mask); + } else { + pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", + aer_error_severity_string[info->severity], + aer_error_layer[layer], aer_agent_string[agent]); - pci_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", - dev->vendor, dev->device, info->status, info->mask); + pci_err(dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", + dev->vendor, dev->device, info->status, info->mask); + } __aer_print_error(dev, info); @@ -748,11 +755,19 @@ static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info) u8 bus = info->id >> 8; u8 devfn = info->id & 0xff; - pci_info(dev, "%s%s error received: %04x:%02x:%02x.%d\n", - info->multi_error_valid ? "Multiple " : "", - aer_error_severity_string[info->severity], - pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn), - PCI_FUNC(devfn)); + if (info->severity == AER_CORRECTABLE) + pci_info_ratelimited(dev, "%s%s error received: %04x:%02x:%02x.%d\n", + info->multi_error_valid ? "Multiple " : "", + aer_error_severity_string[info->severity], + pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn), + PCI_FUNC(devfn)); + else + pci_info(dev, "%s%s error received: %04x:%02x:%02x.%d\n", + info->multi_error_valid ? "Multiple " : "", + aer_error_severity_string[info->severity], + pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn), + PCI_FUNC(devfn)); + } #ifdef CONFIG_ACPI_APEI_PCIEAER diff --git a/include/linux/pci.h b/include/linux/pci.h index 060af91bafcd..d9434bae10c8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2491,6 +2491,9 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); #define pci_info_ratelimited(pdev, fmt, arg...) \ dev_info_ratelimited(&(pdev)->dev, fmt, ##arg) +#define pci_warn_ratelimited(pdev, fmt, arg...) \ + dev_warn_ratelimited(&(pdev)->dev, fmt, ##arg) + #define pci_WARN(pdev, condition, fmt, arg...) \ WARN(condition, "%s %s: " fmt, \ dev_driver_string(&(pdev)->dev), pci_name(pdev), ##arg) -- 2.34.1
WARNING: multiple messages have this Message-ID (diff)
From: Rajat Khandelwal <rajat.khandelwal@linux.intel.com> To: ruscur@russell.cc, oohall@gmail.com, bhelgaas@google.com Cc: linux-pci@vger.kernel.org, Rajat Khandelwal <rajat.khandelwal@linux.intel.com>, linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org, rajat.khandelwal@intel.com Subject: [PATCH] PCI/AER: Rate limit the reporting of the correctable errors Date: Tue, 3 Jan 2023 22:25:48 +0530 [thread overview] Message-ID: <20230103165548.570377-1-rajat.khandelwal@linux.intel.com> (raw) There are many instances where correctable errors tend to inundate the message buffer. We observe such instances during thunderbolt PCIe tunneling. It's true that they are mitigated by the hardware and are non-fatal but we shouldn't be spamming the logs with such correctable errors as it confuses other kernel developers less familiar with PCI errors, support staff, and users who happen to look at the logs, hence rate limit them. A typical example log inside an HP TBT4 dock: [54912.661142] pcieport 0000:00:07.0: AER: Multiple Corrected error received: 0000:2b:00.0 [54912.661194] igc 0000:2b:00.0: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID) [54912.661203] igc 0000:2b:00.0: device [8086:5502] error status/mask=00001100/00002000 [54912.661211] igc 0000:2b:00.0: [ 8] Rollover [54912.661219] igc 0000:2b:00.0: [12] Timeout [54982.838760] pcieport 0000:00:07.0: AER: Corrected error received: 0000:2b:00.0 [54982.838798] igc 0000:2b:00.0: PCIe Bus Error: severity=Corrected, type=Data Link Layer, (Transmitter ID) [54982.838808] igc 0000:2b:00.0: device [8086:5502] error status/mask=00001000/00002000 [54982.838817] igc 0000:2b:00.0: [12] Timeout This gets repeated continuously, thus inundating the buffer. Signed-off-by: Rajat Khandelwal <rajat.khandelwal@linux.intel.com> --- drivers/pci/pcie/aer.c | 54 +++++++++++++++++++++++++++--------------- include/linux/pci.h | 3 +++ 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index e2d8a74f83c3..7ae6761a8e59 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -684,23 +684,24 @@ static void __aer_print_error(struct pci_dev *dev, { const char **strings; unsigned long status = info->status & ~info->mask; - const char *level, *errmsg; + const char *errmsg; int i; - if (info->severity == AER_CORRECTABLE) { + if (info->severity == AER_CORRECTABLE) strings = aer_correctable_error_string; - level = KERN_WARNING; - } else { + else strings = aer_uncorrectable_error_string; - level = KERN_ERR; - } for_each_set_bit(i, &status, 32) { errmsg = strings[i]; if (!errmsg) errmsg = "Unknown Error Bit"; - pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg, + if (info->severity == AER_CORRECTABLE) + pci_warn_ratelimited(dev, " [%2d] %-22s%s\n", i, errmsg, + info->first_error == i ? " (First)" : ""); + else + pci_err(dev, " [%2d] %-22s%s\n", i, errmsg, info->first_error == i ? " (First)" : ""); } pci_dev_aer_stats_incr(dev, info); @@ -710,7 +711,6 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { int layer, agent; int id = ((dev->bus->number << 8) | dev->devfn); - const char *level; if (!info->status) { pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", @@ -721,14 +718,21 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) layer = AER_GET_LAYER_ERROR(info->severity, info->status); agent = AER_GET_AGENT(info->severity, info->status); - level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR; + if (info->severity == AER_CORRECTABLE) { + pci_warn_ratelimited(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", + aer_error_severity_string[info->severity], + aer_error_layer[layer], aer_agent_string[agent]); - pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", - aer_error_severity_string[info->severity], - aer_error_layer[layer], aer_agent_string[agent]); + pci_warn_ratelimited(dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", + dev->vendor, dev->device, info->status, info->mask); + } else { + pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", + aer_error_severity_string[info->severity], + aer_error_layer[layer], aer_agent_string[agent]); - pci_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", - dev->vendor, dev->device, info->status, info->mask); + pci_err(dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", + dev->vendor, dev->device, info->status, info->mask); + } __aer_print_error(dev, info); @@ -748,11 +755,19 @@ static void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info) u8 bus = info->id >> 8; u8 devfn = info->id & 0xff; - pci_info(dev, "%s%s error received: %04x:%02x:%02x.%d\n", - info->multi_error_valid ? "Multiple " : "", - aer_error_severity_string[info->severity], - pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn), - PCI_FUNC(devfn)); + if (info->severity == AER_CORRECTABLE) + pci_info_ratelimited(dev, "%s%s error received: %04x:%02x:%02x.%d\n", + info->multi_error_valid ? "Multiple " : "", + aer_error_severity_string[info->severity], + pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn), + PCI_FUNC(devfn)); + else + pci_info(dev, "%s%s error received: %04x:%02x:%02x.%d\n", + info->multi_error_valid ? "Multiple " : "", + aer_error_severity_string[info->severity], + pci_domain_nr(dev->bus), bus, PCI_SLOT(devfn), + PCI_FUNC(devfn)); + } #ifdef CONFIG_ACPI_APEI_PCIEAER diff --git a/include/linux/pci.h b/include/linux/pci.h index 060af91bafcd..d9434bae10c8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2491,6 +2491,9 @@ void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); #define pci_info_ratelimited(pdev, fmt, arg...) \ dev_info_ratelimited(&(pdev)->dev, fmt, ##arg) +#define pci_warn_ratelimited(pdev, fmt, arg...) \ + dev_warn_ratelimited(&(pdev)->dev, fmt, ##arg) + #define pci_WARN(pdev, condition, fmt, arg...) \ WARN(condition, "%s %s: " fmt, \ dev_driver_string(&(pdev)->dev), pci_name(pdev), ##arg) -- 2.34.1
next reply other threads:[~2023-01-03 16:55 UTC|newest] Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top 2023-01-03 16:55 Rajat Khandelwal [this message] 2023-01-03 16:55 ` [PATCH] PCI/AER: Rate limit the reporting of the correctable errors Rajat Khandelwal 2023-01-03 19:14 ` Bjorn Helgaas 2023-01-03 19:14 ` Bjorn Helgaas 2023-01-04 4:57 ` Rajat Khandelwal 2023-01-04 6:46 ` Leon Romanovsky 2023-01-04 6:46 ` Leon Romanovsky 2023-01-04 13:04 ` Rajat Khandelwal
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20230103165548.570377-1-rajat.khandelwal@linux.intel.com \ --to=rajat.khandelwal@linux.intel.com \ --cc=bhelgaas@google.com \ --cc=linux-kernel@vger.kernel.org \ --cc=linux-pci@vger.kernel.org \ --cc=linuxppc-dev@lists.ozlabs.org \ --cc=oohall@gmail.com \ --cc=rajat.khandelwal@intel.com \ --cc=ruscur@russell.cc \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.