linux-pci.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RESEND] PCI/AER: Use a common function to print AER error bits
@ 2018-04-17 17:09 Alexandru Gagniuc
  2018-04-26 17:27 ` Tyler Baicar
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Alexandru Gagniuc @ 2018-04-17 17:09 UTC (permalink / raw)
  To: bhelgaas, linux-pci
  Cc: gregkh, fred, linux-kernel, alex_gagniuc, austin_bolen,
	keith.busch, Alexandru Gagniuc

On errors reported from CPER, cper_print_bits() was used to log the
AER bits. This resulted in hard-to-understand messages, without a
prefix. Instead use __aer_print_error() for both native AER and CPER
to provide a more consistent log format.

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
---
 drivers/pci/pcie/aer/aerdrv_errprint.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
index cfc89dd57831..cfae4d52f848 100644
--- a/drivers/pci/pcie/aer/aerdrv_errprint.c
+++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
@@ -216,28 +216,30 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer);
 void cper_print_aer(struct pci_dev *dev, int aer_severity,
 		    struct aer_capability_regs *aer)
 {
-	int layer, agent, status_strs_size, tlp_header_valid = 0;
+	int layer, agent, tlp_header_valid = 0;
 	u32 status, mask;
-	const char **status_strs;
+	struct aer_err_info info;
 
 	if (aer_severity == AER_CORRECTABLE) {
 		status = aer->cor_status;
 		mask = aer->cor_mask;
-		status_strs = aer_correctable_error_string;
-		status_strs_size = ARRAY_SIZE(aer_correctable_error_string);
 	} else {
 		status = aer->uncor_status;
 		mask = aer->uncor_mask;
-		status_strs = aer_uncorrectable_error_string;
-		status_strs_size = ARRAY_SIZE(aer_uncorrectable_error_string);
 		tlp_header_valid = status & AER_LOG_TLP_MASKS;
 	}
 
 	layer = AER_GET_LAYER_ERROR(aer_severity, status);
 	agent = AER_GET_AGENT(aer_severity, status);
 
+	memset(&info, 0, sizeof(info));
+	info.severity = aer_severity;
+	info.status = status;
+	info.mask = mask;
+	info.first_error = 0x1f;
+
 	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
-	cper_print_bits("", status, status_strs, status_strs_size);
+	__aer_print_error(dev, &info);
 	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
 		aer_error_layer[layer], aer_agent_string[agent]);
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH RESEND] PCI/AER: Use a common function to print AER error bits
  2018-04-17 17:09 [PATCH RESEND] PCI/AER: Use a common function to print AER error bits Alexandru Gagniuc
@ 2018-04-26 17:27 ` Tyler Baicar
  2018-04-27 22:43 ` Bjorn Helgaas
  2018-04-30 19:52 ` [PATCH v2] " Alexandru Gagniuc
  2 siblings, 0 replies; 10+ messages in thread
From: Tyler Baicar @ 2018-04-26 17:27 UTC (permalink / raw)
  To: Alexandru Gagniuc, bhelgaas, linux-pci
  Cc: gregkh, fred, linux-kernel, alex_gagniuc, austin_bolen, keith.busch

On 4/17/2018 1:09 PM, Alexandru Gagniuc wrote:
> On errors reported from CPER, cper_print_bits() was used to log the
> AER bits. This resulted in hard-to-understand messages, without a
> prefix. Instead use __aer_print_error() for both native AER and CPER
> to provide a more consistent log format.
>
> Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
Tested-by: Tyler Baicar <tbaicar@codeaurora.org>

Thanks!
> ---
>   drivers/pci/pcie/aer/aerdrv_errprint.c | 16 +++++++++-------
>   1 file changed, 9 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
> index cfc89dd57831..cfae4d52f848 100644
> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> @@ -216,28 +216,30 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer);
>   void cper_print_aer(struct pci_dev *dev, int aer_severity,
>   		    struct aer_capability_regs *aer)
>   {
> -	int layer, agent, status_strs_size, tlp_header_valid = 0;
> +	int layer, agent, tlp_header_valid = 0;
>   	u32 status, mask;
> -	const char **status_strs;
> +	struct aer_err_info info;
>   
>   	if (aer_severity == AER_CORRECTABLE) {
>   		status = aer->cor_status;
>   		mask = aer->cor_mask;
> -		status_strs = aer_correctable_error_string;
> -		status_strs_size = ARRAY_SIZE(aer_correctable_error_string);
>   	} else {
>   		status = aer->uncor_status;
>   		mask = aer->uncor_mask;
> -		status_strs = aer_uncorrectable_error_string;
> -		status_strs_size = ARRAY_SIZE(aer_uncorrectable_error_string);
>   		tlp_header_valid = status & AER_LOG_TLP_MASKS;
>   	}
>   
>   	layer = AER_GET_LAYER_ERROR(aer_severity, status);
>   	agent = AER_GET_AGENT(aer_severity, status);
>   
> +	memset(&info, 0, sizeof(info));
> +	info.severity = aer_severity;
> +	info.status = status;
> +	info.mask = mask;
> +	info.first_error = 0x1f;
> +
>   	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> -	cper_print_bits("", status, status_strs, status_strs_size);
> +	__aer_print_error(dev, &info);
>   	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
>   		aer_error_layer[layer], aer_agent_string[agent]);
>   

-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RESEND] PCI/AER: Use a common function to print AER error bits
  2018-04-17 17:09 [PATCH RESEND] PCI/AER: Use a common function to print AER error bits Alexandru Gagniuc
  2018-04-26 17:27 ` Tyler Baicar
@ 2018-04-27 22:43 ` Bjorn Helgaas
  2018-04-28 16:46   ` Alex G.
  2018-04-30 19:52 ` [PATCH v2] " Alexandru Gagniuc
  2 siblings, 1 reply; 10+ messages in thread
From: Bjorn Helgaas @ 2018-04-27 22:43 UTC (permalink / raw)
  To: Alexandru Gagniuc
  Cc: bhelgaas, linux-pci, gregkh, fred, linux-kernel, alex_gagniuc,
	austin_bolen, keith.busch

On Tue, Apr 17, 2018 at 12:09:43PM -0500, Alexandru Gagniuc wrote:
> On errors reported from CPER, cper_print_bits() was used to log the
> AER bits. This resulted in hard-to-understand messages, without a
> prefix. Instead use __aer_print_error() for both native AER and CPER
> to provide a more consistent log format.
> 
> Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
> ---
>  drivers/pci/pcie/aer/aerdrv_errprint.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
> index cfc89dd57831..cfae4d52f848 100644
> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> @@ -216,28 +216,30 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer);
>  void cper_print_aer(struct pci_dev *dev, int aer_severity,
>  		    struct aer_capability_regs *aer)
>  {
> -	int layer, agent, status_strs_size, tlp_header_valid = 0;
> +	int layer, agent, tlp_header_valid = 0;
>  	u32 status, mask;
> -	const char **status_strs;
> +	struct aer_err_info info;
>  
>  	if (aer_severity == AER_CORRECTABLE) {
>  		status = aer->cor_status;
>  		mask = aer->cor_mask;
> -		status_strs = aer_correctable_error_string;
> -		status_strs_size = ARRAY_SIZE(aer_correctable_error_string);
>  	} else {
>  		status = aer->uncor_status;
>  		mask = aer->uncor_mask;
> -		status_strs = aer_uncorrectable_error_string;
> -		status_strs_size = ARRAY_SIZE(aer_uncorrectable_error_string);
>  		tlp_header_valid = status & AER_LOG_TLP_MASKS;
>  	}
>  
>  	layer = AER_GET_LAYER_ERROR(aer_severity, status);
>  	agent = AER_GET_AGENT(aer_severity, status);
>  
> +	memset(&info, 0, sizeof(info));
> +	info.severity = aer_severity;
> +	info.status = status;
> +	info.mask = mask;
> +	info.first_error = 0x1f;

I like this patch a lot, but where does this "first_error = 0x1f" come
from?

I assume this is supposed to be the "First Error Pointer" in the
Advanced Error Capabilities and Control register (PCIe r4.0, sec
7.8.4.7).  There is a "cap_control" field in struct
aer_capability_regs; should we be using that here?

> +
>  	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> -	cper_print_bits("", status, status_strs, status_strs_size);
> +	__aer_print_error(dev, &info);
>  	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
>  		aer_error_layer[layer], aer_agent_string[agent]);
>  
> -- 
> 2.14.3
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RESEND] PCI/AER: Use a common function to print AER error bits
  2018-04-27 22:43 ` Bjorn Helgaas
@ 2018-04-28 16:46   ` Alex G.
  2018-04-28 17:07     ` Alex G.
  0 siblings, 1 reply; 10+ messages in thread
From: Alex G. @ 2018-04-28 16:46 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: bhelgaas, linux-pci, gregkh, fred, linux-kernel, alex_gagniuc,
	austin_bolen, keith.busch

On 04/27/2018 05:43 PM, Bjorn Helgaas wrote:
> On Tue, Apr 17, 2018 at 12:09:43PM -0500, Alexandru Gagniuc wrote:
>> On errors reported from CPER, cper_print_bits() was used to log the
>> AER bits. This resulted in hard-to-understand messages, without a
>> prefix. Instead use __aer_print_error() for both native AER and CPER
>> to provide a more consistent log format.
>>
>> Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
>> ---
>>   drivers/pci/pcie/aer/aerdrv_errprint.c | 16 +++++++++-------
>>   1 file changed, 9 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
>> index cfc89dd57831..cfae4d52f848 100644
>> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
>> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
>> @@ -216,28 +216,30 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer);
>>   void cper_print_aer(struct pci_dev *dev, int aer_severity,
>>   		    struct aer_capability_regs *aer)
>>   {
>> -	int layer, agent, status_strs_size, tlp_header_valid = 0;
>> +	int layer, agent, tlp_header_valid = 0;
>>   	u32 status, mask;
>> -	const char **status_strs;
>> +	struct aer_err_info info;
>>   
>>   	if (aer_severity == AER_CORRECTABLE) {
>>   		status = aer->cor_status;
>>   		mask = aer->cor_mask;
>> -		status_strs = aer_correctable_error_string;
>> -		status_strs_size = ARRAY_SIZE(aer_correctable_error_string);
>>   	} else {
>>   		status = aer->uncor_status;
>>   		mask = aer->uncor_mask;
>> -		status_strs = aer_uncorrectable_error_string;
>> -		status_strs_size = ARRAY_SIZE(aer_uncorrectable_error_string);
>>   		tlp_header_valid = status & AER_LOG_TLP_MASKS;
>>   	}
>>   
>>   	layer = AER_GET_LAYER_ERROR(aer_severity, status);
>>   	agent = AER_GET_AGENT(aer_severity, status);
>>   
>> +	memset(&info, 0, sizeof(info));
>> +	info.severity = aer_severity;
>> +	info.status = status;
>> +	info.mask = mask;
>> +	info.first_error = 0x1f;
> 
> I like this patch a lot, but where does this "first_error = 0x1f" come
> from?

aer_(un)correctable_error_string don't go to [0x1f], so this guarantees 
us we don't print "(First)".

> I assume this is supposed to be the "First Error Pointer" in the
> Advanced Error Capabilities and Control register (PCIe r4.0, sec
> 7.8.4.7).  There is a "cap_control" field in struct
> aer_capability_regs; should we be using that here?

There is a way to extract it from the PCI regs, and it's quite simple. 
IIRC, it should be all f's when the capability is not implemented. I 
wanted to avoid any further parsing of PCI regs in this patch.

I can see a way to use even more common printk code, but that requires 
validating the PCI regs we get from firmware. That means we need to make 
a guarantee about CPER that is beyond the scope of this patch.

Alex

>> +
>>   	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
>> -	cper_print_bits("", status, status_strs, status_strs_size);
>> +	__aer_print_error(dev, &info);
>>   	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
>>   		aer_error_layer[layer], aer_agent_string[agent]);
>>   
>> -- 
>> 2.14.3
>>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RESEND] PCI/AER: Use a common function to print AER error bits
  2018-04-28 16:46   ` Alex G.
@ 2018-04-28 17:07     ` Alex G.
  2018-04-30 17:15       ` Bjorn Helgaas
  0 siblings, 1 reply; 10+ messages in thread
From: Alex G. @ 2018-04-28 17:07 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: bhelgaas, linux-pci, gregkh, fred, linux-kernel, alex_gagniuc,
	austin_bolen, keith.busch

On 04/28/2018 11:46 AM, Alex G. wrote:
> On 04/27/2018 05:43 PM, Bjorn Helgaas wrote:
>> On Tue, Apr 17, 2018 at 12:09:43PM -0500, Alexandru Gagniuc wrote:
(snip)
>>> +    memset(&info, 0, sizeof(info));
>>> +    info.severity = aer_severity;
>>> +    info.status = status;
>>> +    info.mask = mask;
>>> +    info.first_error = 0x1f;
>>
>> I like this patch a lot, but where does this "first_error = 0x1f" come
>> from?
> 
> aer_(un)correctable_error_string don't go to [0x1f], so this guarantees 
> us we don't print "(First)".
> 
>> I assume this is supposed to be the "First Error Pointer" in the
>> Advanced Error Capabilities and Control register (PCIe r4.0, sec
>> 7.8.4.7).  There is a "cap_control" field in struct
>> aer_capability_regs; should we be using that here?
> 
> There is a way to extract it from the PCI regs, and it's quite simple. 
> IIRC, it should be all f's when the capability is not implemented. I 
> wanted to avoid any further parsing of PCI regs in this patch.

I could update the offending line to say:
  +	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);

Though I still have the concerns with validating CPER data:

> I can see a way to use even more common printk code, but that requires 
> validating the PCI regs we get from firmware. That means we need to make 
> a guarantee about CPER that is beyond the scope of this patch.
> 
> Alex
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RESEND] PCI/AER: Use a common function to print AER error bits
  2018-04-28 17:07     ` Alex G.
@ 2018-04-30 17:15       ` Bjorn Helgaas
  2018-04-30 17:41         ` Alex G.
  0 siblings, 1 reply; 10+ messages in thread
From: Bjorn Helgaas @ 2018-04-30 17:15 UTC (permalink / raw)
  To: Alex G.
  Cc: bhelgaas, linux-pci, gregkh, fred, linux-kernel, alex_gagniuc,
	austin_bolen, keith.busch

On Sat, Apr 28, 2018 at 12:07:48PM -0500, Alex G. wrote:
> On 04/28/2018 11:46 AM, Alex G. wrote:
> > On 04/27/2018 05:43 PM, Bjorn Helgaas wrote:
> > > On Tue, Apr 17, 2018 at 12:09:43PM -0500, Alexandru Gagniuc wrote:
> (snip)
> > > > +    memset(&info, 0, sizeof(info));
> > > > +    info.severity = aer_severity;
> > > > +    info.status = status;
> > > > +    info.mask = mask;
> > > > +    info.first_error = 0x1f;
> > > 
> > > I like this patch a lot, but where does this "first_error = 0x1f" come
> > > from?
> > 
> > aer_(un)correctable_error_string don't go to [0x1f], so this guarantees
> > us we don't print "(First)".
> > 
> > > I assume this is supposed to be the "First Error Pointer" in the
> > > Advanced Error Capabilities and Control register (PCIe r4.0, sec
> > > 7.8.4.7).  There is a "cap_control" field in struct
> > > aer_capability_regs; should we be using that here?
> > 
> > There is a way to extract it from the PCI regs, and it's quite simple.
> > IIRC, it should be all f's when the capability is not implemented. I
> > wanted to avoid any further parsing of PCI regs in this patch.
> 
> I could update the offending line to say:
>  +	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);

That's what I would have expected.  So I'd say either do this, or add
a comment about why it's not the right thing to do.

> Though I still have the concerns with validating CPER data:
> 
> > I can see a way to use even more common printk code, but that requires
> > validating the PCI regs we get from firmware. That means we need to make
> > a guarantee about CPER that is beyond the scope of this patch.

Sounds like this is material for another patch, but if/when you do
that, I'd like to understand your concern about validating the
registers we get from firmware.  Are you worried about getting
incorrect register contents, then printing the wrong info, making
the wrong decision about how to recover, something else?

Bjorn

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RESEND] PCI/AER: Use a common function to print AER error bits
  2018-04-30 17:15       ` Bjorn Helgaas
@ 2018-04-30 17:41         ` Alex G.
  2018-05-07 22:06           ` Bjorn Helgaas
  0 siblings, 1 reply; 10+ messages in thread
From: Alex G. @ 2018-04-30 17:41 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: bhelgaas, linux-pci, gregkh, fred, linux-kernel, alex_gagniuc,
	austin_bolen, keith.busch



On 04/30/2018 12:15 PM, Bjorn Helgaas wrote:
> On Sat, Apr 28, 2018 at 12:07:48PM -0500, Alex G. wrote:

(snip)
>> I could update the offending line to say:
>>  +	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
> 
> That's what I would have expected.  So I'd say either do this, or add
> a comment about why it's not the right thing to do.

Okay.

>> Though I still have the concerns with validating CPER data:
>>
>>> I can see a way to use even more common printk code, but that requires
>>> validating the PCI regs we get from firmware. That means we need to make
>>> a guarantee about CPER that is beyond the scope of this patch.
> 
> Sounds like this is material for another patch, but if/when you do
> that, I'd like to understand your concern about validating the
> registers we get from firmware.  Are you worried about getting
> incorrect register contents, then printing the wrong info, making
> the wrong decision about how to recover, something else?

I don't trust firmware, and I have daymares about firmware leaving these
fields uninitialized. In jargon, I'd like to treat it as external
untrusted serialized data.

Alex

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2] PCI/AER: Use a common function to print AER error bits
  2018-04-17 17:09 [PATCH RESEND] PCI/AER: Use a common function to print AER error bits Alexandru Gagniuc
  2018-04-26 17:27 ` Tyler Baicar
  2018-04-27 22:43 ` Bjorn Helgaas
@ 2018-04-30 19:52 ` Alexandru Gagniuc
  2018-05-07 22:13   ` Bjorn Helgaas
  2 siblings, 1 reply; 10+ messages in thread
From: Alexandru Gagniuc @ 2018-04-30 19:52 UTC (permalink / raw)
  To: bhelgaas
  Cc: alex_gagniuc, austin_bolen, shyam_iyer, Alexandru Gagniuc,
	Frederick Lawler, Greg Kroah-Hartman, open list:PCI SUBSYSTEM,
	open list

On errors reported from CPER, cper_print_bits() was used to log the
AER bits. This resulted in hard-to-understand messages, without a
prefix. Instead use __aer_print_error() for both native AER and CPER
to provide a more consistent log format.

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
---

Changes since v1:
  - Parse aer regs for first error pointer instead of disabling it
On the Dell machine where I tested this, the first error pointer is
reported correctly by firmware.

 drivers/pci/pcie/aer/aerdrv_errprint.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
index cfc89dd57831..b5612cc51b63 100644
--- a/drivers/pci/pcie/aer/aerdrv_errprint.c
+++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
@@ -216,28 +216,30 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer);
 void cper_print_aer(struct pci_dev *dev, int aer_severity,
 		    struct aer_capability_regs *aer)
 {
-	int layer, agent, status_strs_size, tlp_header_valid = 0;
+	int layer, agent, tlp_header_valid = 0;
 	u32 status, mask;
-	const char **status_strs;
+	struct aer_err_info info;
 
 	if (aer_severity == AER_CORRECTABLE) {
 		status = aer->cor_status;
 		mask = aer->cor_mask;
-		status_strs = aer_correctable_error_string;
-		status_strs_size = ARRAY_SIZE(aer_correctable_error_string);
 	} else {
 		status = aer->uncor_status;
 		mask = aer->uncor_mask;
-		status_strs = aer_uncorrectable_error_string;
-		status_strs_size = ARRAY_SIZE(aer_uncorrectable_error_string);
 		tlp_header_valid = status & AER_LOG_TLP_MASKS;
 	}
 
 	layer = AER_GET_LAYER_ERROR(aer_severity, status);
 	agent = AER_GET_AGENT(aer_severity, status);
 
+	memset(&info, 0, sizeof(info));
+	info.severity = aer_severity;
+	info.status = status;
+	info.mask = mask;
+	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
+
 	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
-	cper_print_bits("", status, status_strs, status_strs_size);
+	__aer_print_error(dev, &info);
 	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
 		aer_error_layer[layer], aer_agent_string[agent]);
 
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH RESEND] PCI/AER: Use a common function to print AER error bits
  2018-04-30 17:41         ` Alex G.
@ 2018-05-07 22:06           ` Bjorn Helgaas
  0 siblings, 0 replies; 10+ messages in thread
From: Bjorn Helgaas @ 2018-05-07 22:06 UTC (permalink / raw)
  To: Alex G.
  Cc: bhelgaas, linux-pci, gregkh, fred, linux-kernel, alex_gagniuc,
	austin_bolen, keith.busch

On Mon, Apr 30, 2018 at 12:41:26PM -0500, Alex G. wrote:
> On 04/30/2018 12:15 PM, Bjorn Helgaas wrote:
> > On Sat, Apr 28, 2018 at 12:07:48PM -0500, Alex G. wrote:
> 
> (snip)
> >> I could update the offending line to say:
> >>  +	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
> > 
> > That's what I would have expected.  So I'd say either do this, or add
> > a comment about why it's not the right thing to do.
> 
> Okay.
> 
> >> Though I still have the concerns with validating CPER data:
> >>
> >>> I can see a way to use even more common printk code, but that requires
> >>> validating the PCI regs we get from firmware. That means we need to make
> >>> a guarantee about CPER that is beyond the scope of this patch.
> > 
> > Sounds like this is material for another patch, but if/when you do
> > that, I'd like to understand your concern about validating the
> > registers we get from firmware.  Are you worried about getting
> > incorrect register contents, then printing the wrong info, making
> > the wrong decision about how to recover, something else?
> 
> I don't trust firmware, and I have daymares about firmware leaving these
> fields uninitialized. In jargon, I'd like to treat it as external
> untrusted serialized data.

That makes good sense to me.

In this particular case, we only test first_error for equality:

  __aer_print_error(...)
  {
    ...

      pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
	info->first_error == i ? " (First)" : "");

so I don't think there's any danger.  If we were using it to index an
array or something, we should certainly validate it first.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] PCI/AER: Use a common function to print AER error bits
  2018-04-30 19:52 ` [PATCH v2] " Alexandru Gagniuc
@ 2018-05-07 22:13   ` Bjorn Helgaas
  0 siblings, 0 replies; 10+ messages in thread
From: Bjorn Helgaas @ 2018-05-07 22:13 UTC (permalink / raw)
  To: Alexandru Gagniuc
  Cc: bhelgaas, alex_gagniuc, austin_bolen, shyam_iyer,
	Frederick Lawler, Greg Kroah-Hartman, open list:PCI SUBSYSTEM,
	open list

On Mon, Apr 30, 2018 at 02:52:15PM -0500, Alexandru Gagniuc wrote:
> On errors reported from CPER, cper_print_bits() was used to log the
> AER bits. This resulted in hard-to-understand messages, without a
> prefix. Instead use __aer_print_error() for both native AER and CPER
> to provide a more consistent log format.
> 
> Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>

Applied to pci/aer for v4.18, thanks!

> ---
> 
> Changes since v1:
>   - Parse aer regs for first error pointer instead of disabling it
> On the Dell machine where I tested this, the first error pointer is
> reported correctly by firmware.
> 
>  drivers/pci/pcie/aer/aerdrv_errprint.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c
> index cfc89dd57831..b5612cc51b63 100644
> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> @@ -216,28 +216,30 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer);
>  void cper_print_aer(struct pci_dev *dev, int aer_severity,
>  		    struct aer_capability_regs *aer)
>  {
> -	int layer, agent, status_strs_size, tlp_header_valid = 0;
> +	int layer, agent, tlp_header_valid = 0;
>  	u32 status, mask;
> -	const char **status_strs;
> +	struct aer_err_info info;
>  
>  	if (aer_severity == AER_CORRECTABLE) {
>  		status = aer->cor_status;
>  		mask = aer->cor_mask;
> -		status_strs = aer_correctable_error_string;
> -		status_strs_size = ARRAY_SIZE(aer_correctable_error_string);
>  	} else {
>  		status = aer->uncor_status;
>  		mask = aer->uncor_mask;
> -		status_strs = aer_uncorrectable_error_string;
> -		status_strs_size = ARRAY_SIZE(aer_uncorrectable_error_string);
>  		tlp_header_valid = status & AER_LOG_TLP_MASKS;
>  	}
>  
>  	layer = AER_GET_LAYER_ERROR(aer_severity, status);
>  	agent = AER_GET_AGENT(aer_severity, status);
>  
> +	memset(&info, 0, sizeof(info));
> +	info.severity = aer_severity;
> +	info.status = status;
> +	info.mask = mask;
> +	info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
> +
>  	pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
> -	cper_print_bits("", status, status_strs, status_strs_size);
> +	__aer_print_error(dev, &info);
>  	pci_err(dev, "aer_layer=%s, aer_agent=%s\n",
>  		aer_error_layer[layer], aer_agent_string[agent]);
>  
> -- 
> 2.14.3
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2018-05-07 22:14 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-17 17:09 [PATCH RESEND] PCI/AER: Use a common function to print AER error bits Alexandru Gagniuc
2018-04-26 17:27 ` Tyler Baicar
2018-04-27 22:43 ` Bjorn Helgaas
2018-04-28 16:46   ` Alex G.
2018-04-28 17:07     ` Alex G.
2018-04-30 17:15       ` Bjorn Helgaas
2018-04-30 17:41         ` Alex G.
2018-05-07 22:06           ` Bjorn Helgaas
2018-04-30 19:52 ` [PATCH v2] " Alexandru Gagniuc
2018-05-07 22:13   ` Bjorn Helgaas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).