linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] Restructure and fix GHES PCIe AER handling
@ 2017-11-08 19:13 Tyler Baicar
  2017-11-08 19:13 ` [PATCH 1/2] acpi: apei: handle PCIe AER errors in separate function Tyler Baicar
  2017-11-08 19:13 ` [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity Tyler Baicar
  0 siblings, 2 replies; 14+ messages in thread
From: Tyler Baicar @ 2017-11-08 19:13 UTC (permalink / raw)
  To: rjw, tony.luck, bp, bp, will.deacon, james.morse, linux-acpi,
	linux-kernel
  Cc: Tyler Baicar

First, break the PCIe AER handling out into its own function to separate
it from the standard GHES processing

Then fix the AER handling to process all errors in the AER driver rather
than only handling recoverable errors.

Tyler Baicar (2):
  acpi: apei: handle PCIe AER errors in separate function
  acpi: apei: call into AER handling regardless of severity

 drivers/acpi/apei/ghes.c | 62 +++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] acpi: apei: handle PCIe AER errors in separate function
  2017-11-08 19:13 [PATCH 0/2] Restructure and fix GHES PCIe AER handling Tyler Baicar
@ 2017-11-08 19:13 ` Tyler Baicar
  2017-11-09 17:21   ` Borislav Petkov
  2017-11-08 19:13 ` [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity Tyler Baicar
  1 sibling, 1 reply; 14+ messages in thread
From: Tyler Baicar @ 2017-11-08 19:13 UTC (permalink / raw)
  To: rjw, tony.luck, bp, bp, will.deacon, james.morse, linux-acpi,
	linux-kernel
  Cc: Tyler Baicar

Move PCIe AER error handling code into a separate function.

Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
---
 drivers/acpi/apei/ghes.c | 64 +++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 3c3a37b..839c3d5 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -458,6 +458,39 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
 #endif
 }
 
+static void ghes_handle_aer(struct acpi_hest_generic_data *gdata, int sev, int sec_sev)
+{
+#ifdef CONFIG_ACPI_APEI_PCIEAER
+	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
+
+	if (sev == GHES_SEV_RECOVERABLE &&
+	    sec_sev == GHES_SEV_RECOVERABLE &&
+	    pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
+	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
+		unsigned int devfn;
+		int aer_severity;
+
+		devfn = PCI_DEVFN(pcie_err->device_id.device,
+				  pcie_err->device_id.function);
+		aer_severity = cper_severity_to_aer(gdata->error_severity);
+
+		/*
+		 * If firmware reset the component to contain
+		 * the error, we must reinitialize it before
+		 * use, so treat it as a fatal AER error.
+		 */
+		if (gdata->flags & CPER_SEC_RESET)
+			aer_severity = AER_FATAL;
+
+		aer_recover_queue(pcie_err->device_id.segment,
+				  pcie_err->device_id.bus,
+				  devfn, aer_severity,
+				  (struct aer_capability_regs *)
+				  pcie_err->aer_info);
+	}
+#endif
+}
+
 static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -485,38 +518,9 @@ static void ghes_do_proc(struct ghes *ghes,
 			arch_apei_report_mem_error(sev, mem_err);
 			ghes_handle_memory_failure(gdata, sev);
 		}
-#ifdef CONFIG_ACPI_APEI_PCIEAER
 		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
-			struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
-
-			if (sev == GHES_SEV_RECOVERABLE &&
-			    sec_sev == GHES_SEV_RECOVERABLE &&
-			    pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
-			    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
-				unsigned int devfn;
-				int aer_severity;
-
-				devfn = PCI_DEVFN(pcie_err->device_id.device,
-						  pcie_err->device_id.function);
-				aer_severity = cper_severity_to_aer(gdata->error_severity);
-
-				/*
-				 * If firmware reset the component to contain
-				 * the error, we must reinitialize it before
-				 * use, so treat it as a fatal AER error.
-				 */
-				if (gdata->flags & CPER_SEC_RESET)
-					aer_severity = AER_FATAL;
-
-				aer_recover_queue(pcie_err->device_id.segment,
-						  pcie_err->device_id.bus,
-						  devfn, aer_severity,
-						  (struct aer_capability_regs *)
-						  pcie_err->aer_info);
-			}
-
+			ghes_handle_aer(gdata, sev, sec_sev);
 		}
-#endif
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
 			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
 
-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
  2017-11-08 19:13 [PATCH 0/2] Restructure and fix GHES PCIe AER handling Tyler Baicar
  2017-11-08 19:13 ` [PATCH 1/2] acpi: apei: handle PCIe AER errors in separate function Tyler Baicar
@ 2017-11-08 19:13 ` Tyler Baicar
  2017-11-09  9:46   ` Borislav Petkov
  2017-11-13 12:36   ` Dongdong Liu
  1 sibling, 2 replies; 14+ messages in thread
From: Tyler Baicar @ 2017-11-08 19:13 UTC (permalink / raw)
  To: rjw, tony.luck, bp, bp, will.deacon, james.morse, linux-acpi,
	linux-kernel
  Cc: Tyler Baicar

Currently the GHES code only calls into the AER driver for
recoverable type errors. This is incorrect because errors of
other severities do not get logged by the AER driver and do not
get exposed to user space via the AER trace event. So, call
into the AER driver for PCIe errors regardless of the severity

Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
---
 drivers/acpi/apei/ghes.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 839c3d5..bb65fa6 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
 #endif
 }
 
-static void ghes_handle_aer(struct acpi_hest_generic_data *gdata, int sev, int sec_sev)
+static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 {
 #ifdef CONFIG_ACPI_APEI_PCIEAER
 	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
 
-	if (sev == GHES_SEV_RECOVERABLE &&
-	    sec_sev == GHES_SEV_RECOVERABLE &&
-	    pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
+	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
 	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
 		unsigned int devfn;
 		int aer_severity;
@@ -519,7 +517,7 @@ static void ghes_do_proc(struct ghes *ghes,
 			ghes_handle_memory_failure(gdata, sev);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
-			ghes_handle_aer(gdata, sev, sec_sev);
+			ghes_handle_aer(gdata);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
 			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
  2017-11-08 19:13 ` [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity Tyler Baicar
@ 2017-11-09  9:46   ` Borislav Petkov
  2017-11-09 14:37     ` Tyler Baicar
  2017-11-09 15:14     ` Tyler Baicar
  2017-11-13 12:36   ` Dongdong Liu
  1 sibling, 2 replies; 14+ messages in thread
From: Borislav Petkov @ 2017-11-09  9:46 UTC (permalink / raw)
  To: Tyler Baicar
  Cc: rjw, tony.luck, will.deacon, james.morse, linux-acpi, linux-kernel

On Wed, Nov 08, 2017 at 12:13:12PM -0700, Tyler Baicar wrote:
> Currently the GHES code only calls into the AER driver for
> recoverable type errors. This is incorrect because errors of
> other severities do not get logged by the AER driver and do not
> get exposed to user space via the AER trace event. So, call
> into the AER driver for PCIe errors regardless of the severity
> 
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> ---
>  drivers/acpi/apei/ghes.c | 8 +++-----
>  1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 839c3d5..bb65fa6 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>  #endif
>  }

Where did the explanatory comment go?

+/*
+ * PCIe AER errors need to be sent to the AER driver for reporting and
+ * recovery. The GHES severities map to the following AER severities and
+ * require the following handling:
+ *
+ * GHES_SEV_CORRECTABLE -> AER_CORRECTABLE
+ *     These need to be reported by the AER driver but no recovery is
+ *     necessary.
+ * GHES_SEV_RECOVERABLE -> AER_NONFATAL
+ * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
+ *     These both need to be reported and recovered from by the AER driver.
+ * GHES_SEV_PANIC does not make it to this handling since the kernel must
+ *     panic.
+ */

<--- ???

> -static void ghes_handle_aer(struct acpi_hest_generic_data *gdata, int sev, int sec_sev)
> +static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
>  {
>  #ifdef CONFIG_ACPI_APEI_PCIEAER
>  	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
>  
> -	if (sev == GHES_SEV_RECOVERABLE &&
> -	    sec_sev == GHES_SEV_RECOVERABLE &&
> -	    pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
> +	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
>  	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
>  		unsigned int devfn;
>  		int aer_severity;
> @@ -519,7 +517,7 @@ static void ghes_do_proc(struct ghes *ghes,
>  			ghes_handle_memory_failure(gdata, sev);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
> -			ghes_handle_aer(gdata, sev, sec_sev);
> +			ghes_handle_aer(gdata);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
>  			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> -- 

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
  2017-11-09  9:46   ` Borislav Petkov
@ 2017-11-09 14:37     ` Tyler Baicar
  2017-11-09 15:01       ` Borislav Petkov
  2017-11-09 15:14     ` Tyler Baicar
  1 sibling, 1 reply; 14+ messages in thread
From: Tyler Baicar @ 2017-11-09 14:37 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: rjw, tony.luck, will.deacon, james.morse, linux-acpi, linux-kernel

On 11/9/2017 4:46 AM, Borislav Petkov wrote:
> On Wed, Nov 08, 2017 at 12:13:12PM -0700, Tyler Baicar wrote:
>> Currently the GHES code only calls into the AER driver for
>> recoverable type errors. This is incorrect because errors of
>> other severities do not get logged by the AER driver and do not
>> get exposed to user space via the AER trace event. So, call
>> into the AER driver for PCIe errors regardless of the severity
>>
>> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
>> ---
>>   drivers/acpi/apei/ghes.c | 8 +++-----
>>   1 file changed, 3 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 839c3d5..bb65fa6 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>>   #endif
>>   }
> Where did the explanatory comment go?
Ah, forgot to but that back in. I'll send an update shortly.

Thanks,
Tyler

-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
  2017-11-09 14:37     ` Tyler Baicar
@ 2017-11-09 15:01       ` Borislav Petkov
  0 siblings, 0 replies; 14+ messages in thread
From: Borislav Petkov @ 2017-11-09 15:01 UTC (permalink / raw)
  To: Tyler Baicar
  Cc: rjw, tony.luck, will.deacon, james.morse, linux-acpi, linux-kernel

On Thu, Nov 09, 2017 at 09:37:45AM -0500, Tyler Baicar wrote:
> Ah, forgot to but that back in. I'll send an update shortly.

Just the one patch which needs updating pls, as a reply to the the
respective message.

Thx.

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
  2017-11-09  9:46   ` Borislav Petkov
  2017-11-09 14:37     ` Tyler Baicar
@ 2017-11-09 15:14     ` Tyler Baicar
  2017-11-09 17:29       ` Borislav Petkov
  1 sibling, 1 reply; 14+ messages in thread
From: Tyler Baicar @ 2017-11-09 15:14 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: rjw, tony.luck, will.deacon, james.morse, linux-acpi, linux-kernel

On 11/9/2017 4:46 AM, Borislav Petkov wrote:
> On Wed, Nov 08, 2017 at 12:13:12PM -0700, Tyler Baicar wrote:
>> Currently the GHES code only calls into the AER driver for
>> recoverable type errors. This is incorrect because errors of
>> other severities do not get logged by the AER driver and do not
>> get exposed to user space via the AER trace event. So, call
>> into the AER driver for PCIe errors regardless of the severity
>>
>> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
>> ---
>>   drivers/acpi/apei/ghes.c | 8 +++-----
>>   1 file changed, 3 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 839c3d5..bb65fa6 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>>   #endif
>>   }
> Where did the explanatory comment go?
>
> +/*
> + * PCIe AER errors need to be sent to the AER driver for reporting and
> + * recovery. The GHES severities map to the following AER severities and
> + * require the following handling:
> + *
> + * GHES_SEV_CORRECTABLE -> AER_CORRECTABLE
> + *     These need to be reported by the AER driver but no recovery is
> + *     necessary.
> + * GHES_SEV_RECOVERABLE -> AER_NONFATAL
> + * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
> + *     These both need to be reported and recovered from by the AER driver.
> + * GHES_SEV_PANIC does not make it to this handling since the kernel must
> + *     panic.
> + */
>
> <--- ???
Updated patch including the comment:

Currently the GHES code only calls into the AER driver for
recoverable type errors. This is incorrect because errors of
other severities do not get logged by the AER driver and do not
get exposed to user space via the AER trace event. So, call
into the AER driver for PCIe errors regardless of the severity

Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
---
  drivers/acpi/apei/ghes.c | 22 +++++++++++++++++-----
  1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 839c3d5..15dbf65 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -458,14 +458,26 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
  #endif
  }
  
-static void ghes_handle_aer(struct acpi_hest_generic_data *gdata, int sev, int sec_sev)
+/*
+ * PCIe AER errors need to be sent to the AER driver for reporting and
+ * recovery. The GHES severities map to the following AER severities and
+ * require the following handling:
+ *
+ * GHES_SEV_CORRECTABLE -> AER_CORRECTABLE
+ *     These need to be reported by the AER driver but no recovery is
+ *     necessary.
+ * GHES_SEV_RECOVERABLE -> AER_NONFATAL
+ * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
+ *     These both need to be reported and recovered from by the AER driver.
+ * GHES_SEV_PANIC does not make it to this handling since the kernel must
+ *     panic.
+ */
+static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
  {
  #ifdef CONFIG_ACPI_APEI_PCIEAER
  	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
  
-	if (sev == GHES_SEV_RECOVERABLE &&
-	    sec_sev == GHES_SEV_RECOVERABLE &&
-	    pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
+	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
  	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
  		unsigned int devfn;
  		int aer_severity;
@@ -519,7 +531,7 @@ static void ghes_do_proc(struct ghes *ghes,
  			ghes_handle_memory_failure(gdata, sev);
  		}
  		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
-			ghes_handle_aer(gdata, sev, sec_sev);
+			ghes_handle_aer(gdata);
  		}
  		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
  			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
--

Thanks,
Tyler

-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/2] acpi: apei: handle PCIe AER errors in separate function
  2017-11-08 19:13 ` [PATCH 1/2] acpi: apei: handle PCIe AER errors in separate function Tyler Baicar
@ 2017-11-09 17:21   ` Borislav Petkov
  0 siblings, 0 replies; 14+ messages in thread
From: Borislav Petkov @ 2017-11-09 17:21 UTC (permalink / raw)
  To: Tyler Baicar
  Cc: rjw, tony.luck, will.deacon, james.morse, linux-acpi, linux-kernel

On Wed, Nov 08, 2017 at 12:13:11PM -0700, Tyler Baicar wrote:
> Move PCIe AER error handling code into a separate function.
> 
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> ---
>  drivers/acpi/apei/ghes.c | 64 +++++++++++++++++++++++++-----------------------
>  1 file changed, 34 insertions(+), 30 deletions(-)

Reviewed-by: Borislav Petkov <bp@suse.de>

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
  2017-11-09 15:14     ` Tyler Baicar
@ 2017-11-09 17:29       ` Borislav Petkov
  0 siblings, 0 replies; 14+ messages in thread
From: Borislav Petkov @ 2017-11-09 17:29 UTC (permalink / raw)
  To: Tyler Baicar
  Cc: rjw, tony.luck, will.deacon, james.morse, linux-acpi, linux-kernel

On Thu, Nov 09, 2017 at 10:14:35AM -0500, Tyler Baicar wrote:
> On 11/9/2017 4:46 AM, Borislav Petkov wrote:
> > On Wed, Nov 08, 2017 at 12:13:12PM -0700, Tyler Baicar wrote:
> > > Currently the GHES code only calls into the AER driver for
> > > recoverable type errors. This is incorrect because errors of
> > > other severities do not get logged by the AER driver and do not
> > > get exposed to user space via the AER trace event. So, call
> > > into the AER driver for PCIe errors regardless of the severity
> > > 
> > > Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> > > ---
> > >   drivers/acpi/apei/ghes.c | 8 +++-----
> > >   1 file changed, 3 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> > > index 839c3d5..bb65fa6 100644
> > > --- a/drivers/acpi/apei/ghes.c
> > > +++ b/drivers/acpi/apei/ghes.c
> > > @@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
> > >   #endif
> > >   }
> > Where did the explanatory comment go?
> > 
> > +/*
> > + * PCIe AER errors need to be sent to the AER driver for reporting and
> > + * recovery. The GHES severities map to the following AER severities and
> > + * require the following handling:
> > + *
> > + * GHES_SEV_CORRECTABLE -> AER_CORRECTABLE
> > + *     These need to be reported by the AER driver but no recovery is
> > + *     necessary.
> > + * GHES_SEV_RECOVERABLE -> AER_NONFATAL
> > + * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
> > + *     These both need to be reported and recovered from by the AER driver.
> > + * GHES_SEV_PANIC does not make it to this handling since the kernel must
> > + *     panic.
> > + */
> > 
> > <--- ???
> Updated patch including the comment:

When you decide to do the reckless thing of pasting a patch into
thunderbird on *windoze*, first send it to yourself only and try
applying it.

Because I see this:

[boris@pd: ~/kernel/linux> test-apply.sh /tmp/tbaicar.02
checking file drivers/acpi/apei/ghes.c
patch: **** malformed patch at line 64: @@ -519,7 +531,7 @@ static void ghes_do_proc(struct ghes *ghes,

Not good.

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
  2017-11-08 19:13 ` [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity Tyler Baicar
  2017-11-09  9:46   ` Borislav Petkov
@ 2017-11-13 12:36   ` Dongdong Liu
  2017-11-13 15:34     ` Tyler Baicar
  1 sibling, 1 reply; 14+ messages in thread
From: Dongdong Liu @ 2017-11-13 12:36 UTC (permalink / raw)
  To: Tyler Baicar, rjw, tony.luck, bp, bp, will.deacon, james.morse,
	linux-acpi, linux-kernel


在 2017/11/9 3:13, Tyler Baicar 写道:
> Currently the GHES code only calls into the AER driver for
> recoverable type errors. This is incorrect because errors of
> other severities do not get logged by the AER driver and do not
> get exposed to user space via the AER trace event. So, call
> into the AER driver for PCIe errors regardless of the severity

It will also call do_recovery() regardless of the severity for AER correctable errors.
Correctable errors include those error conditions where hardware can recover without any loss of information.
Hardware corrects these errors and software intervention is not required.
So we'd better modify the code as below.
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 7448052..a7f77549 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -633,7 +633,8 @@ static void aer_recover_work_func(struct work_struct *work)
                         continue;
                 }
                 cper_print_aer(pdev, entry.severity, entry.regs);
-           do_recovery(pdev, entry.severity);
+         if(entry.severity != AER_CORRECTABLE)
+                 do_recovery(pdev, entry.severity);
                 pci_dev_put(pdev);
         }
  }

Thanks,
Dongdong
>
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> ---
>  drivers/acpi/apei/ghes.c | 8 +++-----
>  1 file changed, 3 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 839c3d5..bb65fa6 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -458,14 +458,12 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>  #endif
>  }
>
> -static void ghes_handle_aer(struct acpi_hest_generic_data *gdata, int sev, int sec_sev)
> +static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
>  {
>  #ifdef CONFIG_ACPI_APEI_PCIEAER
>  	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
>
> -	if (sev == GHES_SEV_RECOVERABLE &&
> -	    sec_sev == GHES_SEV_RECOVERABLE &&
> -	    pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
> +	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
>  	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
>  		unsigned int devfn;
>  		int aer_severity;
> @@ -519,7 +517,7 @@ static void ghes_do_proc(struct ghes *ghes,
>  			ghes_handle_memory_failure(gdata, sev);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
> -			ghes_handle_aer(gdata, sev, sec_sev);
> +			ghes_handle_aer(gdata);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
>  			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
>

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
  2017-11-13 12:36   ` Dongdong Liu
@ 2017-11-13 15:34     ` Tyler Baicar
  0 siblings, 0 replies; 14+ messages in thread
From: Tyler Baicar @ 2017-11-13 15:34 UTC (permalink / raw)
  To: Dongdong Liu, rjw, tony.luck, bp, bp, will.deacon, james.morse,
	linux-acpi, linux-kernel, Bjorn Helgaas

On 11/13/2017 7:36 AM, Dongdong Liu wrote:
>
> 在 2017/11/9 3:13, Tyler Baicar 写道:
>> Currently the GHES code only calls into the AER driver for
>> recoverable type errors. This is incorrect because errors of
>> other severities do not get logged by the AER driver and do not
>> get exposed to user space via the AER trace event. So, call
>> into the AER driver for PCIe errors regardless of the severity
>
> It will also call do_recovery() regardless of the severity for AER correctable 
> errors.
> Correctable errors include those error conditions where hardware can recover 
> without any loss of information.
> Hardware corrects these errors and software intervention is not required.
> So we'd better modify the code as below.
> diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
> b/drivers/pci/pcie/aer/aerdrv_core.c
> index 7448052..a7f77549 100644
> --- a/drivers/pci/pcie/aer/aerdrv_core.c
> +++ b/drivers/pci/pcie/aer/aerdrv_core.c
> @@ -633,7 +633,8 @@ static void aer_recover_work_func(struct work_struct *work)
>                         continue;
>                 }
>                 cper_print_aer(pdev, entry.severity, entry.regs);
> -           do_recovery(pdev, entry.severity);
> +         if(entry.severity != AER_CORRECTABLE)
> +                 do_recovery(pdev, entry.severity);
>                 pci_dev_put(pdev);
>         }
>  }
Hello Dongdong,

Yes, I have a patch for this that needs to be picked up.

https://lkml.org/lkml/2017/8/28/848

Thanks,
Tyler

-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
  2017-11-15 15:14 Tyler Baicar
  2017-11-18 14:54 ` Rafael J. Wysocki
@ 2017-11-28 10:57 ` Borislav Petkov
  1 sibling, 0 replies; 14+ messages in thread
From: Borislav Petkov @ 2017-11-28 10:57 UTC (permalink / raw)
  To: Tyler Baicar
  Cc: rjw, tony.luck, will.deacon, james.morse, linux-acpi, linux-kernel

On Wed, Nov 15, 2017 at 08:14:27AM -0700, Tyler Baicar wrote:
> Currently the GHES code only calls into the AER driver for
> recoverable type errors. This is incorrect because errors of
> other severities do not get logged by the AER driver and do not
> get exposed to user space via the AER trace event. So, call
> into the AER driver for PCIe errors regardless of the severity
> 
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> ---
>  drivers/acpi/apei/ghes.c | 22 +++++++++++++++++-----
>  1 file changed, 17 insertions(+), 5 deletions(-)

Reviewed-by: Borislav Petkov <bp@suse.de>

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
-- 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
  2017-11-15 15:14 Tyler Baicar
@ 2017-11-18 14:54 ` Rafael J. Wysocki
  2017-11-28 10:57 ` Borislav Petkov
  1 sibling, 0 replies; 14+ messages in thread
From: Rafael J. Wysocki @ 2017-11-18 14:54 UTC (permalink / raw)
  To: Tyler Baicar
  Cc: bp, tony.luck, will.deacon, james.morse, linux-acpi, linux-kernel

On Wednesday, November 15, 2017 4:14:27 PM CET Tyler Baicar wrote:
> Currently the GHES code only calls into the AER driver for
> recoverable type errors. This is incorrect because errors of
> other severities do not get logged by the AER driver and do not
> get exposed to user space via the AER trace event. So, call
> into the AER driver for PCIe errors regardless of the severity

Can you please resend the [1/2] too for completeness?

Thanks,
Rafael

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity
@ 2017-11-15 15:14 Tyler Baicar
  2017-11-18 14:54 ` Rafael J. Wysocki
  2017-11-28 10:57 ` Borislav Petkov
  0 siblings, 2 replies; 14+ messages in thread
From: Tyler Baicar @ 2017-11-15 15:14 UTC (permalink / raw)
  To: bp, rjw, tony.luck, will.deacon, james.morse, linux-acpi, linux-kernel
  Cc: Tyler Baicar

Currently the GHES code only calls into the AER driver for
recoverable type errors. This is incorrect because errors of
other severities do not get logged by the AER driver and do not
get exposed to user space via the AER trace event. So, call
into the AER driver for PCIe errors regardless of the severity

Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
---
 drivers/acpi/apei/ghes.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 839c3d5..15dbf65 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -458,14 +458,26 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
 #endif
 }
 
-static void ghes_handle_aer(struct acpi_hest_generic_data *gdata, int sev, int sec_sev)
+/*
+ * PCIe AER errors need to be sent to the AER driver for reporting and
+ * recovery. The GHES severities map to the following AER severities and
+ * require the following handling:
+ *
+ * GHES_SEV_CORRECTABLE -> AER_CORRECTABLE
+ *     These need to be reported by the AER driver but no recovery is
+ *     necessary.
+ * GHES_SEV_RECOVERABLE -> AER_NONFATAL
+ * GHES_SEV_RECOVERABLE && CPER_SEC_RESET -> AER_FATAL
+ *     These both need to be reported and recovered from by the AER driver.
+ * GHES_SEV_PANIC does not make it to this handling since the kernel must
+ *     panic.
+ */
+static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
 {
 #ifdef CONFIG_ACPI_APEI_PCIEAER
 	struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata);
 
-	if (sev == GHES_SEV_RECOVERABLE &&
-	    sec_sev == GHES_SEV_RECOVERABLE &&
-	    pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
+	if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
 	    pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
 		unsigned int devfn;
 		int aer_severity;
@@ -519,7 +531,7 @@ static void ghes_do_proc(struct ghes *ghes,
 			ghes_handle_memory_failure(gdata, sev);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
-			ghes_handle_aer(gdata, sev, sec_sev);
+			ghes_handle_aer(gdata);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
 			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
-- 
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-11-28 10:57 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-08 19:13 [PATCH 0/2] Restructure and fix GHES PCIe AER handling Tyler Baicar
2017-11-08 19:13 ` [PATCH 1/2] acpi: apei: handle PCIe AER errors in separate function Tyler Baicar
2017-11-09 17:21   ` Borislav Petkov
2017-11-08 19:13 ` [PATCH V3 2/2] acpi: apei: call into AER handling regardless of severity Tyler Baicar
2017-11-09  9:46   ` Borislav Petkov
2017-11-09 14:37     ` Tyler Baicar
2017-11-09 15:01       ` Borislav Petkov
2017-11-09 15:14     ` Tyler Baicar
2017-11-09 17:29       ` Borislav Petkov
2017-11-13 12:36   ` Dongdong Liu
2017-11-13 15:34     ` Tyler Baicar
2017-11-15 15:14 Tyler Baicar
2017-11-18 14:54 ` Rafael J. Wysocki
2017-11-28 10:57 ` Borislav Petkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).