Linux-ACPI Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v5] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode
@ 2020-12-10 12:09 Xiaofei Tan
  2021-01-26 13:32 ` tanxiaofei
  2021-05-11  1:31 ` Xiaofei Tan
  0 siblings, 2 replies; 5+ messages in thread
From: Xiaofei Tan @ 2020-12-10 12:09 UTC (permalink / raw)
  To: james.morse, rafael, rjw, lenb, tony.luck, bp, akpm, jroedel, peterz
  Cc: linux-acpi, linux-kernel, linuxarm, Xiaofei Tan

After the commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
synchronise with APEI's irq work") applied, do_sea() return directly
for user-mode if apei_claim_sea() handled any error record. Therefore,
each error record reported by the user-mode SEA must be effectively
processed in APEI GHES driver.

Currently, GHES driver only processes Memory Error Section.(Ignore PCIe
Error Section, as it has nothing to do with SEA). It is not enough.
Because ARM Processor Error could also be used for SEA in some hardware
platforms, such as Kunpeng9xx series. We can't ask them to switch to
use Memory Error Section for two reasons:
1)The server was delivered to customers, and it will introduce
compatibility issue.
2)It make sense to use ARM Processor Error Section. Because either
cache or memory errors could generate SEA when consumed by a processor.

Do memory failure handling for ARM Processor Error Section just like
for Memory Error Section.

Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
---
Changes since v4:
- 1. Change the patch name from " ACPI / APEI: do memory failure on the
physical address reported by ARM processor error section" to this
more proper one.
- 2. Add a comment in the code to tell why not filter out corrected
error in an uncorrected section.

Changes since v3:
- Print unhandled error following James Morse's advice.

Changes since v2:
- Updated commit log
---
 drivers/acpi/apei/ghes.c | 76 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 59 insertions(+), 17 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index fce7ade..0893968 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head)
 	gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
 }
 
-static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
-				       int sev)
+static bool ghes_do_memory_failure(u64 physical_addr, int flags)
 {
 	unsigned long pfn;
-	int flags = -1;
-	int sec_sev = ghes_severity(gdata->error_severity);
-	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
 	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
 		return false;
 
-	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
-		return false;
-
-	pfn = mem_err->physical_addr >> PAGE_SHIFT;
+	pfn = PHYS_PFN(physical_addr);
 	if (!pfn_valid(pfn)) {
 		pr_warn_ratelimited(FW_WARN GHES_PFX
 		"Invalid address in generic error data: %#llx\n",
-		mem_err->physical_addr);
+		physical_addr);
 		return false;
 	}
 
+	memory_failure_queue(pfn, flags);
+	return true;
+}
+
+static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
+				       int sev)
+{
+	int flags = -1;
+	int sec_sev = ghes_severity(gdata->error_severity);
+	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
+
+	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
+		return false;
+
 	/* iff following two events can be handled properly by now */
 	if (sec_sev == GHES_SEV_CORRECTED &&
 	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
@@ -470,14 +477,51 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
 	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
 		flags = 0;
 
-	if (flags != -1) {
-		memory_failure_queue(pfn, flags);
-		return true;
-	}
+	if (flags != -1)
+		return ghes_do_memory_failure(mem_err->physical_addr, flags);
 
 	return false;
 }
 
+static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
+{
+	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
+	struct cper_arm_err_info *err_info;
+	bool queued = false;
+	int sec_sev, i;
+
+	log_arm_hw_error(err);
+
+	sec_sev = ghes_severity(gdata->error_severity);
+	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
+		return false;
+
+	err_info = (struct cper_arm_err_info *) (err + 1);
+	for (i = 0; i < err->err_info_num; i++, err_info++) {
+		bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
+		bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
+
+		/*
+		 * The field (err_info->error_info & BIT(26)) is fixed to set to
+		 * 1 in some old firmware of HiSilicon Kunpeng920. We assume that
+		 * firmware won't mix corrected errors in an uncorrected section,
+		 * and don't filter out 'corrected' error here.
+		 */
+		if (!is_cache || !has_pa) {
+			pr_warn_ratelimited(FW_WARN GHES_PFX
+			"Unhandled processor error type %s\n",
+			err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
+			cper_proc_error_type_strs[err_info->type] : "unknown error");
+			continue;
+		}
+
+		if (ghes_do_memory_failure(err_info->physical_fault_addr, 0))
+			queued = true;
+	}
+
+	return queued;
+}
+
 /*
  * PCIe AER errors need to be sent to the AER driver for reporting and
  * recovery. The GHES severities map to the following AER severities and
@@ -605,9 +649,7 @@ static bool ghes_do_proc(struct ghes *ghes,
 			ghes_handle_aer(gdata);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
-			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
-
-			log_arm_hw_error(err);
+			queued = ghes_handle_arm_hw_error(gdata, sev);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);
 
-- 
2.8.1


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode
  2020-12-10 12:09 [PATCH v5] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode Xiaofei Tan
@ 2021-01-26 13:32 ` tanxiaofei
  2021-02-05 12:55   ` Rafael J. Wysocki
  2021-05-11  1:31 ` Xiaofei Tan
  1 sibling, 1 reply; 5+ messages in thread
From: tanxiaofei @ 2021-01-26 13:32 UTC (permalink / raw)
  To: james.morse, lenb
  Cc: rafael, rjw, tony.luck, bp, akpm, jroedel, peterz, linux-acpi,
	linux-kernel, linuxarm

@James
Hi James, please help to review this patch. Thank you very much. :)

On 2020/12/10 20:09, Xiaofei Tan wrote:
> After the commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
> synchronise with APEI's irq work") applied, do_sea() return directly
> for user-mode if apei_claim_sea() handled any error record. Therefore,
> each error record reported by the user-mode SEA must be effectively
> processed in APEI GHES driver.
>
> Currently, GHES driver only processes Memory Error Section.(Ignore PCIe
> Error Section, as it has nothing to do with SEA). It is not enough.
> Because ARM Processor Error could also be used for SEA in some hardware
> platforms, such as Kunpeng9xx series. We can't ask them to switch to
> use Memory Error Section for two reasons:
> 1)The server was delivered to customers, and it will introduce
> compatibility issue.
> 2)It make sense to use ARM Processor Error Section. Because either
> cache or memory errors could generate SEA when consumed by a processor.
>
> Do memory failure handling for ARM Processor Error Section just like
> for Memory Error Section.
>
> Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
> ---
> Changes since v4:
> - 1. Change the patch name from " ACPI / APEI: do memory failure on the
> physical address reported by ARM processor error section" to this
> more proper one.
> - 2. Add a comment in the code to tell why not filter out corrected
> error in an uncorrected section.
>
> Changes since v3:
> - Print unhandled error following James Morse's advice.
>
> Changes since v2:
> - Updated commit log
> ---
>  drivers/acpi/apei/ghes.c | 76 +++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 59 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index fce7ade..0893968 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head)
>  	gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
>  }
>
> -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> -				       int sev)
> +static bool ghes_do_memory_failure(u64 physical_addr, int flags)
>  {
>  	unsigned long pfn;
> -	int flags = -1;
> -	int sec_sev = ghes_severity(gdata->error_severity);
> -	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>
>  	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
>  		return false;
>
> -	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> -		return false;
> -
> -	pfn = mem_err->physical_addr >> PAGE_SHIFT;
> +	pfn = PHYS_PFN(physical_addr);
>  	if (!pfn_valid(pfn)) {
>  		pr_warn_ratelimited(FW_WARN GHES_PFX
>  		"Invalid address in generic error data: %#llx\n",
> -		mem_err->physical_addr);
> +		physical_addr);
>  		return false;
>  	}
>
> +	memory_failure_queue(pfn, flags);
> +	return true;
> +}
> +
> +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> +				       int sev)
> +{
> +	int flags = -1;
> +	int sec_sev = ghes_severity(gdata->error_severity);
> +	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
> +
> +	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> +		return false;
> +
>  	/* iff following two events can be handled properly by now */
>  	if (sec_sev == GHES_SEV_CORRECTED &&
>  	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
> @@ -470,14 +477,51 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>  	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
>  		flags = 0;
>
> -	if (flags != -1) {
> -		memory_failure_queue(pfn, flags);
> -		return true;
> -	}
> +	if (flags != -1)
> +		return ghes_do_memory_failure(mem_err->physical_addr, flags);
>
>  	return false;
>  }
>
> +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
> +{
> +	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> +	struct cper_arm_err_info *err_info;
> +	bool queued = false;
> +	int sec_sev, i;
> +
> +	log_arm_hw_error(err);
> +
> +	sec_sev = ghes_severity(gdata->error_severity);
> +	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
> +		return false;
> +
> +	err_info = (struct cper_arm_err_info *) (err + 1);
> +	for (i = 0; i < err->err_info_num; i++, err_info++) {
> +		bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
> +		bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
> +
> +		/*
> +		 * The field (err_info->error_info & BIT(26)) is fixed to set to
> +		 * 1 in some old firmware of HiSilicon Kunpeng920. We assume that
> +		 * firmware won't mix corrected errors in an uncorrected section,
> +		 * and don't filter out 'corrected' error here.
> +		 */
> +		if (!is_cache || !has_pa) {
> +			pr_warn_ratelimited(FW_WARN GHES_PFX
> +			"Unhandled processor error type %s\n",
> +			err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
> +			cper_proc_error_type_strs[err_info->type] : "unknown error");
> +			continue;
> +		}
> +
> +		if (ghes_do_memory_failure(err_info->physical_fault_addr, 0))
> +			queued = true;
> +	}
> +
> +	return queued;
> +}
> +
>  /*
>   * PCIe AER errors need to be sent to the AER driver for reporting and
>   * recovery. The GHES severities map to the following AER severities and
> @@ -605,9 +649,7 @@ static bool ghes_do_proc(struct ghes *ghes,
>  			ghes_handle_aer(gdata);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
> -			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> -
> -			log_arm_hw_error(err);
> +			queued = ghes_handle_arm_hw_error(gdata, sev);
>  		} else {
>  			void *err = acpi_hest_get_payload(gdata);
>
>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode
  2021-01-26 13:32 ` tanxiaofei
@ 2021-02-05 12:55   ` Rafael J. Wysocki
  2021-04-10  7:09     ` Xiaofei Tan
  0 siblings, 1 reply; 5+ messages in thread
From: Rafael J. Wysocki @ 2021-02-05 12:55 UTC (permalink / raw)
  To: tanxiaofei, James Morse, Borislav Petkov
  Cc: Len Brown, Rafael J. Wysocki, Rafael J. Wysocki, Tony Luck,
	Andrew Morton, Joerg Roedel, Peter Zijlstra,
	ACPI Devel Maling List, Linux Kernel Mailing List, Linuxarm

On Tue, Jan 26, 2021 at 2:32 PM tanxiaofei <tanxiaofei@huawei.com> wrote:
>
> @James
> Hi James, please help to review this patch. Thank you very much. :)

James, Boris, any comments?

> On 2020/12/10 20:09, Xiaofei Tan wrote:
> > After the commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
> > synchronise with APEI's irq work") applied, do_sea() return directly
> > for user-mode if apei_claim_sea() handled any error record. Therefore,
> > each error record reported by the user-mode SEA must be effectively
> > processed in APEI GHES driver.
> >
> > Currently, GHES driver only processes Memory Error Section.(Ignore PCIe
> > Error Section, as it has nothing to do with SEA). It is not enough.
> > Because ARM Processor Error could also be used for SEA in some hardware
> > platforms, such as Kunpeng9xx series. We can't ask them to switch to
> > use Memory Error Section for two reasons:
> > 1)The server was delivered to customers, and it will introduce
> > compatibility issue.
> > 2)It make sense to use ARM Processor Error Section. Because either
> > cache or memory errors could generate SEA when consumed by a processor.
> >
> > Do memory failure handling for ARM Processor Error Section just like
> > for Memory Error Section.
> >
> > Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
> > ---
> > Changes since v4:
> > - 1. Change the patch name from " ACPI / APEI: do memory failure on the
> > physical address reported by ARM processor error section" to this
> > more proper one.
> > - 2. Add a comment in the code to tell why not filter out corrected
> > error in an uncorrected section.
> >
> > Changes since v3:
> > - Print unhandled error following James Morse's advice.
> >
> > Changes since v2:
> > - Updated commit log
> > ---
> >  drivers/acpi/apei/ghes.c | 76 +++++++++++++++++++++++++++++++++++++-----------
> >  1 file changed, 59 insertions(+), 17 deletions(-)
> >
> > diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> > index fce7ade..0893968 100644
> > --- a/drivers/acpi/apei/ghes.c
> > +++ b/drivers/acpi/apei/ghes.c
> > @@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head)
> >       gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
> >  }
> >
> > -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> > -                                    int sev)
> > +static bool ghes_do_memory_failure(u64 physical_addr, int flags)
> >  {
> >       unsigned long pfn;
> > -     int flags = -1;
> > -     int sec_sev = ghes_severity(gdata->error_severity);
> > -     struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
> >
> >       if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
> >               return false;
> >
> > -     if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> > -             return false;
> > -
> > -     pfn = mem_err->physical_addr >> PAGE_SHIFT;
> > +     pfn = PHYS_PFN(physical_addr);
> >       if (!pfn_valid(pfn)) {
> >               pr_warn_ratelimited(FW_WARN GHES_PFX
> >               "Invalid address in generic error data: %#llx\n",
> > -             mem_err->physical_addr);
> > +             physical_addr);
> >               return false;
> >       }
> >
> > +     memory_failure_queue(pfn, flags);
> > +     return true;
> > +}
> > +
> > +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> > +                                    int sev)
> > +{
> > +     int flags = -1;
> > +     int sec_sev = ghes_severity(gdata->error_severity);
> > +     struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
> > +
> > +     if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> > +             return false;
> > +
> >       /* iff following two events can be handled properly by now */
> >       if (sec_sev == GHES_SEV_CORRECTED &&
> >           (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
> > @@ -470,14 +477,51 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> >       if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
> >               flags = 0;
> >
> > -     if (flags != -1) {
> > -             memory_failure_queue(pfn, flags);
> > -             return true;
> > -     }
> > +     if (flags != -1)
> > +             return ghes_do_memory_failure(mem_err->physical_addr, flags);
> >
> >       return false;
> >  }
> >
> > +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
> > +{
> > +     struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> > +     struct cper_arm_err_info *err_info;
> > +     bool queued = false;
> > +     int sec_sev, i;
> > +
> > +     log_arm_hw_error(err);
> > +
> > +     sec_sev = ghes_severity(gdata->error_severity);
> > +     if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
> > +             return false;
> > +
> > +     err_info = (struct cper_arm_err_info *) (err + 1);
> > +     for (i = 0; i < err->err_info_num; i++, err_info++) {
> > +             bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
> > +             bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
> > +
> > +             /*
> > +              * The field (err_info->error_info & BIT(26)) is fixed to set to
> > +              * 1 in some old firmware of HiSilicon Kunpeng920. We assume that
> > +              * firmware won't mix corrected errors in an uncorrected section,
> > +              * and don't filter out 'corrected' error here.
> > +              */
> > +             if (!is_cache || !has_pa) {
> > +                     pr_warn_ratelimited(FW_WARN GHES_PFX
> > +                     "Unhandled processor error type %s\n",
> > +                     err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
> > +                     cper_proc_error_type_strs[err_info->type] : "unknown error");
> > +                     continue;
> > +             }
> > +
> > +             if (ghes_do_memory_failure(err_info->physical_fault_addr, 0))
> > +                     queued = true;
> > +     }
> > +
> > +     return queued;
> > +}
> > +
> >  /*
> >   * PCIe AER errors need to be sent to the AER driver for reporting and
> >   * recovery. The GHES severities map to the following AER severities and
> > @@ -605,9 +649,7 @@ static bool ghes_do_proc(struct ghes *ghes,
> >                       ghes_handle_aer(gdata);
> >               }
> >               else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
> > -                     struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> > -
> > -                     log_arm_hw_error(err);
> > +                     queued = ghes_handle_arm_hw_error(gdata, sev);
> >               } else {
> >                       void *err = acpi_hest_get_payload(gdata);
> >
> >
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode
  2021-02-05 12:55   ` Rafael J. Wysocki
@ 2021-04-10  7:09     ` Xiaofei Tan
  0 siblings, 0 replies; 5+ messages in thread
From: Xiaofei Tan @ 2021-04-10  7:09 UTC (permalink / raw)
  To: James Morse, Borislav Petkov
  Cc: Rafael J. Wysocki, Len Brown, Rafael J. Wysocki, Tony Luck,
	Andrew Morton, Joerg Roedel, Peter Zijlstra,
	ACPI Devel Maling List, Linux Kernel Mailing List, Linuxarm

Hi James, Boris

a friendly ping..
any comments for this ?

On 2021/2/5 20:55, Rafael J. Wysocki wrote:
> On Tue, Jan 26, 2021 at 2:32 PM tanxiaofei <tanxiaofei@huawei.com> wrote:
>>
>> @James
>> Hi James, please help to review this patch. Thank you very much. :)
>
> James, Boris, any comments?
>
>> On 2020/12/10 20:09, Xiaofei Tan wrote:
>>> After the commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
>>> synchronise with APEI's irq work") applied, do_sea() return directly
>>> for user-mode if apei_claim_sea() handled any error record. Therefore,
>>> each error record reported by the user-mode SEA must be effectively
>>> processed in APEI GHES driver.
>>>
>>> Currently, GHES driver only processes Memory Error Section.(Ignore PCIe
>>> Error Section, as it has nothing to do with SEA). It is not enough.
>>> Because ARM Processor Error could also be used for SEA in some hardware
>>> platforms, such as Kunpeng9xx series. We can't ask them to switch to
>>> use Memory Error Section for two reasons:
>>> 1)The server was delivered to customers, and it will introduce
>>> compatibility issue.
>>> 2)It make sense to use ARM Processor Error Section. Because either
>>> cache or memory errors could generate SEA when consumed by a processor.
>>>
>>> Do memory failure handling for ARM Processor Error Section just like
>>> for Memory Error Section.
>>>
>>> Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
>>> ---
>>> Changes since v4:
>>> - 1. Change the patch name from " ACPI / APEI: do memory failure on the
>>> physical address reported by ARM processor error section" to this
>>> more proper one.
>>> - 2. Add a comment in the code to tell why not filter out corrected
>>> error in an uncorrected section.
>>>
>>> Changes since v3:
>>> - Print unhandled error following James Morse's advice.
>>>
>>> Changes since v2:
>>> - Updated commit log
>>> ---
>>>  drivers/acpi/apei/ghes.c | 76 +++++++++++++++++++++++++++++++++++++-----------
>>>  1 file changed, 59 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>>> index fce7ade..0893968 100644
>>> --- a/drivers/acpi/apei/ghes.c
>>> +++ b/drivers/acpi/apei/ghes.c
>>> @@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head)
>>>       gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
>>>  }
>>>
>>> -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>>> -                                    int sev)
>>> +static bool ghes_do_memory_failure(u64 physical_addr, int flags)
>>>  {
>>>       unsigned long pfn;
>>> -     int flags = -1;
>>> -     int sec_sev = ghes_severity(gdata->error_severity);
>>> -     struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>>>
>>>       if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
>>>               return false;
>>>
>>> -     if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
>>> -             return false;
>>> -
>>> -     pfn = mem_err->physical_addr >> PAGE_SHIFT;
>>> +     pfn = PHYS_PFN(physical_addr);
>>>       if (!pfn_valid(pfn)) {
>>>               pr_warn_ratelimited(FW_WARN GHES_PFX
>>>               "Invalid address in generic error data: %#llx\n",
>>> -             mem_err->physical_addr);
>>> +             physical_addr);
>>>               return false;
>>>       }
>>>
>>> +     memory_failure_queue(pfn, flags);
>>> +     return true;
>>> +}
>>> +
>>> +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>>> +                                    int sev)
>>> +{
>>> +     int flags = -1;
>>> +     int sec_sev = ghes_severity(gdata->error_severity);
>>> +     struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>>> +
>>> +     if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
>>> +             return false;
>>> +
>>>       /* iff following two events can be handled properly by now */
>>>       if (sec_sev == GHES_SEV_CORRECTED &&
>>>           (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
>>> @@ -470,14 +477,51 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>>>       if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
>>>               flags = 0;
>>>
>>> -     if (flags != -1) {
>>> -             memory_failure_queue(pfn, flags);
>>> -             return true;
>>> -     }
>>> +     if (flags != -1)
>>> +             return ghes_do_memory_failure(mem_err->physical_addr, flags);
>>>
>>>       return false;
>>>  }
>>>
>>> +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
>>> +{
>>> +     struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
>>> +     struct cper_arm_err_info *err_info;
>>> +     bool queued = false;
>>> +     int sec_sev, i;
>>> +
>>> +     log_arm_hw_error(err);
>>> +
>>> +     sec_sev = ghes_severity(gdata->error_severity);
>>> +     if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
>>> +             return false;
>>> +
>>> +     err_info = (struct cper_arm_err_info *) (err + 1);
>>> +     for (i = 0; i < err->err_info_num; i++, err_info++) {
>>> +             bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
>>> +             bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
>>> +
>>> +             /*
>>> +              * The field (err_info->error_info & BIT(26)) is fixed to set to
>>> +              * 1 in some old firmware of HiSilicon Kunpeng920. We assume that
>>> +              * firmware won't mix corrected errors in an uncorrected section,
>>> +              * and don't filter out 'corrected' error here.
>>> +              */
>>> +             if (!is_cache || !has_pa) {
>>> +                     pr_warn_ratelimited(FW_WARN GHES_PFX
>>> +                     "Unhandled processor error type %s\n",
>>> +                     err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
>>> +                     cper_proc_error_type_strs[err_info->type] : "unknown error");
>>> +                     continue;
>>> +             }
>>> +
>>> +             if (ghes_do_memory_failure(err_info->physical_fault_addr, 0))
>>> +                     queued = true;
>>> +     }
>>> +
>>> +     return queued;
>>> +}
>>> +
>>>  /*
>>>   * PCIe AER errors need to be sent to the AER driver for reporting and
>>>   * recovery. The GHES severities map to the following AER severities and
>>> @@ -605,9 +649,7 @@ static bool ghes_do_proc(struct ghes *ghes,
>>>                       ghes_handle_aer(gdata);
>>>               }
>>>               else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
>>> -                     struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
>>> -
>>> -                     log_arm_hw_error(err);
>>> +                     queued = ghes_handle_arm_hw_error(gdata, sev);
>>>               } else {
>>>                       void *err = acpi_hest_get_payload(gdata);
>>>
>>>
>>
>
> .
>


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode
  2020-12-10 12:09 [PATCH v5] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode Xiaofei Tan
  2021-01-26 13:32 ` tanxiaofei
@ 2021-05-11  1:31 ` Xiaofei Tan
  1 sibling, 0 replies; 5+ messages in thread
From: Xiaofei Tan @ 2021-05-11  1:31 UTC (permalink / raw)
  To: james.morse
  Cc: rafael, rjw, lenb, tony.luck, bp, akpm, jroedel, peterz,
	linux-acpi, linux-kernel, linuxarm


ping..

On 2020/12/10 20:09, Xiaofei Tan wrote:
> After the commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
> synchronise with APEI's irq work") applied, do_sea() return directly
> for user-mode if apei_claim_sea() handled any error record. Therefore,
> each error record reported by the user-mode SEA must be effectively
> processed in APEI GHES driver.
>
> Currently, GHES driver only processes Memory Error Section.(Ignore PCIe
> Error Section, as it has nothing to do with SEA). It is not enough.
> Because ARM Processor Error could also be used for SEA in some hardware
> platforms, such as Kunpeng9xx series. We can't ask them to switch to
> use Memory Error Section for two reasons:
> 1)The server was delivered to customers, and it will introduce
> compatibility issue.
> 2)It make sense to use ARM Processor Error Section. Because either
> cache or memory errors could generate SEA when consumed by a processor.
>
> Do memory failure handling for ARM Processor Error Section just like
> for Memory Error Section.
>
> Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
> ---
> Changes since v4:
> - 1. Change the patch name from " ACPI / APEI: do memory failure on the
> physical address reported by ARM processor error section" to this
> more proper one.
> - 2. Add a comment in the code to tell why not filter out corrected
> error in an uncorrected section.
>
> Changes since v3:
> - Print unhandled error following James Morse's advice.
>
> Changes since v2:
> - Updated commit log
> ---
>  drivers/acpi/apei/ghes.c | 76 +++++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 59 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index fce7ade..0893968 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head)
>  	gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
>  }
>
> -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> -				       int sev)
> +static bool ghes_do_memory_failure(u64 physical_addr, int flags)
>  {
>  	unsigned long pfn;
> -	int flags = -1;
> -	int sec_sev = ghes_severity(gdata->error_severity);
> -	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>
>  	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
>  		return false;
>
> -	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> -		return false;
> -
> -	pfn = mem_err->physical_addr >> PAGE_SHIFT;
> +	pfn = PHYS_PFN(physical_addr);
>  	if (!pfn_valid(pfn)) {
>  		pr_warn_ratelimited(FW_WARN GHES_PFX
>  		"Invalid address in generic error data: %#llx\n",
> -		mem_err->physical_addr);
> +		physical_addr);
>  		return false;
>  	}
>
> +	memory_failure_queue(pfn, flags);
> +	return true;
> +}
> +
> +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> +				       int sev)
> +{
> +	int flags = -1;
> +	int sec_sev = ghes_severity(gdata->error_severity);
> +	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
> +
> +	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> +		return false;
> +
>  	/* iff following two events can be handled properly by now */
>  	if (sec_sev == GHES_SEV_CORRECTED &&
>  	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
> @@ -470,14 +477,51 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>  	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
>  		flags = 0;
>
> -	if (flags != -1) {
> -		memory_failure_queue(pfn, flags);
> -		return true;
> -	}
> +	if (flags != -1)
> +		return ghes_do_memory_failure(mem_err->physical_addr, flags);
>
>  	return false;
>  }
>
> +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
> +{
> +	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> +	struct cper_arm_err_info *err_info;
> +	bool queued = false;
> +	int sec_sev, i;
> +
> +	log_arm_hw_error(err);
> +
> +	sec_sev = ghes_severity(gdata->error_severity);
> +	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
> +		return false;
> +
> +	err_info = (struct cper_arm_err_info *) (err + 1);
> +	for (i = 0; i < err->err_info_num; i++, err_info++) {
> +		bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
> +		bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
> +
> +		/*
> +		 * The field (err_info->error_info & BIT(26)) is fixed to set to
> +		 * 1 in some old firmware of HiSilicon Kunpeng920. We assume that
> +		 * firmware won't mix corrected errors in an uncorrected section,
> +		 * and don't filter out 'corrected' error here.
> +		 */
> +		if (!is_cache || !has_pa) {
> +			pr_warn_ratelimited(FW_WARN GHES_PFX
> +			"Unhandled processor error type %s\n",
> +			err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
> +			cper_proc_error_type_strs[err_info->type] : "unknown error");
> +			continue;
> +		}
> +
> +		if (ghes_do_memory_failure(err_info->physical_fault_addr, 0))
> +			queued = true;
> +	}
> +
> +	return queued;
> +}
> +
>  /*
>   * PCIe AER errors need to be sent to the AER driver for reporting and
>   * recovery. The GHES severities map to the following AER severities and
> @@ -605,9 +649,7 @@ static bool ghes_do_proc(struct ghes *ghes,
>  			ghes_handle_aer(gdata);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
> -			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> -
> -			log_arm_hw_error(err);
> +			queued = ghes_handle_arm_hw_error(gdata, sev);
>  		} else {
>  			void *err = acpi_hest_get_payload(gdata);
>
>


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, back to index

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-10 12:09 [PATCH v5] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode Xiaofei Tan
2021-01-26 13:32 ` tanxiaofei
2021-02-05 12:55   ` Rafael J. Wysocki
2021-04-10  7:09     ` Xiaofei Tan
2021-05-11  1:31 ` Xiaofei Tan

Linux-ACPI Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-acpi/0 linux-acpi/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-acpi linux-acpi/ https://lore.kernel.org/linux-acpi \
		linux-acpi@vger.kernel.org
	public-inbox-index linux-acpi

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-acpi


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git