linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v6] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode
@ 2021-06-09  6:03 Xiaofei Tan
  2021-06-09 13:22 ` Rafael J. Wysocki
  0 siblings, 1 reply; 3+ messages in thread
From: Xiaofei Tan @ 2021-06-09  6:03 UTC (permalink / raw)
  To: james.morse, rafael, rjw, lenb, tony.luck, bp, akpm, jroedel, peterz
  Cc: linux-acpi, linux-kernel, linuxarm, Xiaofei Tan

Before commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
synchronise with APEI's irq work"), do_sea() would unconditionally
signal the affected task from the arch code. Since that change,
the GHES driver sends the signals.

This exposes a problem as errors the GHES driver doesn't understand
or doesn't handle effectively are silently ignored. It will cause
the errors get taken again, and circulate endlessly. User-space task
get stuck in this loop.

Existing firmware on Kunpeng9xx systems reports cache errors with the
'ARM Processor Error' CPER records.

Do memory failure handling for ARM Processor Error Section just like
for Memory Error Section.

Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
Reviewed-by: James Morse <james.morse@arm.com>

---
Changes since v5:
- Do some changes following James's suggestions: 1) optimize commit log
2) use err_info->length instead of err_info++' 3) some coding style
advice.

Changes since v4:
- 1. Change the patch name from " ACPI / APEI: do memory failure on the
physical address reported by ARM processor error section" to this
more proper one.
- 2. Add a comment in the code to tell why not filter out corrected
error in an uncorrected section.

Changes since v3:
- Print unhandled error following James Morse's advice.

Changes since v2:
- Updated commit log
---
 drivers/acpi/apei/ghes.c | 81 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 17 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index fce7ade..0c8330e 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head)
 	gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
 }
 
-static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
-				       int sev)
+static bool ghes_do_memory_failure(u64 physical_addr, int flags)
 {
 	unsigned long pfn;
-	int flags = -1;
-	int sec_sev = ghes_severity(gdata->error_severity);
-	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
 	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
 		return false;
 
-	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
-		return false;
-
-	pfn = mem_err->physical_addr >> PAGE_SHIFT;
+	pfn = PHYS_PFN(physical_addr);
 	if (!pfn_valid(pfn)) {
 		pr_warn_ratelimited(FW_WARN GHES_PFX
 		"Invalid address in generic error data: %#llx\n",
-		mem_err->physical_addr);
+		physical_addr);
 		return false;
 	}
 
+	memory_failure_queue(pfn, flags);
+	return true;
+}
+
+static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
+				       int sev)
+{
+	int flags = -1;
+	int sec_sev = ghes_severity(gdata->error_severity);
+	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
+
+	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
+		return false;
+
 	/* iff following two events can be handled properly by now */
 	if (sec_sev == GHES_SEV_CORRECTED &&
 	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
@@ -470,14 +477,56 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
 	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
 		flags = 0;
 
-	if (flags != -1) {
-		memory_failure_queue(pfn, flags);
-		return true;
-	}
+	if (flags != -1)
+		return ghes_do_memory_failure(mem_err->physical_addr, flags);
 
 	return false;
 }
 
+static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
+{
+	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
+	bool queued = false;
+	int sec_sev, i;
+	char *p;
+
+	log_arm_hw_error(err);
+
+	sec_sev = ghes_severity(gdata->error_severity);
+	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
+		return false;
+
+	p = (char *)(err + 1);
+	for (i = 0; i < err->err_info_num; i++) {
+		struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
+		bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
+		bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
+		const char *error_type = "unknown error";
+
+		/*
+		 * The field (err_info->error_info & BIT(26)) is fixed to set to
+		 * 1 in some old firmware of HiSilicon Kunpeng920. We assume that
+		 * firmware won't mix corrected errors in an uncorrected section,
+		 * and don't filter out 'corrected' error here.
+		 */
+		if (is_cache && has_pa) {
+			queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0);
+			p += err_info->length;
+			continue;
+		}
+
+		if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs))
+			error_type = cper_proc_error_type_strs[err_info->type];
+
+		pr_warn_ratelimited(FW_WARN GHES_PFX
+				    "Unhandled processor error type: %s\n",
+				    error_type);
+		p += err_info->length;
+	}
+
+	return queued;
+}
+
 /*
  * PCIe AER errors need to be sent to the AER driver for reporting and
  * recovery. The GHES severities map to the following AER severities and
@@ -605,9 +654,7 @@ static bool ghes_do_proc(struct ghes *ghes,
 			ghes_handle_aer(gdata);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
-			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
-
-			log_arm_hw_error(err);
+			queued = ghes_handle_arm_hw_error(gdata, sev);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);
 
-- 
2.8.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v6] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode
  2021-06-09  6:03 [PATCH v6] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode Xiaofei Tan
@ 2021-06-09 13:22 ` Rafael J. Wysocki
  2021-06-10  6:44   ` Xiaofei Tan
  0 siblings, 1 reply; 3+ messages in thread
From: Rafael J. Wysocki @ 2021-06-09 13:22 UTC (permalink / raw)
  To: Xiaofei Tan
  Cc: James Morse, Rafael J. Wysocki, Rafael J. Wysocki, Len Brown,
	Tony Luck, Borislav Petkov, Andrew Morton, Joerg Roedel,
	Peter Zijlstra, ACPI Devel Maling List,
	Linux Kernel Mailing List, linuxarm

On Wed, Jun 9, 2021 at 8:06 AM Xiaofei Tan <tanxiaofei@huawei.com> wrote:
>
> Before commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
> synchronise with APEI's irq work"), do_sea() would unconditionally
> signal the affected task from the arch code. Since that change,
> the GHES driver sends the signals.

Since this fixes a regression apparently introduced by the above
commit, please add a Fixes tag pointing to that commit to it.

> This exposes a problem as errors the GHES driver doesn't understand
> or doesn't handle effectively are silently ignored. It will cause
> the errors get taken again, and circulate endlessly. User-space task
> get stuck in this loop.
>
> Existing firmware on Kunpeng9xx systems reports cache errors with the
> 'ARM Processor Error' CPER records.
>
> Do memory failure handling for ARM Processor Error Section just like
> for Memory Error Section.

So why is this the right thing to do?

I guess it doesn't address the problem entirely, but only in this
particular case, so what if the firmware on some other platform
reports errors with a new type unknown to the GHES driver?  Will the
problem show up again?

> Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
> Reviewed-by: James Morse <james.morse@arm.com>
>
> ---
> Changes since v5:
> - Do some changes following James's suggestions: 1) optimize commit log
> 2) use err_info->length instead of err_info++' 3) some coding style
> advice.
>
> Changes since v4:
> - 1. Change the patch name from " ACPI / APEI: do memory failure on the
> physical address reported by ARM processor error section" to this
> more proper one.
> - 2. Add a comment in the code to tell why not filter out corrected
> error in an uncorrected section.
>
> Changes since v3:
> - Print unhandled error following James Morse's advice.
>
> Changes since v2:
> - Updated commit log
> ---
>  drivers/acpi/apei/ghes.c | 81 ++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 64 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index fce7ade..0c8330e 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head)
>         gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
>  }
>
> -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> -                                      int sev)
> +static bool ghes_do_memory_failure(u64 physical_addr, int flags)
>  {
>         unsigned long pfn;
> -       int flags = -1;
> -       int sec_sev = ghes_severity(gdata->error_severity);
> -       struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>
>         if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
>                 return false;
>
> -       if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> -               return false;
> -
> -       pfn = mem_err->physical_addr >> PAGE_SHIFT;
> +       pfn = PHYS_PFN(physical_addr);
>         if (!pfn_valid(pfn)) {
>                 pr_warn_ratelimited(FW_WARN GHES_PFX
>                 "Invalid address in generic error data: %#llx\n",
> -               mem_err->physical_addr);
> +               physical_addr);
>                 return false;
>         }
>
> +       memory_failure_queue(pfn, flags);
> +       return true;
> +}
> +
> +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> +                                      int sev)
> +{
> +       int flags = -1;
> +       int sec_sev = ghes_severity(gdata->error_severity);
> +       struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
> +
> +       if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> +               return false;
> +
>         /* iff following two events can be handled properly by now */
>         if (sec_sev == GHES_SEV_CORRECTED &&
>             (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
> @@ -470,14 +477,56 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>         if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
>                 flags = 0;
>
> -       if (flags != -1) {
> -               memory_failure_queue(pfn, flags);
> -               return true;
> -       }
> +       if (flags != -1)
> +               return ghes_do_memory_failure(mem_err->physical_addr, flags);
>
>         return false;
>  }
>
> +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
> +{
> +       struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> +       bool queued = false;
> +       int sec_sev, i;
> +       char *p;
> +
> +       log_arm_hw_error(err);
> +
> +       sec_sev = ghes_severity(gdata->error_severity);
> +       if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
> +               return false;
> +
> +       p = (char *)(err + 1);
> +       for (i = 0; i < err->err_info_num; i++) {
> +               struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
> +               bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
> +               bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
> +               const char *error_type = "unknown error";
> +
> +               /*
> +                * The field (err_info->error_info & BIT(26)) is fixed to set to
> +                * 1 in some old firmware of HiSilicon Kunpeng920. We assume that
> +                * firmware won't mix corrected errors in an uncorrected section,
> +                * and don't filter out 'corrected' error here.
> +                */
> +               if (is_cache && has_pa) {
> +                       queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0);
> +                       p += err_info->length;
> +                       continue;
> +               }
> +
> +               if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs))
> +                       error_type = cper_proc_error_type_strs[err_info->type];
> +
> +               pr_warn_ratelimited(FW_WARN GHES_PFX
> +                                   "Unhandled processor error type: %s\n",
> +                                   error_type);
> +               p += err_info->length;
> +       }
> +
> +       return queued;
> +}
> +
>  /*
>   * PCIe AER errors need to be sent to the AER driver for reporting and
>   * recovery. The GHES severities map to the following AER severities and
> @@ -605,9 +654,7 @@ static bool ghes_do_proc(struct ghes *ghes,
>                         ghes_handle_aer(gdata);
>                 }
>                 else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
> -                       struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> -
> -                       log_arm_hw_error(err);
> +                       queued = ghes_handle_arm_hw_error(gdata, sev);
>                 } else {
>                         void *err = acpi_hest_get_payload(gdata);
>
> --
> 2.8.1
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v6] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode
  2021-06-09 13:22 ` Rafael J. Wysocki
@ 2021-06-10  6:44   ` Xiaofei Tan
  0 siblings, 0 replies; 3+ messages in thread
From: Xiaofei Tan @ 2021-06-10  6:44 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: James Morse, Rafael J. Wysocki, Len Brown, Tony Luck,
	Borislav Petkov, Andrew Morton, Joerg Roedel, Peter Zijlstra,
	ACPI Devel Maling List, Linux Kernel Mailing List, linuxarm

Hi Rafael,

On 2021/6/9 21:22, Rafael J. Wysocki wrote:
> On Wed, Jun 9, 2021 at 8:06 AM Xiaofei Tan <tanxiaofei@huawei.com> wrote:
>>
>> Before commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
>> synchronise with APEI's irq work"), do_sea() would unconditionally
>> signal the affected task from the arch code. Since that change,
>> the GHES driver sends the signals.
>
> Since this fixes a regression apparently introduced by the above
> commit, please add a Fixes tag pointing to that commit to it.
>

OK.

>> This exposes a problem as errors the GHES driver doesn't understand
>> or doesn't handle effectively are silently ignored. It will cause
>> the errors get taken again, and circulate endlessly. User-space task
>> get stuck in this loop.
>>
>> Existing firmware on Kunpeng9xx systems reports cache errors with the
>> 'ARM Processor Error' CPER records.
>>
>> Do memory failure handling for ARM Processor Error Section just like
>> for Memory Error Section.
>
> So why is this the right thing to do?
>
> I guess it doesn't address the problem entirely, but only in this
> particular case, so what if the firmware on some other platform
> reports errors with a new type unknown to the GHES driver?  Will the
> problem show up again?

Yes. GHES driver should give right feedback to ARCH code.
I mean apei_claim_sea() or ghes_notify_sea() doesn't return 0  if the error is unknown.
But it seems difficult to achieve this for current architecture of GHES driver.

>
>> Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
>> Reviewed-by: James Morse <james.morse@arm.com>
>>
>> ---
>> Changes since v5:
>> - Do some changes following James's suggestions: 1) optimize commit log
>> 2) use err_info->length instead of err_info++' 3) some coding style
>> advice.
>>
>> Changes since v4:
>> - 1. Change the patch name from " ACPI / APEI: do memory failure on the
>> physical address reported by ARM processor error section" to this
>> more proper one.
>> - 2. Add a comment in the code to tell why not filter out corrected
>> error in an uncorrected section.
>>
>> Changes since v3:
>> - Print unhandled error following James Morse's advice.
>>
>> Changes since v2:
>> - Updated commit log
>> ---
>>  drivers/acpi/apei/ghes.c | 81 ++++++++++++++++++++++++++++++++++++++----------
>>  1 file changed, 64 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index fce7ade..0c8330e 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -441,28 +441,35 @@ static void ghes_kick_task_work(struct callback_head *head)
>>         gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
>>  }
>>
>> -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>> -                                      int sev)
>> +static bool ghes_do_memory_failure(u64 physical_addr, int flags)
>>  {
>>         unsigned long pfn;
>> -       int flags = -1;
>> -       int sec_sev = ghes_severity(gdata->error_severity);
>> -       struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>>
>>         if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
>>                 return false;
>>
>> -       if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
>> -               return false;
>> -
>> -       pfn = mem_err->physical_addr >> PAGE_SHIFT;
>> +       pfn = PHYS_PFN(physical_addr);
>>         if (!pfn_valid(pfn)) {
>>                 pr_warn_ratelimited(FW_WARN GHES_PFX
>>                 "Invalid address in generic error data: %#llx\n",
>> -               mem_err->physical_addr);
>> +               physical_addr);
>>                 return false;
>>         }
>>
>> +       memory_failure_queue(pfn, flags);
>> +       return true;
>> +}
>> +
>> +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>> +                                      int sev)
>> +{
>> +       int flags = -1;
>> +       int sec_sev = ghes_severity(gdata->error_severity);
>> +       struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>> +
>> +       if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
>> +               return false;
>> +
>>         /* iff following two events can be handled properly by now */
>>         if (sec_sev == GHES_SEV_CORRECTED &&
>>             (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
>> @@ -470,14 +477,56 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>>         if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
>>                 flags = 0;
>>
>> -       if (flags != -1) {
>> -               memory_failure_queue(pfn, flags);
>> -               return true;
>> -       }
>> +       if (flags != -1)
>> +               return ghes_do_memory_failure(mem_err->physical_addr, flags);
>>
>>         return false;
>>  }
>>
>> +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
>> +{
>> +       struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
>> +       bool queued = false;
>> +       int sec_sev, i;
>> +       char *p;
>> +
>> +       log_arm_hw_error(err);
>> +
>> +       sec_sev = ghes_severity(gdata->error_severity);
>> +       if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
>> +               return false;
>> +
>> +       p = (char *)(err + 1);
>> +       for (i = 0; i < err->err_info_num; i++) {
>> +               struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
>> +               bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
>> +               bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
>> +               const char *error_type = "unknown error";
>> +
>> +               /*
>> +                * The field (err_info->error_info & BIT(26)) is fixed to set to
>> +                * 1 in some old firmware of HiSilicon Kunpeng920. We assume that
>> +                * firmware won't mix corrected errors in an uncorrected section,
>> +                * and don't filter out 'corrected' error here.
>> +                */
>> +               if (is_cache && has_pa) {
>> +                       queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0);
>> +                       p += err_info->length;
>> +                       continue;
>> +               }
>> +
>> +               if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs))
>> +                       error_type = cper_proc_error_type_strs[err_info->type];
>> +
>> +               pr_warn_ratelimited(FW_WARN GHES_PFX
>> +                                   "Unhandled processor error type: %s\n",
>> +                                   error_type);
>> +               p += err_info->length;
>> +       }
>> +
>> +       return queued;
>> +}
>> +
>>  /*
>>   * PCIe AER errors need to be sent to the AER driver for reporting and
>>   * recovery. The GHES severities map to the following AER severities and
>> @@ -605,9 +654,7 @@ static bool ghes_do_proc(struct ghes *ghes,
>>                         ghes_handle_aer(gdata);
>>                 }
>>                 else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
>> -                       struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
>> -
>> -                       log_arm_hw_error(err);
>> +                       queued = ghes_handle_arm_hw_error(gdata, sev);
>>                 } else {
>>                         void *err = acpi_hest_get_payload(gdata);
>>
>> --
>> 2.8.1
>>
>
> .
>


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-06-10  6:44 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-09  6:03 [PATCH v6] ACPI / APEI: fix the regression of synchronous external aborts occur in user-mode Xiaofei Tan
2021-06-09 13:22 ` Rafael J. Wysocki
2021-06-10  6:44   ` Xiaofei Tan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).