linux-acpi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] ACPI / APEI: do memory failure on the physical address reported by ARM processor error section
@ 2020-08-05  9:12 Xiaofei Tan
  2020-08-17  2:49 ` Xiaofei Tan
  2020-08-31  1:28 ` Xiaofei Tan
  0 siblings, 2 replies; 3+ messages in thread
From: Xiaofei Tan @ 2020-08-05  9:12 UTC (permalink / raw)
  To: linux-acpi, linux-kernel, rjw, lenb, james.morse, tony.luck, bp
  Cc: linuxarm, shiju.jose, jonathan.cameron, Xiaofei Tan

After the commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
synchronise with APEI's irq work") applied, user-mode SEA is
preferentially processed by APEI. Do memory failure to recover.

But there are some problems:
1) The function apei_claim_sea() has processed an CPER, does not
mean that memory failure handling has done. Because the firmware-first
RAS error is reported by both producer and consumer. Mostly SEA uses
ARM processor error section to report as a consumer. (The producer could
be DDRC and cache, and use memory error section and other error section
to report). But memory failure handling for ARM processor error section
has not been supported. This patch will add it.

2) Some hardware platforms can't record physical address each time. But
they could always have reported a firmware-first RAS error using ARM
processor error section. Such platform should update firmware. Don't
report the RAS error in SEA processing flow when physical address is
not recorded.

Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
---
 drivers/acpi/apei/ghes.c | 70 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 17 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 81bf71b..aee7787 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -429,28 +429,35 @@ static void ghes_kick_task_work(struct callback_head *head)
 	gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
 }
 
-static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
-				       int sev)
+static bool ghes_do_memory_failure(u64 physical_addr, int flags)
 {
 	unsigned long pfn;
-	int flags = -1;
-	int sec_sev = ghes_severity(gdata->error_severity);
-	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
 	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
 		return false;
 
-	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
-		return false;
-
-	pfn = mem_err->physical_addr >> PAGE_SHIFT;
+	pfn = PHYS_PFN(physical_addr);
 	if (!pfn_valid(pfn)) {
 		pr_warn_ratelimited(FW_WARN GHES_PFX
 		"Invalid address in generic error data: %#llx\n",
-		mem_err->physical_addr);
+		physical_addr);
 		return false;
 	}
 
+	memory_failure_queue(pfn, flags);
+	return true;
+}
+
+static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
+				       int sev)
+{
+	int flags = -1;
+	int sec_sev = ghes_severity(gdata->error_severity);
+	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
+
+	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
+		return false;
+
 	/* iff following two events can be handled properly by now */
 	if (sec_sev == GHES_SEV_CORRECTED &&
 	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
@@ -458,14 +465,45 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
 	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
 		flags = 0;
 
-	if (flags != -1) {
-		memory_failure_queue(pfn, flags);
-		return true;
-	}
+	if (flags != -1)
+		return ghes_do_memory_failure(mem_err->physical_addr, flags);
 
 	return false;
 }
 
+static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
+{
+	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
+	struct cper_arm_err_info *err_info;
+	bool queued = false;
+	int sec_sev, i;
+
+	log_arm_hw_error(err);
+
+	sec_sev = ghes_severity(gdata->error_severity);
+	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
+		return false;
+
+	err_info = (struct cper_arm_err_info *) (err + 1);
+	for (i = 0; i < err->err_info_num; i++, err_info++) {
+		if (!(err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR))
+			continue;
+
+		if (err_info->type != CPER_ARM_CACHE_ERROR) {
+			pr_warn_ratelimited(FW_WARN GHES_PFX
+			"Physical address should be invalid for %s\n",
+			err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
+			cper_proc_error_type_strs[err_info->type] : "unknown error type");
+			continue;
+		}
+
+		if (ghes_do_memory_failure(err_info->physical_fault_addr, 0))
+			queued = true;
+	}
+
+	return queued;
+}
+
 /*
  * PCIe AER errors need to be sent to the AER driver for reporting and
  * recovery. The GHES severities map to the following AER severities and
@@ -543,9 +581,7 @@ static bool ghes_do_proc(struct ghes *ghes,
 			ghes_handle_aer(gdata);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
-			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
-
-			log_arm_hw_error(err);
+			queued = ghes_handle_arm_hw_error(gdata, sev);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);
 
-- 
2.8.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] ACPI / APEI: do memory failure on the physical address reported by ARM processor error section
  2020-08-05  9:12 [PATCH v2] ACPI / APEI: do memory failure on the physical address reported by ARM processor error section Xiaofei Tan
@ 2020-08-17  2:49 ` Xiaofei Tan
  2020-08-31  1:28 ` Xiaofei Tan
  1 sibling, 0 replies; 3+ messages in thread
From: Xiaofei Tan @ 2020-08-17  2:49 UTC (permalink / raw)
  To: linux-acpi, linux-kernel, rjw, lenb, james.morse, tony.luck, bp
  Cc: linuxarm, shiju.jose, jonathan.cameron

@James, Could you help to check if i have fixed your review comments? thanks.

On 2020/8/5 17:12, Xiaofei Tan wrote:
> After the commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
> synchronise with APEI's irq work") applied, user-mode SEA is
> preferentially processed by APEI. Do memory failure to recover.
> 
> But there are some problems:
> 1) The function apei_claim_sea() has processed an CPER, does not
> mean that memory failure handling has done. Because the firmware-first
> RAS error is reported by both producer and consumer. Mostly SEA uses
> ARM processor error section to report as a consumer. (The producer could
> be DDRC and cache, and use memory error section and other error section
> to report). But memory failure handling for ARM processor error section
> has not been supported. This patch will add it.
> 
> 2) Some hardware platforms can't record physical address each time. But
> they could always have reported a firmware-first RAS error using ARM
> processor error section. Such platform should update firmware. Don't
> report the RAS error in SEA processing flow when physical address is
> not recorded.
> 
> Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
> ---
>  drivers/acpi/apei/ghes.c | 70 ++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 53 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 81bf71b..aee7787 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -429,28 +429,35 @@ static void ghes_kick_task_work(struct callback_head *head)
>  	gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
>  }
>  
> -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> -				       int sev)
> +static bool ghes_do_memory_failure(u64 physical_addr, int flags)
>  {
>  	unsigned long pfn;
> -	int flags = -1;
> -	int sec_sev = ghes_severity(gdata->error_severity);
> -	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>  
>  	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
>  		return false;
>  
> -	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> -		return false;
> -
> -	pfn = mem_err->physical_addr >> PAGE_SHIFT;
> +	pfn = PHYS_PFN(physical_addr);
>  	if (!pfn_valid(pfn)) {
>  		pr_warn_ratelimited(FW_WARN GHES_PFX
>  		"Invalid address in generic error data: %#llx\n",
> -		mem_err->physical_addr);
> +		physical_addr);
>  		return false;
>  	}
>  
> +	memory_failure_queue(pfn, flags);
> +	return true;
> +}
> +
> +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> +				       int sev)
> +{
> +	int flags = -1;
> +	int sec_sev = ghes_severity(gdata->error_severity);
> +	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
> +
> +	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> +		return false;
> +
>  	/* iff following two events can be handled properly by now */
>  	if (sec_sev == GHES_SEV_CORRECTED &&
>  	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
> @@ -458,14 +465,45 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>  	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
>  		flags = 0;
>  
> -	if (flags != -1) {
> -		memory_failure_queue(pfn, flags);
> -		return true;
> -	}
> +	if (flags != -1)
> +		return ghes_do_memory_failure(mem_err->physical_addr, flags);
>  
>  	return false;
>  }
>  
> +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
> +{
> +	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> +	struct cper_arm_err_info *err_info;
> +	bool queued = false;
> +	int sec_sev, i;
> +
> +	log_arm_hw_error(err);
> +
> +	sec_sev = ghes_severity(gdata->error_severity);
> +	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
> +		return false;
> +
> +	err_info = (struct cper_arm_err_info *) (err + 1);
> +	for (i = 0; i < err->err_info_num; i++, err_info++) {
> +		if (!(err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR))
> +			continue;
> +
> +		if (err_info->type != CPER_ARM_CACHE_ERROR) {
> +			pr_warn_ratelimited(FW_WARN GHES_PFX
> +			"Physical address should be invalid for %s\n",
> +			err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
> +			cper_proc_error_type_strs[err_info->type] : "unknown error type");
> +			continue;
> +		}
> +
> +		if (ghes_do_memory_failure(err_info->physical_fault_addr, 0))
> +			queued = true;
> +	}
> +
> +	return queued;
> +}
> +
>  /*
>   * PCIe AER errors need to be sent to the AER driver for reporting and
>   * recovery. The GHES severities map to the following AER severities and
> @@ -543,9 +581,7 @@ static bool ghes_do_proc(struct ghes *ghes,
>  			ghes_handle_aer(gdata);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
> -			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> -
> -			log_arm_hw_error(err);
> +			queued = ghes_handle_arm_hw_error(gdata, sev);
>  		} else {
>  			void *err = acpi_hest_get_payload(gdata);
>  
> 

-- 
 thanks
tanxiaofei


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] ACPI / APEI: do memory failure on the physical address reported by ARM processor error section
  2020-08-05  9:12 [PATCH v2] ACPI / APEI: do memory failure on the physical address reported by ARM processor error section Xiaofei Tan
  2020-08-17  2:49 ` Xiaofei Tan
@ 2020-08-31  1:28 ` Xiaofei Tan
  1 sibling, 0 replies; 3+ messages in thread
From: Xiaofei Tan @ 2020-08-31  1:28 UTC (permalink / raw)
  To: linux-acpi, linux-kernel, rjw, lenb, james.morse, tony.luck, bp
  Cc: linuxarm, shiju.jose, jonathan.cameron


ping...

On 2020/8/5 17:12, Xiaofei Tan wrote:
> After the commit 8fcc4ae6faf8 ("arm64: acpi: Make apei_claim_sea()
> synchronise with APEI's irq work") applied, user-mode SEA is
> preferentially processed by APEI. Do memory failure to recover.
> 
> But there are some problems:
> 1) The function apei_claim_sea() has processed an CPER, does not
> mean that memory failure handling has done. Because the firmware-first
> RAS error is reported by both producer and consumer. Mostly SEA uses
> ARM processor error section to report as a consumer. (The producer could
> be DDRC and cache, and use memory error section and other error section
> to report). But memory failure handling for ARM processor error section
> has not been supported. This patch will add it.
> 
> 2) Some hardware platforms can't record physical address each time. But
> they could always have reported a firmware-first RAS error using ARM
> processor error section. Such platform should update firmware. Don't
> report the RAS error in SEA processing flow when physical address is
> not recorded.
> 
> Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
> ---
>  drivers/acpi/apei/ghes.c | 70 ++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 53 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 81bf71b..aee7787 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -429,28 +429,35 @@ static void ghes_kick_task_work(struct callback_head *head)
>  	gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
>  }
>  
> -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> -				       int sev)
> +static bool ghes_do_memory_failure(u64 physical_addr, int flags)
>  {
>  	unsigned long pfn;
> -	int flags = -1;
> -	int sec_sev = ghes_severity(gdata->error_severity);
> -	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>  
>  	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
>  		return false;
>  
> -	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> -		return false;
> -
> -	pfn = mem_err->physical_addr >> PAGE_SHIFT;
> +	pfn = PHYS_PFN(physical_addr);
>  	if (!pfn_valid(pfn)) {
>  		pr_warn_ratelimited(FW_WARN GHES_PFX
>  		"Invalid address in generic error data: %#llx\n",
> -		mem_err->physical_addr);
> +		physical_addr);
>  		return false;
>  	}
>  
> +	memory_failure_queue(pfn, flags);
> +	return true;
> +}
> +
> +static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> +				       int sev)
> +{
> +	int flags = -1;
> +	int sec_sev = ghes_severity(gdata->error_severity);
> +	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
> +
> +	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
> +		return false;
> +
>  	/* iff following two events can be handled properly by now */
>  	if (sec_sev == GHES_SEV_CORRECTED &&
>  	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
> @@ -458,14 +465,45 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>  	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
>  		flags = 0;
>  
> -	if (flags != -1) {
> -		memory_failure_queue(pfn, flags);
> -		return true;
> -	}
> +	if (flags != -1)
> +		return ghes_do_memory_failure(mem_err->physical_addr, flags);
>  
>  	return false;
>  }
>  
> +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
> +{
> +	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> +	struct cper_arm_err_info *err_info;
> +	bool queued = false;
> +	int sec_sev, i;
> +
> +	log_arm_hw_error(err);
> +
> +	sec_sev = ghes_severity(gdata->error_severity);
> +	if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
> +		return false;
> +
> +	err_info = (struct cper_arm_err_info *) (err + 1);
> +	for (i = 0; i < err->err_info_num; i++, err_info++) {
> +		if (!(err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR))
> +			continue;
> +
> +		if (err_info->type != CPER_ARM_CACHE_ERROR) {
> +			pr_warn_ratelimited(FW_WARN GHES_PFX
> +			"Physical address should be invalid for %s\n",
> +			err_info->type < ARRAY_SIZE(cper_proc_error_type_strs) ?
> +			cper_proc_error_type_strs[err_info->type] : "unknown error type");
> +			continue;
> +		}
> +
> +		if (ghes_do_memory_failure(err_info->physical_fault_addr, 0))
> +			queued = true;
> +	}
> +
> +	return queued;
> +}
> +
>  /*
>   * PCIe AER errors need to be sent to the AER driver for reporting and
>   * recovery. The GHES severities map to the following AER severities and
> @@ -543,9 +581,7 @@ static bool ghes_do_proc(struct ghes *ghes,
>  			ghes_handle_aer(gdata);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
> -			struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
> -
> -			log_arm_hw_error(err);
> +			queued = ghes_handle_arm_hw_error(gdata, sev);
>  		} else {
>  			void *err = acpi_hest_get_payload(gdata);
>  
> 

-- 
 thanks
tanxiaofei


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-08-31  1:29 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-05  9:12 [PATCH v2] ACPI / APEI: do memory failure on the physical address reported by ARM processor error section Xiaofei Tan
2020-08-17  2:49 ` Xiaofei Tan
2020-08-31  1:28 ` Xiaofei Tan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).